From 8967f0cca55964060bf54ab15be52012ddda22ab Mon Sep 17 00:00:00 2001 From: Kevin Brown Date: Tue, 28 Apr 2026 05:36:52 +0300 Subject: [PATCH 1/5] WP-1: Newsletter intake from Resend --- core/migrations/0002_newsletter_intake.py | 83 ++++++++++++ core/models.py | 63 +++++++++ core/newsletters.py | 155 ++++++++++++++++++++++ core/serializers.py | 40 +++++- core/settings_types.py | 4 + core/tasks.py | 76 ++++++++++- core/tests/test_newsletters.py | 138 +++++++++++++++++++ core/urls.py | 4 +- core/views.py | 113 ++++++++++++++++ newsletter_maker/settings/__init__.py | 6 + newsletter_maker/settings/base.py | 6 + requirements.txt | 1 + 12 files changed, 681 insertions(+), 8 deletions(-) create mode 100644 core/migrations/0002_newsletter_intake.py create mode 100644 core/newsletters.py create mode 100644 core/tests/test_newsletters.py diff --git a/core/migrations/0002_newsletter_intake.py b/core/migrations/0002_newsletter_intake.py new file mode 100644 index 00000000..86381058 --- /dev/null +++ b/core/migrations/0002_newsletter_intake.py @@ -0,0 +1,83 @@ +from django.db import migrations, models +import django.db.models.deletion + +import core.models + + +class Migration(migrations.Migration): + + dependencies = [ + ("core", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="project", + name="intake_enabled", + field=models.BooleanField(default=False), + ), + migrations.AddField( + model_name="project", + name="intake_token", + field=models.CharField(default=core.models.generate_project_intake_token, editable=False, max_length=64, unique=True), + ), + migrations.AddField( + model_name="content", + name="source_metadata", + field=models.JSONField(blank=True, default=dict), + ), + migrations.CreateModel( + name="IntakeAllowlist", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("sender_email", models.EmailField(max_length=254)), + ("confirmed_at", models.DateTimeField(blank=True, null=True)), + ("confirmation_token", models.CharField(default=core.models.generate_confirmation_token, max_length=64, unique=True)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "project", + models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="intake_allowlist", to="core.project"), + ), + ], + options={ + "ordering": ["sender_email"], + }, + ), + migrations.CreateModel( + name="NewsletterIntake", + fields=[ + ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("sender_email", models.EmailField(max_length=254)), + ("subject", models.CharField(max_length=512)), + ("received_at", models.DateTimeField(auto_now_add=True)), + ("raw_html", models.TextField(blank=True)), + ("raw_text", models.TextField(blank=True)), + ("message_id", models.CharField(max_length=255, unique=True)), + ( + "status", + models.CharField( + choices=[("pending", "Pending"), ("extracted", "Extracted"), ("failed", "Failed"), ("rejected", "Rejected")], + default="pending", + max_length=16, + ), + ), + ("extraction_result", models.JSONField(blank=True, null=True)), + ("error_message", models.TextField(blank=True)), + ( + "project", + models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="newsletter_intakes", to="core.project"), + ), + ], + options={ + "ordering": ["-received_at"], + }, + ), + migrations.AddIndex( + model_name="newsletterintake", + index=models.Index(fields=["project", "sender_email", "status"], name="core_newsle_project_2c63fb_idx"), + ), + migrations.AddConstraint( + model_name="intakeallowlist", + constraint=models.UniqueConstraint(fields=("project", "sender_email"), name="core_allowlist_unique_project_sender"), + ), + ] \ No newline at end of file diff --git a/core/models.py b/core/models.py index e23f7dfe..3b260940 100644 --- a/core/models.py +++ b/core/models.py @@ -1,8 +1,18 @@ +import secrets + from django.conf import settings from django.contrib.auth.models import Group from django.db import models +def generate_project_intake_token() -> str: + return secrets.token_hex(16) + + +def generate_confirmation_token() -> str: + return secrets.token_urlsafe(24) + + class EntityType(models.TextChoices): INDIVIDUAL = "individual", "Individual" VENDOR = "vendor", "Vendor" @@ -26,6 +36,13 @@ class SourcePluginName(models.TextChoices): REDDIT = "reddit", "Reddit" +class NewsletterIntakeStatus(models.TextChoices): + PENDING = "pending", "Pending" + EXTRACTED = "extracted", "Extracted" + FAILED = "failed", "Failed" + REJECTED = "rejected", "Rejected" + + class RunStatus(models.TextChoices): RUNNING = "running", "Running" SUCCESS = "success", "Success" @@ -47,6 +64,8 @@ class Project(models.Model): group = models.ForeignKey(Group, on_delete=models.CASCADE, related_name="projects") topic_description = models.TextField() content_retention_days = models.PositiveIntegerField(default=365) + intake_token = models.CharField(max_length=64, unique=True, default=generate_project_intake_token, editable=False) + intake_enabled = models.BooleanField(default=False) created_at = models.DateTimeField(auto_now_add=True) class Meta: @@ -107,6 +126,7 @@ class Content(models.Model): content_text = models.TextField() relevance_score = models.FloatField(null=True, blank=True) embedding_id = models.CharField(max_length=64, blank=True) + source_metadata = models.JSONField(default=dict, blank=True) is_reference = models.BooleanField(default=False) is_active = models.BooleanField(default=True) @@ -123,6 +143,49 @@ def __str__(self) -> str: return self.title +class IntakeAllowlist(models.Model): + project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="intake_allowlist") + sender_email = models.EmailField() + confirmed_at = models.DateTimeField(null=True, blank=True) + confirmation_token = models.CharField(max_length=64, unique=True, default=generate_confirmation_token) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + ordering = ["sender_email"] + constraints = [ + models.UniqueConstraint(fields=["project", "sender_email"], name="core_allowlist_unique_project_sender"), + ] + + def __str__(self) -> str: + return f"{self.sender_email} for {self.project.name}" + + @property + def is_confirmed(self) -> bool: + return self.confirmed_at is not None + + +class NewsletterIntake(models.Model): + project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="newsletter_intakes") + sender_email = models.EmailField() + subject = models.CharField(max_length=512) + received_at = models.DateTimeField(auto_now_add=True) + raw_html = models.TextField(blank=True) + raw_text = models.TextField(blank=True) + message_id = models.CharField(max_length=255, unique=True) + status = models.CharField(max_length=16, choices=NewsletterIntakeStatus.choices, default=NewsletterIntakeStatus.PENDING) + extraction_result = models.JSONField(null=True, blank=True) + error_message = models.TextField(blank=True) + + class Meta: + ordering = ["-received_at"] + indexes = [ + models.Index(fields=["project", "sender_email", "status"]), + ] + + def __str__(self) -> str: + return f"{self.subject or self.message_id}" + + class SkillResult(models.Model): content = models.ForeignKey(Content, on_delete=models.CASCADE, related_name="skill_results") project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="skill_results") diff --git a/core/newsletters.py b/core/newsletters.py new file mode 100644 index 00000000..81b5b15a --- /dev/null +++ b/core/newsletters.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import hashlib +import hmac +import re +from dataclasses import dataclass +from email.utils import parseaddr +from html.parser import HTMLParser +from typing import Any, cast + +from django.conf import settings as django_settings + +from core.settings_types import CoreSettings + +settings = cast(CoreSettings, django_settings) + +SCRIPT_TAG_PATTERN = re.compile(r")<[^<]*)*", re.IGNORECASE | re.DOTALL) +INLINE_HANDLER_PATTERN = re.compile(r"\son[a-z]+=(?:\"[^\"]*\"|'[^']*')", re.IGNORECASE) +URL_PATTERN = re.compile(r"https?://[^\s<>'\"]+") + + +def normalize_sender_email(value: str) -> str: + _, email_address = parseaddr(value) + return email_address.strip().lower() + + +def sanitize_newsletter_html(raw_html: str) -> str: + without_scripts = SCRIPT_TAG_PATTERN.sub("", raw_html) + return INLINE_HANDLER_PATTERN.sub("", without_scripts) + + +def compute_resend_signature(payload: bytes, secret: str) -> str: + return hmac.new(secret.encode("utf-8"), payload, hashlib.sha256).hexdigest() + + +def verify_resend_signature(payload: bytes, provided_signature: str) -> bool: + if not settings.RESEND_WEBHOOK_SECRET or not provided_signature: + return False + expected_signature = compute_resend_signature(payload, settings.RESEND_WEBHOOK_SECRET) + return hmac.compare_digest(expected_signature, provided_signature) + + +def extract_project_token(recipient: str) -> str | None: + _, email_address = parseaddr(recipient) + local_part = email_address.partition("@")[0] + prefix, separator, token = local_part.partition("+") + if prefix != "intake" or separator != "+" or not token: + return None + return token + + +def send_confirmation_email(*, to_email: str, confirm_url: str, project_name: str) -> None: + if not settings.RESEND_API_KEY: + raise RuntimeError("RESEND_API_KEY must be configured to send newsletter confirmation emails.") + + import resend + + resend.api_key = settings.RESEND_API_KEY + resend.Emails.send( + { + "from": settings.RESEND_FROM_EMAIL, + "to": [to_email], + "subject": f"Confirm newsletter intake for {project_name}", + "html": ( + "

Confirm this sender for newsletter ingestion.

" + f'

Confirm sender

' + ), + } + ) + + +@dataclass(slots=True) +class ExtractedNewsletterItem: + url: str + title: str + excerpt: str + position: int + + +class _NewsletterLinkParser(HTMLParser): + def __init__(self): + super().__init__() + self.links: list[dict[str, str]] = [] + self._active_href: str | None = None + self._active_text: list[str] = [] + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag != "a": + return + for name, value in attrs: + if name == "href" and value and value.startswith(("http://", "https://")): + self._active_href = value + self._active_text = [] + return + + def handle_data(self, data: str) -> None: + if self._active_href is not None: + self._active_text.append(data) + + def handle_endtag(self, tag: str) -> None: + if tag != "a" or self._active_href is None: + return + self.links.append( + { + "url": self._active_href, + "title": " ".join(part.strip() for part in self._active_text if part.strip()), + } + ) + self._active_href = None + self._active_text = [] + + +def extract_newsletter_items(*, subject: str, raw_html: str, raw_text: str) -> list[ExtractedNewsletterItem]: + parser = _NewsletterLinkParser() + if raw_html: + parser.feed(raw_html) + + seen_urls: set[str] = set() + extracted_items: list[ExtractedNewsletterItem] = [] + for candidate in parser.links: + url = candidate["url"].strip() + if not url or url in seen_urls: + continue + seen_urls.add(url) + extracted_items.append( + ExtractedNewsletterItem( + url=url, + title=candidate["title"] or subject or url, + excerpt=raw_text[:500].strip(), + position=len(extracted_items) + 1, + ) + ) + + for match in URL_PATTERN.finditer(raw_text): + url = match.group(0).rstrip(".,)") + if url in seen_urls: + continue + seen_urls.add(url) + extracted_items.append( + ExtractedNewsletterItem( + url=url, + title=subject or url, + excerpt=raw_text[:500].strip(), + position=len(extracted_items) + 1, + ) + ) + + return extracted_items + + +def get_resend_payload_data(payload: dict[str, Any]) -> dict[str, Any]: + data = payload.get("data") + if isinstance(data, dict): + return data + return payload \ No newline at end of file diff --git a/core/serializers.py b/core/serializers.py index 7f56ef29..16850b4e 100644 --- a/core/serializers.py +++ b/core/serializers.py @@ -5,6 +5,8 @@ Content, Entity, IngestionRun, + IntakeAllowlist, + NewsletterIntake, Project, ProjectConfig, ReviewQueue, @@ -48,7 +50,16 @@ class ProjectSerializer(ProjectScopedSerializerMixin, serializers.ModelSerialize class Meta: model = Project - fields = ["id", "name", "group", "topic_description", "content_retention_days", "created_at"] + fields = [ + "id", + "name", + "group", + "topic_description", + "content_retention_days", + "intake_token", + "intake_enabled", + "created_at", + ] read_only_fields = ["id", "created_at"] @@ -103,6 +114,7 @@ class Meta: "content_text", "relevance_score", "embedding_id", + "source_metadata", "is_reference", "is_active", ] @@ -205,3 +217,29 @@ def validate(self, attrs): if project and content and content.project_id != project.id: raise serializers.ValidationError({"content": "Content must belong to the selected project."}) return attrs + + +class IntakeAllowlistSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + class Meta: + model = IntakeAllowlist + fields = ["id", "project", "sender_email", "confirmed_at", "confirmation_token", "created_at"] + read_only_fields = ["id", "project", "confirmation_token", "created_at"] + + +class NewsletterIntakeSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + class Meta: + model = NewsletterIntake + fields = [ + "id", + "project", + "sender_email", + "subject", + "received_at", + "raw_html", + "raw_text", + "message_id", + "status", + "extraction_result", + "error_message", + ] + read_only_fields = ["id", "project", "received_at", "status", "extraction_result", "error_message"] diff --git a/core/settings_types.py b/core/settings_types.py index b06ac4b6..91dcdc64 100644 --- a/core/settings_types.py +++ b/core/settings_types.py @@ -2,6 +2,7 @@ class CoreSettings(Protocol): + CELERY_TASK_ALWAYS_EAGER: bool QDRANT_URL: str EMBEDDING_MODEL: str EMBEDDING_PROVIDER: str @@ -11,3 +12,6 @@ class CoreSettings(Protocol): OPENROUTER_API_BASE: str OPENROUTER_APP_URL: str OPENROUTER_APP_NAME: str + RESEND_API_KEY: str + RESEND_FROM_EMAIL: str + RESEND_WEBHOOK_SECRET: str diff --git a/core/tasks.py b/core/tasks.py index eb1568c3..0f6e0a5f 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -5,7 +5,8 @@ from django.utils import timezone from core.embeddings import upsert_content_embedding -from core.models import Content, IngestionRun, RunStatus, SourceConfig +from core.models import Content, IngestionRun, IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, RunStatus, SourceConfig +from core.newsletters import extract_newsletter_items from core.pipeline import ( RELEVANCE_SKILL_NAME, SUMMARIZATION_SKILL_NAME, @@ -107,12 +108,75 @@ def _ingest_source_config(source_config: SourceConfig) -> tuple[int, int]: published_date=item.published_date, content_text=item.content_text, ) - upsert_content_embedding(content) - if settings.CELERY_TASK_ALWAYS_EAGER: - process_content(content.id) - else: - process_content.delay(content.id) + _schedule_content_processing(content) ingested_count += 1 source_config.last_fetched_at = timezone.now() source_config.save(update_fields=["last_fetched_at"]) return len(fetched_items), ingested_count + + +@shared_task(name="core.tasks.process_newsletter_intake") +def process_newsletter_intake(intake_id: int): + intake = NewsletterIntake.objects.select_related("project").get(pk=intake_id) + + allowlist = IntakeAllowlist.objects.filter( + project=intake.project, + sender_email=intake.sender_email, + confirmed_at__isnull=False, + ).first() + if allowlist is None: + intake.status = NewsletterIntakeStatus.PENDING + intake.error_message = "Sender has not confirmed newsletter intake." + intake.save(update_fields=["status", "error_message"]) + return {"status": intake.status, "items_ingested": 0} + + extracted_items = extract_newsletter_items( + subject=intake.subject, + raw_html=intake.raw_html, + raw_text=intake.raw_text, + ) + ingested_count = 0 + for item in extracted_items: + if Content.objects.filter(project=intake.project, url=item.url).exists(): + continue + content = Content.objects.create( + project=intake.project, + url=item.url, + title=item.title[:512], + author=intake.sender_email[:255], + source_plugin="newsletter", + published_date=timezone.now(), + content_text=item.excerpt or intake.raw_text, + source_metadata={ + "newsletter_intake_id": intake.id, + "sender_email": intake.sender_email, + "position": item.position, + }, + ) + _schedule_content_processing(content) + ingested_count += 1 + + intake.status = NewsletterIntakeStatus.EXTRACTED + intake.error_message = "" + intake.extraction_result = { + "method": "heuristic", + "items": [ + { + "url": item.url, + "title": item.title, + "excerpt": item.excerpt, + "position": item.position, + } + for item in extracted_items + ], + } + intake.save(update_fields=["status", "error_message", "extraction_result"]) + return {"status": intake.status, "items_ingested": ingested_count} + + +def _schedule_content_processing(content: Content) -> None: + upsert_content_embedding(content) + if settings.CELERY_TASK_ALWAYS_EAGER: + process_content(content.id) + else: + process_content.delay(content.id) diff --git a/core/tests/test_newsletters.py b/core/tests/test_newsletters.py new file mode 100644 index 00000000..19661f41 --- /dev/null +++ b/core/tests/test_newsletters.py @@ -0,0 +1,138 @@ +import json + +import pytest +from django.contrib.auth.models import Group +from django.urls import reverse + +from core.models import Content, IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, Project +from core.newsletters import compute_resend_signature, extract_newsletter_items, sanitize_newsletter_html + + +pytestmark = pytest.mark.django_db + + +@pytest.fixture +def project(): + group = Group.objects.create(name="newsletter-team") + return Project.objects.create( + name="Newsletter Project", + group=group, + topic_description="Platform engineering", + intake_enabled=True, + ) + + +def test_sanitize_newsletter_html_removes_scripts_and_inline_handlers(): + sanitized = sanitize_newsletter_html('
Read
') + + assert "", + "to": [f"intake+{project.intake_token}@inbox.example.com"], + "subject": "Weekly Digest", + "html": 'Read now', + "text": "Read https://example.com/post", + "message_id": "msg-123", + } + } + raw_payload = json.dumps(payload) + + response = client.post( + reverse("resend-inbound-webhook"), + data=raw_payload, + content_type="application/json", + HTTP_X_RESEND_SIGNATURE=compute_resend_signature(raw_payload.encode("utf-8"), settings.RESEND_WEBHOOK_SECRET), + ) + + assert response.status_code == 202 + intake = NewsletterIntake.objects.get(message_id="msg-123") + allowlist = IntakeAllowlist.objects.get(project=project, sender_email="newsletter@example.com") + assert intake.status == NewsletterIntakeStatus.PENDING + assert allowlist.confirmed_at is None + send_mock.assert_called_once() + + +def test_confirm_newsletter_sender_confirms_allowlist_and_queues_pending_intakes(client, settings, mocker, project): + settings.CELERY_TASK_ALWAYS_EAGER = False + allowlist = IntakeAllowlist.objects.create(project=project, sender_email="newsletter@example.com") + intake = NewsletterIntake.objects.create( + project=project, + sender_email="newsletter@example.com", + subject="Digest", + raw_text="Visit https://example.com/post", + message_id="msg-456", + ) + delay_mock = mocker.patch("core.views.process_newsletter_intake.delay") + + response = client.get(reverse("confirm-newsletter-sender", kwargs={"token": allowlist.confirmation_token})) + + assert response.status_code == 200 + allowlist.refresh_from_db() + assert allowlist.confirmed_at is not None + delay_mock.assert_called_once_with(intake.id) + + +def test_process_newsletter_intake_creates_content_for_confirmed_sender(settings, mocker, project): + settings.CELERY_TASK_ALWAYS_EAGER = False + allowlist = IntakeAllowlist.objects.create( + project=project, + sender_email="newsletter@example.com", + confirmed_at="2026-04-28T00:00:00Z", + ) + intake = NewsletterIntake.objects.create( + project=project, + sender_email=allowlist.sender_email, + subject="Digest", + raw_html='Great Article', + raw_text="Great article https://example.com/article", + message_id="msg-789", + ) + upsert_mock = mocker.patch("core.tasks.upsert_content_embedding") + delay_mock = mocker.patch("core.tasks.process_content.delay") + + from core.tasks import process_newsletter_intake + + result = process_newsletter_intake(intake.id) + + assert result["items_ingested"] == 1 + intake.refresh_from_db() + content = Content.objects.get(project=project, url="https://example.com/article") + assert intake.status == NewsletterIntakeStatus.EXTRACTED + assert intake.extraction_result["method"] == "heuristic" + assert content.source_plugin == "newsletter" + assert content.source_metadata["newsletter_intake_id"] == intake.id + upsert_mock.assert_called_once_with(content) + delay_mock.assert_called_once_with(content.id) \ No newline at end of file diff --git a/core/urls.py b/core/urls.py index 47613a1b..73034bd5 100644 --- a/core/urls.py +++ b/core/urls.py @@ -1,8 +1,10 @@ from django.urls import path -from core.views import healthz_view, readyz_view +from core.views import confirm_newsletter_sender_view, healthz_view, readyz_view, resend_inbound_webhook_view urlpatterns = [ path("healthz/", healthz_view, name="healthz"), path("readyz/", readyz_view, name="readyz"), + path("api/v1/inbound/resend/", resend_inbound_webhook_view, name="resend-inbound-webhook"), + path("api/v1/inbound/confirm//", confirm_newsletter_sender_view, name="confirm-newsletter-sender"), ] diff --git a/core/views.py b/core/views.py index afddb1ea..1d45ffcd 100644 --- a/core/views.py +++ b/core/views.py @@ -1,12 +1,27 @@ from http import HTTPStatus +import json from typing import cast from django.conf import settings as django_settings from django.db import connection +from django.http import HttpRequest from django.http import JsonResponse +from django.shortcuts import get_object_or_404 +from django.utils import timezone +from django.views.decorators.csrf import csrf_exempt +from django.views.decorators.http import require_GET, require_POST from qdrant_client import QdrantClient +from core.models import IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, Project +from core.newsletters import ( + get_resend_payload_data, + normalize_sender_email, + sanitize_newsletter_html, + send_confirmation_email, + verify_resend_signature, +) from core.settings_types import CoreSettings +from core.tasks import process_newsletter_intake settings = cast(CoreSettings, django_settings) @@ -45,3 +60,101 @@ def _check_qdrant() -> bool: except Exception: return False return True + + +@csrf_exempt +@require_POST +def resend_inbound_webhook_view(request: HttpRequest): + body = request.body + if len(body) > 1_000_000: + return JsonResponse({"detail": "Payload too large."}, status=HTTPStatus.REQUEST_ENTITY_TOO_LARGE) + + signature = request.headers.get("X-Resend-Signature", "") + if not verify_resend_signature(body, signature): + return JsonResponse({"detail": "Invalid webhook signature."}, status=HTTPStatus.UNAUTHORIZED) + + try: + payload = json.loads(body) + except json.JSONDecodeError: + return JsonResponse({"detail": "Invalid JSON payload."}, status=HTTPStatus.BAD_REQUEST) + + payload_data = get_resend_payload_data(payload) + recipients = payload_data.get("to") or [] + if isinstance(recipients, str): + recipients = [recipients] + + project = None + for recipient in recipients: + token = _extract_project_token_from_recipient(recipient) + if token is None: + continue + project = Project.objects.filter(intake_token=token, intake_enabled=True).first() + if project is not None: + break + + if project is None: + return JsonResponse({"detail": "No matching intake project found."}, status=HTTPStatus.NOT_FOUND) + + sender_email = normalize_sender_email(str(payload_data.get("from", ""))) + message_id = str(payload_data.get("message_id") or payload_data.get("email_id") or payload_data.get("id") or "").strip() + if not sender_email or not message_id: + return JsonResponse({"detail": "Missing sender or message identifier."}, status=HTTPStatus.BAD_REQUEST) + + defaults = { + "project": project, + "sender_email": sender_email, + "subject": str(payload_data.get("subject", ""))[:512], + "raw_html": sanitize_newsletter_html(str(payload_data.get("html", "") or payload_data.get("html_body", ""))), + "raw_text": str(payload_data.get("text", "") or payload_data.get("text_body", "")), + } + intake, created = NewsletterIntake.objects.get_or_create(message_id=message_id, defaults=defaults) + if not created: + return JsonResponse({"id": intake.id, "status": intake.status, "duplicate": True}, status=HTTPStatus.OK) + + allowlist, allowlist_created = IntakeAllowlist.objects.get_or_create( + project=project, + sender_email=sender_email, + ) + + if allowlist.is_confirmed: + _queue_newsletter_intake(intake.id) + return JsonResponse({"id": intake.id, "status": intake.status}, status=HTTPStatus.ACCEPTED) + + if allowlist_created: + confirm_url = request.build_absolute_uri(f"/api/v1/inbound/confirm/{allowlist.confirmation_token}/") + send_confirmation_email(to_email=sender_email, confirm_url=confirm_url, project_name=project.name) + + return JsonResponse({"id": intake.id, "status": intake.status, "confirmation_required": True}, status=HTTPStatus.ACCEPTED) + + +@require_GET +def confirm_newsletter_sender_view(request: HttpRequest, token: str): + allowlist = get_object_or_404(IntakeAllowlist, confirmation_token=token) + if allowlist.confirmed_at is None: + allowlist.confirmed_at = timezone.now() + allowlist.save(update_fields=["confirmed_at"]) + + pending_intake_ids = list( + NewsletterIntake.objects.filter( + project=allowlist.project, + sender_email=allowlist.sender_email, + status=NewsletterIntakeStatus.PENDING, + ).values_list("id", flat=True) + ) + for intake_id in pending_intake_ids: + _queue_newsletter_intake(intake_id) + + return JsonResponse({"status": "confirmed", "queued": len(pending_intake_ids)}) + + +def _extract_project_token_from_recipient(recipient: str) -> str | None: + from core.newsletters import extract_project_token + + return extract_project_token(recipient) + + +def _queue_newsletter_intake(intake_id: int) -> None: + if settings.CELERY_TASK_ALWAYS_EAGER: + process_newsletter_intake(intake_id) + else: + process_newsletter_intake.delay(intake_id) diff --git a/newsletter_maker/settings/__init__.py b/newsletter_maker/settings/__init__.py index e0ed9fe3..21fae6f8 100644 --- a/newsletter_maker/settings/__init__.py +++ b/newsletter_maker/settings/__init__.py @@ -40,6 +40,9 @@ REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, + RESEND_API_KEY, + RESEND_FROM_EMAIL, + RESEND_WEBHOOK_SECRET, REST_FRAMEWORK, ROOT_URLCONF, SECRET_KEY, @@ -113,6 +116,9 @@ "REDDIT_CLIENT_ID", "REDDIT_CLIENT_SECRET", "REDDIT_USER_AGENT", + "RESEND_API_KEY", + "RESEND_FROM_EMAIL", + "RESEND_WEBHOOK_SECRET", "REDIS_URL", "REST_FRAMEWORK", "ROOT_URLCONF", diff --git a/newsletter_maker/settings/base.py b/newsletter_maker/settings/base.py index d6dbe3ab..20378f79 100644 --- a/newsletter_maker/settings/base.py +++ b/newsletter_maker/settings/base.py @@ -44,6 +44,9 @@ def env_list(name: str, default: str = "") -> list[str]: REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID", "") REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET", "") REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT", "newsletter-maker/0.1") +RESEND_API_KEY = os.getenv("RESEND_API_KEY", "") +RESEND_FROM_EMAIL = os.getenv("RESEND_FROM_EMAIL", "onboarding@resend.dev") +RESEND_WEBHOOK_SECRET = os.getenv("RESEND_WEBHOOK_SECRET", "") INSTALLED_APPS = [ # 1. High-priority middleware dependencies @@ -183,6 +186,9 @@ def env_list(name: str, default: str = "") -> list[str]: "REDDIT_CLIENT_ID", "REDDIT_CLIENT_SECRET", "REDDIT_USER_AGENT", + "RESEND_API_KEY", + "RESEND_FROM_EMAIL", + "RESEND_WEBHOOK_SECRET", "INSTALLED_APPS", "MIDDLEWARE", "ROOT_URLCONF", diff --git a/requirements.txt b/requirements.txt index d24b45a3..05b9ee2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,6 +47,7 @@ pytest==9.0.3 python-dotenv==1.2.2 qdrant-client==1.17.1 requests-toolbelt==1.0.0 +resend==2.29.0 ruff==0.15.12 sentence-transformers==5.4.1 structlog==25.5.0 From 7ebd9e7cffb10b485e88e84fc1cfa47efb679d8d Mon Sep 17 00:00:00 2001 From: Kevin Brown Date: Tue, 28 Apr 2026 16:36:10 +0300 Subject: [PATCH 2/5] Resend integration --- core/migrations/0002_newsletter_intake.py | 2 +- core/newsletters.py | 2 +- core/tasks.py | 10 +++++++++- core/tests/test_newsletters.py | 3 +-- core/views.py | 5 ++--- frontend/next-env.d.ts | 2 +- 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/core/migrations/0002_newsletter_intake.py b/core/migrations/0002_newsletter_intake.py index 86381058..70991ab7 100644 --- a/core/migrations/0002_newsletter_intake.py +++ b/core/migrations/0002_newsletter_intake.py @@ -80,4 +80,4 @@ class Migration(migrations.Migration): model_name="intakeallowlist", constraint=models.UniqueConstraint(fields=("project", "sender_email"), name="core_allowlist_unique_project_sender"), ), - ] \ No newline at end of file + ] diff --git a/core/newsletters.py b/core/newsletters.py index 81b5b15a..ca96516d 100644 --- a/core/newsletters.py +++ b/core/newsletters.py @@ -152,4 +152,4 @@ def get_resend_payload_data(payload: dict[str, Any]) -> dict[str, Any]: data = payload.get("data") if isinstance(data, dict): return data - return payload \ No newline at end of file + return payload diff --git a/core/tasks.py b/core/tasks.py index 0f6e0a5f..812812da 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -5,7 +5,15 @@ from django.utils import timezone from core.embeddings import upsert_content_embedding -from core.models import Content, IngestionRun, IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, RunStatus, SourceConfig +from core.models import ( + Content, + IngestionRun, + IntakeAllowlist, + NewsletterIntake, + NewsletterIntakeStatus, + RunStatus, + SourceConfig, +) from core.newsletters import extract_newsletter_items from core.pipeline import ( RELEVANCE_SKILL_NAME, diff --git a/core/tests/test_newsletters.py b/core/tests/test_newsletters.py index 19661f41..a9d1ca69 100644 --- a/core/tests/test_newsletters.py +++ b/core/tests/test_newsletters.py @@ -7,7 +7,6 @@ from core.models import Content, IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, Project from core.newsletters import compute_resend_signature, extract_newsletter_items, sanitize_newsletter_html - pytestmark = pytest.mark.django_db @@ -135,4 +134,4 @@ def test_process_newsletter_intake_creates_content_for_confirmed_sender(settings assert content.source_plugin == "newsletter" assert content.source_metadata["newsletter_intake_id"] == intake.id upsert_mock.assert_called_once_with(content) - delay_mock.assert_called_once_with(content.id) \ No newline at end of file + delay_mock.assert_called_once_with(content.id) diff --git a/core/views.py b/core/views.py index 1d45ffcd..eacb612e 100644 --- a/core/views.py +++ b/core/views.py @@ -1,11 +1,10 @@ -from http import HTTPStatus import json +from http import HTTPStatus from typing import cast from django.conf import settings as django_settings from django.db import connection -from django.http import HttpRequest -from django.http import JsonResponse +from django.http import HttpRequest, JsonResponse from django.shortcuts import get_object_or_404 from django.utils import timezone from django.views.decorators.csrf import csrf_exempt diff --git a/frontend/next-env.d.ts b/frontend/next-env.d.ts index 9edff1c7..c4b7818f 100644 --- a/frontend/next-env.d.ts +++ b/frontend/next-env.d.ts @@ -1,6 +1,6 @@ /// /// -import "./.next/types/routes.d.ts"; +import "./.next/dev/types/routes.d.ts"; // NOTE: This file should not be edited // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. From 3ae79dbb57d306c8c766afc12d938a22f73db013 Mon Sep 17 00:00:00 2001 From: Kevin Brown Date: Tue, 28 Apr 2026 16:45:24 +0300 Subject: [PATCH 3/5] Add favicon to frontend layout --- frontend/src/app/layout.tsx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frontend/src/app/layout.tsx b/frontend/src/app/layout.tsx index 50070919..b53deeeb 100644 --- a/frontend/src/app/layout.tsx +++ b/frontend/src/app/layout.tsx @@ -19,6 +19,10 @@ const body = Space_Grotesk({ export const metadata: Metadata = { title: "Newsletter Maker Frontend", description: "Minimal dashboard for reviewing ingested newsletter content.", + icons: { + icon: "/favicon.ico", + shortcut: "/favicon.ico", + }, } export default function RootLayout({ From cd321f1f7116dbff350ce265db600edea1b89882 Mon Sep 17 00:00:00 2001 From: Kevin Brown Date: Tue, 28 Apr 2026 18:24:30 +0300 Subject: [PATCH 4/5] Refactor to django-anymail from resend client and add tests --- .env.example | 28 +++ .vscode/settings.json | 3 + README.md | 6 + core/apps.py | 3 + core/newsletters.py | 134 +++++++++---- core/settings_types.py | 4 +- core/signals.py | 38 ++++ core/tests/test_newsletters.py | 267 +++++++++++++++++++++++--- core/urls.py | 3 +- core/views.py | 95 +-------- newsletter_maker/settings/__init__.py | 16 +- newsletter_maker/settings/base.py | 23 ++- newsletter_maker/urls.py | 1 + requirements.txt | 17 +- 14 files changed, 469 insertions(+), 169 deletions(-) create mode 100644 core/signals.py diff --git a/.env.example b/.env.example index 9df9fcb6..9f0658ce 100644 --- a/.env.example +++ b/.env.example @@ -11,6 +11,7 @@ OPENROUTER_API_KEY= OPENROUTER_API_BASE=https://openrouter.ai/api/v1 OPENROUTER_APP_URL= OPENROUTER_APP_NAME=newsletter-maker + AI_CLASSIFICATION_MODEL=meta-llama/llama-3.1-70b-instruct AI_RELEVANCE_MODEL=qwen/qwen-2.5-72b-instruct AI_SUMMARIZATION_MODEL=google/gemma-3-27b-it @@ -21,14 +22,39 @@ AI_RELEVANCE_REVIEW_THRESHOLD=0.4 AI_RELEVANCE_SUMMARIZE_THRESHOLD=0.7 AI_MAX_NODE_RETRIES=2 AI_REQUEST_TIMEOUT_SECONDS=60 + EMBEDDING_PROVIDER=sentence-transformers EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 EMBEDDING_TRUST_REMOTE_CODE=false + OLLAMA_URL=http://localhost:11434 + REDDIT_CLIENT_ID= REDDIT_CLIENT_SECRET= REDDIT_USER_AGENT=newsletter-maker/0.1 + +# Outbound mail provider. Use Resend or Amazon SES. +EMAIL_BACKEND=anymail.backends.resend.EmailBackend +DEFAULT_FROM_EMAIL=onboarding@resend.dev +SERVER_EMAIL=onboarding@resend.dev + +# Resend outbound + inbound +RESEND_API_KEY= +RESEND_FROM_EMAIL=onboarding@resend.dev +RESEND_INBOUND_SECRET= + +# Amazon SES outbound + inbound +# EMAIL_BACKEND=anymail.backends.amazon_ses.EmailBackend +# AWS_ACCESS_KEY_ID= +# AWS_SECRET_ACCESS_KEY= +# AWS_DEFAULT_REGION=us-east-1 + +# Shared webhook basic auth for providers that need it. +# Format must be username:password. +ANYMAIL_WEBHOOK_SECRET= + LOG_LEVEL=INFO + CELERY_TASK_ALWAYS_EAGER=false DJANGO_SUPERUSER_USERNAME=admin @@ -40,5 +66,7 @@ NEWSLETTER_API_USERNAME=admin NEWSLETTER_API_PASSWORD=adminpass DEBUG=True + ALLOWED_HOSTS=localhost,127.0.0.1,newslettermaker.tech + FRONTEND_URL=http://localhost:3000 diff --git a/.vscode/settings.json b/.vscode/settings.json index d55b96fa..33023242 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,10 +1,12 @@ { "cSpell.words": [ "ASGI", + "botocore", "buildx", "cbranch", "cfgv", "cstat", + "dateutil", "djlint", "FAVICONS", "Feedly", @@ -33,6 +35,7 @@ "readyz", "Referer", "simplejwt", + "svix", "Unparseable", "unstub", "upserted", diff --git a/README.md b/README.md index d451cbd3..a338db17 100644 --- a/README.md +++ b/README.md @@ -177,3 +177,9 @@ For the default local bootstrap, `.env` also seeds an `admin` superuser in the c This repository is licensed under the GNU Affero General Public License v3.0 or later. See [LICENSE](LICENSE). Based on the current direct dependencies, AGPL is a reasonable fit: the packages in use are permissive or LGPL-compatible licenses such as BSD, MIT, Apache-2.0, and LGPLv3. That said, this is a practical compatibility check, not legal advice, so review it with counsel if you need a formal licensing opinion. + +In our skills/relevance_scoring/SKILL.md, I see that we're passing the tenant id. Are we evaluating the content item's relevance against all content items for a project (tenant)? Or how is the relevance actually calculated? + +I notice too we're still using "tenant_id". We changed our naming scheme from "tenant" to "project". We need to update the skills and anywhere else we're still referring to the old "tenant" naming scheme. + +How many Embedding Dimensions are we using to create embeddings? Where are we specifying that value for Qdrant records? diff --git a/core/apps.py b/core/apps.py index c0ce093b..c2c322dd 100644 --- a/core/apps.py +++ b/core/apps.py @@ -4,3 +4,6 @@ class CoreConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "core" + + def ready(self) -> None: + import core.signals # noqa: F401 diff --git a/core/newsletters.py b/core/newsletters.py index ca96516d..41e205dd 100644 --- a/core/newsletters.py +++ b/core/newsletters.py @@ -1,15 +1,16 @@ from __future__ import annotations -import hashlib -import hmac import re from dataclasses import dataclass from email.utils import parseaddr from html.parser import HTMLParser -from typing import Any, cast +from typing import Any, Iterable, cast from django.conf import settings as django_settings +from django.core.mail import EmailMultiAlternatives +from django.urls import reverse +from core.models import IntakeAllowlist, NewsletterIntake, Project from core.settings_types import CoreSettings settings = cast(CoreSettings, django_settings) @@ -29,17 +30,6 @@ def sanitize_newsletter_html(raw_html: str) -> str: return INLINE_HANDLER_PATTERN.sub("", without_scripts) -def compute_resend_signature(payload: bytes, secret: str) -> str: - return hmac.new(secret.encode("utf-8"), payload, hashlib.sha256).hexdigest() - - -def verify_resend_signature(payload: bytes, provided_signature: str) -> bool: - if not settings.RESEND_WEBHOOK_SECRET or not provided_signature: - return False - expected_signature = compute_resend_signature(payload, settings.RESEND_WEBHOOK_SECRET) - return hmac.compare_digest(expected_signature, provided_signature) - - def extract_project_token(recipient: str) -> str | None: _, email_address = parseaddr(recipient) local_part = email_address.partition("@")[0] @@ -50,24 +40,101 @@ def extract_project_token(recipient: str) -> str | None: def send_confirmation_email(*, to_email: str, confirm_url: str, project_name: str) -> None: - if not settings.RESEND_API_KEY: - raise RuntimeError("RESEND_API_KEY must be configured to send newsletter confirmation emails.") - - import resend - - resend.api_key = settings.RESEND_API_KEY - resend.Emails.send( - { - "from": settings.RESEND_FROM_EMAIL, - "to": [to_email], - "subject": f"Confirm newsletter intake for {project_name}", - "html": ( - "

Confirm this sender for newsletter ingestion.

" - f'

Confirm sender

' - ), - } + subject = f"Confirm newsletter intake for {project_name}" + text_body = ( + "Confirm this sender for newsletter ingestion.\n\n" + f"Confirm sender: {confirm_url}" + ) + html_body = ( + "

Confirm this sender for newsletter ingestion.

" + f'

Confirm sender

' ) + message = EmailMultiAlternatives( + subject=subject, + body=text_body, + from_email=settings.DEFAULT_FROM_EMAIL, + to=[to_email], + ) + message.attach_alternative(html_body, "text/html") + message.send() + + +def build_confirmation_url(token: str) -> str: + base_url = settings.NEWSLETTER_API_BASE_URL.rstrip("/") + return f"{base_url}{reverse('confirm-newsletter-sender', kwargs={'token': token})}" + + +def process_inbound_newsletter( + *, + recipients: Iterable[str], + sender_email: str, + subject: str, + raw_html: str, + raw_text: str, + message_id: str, +) -> dict[str, Any]: + project = _find_intake_project(recipients) + if project is None: + return {"status": "ignored", "reason": "no_matching_project"} + + normalized_sender_email = normalize_sender_email(sender_email) + normalized_message_id = message_id.strip() + if not normalized_sender_email or not normalized_message_id: + return {"status": "ignored", "reason": "missing_sender_or_message_id"} + + defaults = { + "project": project, + "sender_email": normalized_sender_email, + "subject": subject[:512], + "raw_html": sanitize_newsletter_html(raw_html), + "raw_text": raw_text, + } + intake, created = NewsletterIntake.objects.get_or_create( + message_id=normalized_message_id, + defaults=defaults, + ) + if not created: + return {"id": intake.id, "status": intake.status, "duplicate": True} + + allowlist, allowlist_created = IntakeAllowlist.objects.get_or_create( + project=project, + sender_email=normalized_sender_email, + ) + + if allowlist.is_confirmed: + queue_newsletter_intake(intake.id) + return {"id": intake.id, "status": intake.status} + + if allowlist_created: + send_confirmation_email( + to_email=normalized_sender_email, + confirm_url=build_confirmation_url(allowlist.confirmation_token), + project_name=project.name, + ) + + return {"id": intake.id, "status": intake.status, "confirmation_required": True} + + +def queue_newsletter_intake(intake_id: int) -> None: + from core.tasks import process_newsletter_intake + + if settings.CELERY_TASK_ALWAYS_EAGER: + process_newsletter_intake(intake_id) + else: + process_newsletter_intake.delay(intake_id) + + +def _find_intake_project(recipients: Iterable[str]) -> Project | None: + for recipient in recipients: + token = extract_project_token(recipient) + if token is None: + continue + project = Project.objects.filter(intake_token=token, intake_enabled=True).first() + if project is not None: + return project + return None + @dataclass(slots=True) class ExtractedNewsletterItem: @@ -146,10 +213,3 @@ def extract_newsletter_items(*, subject: str, raw_html: str, raw_text: str) -> l ) return extracted_items - - -def get_resend_payload_data(payload: dict[str, Any]) -> dict[str, Any]: - data = payload.get("data") - if isinstance(data, dict): - return data - return payload diff --git a/core/settings_types.py b/core/settings_types.py index 91dcdc64..413570ec 100644 --- a/core/settings_types.py +++ b/core/settings_types.py @@ -3,6 +3,8 @@ class CoreSettings(Protocol): CELERY_TASK_ALWAYS_EAGER: bool + DEFAULT_FROM_EMAIL: str + NEWSLETTER_API_BASE_URL: str QDRANT_URL: str EMBEDDING_MODEL: str EMBEDDING_PROVIDER: str @@ -13,5 +15,3 @@ class CoreSettings(Protocol): OPENROUTER_APP_URL: str OPENROUTER_APP_NAME: str RESEND_API_KEY: str - RESEND_FROM_EMAIL: str - RESEND_WEBHOOK_SECRET: str diff --git a/core/signals.py b/core/signals.py new file mode 100644 index 00000000..19cbf805 --- /dev/null +++ b/core/signals.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from anymail.signals import inbound +from django.dispatch import receiver + +from core.newsletters import process_inbound_newsletter + + +def _address_to_string(address) -> str: + if address is None: + return "" + addr_spec = getattr(address, "addr_spec", None) + if isinstance(addr_spec, str): + return addr_spec.strip() + return str(address).strip() + + +@receiver(inbound) +def handle_anymail_inbound(sender, event, esp_name, **kwargs): + message = event.message + + recipients: list[str] = [] + if message.envelope_recipient: + recipients.append(message.envelope_recipient) + recipients.extend( + address.addr_spec + for address in getattr(message, "to", []) + if getattr(address, "addr_spec", "") + ) + + process_inbound_newsletter( + recipients=recipients, + sender_email=message.envelope_sender or _address_to_string(getattr(message, "from_email", None)), + subject=message.subject or "", + raw_html=message.html or "", + raw_text=message.text or "", + message_id=str(message.get("Message-ID", "") or event.event_id or ""), + ) \ No newline at end of file diff --git a/core/tests/test_newsletters.py b/core/tests/test_newsletters.py index a9d1ca69..2c6ee964 100644 --- a/core/tests/test_newsletters.py +++ b/core/tests/test_newsletters.py @@ -1,11 +1,21 @@ import json +from base64 import b64encode +from datetime import datetime, timezone +from types import SimpleNamespace import pytest from django.contrib.auth.models import Group +from django.core import mail from django.urls import reverse +from svix.webhooks import Webhook from core.models import Content, IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, Project -from core.newsletters import compute_resend_signature, extract_newsletter_items, sanitize_newsletter_html +from core.newsletters import ( + extract_newsletter_items, + sanitize_newsletter_html, + send_confirmation_email, +) +from core.signals import handle_anymail_inbound pytestmark = pytest.mark.django_db @@ -40,48 +50,245 @@ def test_extract_newsletter_items_prefers_anchor_titles_and_dedupes_urls(): assert items[1].title == "Weekly Digest" -def test_resend_inbound_webhook_rejects_invalid_signature(client, settings): - settings.RESEND_WEBHOOK_SECRET = "secret" +class FakeInboundMessage: + def __init__( + self, + *, + envelope_recipient: str | None, + from_email: str, + subject: str, + html: str, + text: str, + message_id: str, + to: list[str] | None = None, + ) -> None: + self.envelope_recipient = envelope_recipient + self.envelope_sender = from_email + self.from_email = SimpleNamespace(addr_spec=from_email) + self.subject = subject + self.html = html + self.text = text + self.to = [SimpleNamespace(addr_spec=address) for address in (to or [])] + self._headers = {"Message-ID": message_id} + + def get(self, key: str, default=None): + return self._headers.get(key, default) + + +def _signed_resend_headers(secret: str, payload: str, *, message_id: str) -> dict[str, str]: + timestamp = datetime.now(timezone.utc) + signature = Webhook(secret).sign( + msg_id=message_id, + timestamp=timestamp, + data=payload, + ) + return { + "HTTP_SVIX_ID": message_id, + "HTTP_SVIX_TIMESTAMP": str(int(timestamp.timestamp())), + "HTTP_SVIX_SIGNATURE": signature, + } + + +def _basic_auth_header(credentials: str) -> str: + encoded = b64encode(credentials.encode("utf-8")).decode("ascii") + return f"Basic {encoded}" + + +def test_handle_anymail_inbound_creates_pending_intake_and_sends_confirmation(settings, mocker, project): + settings.NEWSLETTER_API_BASE_URL = "https://example.com" + send_mock = mocker.patch("core.newsletters.send_confirmation_email") + event = SimpleNamespace( + message=FakeInboundMessage( + envelope_recipient=f"intake+{project.intake_token}@inbox.example.com", + from_email="newsletter@example.com", + subject="Weekly Digest", + html='Read now', + text="Read https://example.com/post", + message_id="msg-123", + ), + event_id=None, + ) + + handle_anymail_inbound(sender=object(), event=event, esp_name="Resend") + + intake = NewsletterIntake.objects.get(message_id="msg-123") + allowlist = IntakeAllowlist.objects.get(project=project, sender_email="newsletter@example.com") + assert intake.status == NewsletterIntakeStatus.PENDING + assert allowlist.confirmed_at is None + send_mock.assert_called_once() + assert send_mock.call_args.kwargs["confirm_url"].startswith("https://example.com/api/v1/inbound/confirm/") + + +def test_handle_anymail_inbound_queues_confirmed_sender(settings, mocker, project): + settings.CELERY_TASK_ALWAYS_EAGER = False + send_mock = mocker.patch("core.newsletters.send_confirmation_email") + delay_mock = mocker.patch("core.tasks.process_newsletter_intake.delay") + IntakeAllowlist.objects.create( + project=project, + sender_email="newsletter@example.com", + confirmed_at="2026-04-28T00:00:00Z", + ) + event = SimpleNamespace( + message=FakeInboundMessage( + envelope_recipient=f"intake+{project.intake_token}@inbox.example.com", + from_email="newsletter@example.com", + subject="Weekly Digest", + html='Read now', + text="Read https://example.com/post", + message_id="msg-456", + ), + event_id=None, + ) + + handle_anymail_inbound(sender=object(), event=event, esp_name="Resend") + + intake = NewsletterIntake.objects.get(message_id="msg-456") + delay_mock.assert_called_once_with(intake.id) + send_mock.assert_not_called() + + +def test_resend_inbound_webhook_posts_to_anymail_url(settings, client, mocker, project): + settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend" + settings.DEFAULT_FROM_EMAIL = "noreply@example.com" + settings.NEWSLETTER_API_BASE_URL = "https://example.com" + settings.RESEND_API_KEY = "re_test_key" + settings.RESEND_INBOUND_SECRET = "whsec_test_secret" + settings.ANYMAIL = { + "RESEND_API_KEY": settings.RESEND_API_KEY, + "RESEND_INBOUND_SECRET": settings.RESEND_INBOUND_SECRET, + } + + api_response = mocker.Mock() + api_response.raise_for_status.return_value = None + api_response.json.return_value = { + "from": "newsletter@example.com", + "to": [f"intake+{project.intake_token}@inbox.example.com"], + "subject": "Weekly Digest", + "text": "Read https://example.com/post", + "html": 'Read now', + "message_id": "", + "headers": { + "Message-ID": "", + }, + "attachments": [], + } + mocker.patch("anymail.webhooks.resend.requests.get", return_value=api_response) + + payload = json.dumps( + { + "type": "email.received", + "created_at": "2026-04-28T12:00:00.000Z", + "data": { + "email_id": "re_email_123", + }, + } + ) response = client.post( - reverse("resend-inbound-webhook"), - data=json.dumps({"data": {}}), + "/anymail/resend/inbound/", + data=payload, content_type="application/json", - HTTP_X_RESEND_SIGNATURE="wrong", + **_signed_resend_headers( + settings.RESEND_INBOUND_SECRET, + payload, + message_id="msg_resend_123", + ), ) - assert response.status_code == 401 - - -def test_resend_inbound_webhook_creates_pending_intake_and_sends_confirmation(client, settings, mocker, project): - settings.RESEND_WEBHOOK_SECRET = "secret" - settings.RESEND_API_KEY = "resend-key" - send_mock = mocker.patch("core.views.send_confirmation_email") - payload = { - "data": { - "from": "Sender ", - "to": [f"intake+{project.intake_token}@inbox.example.com"], - "subject": "Weekly Digest", - "html": 'Read now', - "text": "Read https://example.com/post", - "message_id": "msg-123", - } + assert response.status_code == 200 + intake = NewsletterIntake.objects.get(message_id="") + allowlist = IntakeAllowlist.objects.get(project=project, sender_email="newsletter@example.com") + assert intake.status == NewsletterIntakeStatus.PENDING + assert allowlist.confirmed_at is None + assert len(mail.outbox) == 1 + assert "/api/v1/inbound/confirm/" in mail.outbox[0].body + + +def test_amazon_ses_inbound_webhook_posts_to_anymail_url(settings, client, project): + settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend" + settings.DEFAULT_FROM_EMAIL = "noreply@example.com" + settings.NEWSLETTER_API_BASE_URL = "https://example.com" + settings.ANYMAIL = { + "WEBHOOK_SECRET": "anymail:ses-secret", } - raw_payload = json.dumps(payload) + + payload = json.dumps( + { + "Type": "Notification", + "MessageId": "sns-message-123", + "TopicArn": "arn:aws:sns:us-east-1:123456789012:ses-inbound", + "Message": json.dumps( + { + "notificationType": "Received", + "mail": { + "messageId": "ses-message-123", + "timestamp": "2026-04-28T12:00:00.000Z", + "source": "newsletter@example.com", + }, + "receipt": { + "action": { + "type": "SNS", + "encoding": "UTF-8", + }, + "recipients": [ + f"intake+{project.intake_token}@inbox.example.com", + ], + "spamVerdict": { + "status": "PASS", + }, + }, + "content": ( + "From: newsletter@example.com\n" + f"To: intake+{project.intake_token}@inbox.example.com\n" + "Subject: SES Digest\n" + "Message-ID: \n" + "Content-Type: text/plain; charset=utf-8\n" + "\n" + "Read https://example.com/post\n" + ), + } + ), + } + ) response = client.post( - reverse("resend-inbound-webhook"), - data=raw_payload, + "/anymail/amazon_ses/inbound/", + data=payload, content_type="application/json", - HTTP_X_RESEND_SIGNATURE=compute_resend_signature(raw_payload.encode("utf-8"), settings.RESEND_WEBHOOK_SECRET), + HTTP_X_AMZ_SNS_MESSAGE_TYPE="Notification", + HTTP_X_AMZ_SNS_MESSAGE_ID="sns-message-123", + HTTP_AUTHORIZATION=_basic_auth_header(settings.ANYMAIL["WEBHOOK_SECRET"]), ) - assert response.status_code == 202 - intake = NewsletterIntake.objects.get(message_id="msg-123") + assert response.status_code == 200 + intake = NewsletterIntake.objects.get(message_id="") allowlist = IntakeAllowlist.objects.get(project=project, sender_email="newsletter@example.com") assert intake.status == NewsletterIntakeStatus.PENDING assert allowlist.confirmed_at is None - send_mock.assert_called_once() + assert len(mail.outbox) == 1 + assert "/api/v1/inbound/confirm/" in mail.outbox[0].body + + +def test_send_confirmation_email_uses_django_mail_backend(settings): + settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend" + settings.DEFAULT_FROM_EMAIL = "noreply@example.com" + + send_confirmation_email( + to_email="newsletter@example.com", + confirm_url="https://example.com/confirm/token", + project_name="Platform Engineering Weekly", + ) + + assert len(mail.outbox) == 1 + message = mail.outbox[0] + assert message.subject == "Confirm newsletter intake for Platform Engineering Weekly" + assert message.from_email == "noreply@example.com" + assert message.to == ["newsletter@example.com"] + assert "https://example.com/confirm/token" in message.body + assert any( + mimetype == "text/html" and "Confirm sender" in content + for content, mimetype in message.alternatives + ) def test_confirm_newsletter_sender_confirms_allowlist_and_queues_pending_intakes(client, settings, mocker, project): @@ -94,7 +301,7 @@ def test_confirm_newsletter_sender_confirms_allowlist_and_queues_pending_intakes raw_text="Visit https://example.com/post", message_id="msg-456", ) - delay_mock = mocker.patch("core.views.process_newsletter_intake.delay") + delay_mock = mocker.patch("core.tasks.process_newsletter_intake.delay") response = client.get(reverse("confirm-newsletter-sender", kwargs={"token": allowlist.confirmation_token})) diff --git a/core/urls.py b/core/urls.py index 73034bd5..64b858ae 100644 --- a/core/urls.py +++ b/core/urls.py @@ -1,10 +1,9 @@ from django.urls import path -from core.views import confirm_newsletter_sender_view, healthz_view, readyz_view, resend_inbound_webhook_view +from core.views import confirm_newsletter_sender_view, healthz_view, readyz_view urlpatterns = [ path("healthz/", healthz_view, name="healthz"), path("readyz/", readyz_view, name="readyz"), - path("api/v1/inbound/resend/", resend_inbound_webhook_view, name="resend-inbound-webhook"), path("api/v1/inbound/confirm//", confirm_newsletter_sender_view, name="confirm-newsletter-sender"), ] diff --git a/core/views.py b/core/views.py index eacb612e..3d1191cb 100644 --- a/core/views.py +++ b/core/views.py @@ -1,4 +1,3 @@ -import json from http import HTTPStatus from typing import cast @@ -7,20 +6,12 @@ from django.http import HttpRequest, JsonResponse from django.shortcuts import get_object_or_404 from django.utils import timezone -from django.views.decorators.csrf import csrf_exempt -from django.views.decorators.http import require_GET, require_POST +from django.views.decorators.http import require_GET from qdrant_client import QdrantClient -from core.models import IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, Project -from core.newsletters import ( - get_resend_payload_data, - normalize_sender_email, - sanitize_newsletter_html, - send_confirmation_email, - verify_resend_signature, -) +from core.models import IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus +from core.newsletters import queue_newsletter_intake from core.settings_types import CoreSettings -from core.tasks import process_newsletter_intake settings = cast(CoreSettings, django_settings) @@ -61,71 +52,6 @@ def _check_qdrant() -> bool: return True -@csrf_exempt -@require_POST -def resend_inbound_webhook_view(request: HttpRequest): - body = request.body - if len(body) > 1_000_000: - return JsonResponse({"detail": "Payload too large."}, status=HTTPStatus.REQUEST_ENTITY_TOO_LARGE) - - signature = request.headers.get("X-Resend-Signature", "") - if not verify_resend_signature(body, signature): - return JsonResponse({"detail": "Invalid webhook signature."}, status=HTTPStatus.UNAUTHORIZED) - - try: - payload = json.loads(body) - except json.JSONDecodeError: - return JsonResponse({"detail": "Invalid JSON payload."}, status=HTTPStatus.BAD_REQUEST) - - payload_data = get_resend_payload_data(payload) - recipients = payload_data.get("to") or [] - if isinstance(recipients, str): - recipients = [recipients] - - project = None - for recipient in recipients: - token = _extract_project_token_from_recipient(recipient) - if token is None: - continue - project = Project.objects.filter(intake_token=token, intake_enabled=True).first() - if project is not None: - break - - if project is None: - return JsonResponse({"detail": "No matching intake project found."}, status=HTTPStatus.NOT_FOUND) - - sender_email = normalize_sender_email(str(payload_data.get("from", ""))) - message_id = str(payload_data.get("message_id") or payload_data.get("email_id") or payload_data.get("id") or "").strip() - if not sender_email or not message_id: - return JsonResponse({"detail": "Missing sender or message identifier."}, status=HTTPStatus.BAD_REQUEST) - - defaults = { - "project": project, - "sender_email": sender_email, - "subject": str(payload_data.get("subject", ""))[:512], - "raw_html": sanitize_newsletter_html(str(payload_data.get("html", "") or payload_data.get("html_body", ""))), - "raw_text": str(payload_data.get("text", "") or payload_data.get("text_body", "")), - } - intake, created = NewsletterIntake.objects.get_or_create(message_id=message_id, defaults=defaults) - if not created: - return JsonResponse({"id": intake.id, "status": intake.status, "duplicate": True}, status=HTTPStatus.OK) - - allowlist, allowlist_created = IntakeAllowlist.objects.get_or_create( - project=project, - sender_email=sender_email, - ) - - if allowlist.is_confirmed: - _queue_newsletter_intake(intake.id) - return JsonResponse({"id": intake.id, "status": intake.status}, status=HTTPStatus.ACCEPTED) - - if allowlist_created: - confirm_url = request.build_absolute_uri(f"/api/v1/inbound/confirm/{allowlist.confirmation_token}/") - send_confirmation_email(to_email=sender_email, confirm_url=confirm_url, project_name=project.name) - - return JsonResponse({"id": intake.id, "status": intake.status, "confirmation_required": True}, status=HTTPStatus.ACCEPTED) - - @require_GET def confirm_newsletter_sender_view(request: HttpRequest, token: str): allowlist = get_object_or_404(IntakeAllowlist, confirmation_token=token) @@ -141,19 +67,6 @@ def confirm_newsletter_sender_view(request: HttpRequest, token: str): ).values_list("id", flat=True) ) for intake_id in pending_intake_ids: - _queue_newsletter_intake(intake_id) + queue_newsletter_intake(intake_id) return JsonResponse({"status": "confirmed", "queued": len(pending_intake_ids)}) - - -def _extract_project_token_from_recipient(recipient: str) -> str | None: - from core.newsletters import extract_project_token - - return extract_project_token(recipient) - - -def _queue_newsletter_intake(intake_id: int) -> None: - if settings.CELERY_TASK_ALWAYS_EAGER: - process_newsletter_intake(intake_id) - else: - process_newsletter_intake.delay(intake_id) diff --git a/newsletter_maker/settings/__init__.py b/newsletter_maker/settings/__init__.py index 21fae6f8..6541a70f 100644 --- a/newsletter_maker/settings/__init__.py +++ b/newsletter_maker/settings/__init__.py @@ -25,6 +25,8 @@ ACCOUNT_LOGIN_METHODS, ACCOUNT_SIGNUP_FIELDS, ALLOWED_HOSTS, + ANYMAIL, + ANYMAIL_WEBHOOK_SECRET, AUTH_PASSWORD_VALIDATORS, AUTHENTICATION_BACKENDS, BASE_DIR, @@ -33,20 +35,24 @@ DATABASES, DEBUG, DEFAULT_AUTO_FIELD, + DEFAULT_FROM_EMAIL, DRF_STANDARDIZED_ERRORS, + EMAIL_BACKEND, INSTALLED_APPS, LANGUAGE_CODE, MIDDLEWARE, + NEWSLETTER_API_BASE_URL, REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, RESEND_API_KEY, RESEND_FROM_EMAIL, - RESEND_WEBHOOK_SECRET, + RESEND_INBOUND_SECRET, REST_FRAMEWORK, ROOT_URLCONF, SECRET_KEY, SECURE_PROXY_SSL_HEADER, + SERVER_EMAIL, SITE_ID, STATIC_ROOT, STATIC_URL, @@ -97,8 +103,11 @@ "DATABASES", "DATABASE_URL", "DEBUG", + "DEFAULT_FROM_EMAIL", "DEFAULT_AUTO_FIELD", "DRF_STANDARDIZED_ERRORS", + "EMAIL_BACKEND", + "ANYMAIL", "EMBEDDING_MODEL", "EMBEDDING_PROVIDER", "EMBEDDING_TRUST_REMOTE_CODE", @@ -107,6 +116,7 @@ "LOGGING", "LOG_LEVEL", "MIDDLEWARE", + "NEWSLETTER_API_BASE_URL", "OLLAMA_URL", "OPENROUTER_API_BASE", "OPENROUTER_API_KEY", @@ -117,11 +127,13 @@ "REDDIT_CLIENT_SECRET", "REDDIT_USER_AGENT", "RESEND_API_KEY", + "RESEND_INBOUND_SECRET", "RESEND_FROM_EMAIL", - "RESEND_WEBHOOK_SECRET", + "ANYMAIL_WEBHOOK_SECRET", "REDIS_URL", "REST_FRAMEWORK", "ROOT_URLCONF", + "SERVER_EMAIL", "SECRET_KEY", "SECURE_PROXY_SSL_HEADER", "SITE_ID", diff --git a/newsletter_maker/settings/base.py b/newsletter_maker/settings/base.py index 20378f79..c1bbe547 100644 --- a/newsletter_maker/settings/base.py +++ b/newsletter_maker/settings/base.py @@ -40,13 +40,25 @@ def env_list(name: str, default: str = "") -> list[str]: DATABASE_URL = os.getenv("DATABASE_URL", f"sqlite:///{BASE_DIR / 'db.sqlite3'}") SITE_ID = int(os.getenv("SITE_ID", "1")) +NEWSLETTER_API_BASE_URL = os.getenv("NEWSLETTER_API_BASE_URL", "http://127.0.0.1:8080") REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID", "") REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET", "") REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT", "newsletter-maker/0.1") RESEND_API_KEY = os.getenv("RESEND_API_KEY", "") RESEND_FROM_EMAIL = os.getenv("RESEND_FROM_EMAIL", "onboarding@resend.dev") -RESEND_WEBHOOK_SECRET = os.getenv("RESEND_WEBHOOK_SECRET", "") +RESEND_INBOUND_SECRET = os.getenv("RESEND_INBOUND_SECRET", "") +ANYMAIL_WEBHOOK_SECRET = os.getenv("ANYMAIL_WEBHOOK_SECRET", "") +DEFAULT_FROM_EMAIL = os.getenv("DEFAULT_FROM_EMAIL", RESEND_FROM_EMAIL) +SERVER_EMAIL = os.getenv("SERVER_EMAIL", DEFAULT_FROM_EMAIL) +EMAIL_BACKEND = os.getenv("EMAIL_BACKEND", "anymail.backends.resend.EmailBackend") +ANYMAIL = {} +if RESEND_API_KEY: + ANYMAIL["RESEND_API_KEY"] = RESEND_API_KEY +if RESEND_INBOUND_SECRET: + ANYMAIL["RESEND_INBOUND_SECRET"] = RESEND_INBOUND_SECRET +if ANYMAIL_WEBHOOK_SECRET: + ANYMAIL["WEBHOOK_SECRET"] = ANYMAIL_WEBHOOK_SECRET INSTALLED_APPS = [ # 1. High-priority middleware dependencies @@ -77,6 +89,7 @@ def env_list(name: str, default: str = "") -> list[str]: "allauth.socialaccount", "allauth.socialaccount.providers.google", "allauth.socialaccount.providers.github", + "anymail", # 5. Utilities & Schema Tools "import_export", # Standard library @@ -183,12 +196,18 @@ def env_list(name: str, default: str = "") -> list[str]: "CSRF_TRUSTED_ORIGINS", "DATABASE_URL", "SITE_ID", + "NEWSLETTER_API_BASE_URL", "REDDIT_CLIENT_ID", "REDDIT_CLIENT_SECRET", "REDDIT_USER_AGENT", "RESEND_API_KEY", + "RESEND_INBOUND_SECRET", "RESEND_FROM_EMAIL", - "RESEND_WEBHOOK_SECRET", + "ANYMAIL_WEBHOOK_SECRET", + "DEFAULT_FROM_EMAIL", + "SERVER_EMAIL", + "EMAIL_BACKEND", + "ANYMAIL", "INSTALLED_APPS", "MIDDLEWARE", "ROOT_URLCONF", diff --git a/newsletter_maker/urls.py b/newsletter_maker/urls.py index 93e2e7bc..0be6c98e 100644 --- a/newsletter_maker/urls.py +++ b/newsletter_maker/urls.py @@ -16,6 +16,7 @@ def root_redirect_view(request): path("", include("core.urls")), path("", root_redirect_view), path("admin/", admin.site.urls), + path("anymail/", include("anymail.urls")), path("api/docs/", SpectacularSwaggerView.as_view(url_name="schema"), name="swagger-ui"), path("api/schema/", SpectacularAPIView.as_view(), name="schema"), path("api/auth/", include("dj_rest_auth.urls")), diff --git a/requirements.txt b/requirements.txt index 05b9ee2f..cbf1a60e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,15 @@ +boto3==1.42.97 +botocore==1.42.97 celery[redis]==5.6.3 -coverage==7.13.5 cfgv==3.5.0 +coverage==7.13.5 cryptography==47.0.0 cssbeautifier==1.15.4 +Deprecated==1.3.1 dj-database-url==3.1.2 dj-rest-auth==7.2.0 django-allauth==65.16.1 +django-anymail==15.0 django-cors-headers==4.9.0 django-import-export==4.4.0 django-stubs-ext==6.0.3 @@ -24,6 +28,7 @@ gunicorn==25.3.0 httpx==0.28.1 identify==2.6.19 inflection==0.5.1 +jmespath==1.1.0 jsbeautifier==1.15.4 langchain-core==1.3.2 langchain-protocol==0.0.12 @@ -36,6 +41,7 @@ librt==0.9.0 mypy==1.20.2 nodeenv==1.10.0 ormsgpack==1.12.2 +pillow==12.2.0 praw==7.8.1 pre-commit==4.6.0 psycopg[binary]==3.3.3 @@ -47,12 +53,17 @@ pytest==9.0.3 python-dotenv==1.2.2 qdrant-client==1.17.1 requests-toolbelt==1.0.0 -resend==2.29.0 ruff==0.15.12 +s3transfer==0.16.1 sentence-transformers==5.4.1 +standardwebhooks==1.0.1 structlog==25.5.0 -types-pyyaml==6.0.12.20260408 +svix==1.92.2 +types-Deprecated==1.3.1.20260408 +types-python-dateutil==2.9.0.20260408 +types-pyyaml==6.0.12 uritemplate==4.2.0 uuid-utils==0.14.1 watchdog==6.0.0 +wrapt==2.1.2 xxhash==3.7.0 From 0ebffbe95c19a83fc5c30fbe1e29123d35ddbfe4 Mon Sep 17 00:00:00 2001 From: Kevin Brown Date: Tue, 28 Apr 2026 19:03:57 +0300 Subject: [PATCH 5/5] Fix CodeQL alerts on CI --- README.md | 10 +- core/newsletter_extraction.py | 86 +++++++++++ core/newsletters.py | 220 ++++++++++++++++----------- core/signals.py | 2 +- core/tasks.py | 2 +- core/tests/test_newsletters.py | 11 ++ newsletter_maker/settings/swagger.py | 10 +- skills/relevance_scoring/SKILL.md | 4 +- 8 files changed, 238 insertions(+), 107 deletions(-) create mode 100644 core/newsletter_extraction.py diff --git a/README.md b/README.md index a338db17..86fb6b76 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Every AI capability is a standalone, documented module following the Claude Skil Seven skills form the core pipeline: | Skill | Description | -|-------|-------------| +| ----- | ----------- | | **Content Classification** | Categorizes raw content (e.g., tutorial, opinion, release notes) and assigns a confidence score. | | **Relevance Scoring** | Evaluates content usefulness using semantic similarity against a reference corpus and LLM judgment. | | **Deduplication** | Compares new content against recent embeddings to group similar topics and pick the best version. | @@ -175,11 +175,3 @@ For the default local bootstrap, `.env` also seeds an `admin` superuser in the c ## License This repository is licensed under the GNU Affero General Public License v3.0 or later. See [LICENSE](LICENSE). - -Based on the current direct dependencies, AGPL is a reasonable fit: the packages in use are permissive or LGPL-compatible licenses such as BSD, MIT, Apache-2.0, and LGPLv3. That said, this is a practical compatibility check, not legal advice, so review it with counsel if you need a formal licensing opinion. - -In our skills/relevance_scoring/SKILL.md, I see that we're passing the tenant id. Are we evaluating the content item's relevance against all content items for a project (tenant)? Or how is the relevance actually calculated? - -I notice too we're still using "tenant_id". We changed our naming scheme from "tenant" to "project". We need to update the skills and anywhere else we're still referring to the old "tenant" naming scheme. - -How many Embedding Dimensions are we using to create embeddings? Where are we specifying that value for Qdrant records? diff --git a/core/newsletter_extraction.py b/core/newsletter_extraction.py new file mode 100644 index 00000000..87e85113 --- /dev/null +++ b/core/newsletter_extraction.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from html.parser import HTMLParser + +URL_PATTERN = re.compile(r"https?://[^\s<>'\"]+") + + +@dataclass(slots=True) +class ExtractedNewsletterItem: + url: str + title: str + excerpt: str + position: int + + +class _NewsletterLinkParser(HTMLParser): + def __init__(self) -> None: + super().__init__() + self.links: list[dict[str, str]] = [] + self._active_href: str | None = None + self._active_text: list[str] = [] + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag != "a": + return + for name, value in attrs: + if name == "href" and value and value.startswith(("http://", "https://")): + self._active_href = value + self._active_text = [] + return + + def handle_data(self, data: str) -> None: + if self._active_href is not None: + self._active_text.append(data) + + def handle_endtag(self, tag: str) -> None: + if tag != "a" or self._active_href is None: + return + self.links.append( + { + "url": self._active_href, + "title": " ".join(part.strip() for part in self._active_text if part.strip()), + } + ) + self._active_href = None + self._active_text = [] + + +def extract_newsletter_items(*, subject: str, raw_html: str, raw_text: str) -> list[ExtractedNewsletterItem]: + parser = _NewsletterLinkParser() + if raw_html: + parser.feed(raw_html) + + seen_urls: set[str] = set() + extracted_items: list[ExtractedNewsletterItem] = [] + for candidate in parser.links: + url = candidate["url"].strip() + if not url or url in seen_urls: + continue + seen_urls.add(url) + extracted_items.append( + ExtractedNewsletterItem( + url=url, + title=candidate["title"] or subject or url, + excerpt=raw_text[:500].strip(), + position=len(extracted_items) + 1, + ) + ) + + for match in URL_PATTERN.finditer(raw_text): + url = match.group(0).rstrip(".,)") + if url in seen_urls: + continue + seen_urls.add(url) + extracted_items.append( + ExtractedNewsletterItem( + url=url, + title=subject or url, + excerpt=raw_text[:500].strip(), + position=len(extracted_items) + 1, + ) + ) + + return extracted_items diff --git a/core/newsletters.py b/core/newsletters.py index 41e205dd..a44b45fe 100644 --- a/core/newsletters.py +++ b/core/newsletters.py @@ -1,23 +1,22 @@ from __future__ import annotations -import re -from dataclasses import dataclass from email.utils import parseaddr +from html import escape from html.parser import HTMLParser from typing import Any, Iterable, cast +from celery import current_app from django.conf import settings as django_settings from django.core.mail import EmailMultiAlternatives from django.urls import reverse from core.models import IntakeAllowlist, NewsletterIntake, Project +from core.newsletter_extraction import extract_newsletter_items from core.settings_types import CoreSettings settings = cast(CoreSettings, django_settings) -SCRIPT_TAG_PATTERN = re.compile(r")<[^<]*)*", re.IGNORECASE | re.DOTALL) -INLINE_HANDLER_PATTERN = re.compile(r"\son[a-z]+=(?:\"[^\"]*\"|'[^']*')", re.IGNORECASE) -URL_PATTERN = re.compile(r"https?://[^\s<>'\"]+") +__all__ = ["extract_newsletter_items"] def normalize_sender_email(value: str) -> str: @@ -26,8 +25,131 @@ def normalize_sender_email(value: str) -> str: def sanitize_newsletter_html(raw_html: str) -> str: - without_scripts = SCRIPT_TAG_PATTERN.sub("", raw_html) - return INLINE_HANDLER_PATTERN.sub("", without_scripts) + without_scripts = _strip_script_blocks(raw_html) + parser = _InlineHandlerStrippingParser() + parser.feed(without_scripts) + parser.close() + return parser.get_html() + + +def _strip_script_blocks(raw_html: str) -> str: + sanitized_parts: list[str] = [] + index = 0 + raw_length = len(raw_html) + while index < raw_length: + script_start = _find_script_start(raw_html, index) + if script_start == -1: + sanitized_parts.append(raw_html[index:]) + break + + sanitized_parts.append(raw_html[index:script_start]) + opening_tag_end = _find_tag_end(raw_html, script_start + 1) + if opening_tag_end == -1: + break + + script_end = _find_script_end(raw_html, opening_tag_end + 1) + if script_end == -1: + break + index = script_end + + return "".join(sanitized_parts) + + +def _find_script_start(raw_html: str, start_index: int) -> int: + search_index = start_index + while True: + candidate = raw_html.lower().find("= len(raw_html) or raw_html[tag_name_end] in " \t\r\n\f/>": + return candidate + search_index = candidate + 1 + + +def _find_script_end(raw_html: str, start_index: int) -> int: + search_index = start_index + lower_html = raw_html.lower() + while True: + candidate = lower_html.find("": + search_index = candidate + 1 + continue + closing_tag_end = _find_tag_end(raw_html, candidate + 1) + if closing_tag_end == -1: + return len(raw_html) + return closing_tag_end + 1 + + +def _find_tag_end(raw_html: str, start_index: int) -> int: + quote_char: str | None = None + for index in range(start_index, len(raw_html)): + current_char = raw_html[index] + if quote_char is not None: + if current_char == quote_char: + quote_char = None + continue + if current_char in {'"', "'"}: + quote_char = current_char + continue + if current_char == ">": + return index + return -1 + + +class _InlineHandlerStrippingParser(HTMLParser): + def __init__(self) -> None: + super().__init__(convert_charrefs=False) + self._parts: list[str] = [] + + def get_html(self) -> str: + return "".join(self._parts) + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + self._parts.append(self._render_tag(tag, attrs)) + + def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + rendered = self._render_tag(tag, attrs) + if rendered.endswith(">"): + rendered = f"{rendered[:-1]} />" + self._parts.append(rendered) + + def handle_endtag(self, tag: str) -> None: + self._parts.append(f"") + + def handle_data(self, data: str) -> None: + self._parts.append(data) + + def handle_entityref(self, name: str) -> None: + self._parts.append(f"&{name};") + + def handle_charref(self, name: str) -> None: + self._parts.append(f"&#{name};") + + def handle_comment(self, data: str) -> None: + self._parts.append(f"") + + def handle_decl(self, decl: str) -> None: + self._parts.append(f"") + + def unknown_decl(self, data: str) -> None: + self._parts.append(f"") + + @staticmethod + def _render_tag(tag: str, attrs: list[tuple[str, str | None]]) -> str: + rendered_attrs: list[str] = [] + for name, value in attrs: + if name.lower().startswith("on"): + continue + if value is None: + rendered_attrs.append(name) + continue + rendered_attrs.append(f'{name}="{escape(value, quote=True)}"') + attr_suffix = f" {' '.join(rendered_attrs)}" if rendered_attrs else "" + return f"<{tag}{attr_suffix}>" def extract_project_token(recipient: str) -> str | None: @@ -117,10 +239,9 @@ def process_inbound_newsletter( def queue_newsletter_intake(intake_id: int) -> None: - from core.tasks import process_newsletter_intake - + process_newsletter_intake = current_app.tasks["core.tasks.process_newsletter_intake"] if settings.CELERY_TASK_ALWAYS_EAGER: - process_newsletter_intake(intake_id) + process_newsletter_intake.apply(args=(intake_id,), throw=True) else: process_newsletter_intake.delay(intake_id) @@ -134,82 +255,3 @@ def _find_intake_project(recipients: Iterable[str]) -> Project | None: if project is not None: return project return None - - -@dataclass(slots=True) -class ExtractedNewsletterItem: - url: str - title: str - excerpt: str - position: int - - -class _NewsletterLinkParser(HTMLParser): - def __init__(self): - super().__init__() - self.links: list[dict[str, str]] = [] - self._active_href: str | None = None - self._active_text: list[str] = [] - - def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: - if tag != "a": - return - for name, value in attrs: - if name == "href" and value and value.startswith(("http://", "https://")): - self._active_href = value - self._active_text = [] - return - - def handle_data(self, data: str) -> None: - if self._active_href is not None: - self._active_text.append(data) - - def handle_endtag(self, tag: str) -> None: - if tag != "a" or self._active_href is None: - return - self.links.append( - { - "url": self._active_href, - "title": " ".join(part.strip() for part in self._active_text if part.strip()), - } - ) - self._active_href = None - self._active_text = [] - - -def extract_newsletter_items(*, subject: str, raw_html: str, raw_text: str) -> list[ExtractedNewsletterItem]: - parser = _NewsletterLinkParser() - if raw_html: - parser.feed(raw_html) - - seen_urls: set[str] = set() - extracted_items: list[ExtractedNewsletterItem] = [] - for candidate in parser.links: - url = candidate["url"].strip() - if not url or url in seen_urls: - continue - seen_urls.add(url) - extracted_items.append( - ExtractedNewsletterItem( - url=url, - title=candidate["title"] or subject or url, - excerpt=raw_text[:500].strip(), - position=len(extracted_items) + 1, - ) - ) - - for match in URL_PATTERN.finditer(raw_text): - url = match.group(0).rstrip(".,)") - if url in seen_urls: - continue - seen_urls.add(url) - extracted_items.append( - ExtractedNewsletterItem( - url=url, - title=subject or url, - excerpt=raw_text[:500].strip(), - position=len(extracted_items) + 1, - ) - ) - - return extracted_items diff --git a/core/signals.py b/core/signals.py index 19cbf805..8a4327f6 100644 --- a/core/signals.py +++ b/core/signals.py @@ -35,4 +35,4 @@ def handle_anymail_inbound(sender, event, esp_name, **kwargs): raw_html=message.html or "", raw_text=message.text or "", message_id=str(message.get("Message-ID", "") or event.event_id or ""), - ) \ No newline at end of file + ) diff --git a/core/tasks.py b/core/tasks.py index 812812da..bed6c143 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -14,7 +14,7 @@ RunStatus, SourceConfig, ) -from core.newsletters import extract_newsletter_items +from core.newsletter_extraction import extract_newsletter_items from core.pipeline import ( RELEVANCE_SKILL_NAME, SUMMARIZATION_SKILL_NAME, diff --git a/core/tests/test_newsletters.py b/core/tests/test_newsletters.py index 2c6ee964..068f4a02 100644 --- a/core/tests/test_newsletters.py +++ b/core/tests/test_newsletters.py @@ -38,6 +38,17 @@ def test_sanitize_newsletter_html_removes_scripts_and_inline_handlers(): assert "onclick=" not in sanitized +def test_sanitize_newsletter_html_removes_scripts_with_malformed_end_tags(): + sanitized = sanitize_newsletter_html( + '
Read
' + ) + + assert "alert(1)" not in sanitized + assert "= 0.85: use the similarity score directly. - Similarity < 0.5: use the similarity score directly.