Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ OPENROUTER_API_KEY=
OPENROUTER_API_BASE=https://openrouter.ai/api/v1
OPENROUTER_APP_URL=
OPENROUTER_APP_NAME=newsletter-maker

AI_CLASSIFICATION_MODEL=meta-llama/llama-3.1-70b-instruct
AI_RELEVANCE_MODEL=qwen/qwen-2.5-72b-instruct
AI_SUMMARIZATION_MODEL=google/gemma-3-27b-it
Expand All @@ -21,14 +22,39 @@ AI_RELEVANCE_REVIEW_THRESHOLD=0.4
AI_RELEVANCE_SUMMARIZE_THRESHOLD=0.7
AI_MAX_NODE_RETRIES=2
AI_REQUEST_TIMEOUT_SECONDS=60

EMBEDDING_PROVIDER=sentence-transformers
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
EMBEDDING_TRUST_REMOTE_CODE=false

OLLAMA_URL=http://localhost:11434

REDDIT_CLIENT_ID=
REDDIT_CLIENT_SECRET=
REDDIT_USER_AGENT=newsletter-maker/0.1

# Outbound mail provider. Use Resend or Amazon SES.
EMAIL_BACKEND=anymail.backends.resend.EmailBackend
DEFAULT_FROM_EMAIL=onboarding@resend.dev
SERVER_EMAIL=onboarding@resend.dev

# Resend outbound + inbound
RESEND_API_KEY=
RESEND_FROM_EMAIL=onboarding@resend.dev
RESEND_INBOUND_SECRET=

# Amazon SES outbound + inbound
# EMAIL_BACKEND=anymail.backends.amazon_ses.EmailBackend
# AWS_ACCESS_KEY_ID=
# AWS_SECRET_ACCESS_KEY=
# AWS_DEFAULT_REGION=us-east-1

# Shared webhook basic auth for providers that need it.
# Format must be username:password.
ANYMAIL_WEBHOOK_SECRET=

LOG_LEVEL=INFO

CELERY_TASK_ALWAYS_EAGER=false

DJANGO_SUPERUSER_USERNAME=admin
Expand All @@ -40,5 +66,7 @@ NEWSLETTER_API_USERNAME=admin
NEWSLETTER_API_PASSWORD=adminpass

DEBUG=True

ALLOWED_HOSTS=localhost,127.0.0.1,newslettermaker.tech

FRONTEND_URL=http://localhost:3000
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
{
"cSpell.words": [
"ASGI",
"botocore",
"buildx",
"cbranch",
"cfgv",
"cstat",
"dateutil",
"djlint",
"FAVICONS",
"Feedly",
Expand Down Expand Up @@ -33,6 +35,7 @@
"readyz",
"Referer",
"simplejwt",
"svix",
"Unparseable",
"unstub",
"upserted",
Expand Down
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Every AI capability is a standalone, documented module following the Claude Skil
Seven skills form the core pipeline:

| Skill | Description |
|-------|-------------|
| ----- | ----------- |
| **Content Classification** | Categorizes raw content (e.g., tutorial, opinion, release notes) and assigns a confidence score. |
| **Relevance Scoring** | Evaluates content usefulness using semantic similarity against a reference corpus and LLM judgment. |
| **Deduplication** | Compares new content against recent embeddings to group similar topics and pick the best version. |
Expand Down Expand Up @@ -175,5 +175,3 @@ For the default local bootstrap, `.env` also seeds an `admin` superuser in the c
## License

This repository is licensed under the GNU Affero General Public License v3.0 or later. See [LICENSE](LICENSE).

Based on the current direct dependencies, AGPL is a reasonable fit: the packages in use are permissive or LGPL-compatible licenses such as BSD, MIT, Apache-2.0, and LGPLv3. That said, this is a practical compatibility check, not legal advice, so review it with counsel if you need a formal licensing opinion.
3 changes: 3 additions & 0 deletions core/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
class CoreConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "core"

def ready(self) -> None:
import core.signals # noqa: F401
83 changes: 83 additions & 0 deletions core/migrations/0002_newsletter_intake.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from django.db import migrations, models
import django.db.models.deletion

import core.models


class Migration(migrations.Migration):

dependencies = [
("core", "0001_initial"),
]

operations = [
migrations.AddField(
model_name="project",
name="intake_enabled",
field=models.BooleanField(default=False),
),
migrations.AddField(
model_name="project",
name="intake_token",
field=models.CharField(default=core.models.generate_project_intake_token, editable=False, max_length=64, unique=True),
),
migrations.AddField(
model_name="content",
name="source_metadata",
field=models.JSONField(blank=True, default=dict),
),
migrations.CreateModel(
name="IntakeAllowlist",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("sender_email", models.EmailField(max_length=254)),
("confirmed_at", models.DateTimeField(blank=True, null=True)),
("confirmation_token", models.CharField(default=core.models.generate_confirmation_token, max_length=64, unique=True)),
("created_at", models.DateTimeField(auto_now_add=True)),
(
"project",
models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="intake_allowlist", to="core.project"),
),
],
options={
"ordering": ["sender_email"],
},
),
migrations.CreateModel(
name="NewsletterIntake",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("sender_email", models.EmailField(max_length=254)),
("subject", models.CharField(max_length=512)),
("received_at", models.DateTimeField(auto_now_add=True)),
("raw_html", models.TextField(blank=True)),
("raw_text", models.TextField(blank=True)),
("message_id", models.CharField(max_length=255, unique=True)),
(
"status",
models.CharField(
choices=[("pending", "Pending"), ("extracted", "Extracted"), ("failed", "Failed"), ("rejected", "Rejected")],
default="pending",
max_length=16,
),
),
("extraction_result", models.JSONField(blank=True, null=True)),
("error_message", models.TextField(blank=True)),
(
"project",
models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="newsletter_intakes", to="core.project"),
),
],
options={
"ordering": ["-received_at"],
},
),
migrations.AddIndex(
model_name="newsletterintake",
index=models.Index(fields=["project", "sender_email", "status"], name="core_newsle_project_2c63fb_idx"),
),
migrations.AddConstraint(
model_name="intakeallowlist",
constraint=models.UniqueConstraint(fields=("project", "sender_email"), name="core_allowlist_unique_project_sender"),
),
]
63 changes: 63 additions & 0 deletions core/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
import secrets

from django.conf import settings
from django.contrib.auth.models import Group
from django.db import models


def generate_project_intake_token() -> str:
return secrets.token_hex(16)


def generate_confirmation_token() -> str:
return secrets.token_urlsafe(24)


class EntityType(models.TextChoices):
INDIVIDUAL = "individual", "Individual"
VENDOR = "vendor", "Vendor"
Expand All @@ -26,6 +36,13 @@ class SourcePluginName(models.TextChoices):
REDDIT = "reddit", "Reddit"


class NewsletterIntakeStatus(models.TextChoices):
PENDING = "pending", "Pending"
EXTRACTED = "extracted", "Extracted"
FAILED = "failed", "Failed"
REJECTED = "rejected", "Rejected"


class RunStatus(models.TextChoices):
RUNNING = "running", "Running"
SUCCESS = "success", "Success"
Expand All @@ -47,6 +64,8 @@ class Project(models.Model):
group = models.ForeignKey(Group, on_delete=models.CASCADE, related_name="projects")
topic_description = models.TextField()
content_retention_days = models.PositiveIntegerField(default=365)
intake_token = models.CharField(max_length=64, unique=True, default=generate_project_intake_token, editable=False)
intake_enabled = models.BooleanField(default=False)
created_at = models.DateTimeField(auto_now_add=True)

class Meta:
Expand Down Expand Up @@ -107,6 +126,7 @@ class Content(models.Model):
content_text = models.TextField()
relevance_score = models.FloatField(null=True, blank=True)
embedding_id = models.CharField(max_length=64, blank=True)
source_metadata = models.JSONField(default=dict, blank=True)
is_reference = models.BooleanField(default=False)
is_active = models.BooleanField(default=True)

Expand All @@ -123,6 +143,49 @@ def __str__(self) -> str:
return self.title


class IntakeAllowlist(models.Model):
project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="intake_allowlist")
sender_email = models.EmailField()
confirmed_at = models.DateTimeField(null=True, blank=True)
confirmation_token = models.CharField(max_length=64, unique=True, default=generate_confirmation_token)
created_at = models.DateTimeField(auto_now_add=True)

class Meta:
ordering = ["sender_email"]
constraints = [
models.UniqueConstraint(fields=["project", "sender_email"], name="core_allowlist_unique_project_sender"),
]

def __str__(self) -> str:
return f"{self.sender_email} for {self.project.name}"

@property
def is_confirmed(self) -> bool:
return self.confirmed_at is not None


class NewsletterIntake(models.Model):
project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="newsletter_intakes")
sender_email = models.EmailField()
subject = models.CharField(max_length=512)
received_at = models.DateTimeField(auto_now_add=True)
raw_html = models.TextField(blank=True)
raw_text = models.TextField(blank=True)
message_id = models.CharField(max_length=255, unique=True)
status = models.CharField(max_length=16, choices=NewsletterIntakeStatus.choices, default=NewsletterIntakeStatus.PENDING)
extraction_result = models.JSONField(null=True, blank=True)
error_message = models.TextField(blank=True)

class Meta:
ordering = ["-received_at"]
indexes = [
models.Index(fields=["project", "sender_email", "status"]),
]

def __str__(self) -> str:
return f"{self.subject or self.message_id}"


class SkillResult(models.Model):
content = models.ForeignKey(Content, on_delete=models.CASCADE, related_name="skill_results")
project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="skill_results")
Expand Down
86 changes: 86 additions & 0 deletions core/newsletter_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations

import re
from dataclasses import dataclass
from html.parser import HTMLParser

URL_PATTERN = re.compile(r"https?://[^\s<>'\"]+")


@dataclass(slots=True)
class ExtractedNewsletterItem:
url: str
title: str
excerpt: str
position: int


class _NewsletterLinkParser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.links: list[dict[str, str]] = []
self._active_href: str | None = None
self._active_text: list[str] = []

def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag != "a":
return
for name, value in attrs:
if name == "href" and value and value.startswith(("http://", "https://")):
self._active_href = value
self._active_text = []
return

def handle_data(self, data: str) -> None:
if self._active_href is not None:
self._active_text.append(data)

def handle_endtag(self, tag: str) -> None:
if tag != "a" or self._active_href is None:
return
self.links.append(
{
"url": self._active_href,
"title": " ".join(part.strip() for part in self._active_text if part.strip()),
}
)
self._active_href = None
self._active_text = []


def extract_newsletter_items(*, subject: str, raw_html: str, raw_text: str) -> list[ExtractedNewsletterItem]:
parser = _NewsletterLinkParser()
if raw_html:
parser.feed(raw_html)

seen_urls: set[str] = set()
extracted_items: list[ExtractedNewsletterItem] = []
for candidate in parser.links:
url = candidate["url"].strip()
if not url or url in seen_urls:
continue
seen_urls.add(url)
extracted_items.append(
ExtractedNewsletterItem(
url=url,
title=candidate["title"] or subject or url,
excerpt=raw_text[:500].strip(),
position=len(extracted_items) + 1,
)
)

for match in URL_PATTERN.finditer(raw_text):
url = match.group(0).rstrip(".,)")
if url in seen_urls:
continue
seen_urls.add(url)
extracted_items.append(
ExtractedNewsletterItem(
url=url,
title=subject or url,
excerpt=raw_text[:500].strip(),
position=len(extracted_items) + 1,
)
)

return extracted_items
Loading
Loading