diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000..1ae4fcfb --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,71 @@ +# Newsletter Maker Project Instructions + +You are working in Newsletter Maker, a Django + DRF + Celery + Qdrant backend with a Next.js App Router frontend. + +## Repository Shape + +- Backend runtime code lives in `core/`. +- Django project settings and top-level URLs live in `newsletter_maker/`. +- Backend tests live primarily in `core/tests/` and `tests/`. +- Frontend application code lives in `frontend/src/app/`, shared UI in `frontend/src/components/`, and shared API/types/helpers in `frontend/src/lib/`. +- Operational and architecture docs live in `docs/`. + +## Working Norms + +- Prefer the smallest correct slice of work. Not every request requires both backend and frontend changes. +- For user-facing product features, assess the full path: model or worker changes, serializer or API changes, frontend types and data access, UI updates, and tests. +- For admin-only, ingestion-only, worker-only, documentation-only, or settings-only changes, stay in the affected layer. Do not scaffold unnecessary frontend code. +- Preserve existing naming. This repo uses `project`, not `tenant`. + +## Backend Conventions + +- Project scoping is a core invariant. Most API resources are nested under `/api/v1/projects/{project_id}/...`. +- Reuse the established DRF patterns in `core/api.py`, `core/api_urls.py`, and `core/serializers.py`: + - `ProjectOwnedQuerysetMixin` for nested viewsets + - serializer context containing `project` + - explicit validation for cross-project foreign keys +- Keep viewsets and views thin. Put operational logic in `core/tasks.py`, `core/pipeline.py`, `core/newsletters.py`, `core/plugins/`, or nearby helpers. +- Preserve existing API field shapes. Backend serializers and frontend types currently use `snake_case`; do not introduce ad hoc `camelCase` transforms. +- When API behavior changes, update drf-spectacular schema metadata in `core/api.py`. +- When changing ingestion, newsletter intake, AI processing, or embeddings, preserve the handoff between database state, Celery tasks, and Qdrant state. + +## Frontend Conventions + +- The frontend uses Next.js App Router. +- Shared backend-facing types belong in `frontend/src/lib/types.ts`. +- Shared backend-facing data access belongs in `frontend/src/lib/api.ts` unless there is a clear reason to add a route handler under `frontend/src/app/api/`. +- Keep reusable UI in `frontend/src/components/` and page assembly in `frontend/src/app/`. +- Preserve existing backend payload shapes in TypeScript types and UI code unless the backend contract is intentionally changing. + +## Documentation Standards + +- Python uses Google-style docstrings with PEP 257 conventions. +- Add or improve module docstrings plus public classes, public functions, and non-obvious helpers. +- Do not add noisy boilerplate to trivial `__str__` methods, simple properties, or obvious one-line helpers unless the surrounding file genuinely benefits. +- TypeScript and React code should use JSDoc for exported utilities, hooks, route handlers, and non-trivial components when behavior is not obvious from the type signature alone. +- If architecture or workflow behavior changes, update the most relevant docs in `docs/`, especially `docs/DEVELOPER_GUIDE.md`, `docs/IMPLEMENTATION_OVERVIEW.md`, `docs/MODELS.md`, `docs/RELEVANCE_SCORING.md`, or `docs/LOGGING.md`. + +## Testing And Validation + +- Backend tests use `pytest`. +- Frontend tests use `vitest`. +- Prefer focused validation commands over full-suite runs when the change is localized. +- Common commands in this repo: + - `pytest core/tests/...` + - `python manage.py check` + - `just backend-lint` + - `cd frontend && npm run test:run` + - `cd frontend && npm run typecheck` + - `just frontend-lint` +- Prefer existing `just` tasks when they cover the needed validation flow. + +## Skill Usage + +Use the workspace skills in `.github/skills/` when they match the task: + +- `docstring-enforcer`: documentation passes or doc cleanup across multiple files. +- `coverage-auditor`: closing backend or frontend test gaps. +- `bridge-scaffolder`: features that span Django API work and Next.js consumption. +- `project-api-patterns`: adding or changing project-scoped DRF endpoints. +- `source-plugin-patterns`: adding or changing ingestion plugins or source-config behavior. +- `ai-pipeline-patterns`: changing embeddings, relevance scoring, newsletter intake, or Celery-driven AI workflow behavior. diff --git a/.github/instructions/backend-python.instructions.md b/.github/instructions/backend-python.instructions.md new file mode 100644 index 00000000..5a15b56a --- /dev/null +++ b/.github/instructions/backend-python.instructions.md @@ -0,0 +1,37 @@ +--- +name: "Backend Python Guidelines" +description: "Use when editing Django, DRF, Celery, plugin, management command, or backend test code in Python. Covers project scoping, workflow placement, docstrings, and focused validation for core/, newsletter_maker/, tests/, and manage.py." +applyTo: + - "core/**/*.py" + - "newsletter_maker/**/*.py" + - "tests/**/*.py" + - "manage.py" +--- + +# Backend Python Guidelines + +- Preserve `project` as the scoping boundary. Do not reintroduce `tenant` naming. +- Keep Django views and DRF viewsets thin. Put operational logic in nearby helpers such as `core/tasks.py`, `core/pipeline.py`, `core/newsletters.py`, `core/plugins/`, or focused modules next to the owning workflow. +- For nested API resources, follow the patterns in `core/api.py`, `core/api_urls.py`, and `core/serializers.py`. +- Enforce cross-project relationship validation in serializers instead of trusting the client. +- Preserve existing API field names in `snake_case` unless the contract is intentionally changing across backend and frontend. +- Use Google-style docstrings with PEP 257 conventions for public modules, classes, functions, and non-obvious helpers. +- Keep changes small and local. Do not create generic `services.py` or `utils.py` files unless the repo already needs that extraction. +- When changing ingestion, embeddings, newsletter intake, or review behavior, keep the database, Celery, and Qdrant handoff coherent. + +## Validation + +- Prefer focused checks first: + - `pytest core/tests/...` + - `python manage.py check` + - `just backend-lint` +- Mock external systems such as Reddit, feed parsing, OpenRouter, email providers, and Qdrant in tests. + +## Good Anchors + +- `core/models.py` +- `core/serializers.py` +- `core/api.py` +- `core/tasks.py` +- `core/pipeline.py` +- `core/newsletters.py` diff --git a/.github/instructions/documentation.instructions.md b/.github/instructions/documentation.instructions.md new file mode 100644 index 00000000..a1aa8d0c --- /dev/null +++ b/.github/instructions/documentation.instructions.md @@ -0,0 +1,36 @@ +--- +name: "Documentation Guidelines" +description: "Use when editing architecture, onboarding, operations, or product documentation in docs/**/*.md, README.md, or other repo markdown docs. Covers keeping docs aligned with code, preserving project terminology, and updating cross-links when workflows change." +applyTo: + - "docs/**/*.md" + - "README.md" +--- + +# Documentation Guidelines + +- Write docs to match the current codebase, not an aspirational future design, unless the file is explicitly a roadmap or planning document. +- Use `project`, not `tenant`, unless a historical or comparative note explicitly requires the old term. +- Prefer concrete file and workflow references over vague architectural summaries. +- When a behavior changes, update the closest existing document instead of adding a new overlapping explanation. +- Keep `docs/DEVELOPER_GUIDE.md` current when the best "where to look first" path changes for contributors. +- Keep `README.md` high-level. Put detailed runtime, workflow, or operator guidance in `docs/` and link to it. +- When documenting backend behavior, align the wording with the real implementation in files like `core/models.py`, `core/tasks.py`, `core/pipeline.py`, `core/newsletters.py`, `core/api.py`, and `newsletter_maker/settings/`. +- When documenting frontend behavior, align the wording with the real implementation in `frontend/src/app/`, `frontend/src/components/`, and `frontend/src/lib/`. +- If a doc mentions commands, prefer the repo's real commands from `justfile`, `package.json`, or `manage.py`. +- If a code change affects logging, relevance scoring, ingestion, newsletter intake, or onboarding, check whether `docs/LOGGING.md`, `docs/RELEVANCE_SCORING.md`, `docs/IMPLEMENTATION_OVERVIEW.md`, or `docs/DEVELOPER_GUIDE.md` should change too. + +## Style + +- Favor short sections and direct statements over long narrative paragraphs. +- Use bullets for operational steps and comparisons. +- Avoid copying large blocks of code into docs when a file path and a short explanation are enough. +- Keep terminology consistent across docs, admin UI descriptions, and code comments. + +## Good Anchors + +- `docs/DEVELOPER_GUIDE.md` +- `docs/IMPLEMENTATION_OVERVIEW.md` +- `docs/MODELS.md` +- `docs/RELEVANCE_SCORING.md` +- `docs/LOGGING.md` +- `README.md` diff --git a/.github/instructions/frontend-app.instructions.md b/.github/instructions/frontend-app.instructions.md new file mode 100644 index 00000000..9abdfb2e --- /dev/null +++ b/.github/instructions/frontend-app.instructions.md @@ -0,0 +1,31 @@ +--- +name: "Frontend App Router Guidelines" +description: "Use when editing Next.js App Router pages, route handlers, shared frontend API helpers, or TypeScript UI code in frontend/src/. Covers file placement, backend contract preservation, typing, and frontend validation." +applyTo: + - "frontend/src/**/*.ts" + - "frontend/src/**/*.tsx" +--- + +# Frontend App Router Guidelines + +- This frontend uses Next.js App Router, not the Pages Router. +- Keep backend-facing types in `frontend/src/lib/types.ts` and shared server-side API access in `frontend/src/lib/api.ts` unless a route handler in `frontend/src/app/api/` is the correct boundary. +- Reuse the existing backend contract. This repo currently consumes `snake_case` fields from Django; do not silently rename payload keys in the frontend. +- Keep reusable UI in `frontend/src/components/` and page composition in `frontend/src/app/`. +- Prefer strong explicit types over loose `Record` shapes when the contract is known. +- Add JSDoc for exported utilities, route handlers, hooks, and non-trivial components when behavior is not obvious from the signature. +- When a change depends on new backend fields or endpoints, update the corresponding types and API helpers in the same change. + +## Validation + +- Prefer focused checks first: + - `cd frontend && npm run test:run` + - `cd frontend && npm run typecheck` + - `just frontend-lint` + +## Good Anchors + +- `frontend/src/lib/types.ts` +- `frontend/src/lib/api.ts` +- `frontend/src/app/` +- `frontend/src/components/` diff --git a/.github/skills/ai-pipeline-patterns/SKILL.md b/.github/skills/ai-pipeline-patterns/SKILL.md new file mode 100644 index 00000000..122d3282 --- /dev/null +++ b/.github/skills/ai-pipeline-patterns/SKILL.md @@ -0,0 +1,53 @@ +--- +name: ai-pipeline-patterns +description: "Use when changing the AI workflow, embeddings, Qdrant integration, newsletter intake, relevance scoring, summarization, classification, Celery handoff, or Anymail inbound processing. Trigger phrases include pipeline, relevance scoring, embeddings, Qdrant, newsletter intake, summarization, classification, OpenRouter, and review queue." +--- + +# AI Pipeline Patterns Skill + +Use this skill when working on the ingestion and AI processing pipeline. + +## Scope + +This skill covers the interaction between: + +- `core/pipeline.py` +- `core/embeddings.py` +- `core/tasks.py` +- `core/newsletters.py` +- `core/newsletter_extraction.py` +- `core/signals.py` +- `core/views.py` +- related models, tests, and docs + +## Rules + +- Preserve the separation between persisted state, background execution, and vector storage. +- Keep Qdrant access inside the embedding layer unless there is a strong reason to expand that boundary. +- Keep newsletter intake routing and confirmation logic in the newsletter intake modules, not in unrelated views or serializers. +- Prefer explicit fallbacks when LLM or external-service calls can fail. +- Respect the current thresholds and routing semantics around relevance scoring, summarization eligibility, and review-queue creation unless the change intentionally redefines them. +- When changing provider behavior, preserve provider-agnostic boundaries where they already exist, such as Anymail for inbound and Django mail abstractions for outbound. +- Update docs when pipeline behavior changes in a way a new developer or operator would need to understand. + +## Implementation Guidance + +- For embedding or vector-search changes, start in `core/embeddings.py` and then verify the downstream callers in `core/pipeline.py` and `core/tasks.py`. +- For newsletter intake changes, start in `core/newsletters.py` or `core/signals.py` and follow the path through `NewsletterIntake`, `IntakeAllowlist`, and `process_newsletter_intake`. +- For project routing or user-facing confirmation behavior, verify both `core/views.py` and top-level URL registration. +- For AI-skill output changes, check how `SkillResult` rows are persisted and how admin or frontend surfaces consume them. + +## Validation + +- Prefer focused checks first: + - `pytest core/tests/test_pipeline.py` + - `pytest core/tests/test_tasks.py` + - `pytest core/tests/test_newsletters.py` + - `python manage.py check` +- If a change affects serializers, admin, or API surfaces too, expand validation to the nearest related test module. + +## References + +- `docs/DEVELOPER_GUIDE.md` +- `docs/RELEVANCE_SCORING.md` +- `docs/LOGGING.md` diff --git a/.github/skills/bridge-scaffolder/SKILL.md b/.github/skills/bridge-scaffolder/SKILL.md new file mode 100644 index 00000000..85679292 --- /dev/null +++ b/.github/skills/bridge-scaffolder/SKILL.md @@ -0,0 +1,28 @@ +--- +name: bridge-scaffolder +description: "Use when creating or changing a feature that spans both the Django API and the Next.js frontend. Trigger phrases include full-stack feature, add endpoint and UI, wire frontend to backend, project dashboard change, and bridge serializer to frontend types." +--- + +# Bridge Scaffolder Skill + +Use this skill when a change genuinely crosses the backend and frontend boundary. + +## Rules + +- **Django Side:** Follow the existing patterns in `core/api.py`, `core/api_urls.py`, and `core/serializers.py`. +- Most nested resources should stay project-scoped under `/api/v1/projects/{project_id}/...`. +- Keep business logic out of viewsets. Use `core/tasks.py`, `core/pipeline.py`, `core/newsletters.py`, `core/plugins/`, or nearby helpers for real workflow logic. +- **Next.js Side:** Update `frontend/src/lib/types.ts`, `frontend/src/lib/api.ts`, and the relevant pages, components, or route handlers under `frontend/src/app/`. +- Preserve the existing `snake_case` payload shape unless the backend contract is intentionally changing. + +## Implementation Guidance + +- Check `core/api_urls.py` for the current route topology and `core/api.py` for the schema helper patterns. +- Keep serializer validation aligned with project scoping and cross-project relationship rules. +- If the frontend consumes the new field or endpoint, reflect it in `frontend/src/lib/types.ts` and the corresponding API helpers. +- Update docs when the feature changes a core workflow or user-facing behavior. + +## Related Guidance + +- Use `project-api-patterns` when the backend portion is primarily a new or changed DRF resource. +- Use `coverage-auditor` immediately after scaffolding to add targeted backend and frontend tests. diff --git a/.github/skills/coverage-auditor/SKILL.md b/.github/skills/coverage-auditor/SKILL.md new file mode 100644 index 00000000..59f12b67 --- /dev/null +++ b/.github/skills/coverage-auditor/SKILL.md @@ -0,0 +1,39 @@ +--- +name: coverage-auditor +description: "Use when adding or updating tests for Django, DRF, Celery, admin, serializer, plugin, or Next.js code, or when closing a coverage gap. Trigger phrases include add tests, improve coverage, pytest, vitest, missing branch, serializer test, admin test, and route handler test." +--- + +# Coverage Auditor Skill + +Use this skill to add the smallest effective tests around the changed behavior. + +## Rules + +- **Backend:** Use `pytest` with the existing patterns in `core/tests/` and `tests/`. +- **Frontend:** Use `vitest` and the existing co-located `__tests__` structure under `frontend/src/`. +- Mock external integrations such as feed parsing, Reddit, email providers, Qdrant, or LLM calls instead of relying on live services. +- Prefer focused behavior tests over broad snapshot-style tests. +- For defensive branches that are intentionally unreachable, document exclusions explicitly with `# pragma: no cover` or `/* v8 ignore next */` only when justified. + +## Testing Guidance + +- Analyze the changed file first and cover the real branch points: validation failures, access control, empty states, duplicate handling, service failures, and success paths. +- For backend work, add tests near the closest existing module, such as: + - `core/tests/test_serializers.py` + - `core/tests/test_admin.py` + - `core/tests/test_tasks.py` + - `core/tests/test_newsletters.py` + - `core/tests/test_pipeline.py` +- For frontend work, extend the nearest existing `__tests__` file under `frontend/src/components/` or `frontend/src/lib/`. +- When adding new frontend tests, keep imports sorted to satisfy the repo's ESLint import-order rules. If you hit `Run autofix to sort these imports!`, fix the import block or run file-scoped ESLint before moving on. +- After changing tests, run the narrowest relevant validation command first. +- For Provider/Wrapper components, do not test state or initialization. Instead, render a consumer child component and assert that the context values or library configurations (like QueryClient options) match the expected project defaults. +- For Server Components, prioritize unit testing exported helper functions (like deriveSourceStatus) for logic, and use integration tests to verify that API data is mapped correctly to the UI components (badges, tables, etc.). Mock all library API calls using vi.mock. +- Out of Scope: Root layouts, static metadata, font configurations, and pure "pass-through" providers. +- In Scope: Layouts with conditional rendering, breadcrumb logic, or role-based access checks. + +## References + +- Backend commands are captured in `justfile` as `just backend-test`, `just backend-test-coverage`, and `just backend-lint`. +- Frontend commands are captured in `frontend/package.json` and the repo `justfile` as `npm run test:run`, `npm run test:coverage`, and `just frontend-lint`. +- Use `python manage.py check` after Django-side structural changes. diff --git a/.github/skills/docstring-enforcer/SKILL.md b/.github/skills/docstring-enforcer/SKILL.md new file mode 100644 index 00000000..6c1d4c30 --- /dev/null +++ b/.github/skills/docstring-enforcer/SKILL.md @@ -0,0 +1,37 @@ +--- +name: docstring-enforcer +description: "Use when adding or revising documentation for Python modules/Django code or exported TypeScript/React utilities. Triggers: docstring, JSDoc, document this file, lib utilities, improve docs, and explain intent." +--- + +# Docstring Enforcer Skill + +Use this skill for meaningful documentation that explains *why* code exists and *how* it handles edge cases. + +## Rules + +### 🐍 Python (Google Style) + +- **Standard:** Use Google-style docstrings and PEP 257. +- **Structure:** Include `Args:`, `Returns:`, and `Raises:` only when they provide value. Do not add empty sections. +- **Django Specifics:** Document the "why" behind complex QuerySets or signal receivers. Trivial dunder methods or obvious model fields can remain undocumented if self-explanatory. +- **Intent:** Favor workflow context over repeating the function name (e.g., "Invalidates the cache after user logout" vs "Does logout stuff"). + +### ⚛️ TypeScript & Next.js (JSDoc Style) + +- **Standard:** Use JSDoc for exported utilities, hooks, and complex components. +- **Frontend `lib/` Policy:** ALL shared utilities in `lib/` must include: + - **@example:** A brief code snippet showing typical usage. + - **Edge Cases:** Describe behavior for `null`, `undefined`, or empty strings in `@param` or `@returns`. +- **Clarity:** Do not just restate TypeScript types. Explain constraints (e.g., "The string must be a valid ISO-8601 date"). + +## References + +- **Gold Standard (Python):** See `core/models.py` and `core/pipeline.py`. +- **Gold Standard (TS/lib):** See `frontend/src/lib/formatters.ts` (if applicable) for example-driven JSDoc. +- **Context:** Check `docs/DEVELOPER_GUIDE.md` to ensure docs align with overall system architecture. + +## Workflow + +1. Analyze the file to understand its role in the Django/Next.js bridge. +2. If documentation is missing or outdated, rewrite it using the styles above. +3. Ensure that if a Django API field changes, the corresponding Next.js JSDoc for that field is also flagged for an update. diff --git a/.github/skills/project-api-patterns/SKILL.md b/.github/skills/project-api-patterns/SKILL.md new file mode 100644 index 00000000..2f1f6fab --- /dev/null +++ b/.github/skills/project-api-patterns/SKILL.md @@ -0,0 +1,33 @@ +--- +name: project-api-patterns +description: "Use when adding or changing Django REST Framework serializers, viewsets, nested routes, schema docs, or project-scoped endpoints in core/api.py, core/api_urls.py, or core/serializers.py. Trigger phrases include project API, nested route, DRF viewset, serializer validation, project_id endpoint, and drf-spectacular docs." +--- + +# Project API Patterns Skill + +Use this skill when changing the project-scoped REST API. + +## Rules + +- Treat `Project` as the isolation boundary. +- Top-level project resources live on the base router; most other resources are nested under `/api/v1/projects/{project_id}/...`. +- Reuse `ProjectOwnedQuerysetMixin` in `core/api.py` for nested resources. +- Pass `project` through serializer context and enforce cross-project relationship validation in serializers. +- Keep API field names in `snake_case` to match current serializers and frontend types. +- Update drf-spectacular metadata in `core/api.py` when the endpoint contract changes. + +## Implementation Guidance + +- Add or update serializers in `core/serializers.py`. +- Add or update viewsets and schema decorators in `core/api.py`. +- Register routes in `core/api_urls.py` using the existing nested router pattern. +- If the frontend consumes the API, update `frontend/src/lib/types.ts` and `frontend/src/lib/api.ts`. +- Add or update focused tests near the changed behavior, usually under `core/tests/`. + +## References + +- `core/api.py` +- `core/api_urls.py` +- `core/serializers.py` +- `frontend/src/lib/types.ts` +- `frontend/src/lib/api.ts` diff --git a/.github/skills/source-plugin-patterns/SKILL.md b/.github/skills/source-plugin-patterns/SKILL.md new file mode 100644 index 00000000..dbd0a502 --- /dev/null +++ b/.github/skills/source-plugin-patterns/SKILL.md @@ -0,0 +1,33 @@ +--- +name: source-plugin-patterns +description: "Use when adding or changing ingestion plugins, source configuration validation, RSS or Reddit style fetchers, source health checks, or content ingestion behavior in core/plugins/ and core/tasks.py. Trigger phrases include source plugin, ingestion plugin, add feed source, source config, RSS plugin, Reddit plugin, and health_check." +--- + +# Source Plugin Patterns Skill + +Use this skill when working on the ingestion plugin system. + +## Rules + +- Plugins should conform to the `SourcePlugin` interface in `core/plugins/base.py`. +- Plugin output should be normalized into `ContentItem` objects. +- Keep plugin-specific config validation in the plugin class and route shared resolution through `core/plugins/registry.py`. +- Register any new plugin type in both the registry and the source-plugin enum used by the data model. +- Ingestion orchestration belongs in `core/tasks.py`, not inside the plugin registry or API layer. +- External network calls should be mocked in tests. + +## Implementation Guidance + +- Update `core/plugins/base.py` only when the shared plugin contract must change. +- Add or update concrete plugins in `core/plugins/`. +- Update `core/plugins/registry.py` and any related enum or serializer validation paths. +- Confirm `SourceConfig` validation still works through both the serializer and admin paths. +- Add or update focused tests in `core/tests/test_tasks.py`, `core/tests/test_admin.py`, `core/tests/test_serializers.py`, or a new nearby plugin test module. + +## References + +- `core/plugins/base.py` +- `core/plugins/registry.py` +- `core/plugins/rss.py` +- `core/plugins/reddit.py` +- `core/tasks.py` diff --git a/.vscode/settings.json b/.vscode/settings.json index 33023242..550d4b31 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,6 +8,8 @@ "cstat", "dateutil", "djlint", + "dnspython", + "falsey", "FAVICONS", "Feedly", "Fraunces", @@ -17,6 +19,7 @@ "httpx", "isready", "jsbeautifier", + "libipld", "libpq", "librt", "mday", @@ -30,10 +33,12 @@ "pylint", "PYTHONDONTWRITEBYTECODE", "PYTHONUNBUFFERED", + "pytokens", "pyyaml", "QDRANT", "readyz", "Referer", + "scaffolder", "simplejwt", "svix", "Unparseable", diff --git a/README.md b/README.md index 86fb6b76..71a02760 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,12 @@ The system is designed for graceful failure, not silent corruption. Unparseable ## Project Documentation +- [Developer Guide](docs/DEVELOPER_GUIDE.md) gives a fast "where to look first" map for new contributors. +- [Implementation Overview](docs/IMPLEMENTATION_OVERVIEW.md) summarizes the main features and current architecture. +- [Data Models](docs/MODELS.md) describes the purpose of each core model. +- [Relevance Scoring](docs/RELEVANCE_SCORING.md) explains how similarity scoring and review thresholds work. +- [Logging](docs/LOGGING.md) explains where application logs go in local and containerized environments. + ## Local Development ```bash diff --git a/conftest.py b/conftest.py index 31b9f259..2c29ab71 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,5 @@ from dotenv import load_dotenv - load_dotenv(".env.test", override=True) diff --git a/core/admin.py b/core/admin.py index 5dbf3a2e..db32403c 100644 --- a/core/admin.py +++ b/core/admin.py @@ -1,3 +1,10 @@ +"""Django admin configuration for the core editorial workflow. + +These admin classes are intentionally richer than default CRUD screens. They expose +the health, traceability, and review information editors and operators need while +running ingestion and AI-assisted content curation. +""" + import json from django.contrib import admin, messages @@ -9,49 +16,62 @@ from unfold.admin import ModelAdmin from core.models import ( - Content, - Entity, - IngestionRun, + Content, + Entity, + IngestionRun, Project, ProjectConfig, - ReviewQueue, - SkillResult, - SourceConfig, - UserFeedback, + ReviewQueue, + SkillResult, + SourceConfig, + UserFeedback, ) from core.plugins import get_plugin_for_source_config, validate_plugin_config @admin.register(Project) class ProjectAdmin(ExportActionMixin, admin.ModelAdmin): - list_display = ("name", "group", "content_retention_days", "created_at") + """Admin configuration for top-level project workspaces.""" + + list_display = ("name", "group", "content_retention_days", "created_at") - # Better navigation - date_hierarchy = "created_at" - list_filter = ("created_at",) + # Better navigation + date_hierarchy = "created_at" + list_filter = ("created_at",) - # Faster searching - search_fields = ("name", "group__name") + # Faster searching + search_fields = ("name", "group__name") - # Performance for large user lists - autocomplete_fields = ("group",) + # Performance for large user lists + autocomplete_fields = ("group",) - # Quick editing - list_editable = ("content_retention_days",) + # Quick editing + list_editable = ("content_retention_days",) @admin.register(ProjectConfig) class ProjectConfigAdmin(admin.ModelAdmin): - list_display = ("project", "upvote_authority_weight", "downvote_authority_weight", "authority_decay_rate") + """Admin configuration for per-project scoring settings.""" + + list_display = ( + "project", + "upvote_authority_weight", + "downvote_authority_weight", + "authority_decay_rate", + ) @admin.register(Entity) class EntityAdmin(admin.ModelAdmin): + """Admin configuration for tracked people, vendors, and organizations.""" + # Replace 'authority_score' with your new method name list_display = ("name", "project", "type", "colored_score", "created_at") @admin.display(description="Authority Score", ordering="authority_score") def colored_score(self, obj): + """Render the authority score with a traffic-light color cue.""" + # Choose a color based on the value if obj.authority_score >= 80: color = "green" @@ -68,22 +88,28 @@ def colored_score(self, obj): class HighValueFilter(admin.SimpleListFilter): - title = 'Content Value' - parameter_name = 'value_tier' + """Filter content down to high-value reference items.""" + + title = "Content Value" + parameter_name = "value_tier" def lookups(self, request, model_admin): - return ( - ('high_value', '🔥 High Value (Score > 80 & Reference)'), - ) + """Return the custom filter options displayed in the admin sidebar.""" + + return (("high_value", "🔥 High Value (Score > 80 & Reference)"),) def queryset(self, request, queryset): - if self.value() == 'high_value': + """Apply the high-value filter when it is selected.""" + + if self.value() == "high_value": return queryset.filter(relevance_score__gt=80, is_reference=True) return queryset @admin.register(Content) class ContentAdmin(admin.ModelAdmin): + """Admin view for curated content plus trace and score context.""" + list_display = ( "display_relevance", "is_active", @@ -99,7 +125,7 @@ class ContentAdmin(admin.ModelAdmin): HighValueFilter, ("project", admin.RelatedOnlyFieldListFilter), "source_plugin", - "is_active" + "is_active", ) search_fields = ("title", "author", "url") actions = ["generate_newsletter_ideas"] @@ -112,26 +138,36 @@ def preview_content(self, obj): return "-" return format_html( '🔍 View', - preview_text[:500] + preview_text[:500], ) @admin.display(description="AI Trace") def view_trace(self, obj): - """Link to the latest external trace when present, otherwise the internal skill run history.""" + """Link to the latest external trace or fall back to stored skill history.""" from urllib.parse import urlencode from django.conf import settings from django.urls import reverse - latest_skill_result = obj.skill_results.filter( - superseded_by__isnull=True, - ).order_by("-created_at").first() + latest_skill_result = ( + obj.skill_results.filter( + superseded_by__isnull=True, + ) + .order_by("-created_at") + .first() + ) if latest_skill_result is None: return "-" result_data = latest_skill_result.result_data or {} trace_sections = [result_data] - for section_name in ("trace", "langsmith", "langfuse", "observability", "telemetry"): + for section_name in ( + "trace", + "langsmith", + "langfuse", + "observability", + "telemetry", + ): section = result_data.get(section_name) if isinstance(section, dict): trace_sections.append(section) @@ -139,20 +175,36 @@ def view_trace(self, obj): trace_url = "" trace_id = "" for section in trace_sections: - for key in ("trace_url", "traceUrl", "langsmith_run_url", "langfuse_trace_url"): + for key in ( + "trace_url", + "traceUrl", + "langsmith_run_url", + "langfuse_trace_url", + ): value = section.get(key) if isinstance(value, str) and value: trace_url = value break if trace_url: break - for key in ("trace_id", "traceId", "run_id", "runId", "langsmith_run_id", "langfuse_trace_id"): + for key in ( + "trace_id", + "traceId", + "run_id", + "runId", + "langsmith_run_id", + "langfuse_trace_id", + ): value = section.get(key) if isinstance(value, str) and value: trace_id = value break - if not trace_url and trace_id and getattr(settings, "AI_TRACE_URL_TEMPLATE", ""): + if ( + not trace_url + and trace_id + and getattr(settings, "AI_TRACE_URL_TEMPLATE", "") + ): trace_url = settings.AI_TRACE_URL_TEMPLATE.format( content_id=obj.id, run_id=trace_id, @@ -182,14 +234,22 @@ def view_trace(self, obj): @admin.display(description="Score") def display_relevance(self, obj): + """Render the relevance score with a coarse color-coded severity band.""" + if obj.relevance_score is None: return "-" - color = "green" if obj.relevance_score > 75 else "orange" if obj.relevance_score > 40 else "red" + color = ( + "green" + if obj.relevance_score > 75 + else "orange" if obj.relevance_score > 40 else "red" + ) return format_html('{}%', color, obj.relevance_score) def changelist_view(self, request, extra_context=None): + """Augment the changelist with content dashboard statistics.""" + queryset = self.get_queryset(request) - metrics = queryset.aggregate(avg_score=Avg('relevance_score')) + metrics = queryset.aggregate(avg_score=Avg("relevance_score")) extra_context = extra_context or {} extra_context["dashboard_stats"] = [ @@ -197,7 +257,7 @@ def changelist_view(self, request, extra_context=None): "title": "Avg Relevance", "value": f"{metrics['avg_score'] or 0:.1f}%", "icon": "insights", - "color": "success" if (metrics['avg_score'] or 0) > 70 else "warning", + "color": "success" if (metrics["avg_score"] or 0) > 70 else "warning", }, { "title": "Total Filtered", @@ -210,6 +270,8 @@ def changelist_view(self, request, extra_context=None): @admin.action(description="Generate Ideas for Newsletter") def generate_newsletter_ideas(self, request, queryset): + """Queue pipeline processing for the selected content items.""" + from core.tasks import process_content content_ids = list(queryset.values_list("id", flat=True)) @@ -224,6 +286,8 @@ def generate_newsletter_ideas(self, request, queryset): @admin.register(SkillResult) class SkillResultAdmin(ModelAdmin): + """Admin view for AI skill history, retries, and result inspection.""" + list_display = ( "skill_name", "get_content_link", @@ -237,17 +301,29 @@ class SkillResultAdmin(ModelAdmin): list_filter = ("status", "skill_name", "project", "model_used") search_fields = ("skill_name", "content__title", "model_used", "error_message") actions = ["retry_selected_skills"] - readonly_fields = ("pretty_result_data", "latency_ms", "created_at", "superseded_by") + readonly_fields = ( + "pretty_result_data", + "latency_ms", + "created_at", + "superseded_by", + ) fieldsets = ( - ("Execution Details", { - "fields": ("skill_name", "content", "project", "status", "model_used") - }), - ("AI Output", { - "fields": ("pretty_result_data", "error_message"), - }), - ("Performance Metrics", { - "fields": ("latency_ms", "confidence", "created_at", "superseded_by"), - }), + ( + "Execution Details", + {"fields": ("skill_name", "content", "project", "status", "model_used")}, + ), + ( + "AI Output", + { + "fields": ("pretty_result_data", "error_message"), + }, + ), + ( + "Performance Metrics", + { + "fields": ("latency_ms", "confidence", "created_at", "superseded_by"), + }, + ), ) @admin.action(description="Retry Selected Skills") @@ -257,7 +333,7 @@ def retry_selected_skills(self, request, queryset): self.message_user( request, f"Successfully reset {updated} skills to PENDING for retry.", - messages.SUCCESS + messages.SUCCESS, ) @admin.display(description="Result Preview") @@ -267,50 +343,63 @@ def preview_json(self, obj): return "-" return format_html( '🔍 Preview', - f"{obj.pk}/change/" + f"{obj.pk}/change/", ) @admin.display(description="Content") def get_content_link(self, obj): + """Return a compact content title for the table view.""" + return obj.content.title[:30] + "..." if obj.content.title else "Untitled" @admin.display(description="Status") def display_status(self, obj): + """Render the skill status as a colored dot plus label.""" + status_value = str(obj.status).lower() colors = {"completed": "green", "failed": "red", "pending": "orange"} color = colors.get(status_value, "gray") return format_html( '● {}', - color, status_value.upper() + color, + status_value.upper(), ) @admin.display(description="Perf / Conf") def display_performance(self, obj): + """Show latency and confidence together in a compact cell.""" + latency = f"{obj.latency_ms}ms" if obj.latency_ms else "-" conf = f"{int(obj.confidence * 100)}%" if obj.confidence is not None else "-" return f"{latency} / {conf}" @admin.display(description="Current", boolean=True) def is_current(self, obj): + """Return whether this row is the most recent non-superseded result.""" + return obj.superseded_by is None @admin.display(description="Result Data JSON") def pretty_result_data(self, obj): + """Render result JSON in a readable preformatted block.""" + if not obj.result_data: return "No data available" formatted_json = json.dumps(obj.result_data, indent=4) return mark_safe( f'
'
-            f'{formatted_json}'
-            f'
' + f"{formatted_json}" + f"" ) def changelist_view(self, request, extra_context=None): + """Augment the changelist with latency and failure-rate statistics.""" + qs = self.get_queryset(request) extra_context = extra_context or {} - metrics = qs.aggregate(avg_lat=Avg('latency_ms')) - avg_latency = metrics['avg_lat'] or 0 - failure_count = qs.filter(status='failed').count() + metrics = qs.aggregate(avg_lat=Avg("latency_ms")) + avg_latency = metrics["avg_lat"] or 0 + failure_count = qs.filter(status="failed").count() total_count = qs.count() or 1 extra_context["dashboard_stats"] = [ @@ -332,25 +421,31 @@ def changelist_view(self, request, extra_context=None): @admin.register(UserFeedback) class UserFeedbackAdmin(ModelAdmin): + """Admin view for editorial feedback and agreement with AI scoring.""" + list_display = ( "display_feedback", "get_content_title", "get_ai_score", "project", "user", - "created_at" + "created_at", ) list_filter = ("feedback_type", ("project", admin.RelatedOnlyFieldListFilter)) search_fields = ("content__title", "user__email", "user__username") @admin.display(description="Type") def display_feedback(self, obj): + """Render feedback as a thumbs-up or thumbs-down glyph.""" + if str(obj.feedback_type).lower() == "upvote": return format_html('{}', "1.2rem", "👍") return format_html('{}', "1.2rem", "👎") @admin.display(description="Content Title") def get_content_title(self, obj): + """Return a shortened content title for list display.""" + return obj.content.title[:50] + "..." @admin.display(description="AI Score") @@ -363,6 +458,8 @@ def get_ai_score(self, obj): return format_html('{}%', color, score) def changelist_view(self, request, extra_context=None): + """Augment the changelist with editorial approval statistics.""" + qs = self.get_queryset(request) extra_context = extra_context or {} upvotes = qs.filter(feedback_type="upvote").count() @@ -387,6 +484,8 @@ def changelist_view(self, request, extra_context=None): @admin.register(IngestionRun) class IngestionRunAdmin(ModelAdmin): + """Admin view for ingestion health, throughput, and timing.""" + list_display = ( "plugin_name", "project", @@ -395,37 +494,39 @@ class IngestionRunAdmin(ModelAdmin): "display_duration", "started_at", ) - list_filter = ("plugin_name", "status", ("project", admin.RelatedOnlyFieldListFilter)) + list_filter = ( + "plugin_name", + "status", + ("project", admin.RelatedOnlyFieldListFilter), + ) search_fields = ("plugin_name", "error_message", "project__name") readonly_fields = ("display_duration", "started_at", "completed_at") fieldsets = ( - ("Run Info", { - "fields": ("plugin_name", "project", "status") - }), - ("Data Metrics", { - "fields": ("items_fetched", "items_ingested", "display_efficiency") - }), - ("Timing", { - "fields": ("started_at", "completed_at", "display_duration") - }), - ("Logs", { - "fields": ("error_message",), - "classes": ("collapse",) - }), + ("Run Info", {"fields": ("plugin_name", "project", "status")}), + ( + "Data Metrics", + {"fields": ("items_fetched", "items_ingested", "display_efficiency")}, + ), + ("Timing", {"fields": ("started_at", "completed_at", "display_duration")}), + ("Logs", {"fields": ("error_message",), "classes": ("collapse",)}), ) @admin.display(description="Status") def display_status(self, obj): + """Render ingestion status as an Unfold badge.""" + status_value = str(obj.status).lower() colors = {"success": "success", "failed": "danger", "running": "info"} return format_html( '{}', colors.get(status_value, "warning"), - status_value.upper() + status_value.upper(), ) @admin.display(description="Efficiency (Ingested/Fetched)") def display_efficiency(self, obj): + """Show how much of the fetched content became stored content.""" + if obj.items_fetched == 0: return "0/0" percent = (obj.items_ingested / obj.items_fetched) * 100 @@ -433,11 +534,16 @@ def display_efficiency(self, obj): percent_label = f"({percent:.0f}%)" return format_html( '{} / {} {}', - obj.items_ingested, obj.items_fetched, color, percent_label + obj.items_ingested, + obj.items_fetched, + color, + percent_label, ) @admin.display(description="Duration") def display_duration(self, obj): + """Return human-readable runtime for completed ingestion runs.""" + if not obj.completed_at: return "In Progress..." duration = obj.completed_at - obj.started_at @@ -445,11 +551,13 @@ def display_duration(self, obj): return f"{int(seconds // 60)}m {int(seconds % 60)}s" def changelist_view(self, request, extra_context=None): + """Augment the changelist with ingestion success statistics.""" + qs = self.get_queryset(request) extra_context = extra_context or {} total_runs = qs.count() failed_runs = qs.filter(status="failed").count() - total_ingested = sum(qs.values_list('items_ingested', flat=True)) + total_ingested = sum(qs.values_list("items_ingested", flat=True)) extra_context["dashboard_stats"] = [ { @@ -469,6 +577,8 @@ def changelist_view(self, request, extra_context=None): @admin.register(SourceConfig) class SourceConfigAdmin(ModelAdmin): + """Admin view for source-plugin configuration and connectivity checks.""" + list_display = ( "plugin_name", "project", @@ -476,35 +586,51 @@ class SourceConfigAdmin(ModelAdmin): "is_active", "last_fetched_at", ) - list_filter = ("is_active", "plugin_name", ("project", admin.RelatedOnlyFieldListFilter)) + list_filter = ( + "is_active", + "plugin_name", + ("project", admin.RelatedOnlyFieldListFilter), + ) list_editable = ("is_active",) search_fields = ("plugin_name", "project__name") actions = ["test_source_connection"] readonly_fields = ("last_fetched_at", "pretty_config") fieldsets = ( - ("Core Settings", { - "fields": ("plugin_name", "project", "is_active") - }), - ("Configuration", { - "fields": ("pretty_config", "config"), - }), - ("Activity", { - "fields": ("last_fetched_at",), - }), + ("Core Settings", {"fields": ("plugin_name", "project", "is_active")}), + ( + "Configuration", + { + "fields": ("pretty_config", "config"), + }, + ), + ( + "Activity", + { + "fields": ("last_fetched_at",), + }, + ), ) @admin.display(description="Status") def display_health(self, obj): + """Infer a human-friendly health state from activity timestamps.""" + if not obj.is_active: return format_html('{}', "gray", "● Paused") if obj.last_fetched_at: hours_since = (timezone.now() - obj.last_fetched_at).total_seconds() / 3600 if hours_since > 24: - return format_html('{}', "red", "● Stale") - return format_html('{}', "green", "● Healthy") + return format_html( + '{}', "red", "● Stale" + ) + return format_html( + '{}', "green", "● Healthy" + ) - return format_html('{}', "orange", "● Never Run") + return format_html( + '{}', "orange", "● Never Run" + ) @admin.display(description="Config Preview") def pretty_config(self, obj): @@ -512,7 +638,9 @@ def pretty_config(self, obj): if not obj.config: return "Empty" formatted_json = json.dumps(obj.config, indent=4) - return mark_safe(f'
{formatted_json}
') + return mark_safe( + f'
{formatted_json}
' + ) @admin.action(description="Test Source Connectivity") def test_source_connection(self, request, queryset): @@ -551,6 +679,8 @@ def test_source_connection(self, request, queryset): ) def changelist_view(self, request, extra_context=None): + """Augment the changelist with source-count and diversity stats.""" + qs = self.get_queryset(request) extra_context = extra_context or {} active_count = qs.filter(is_active=True).count() @@ -565,7 +695,7 @@ def changelist_view(self, request, extra_context=None): }, { "title": "Plugin Variety", - "value": qs.values('plugin_name').distinct().count(), + "value": qs.values("plugin_name").distinct().count(), "icon": "extension", }, ] @@ -574,6 +704,8 @@ def changelist_view(self, request, extra_context=None): @admin.register(ReviewQueue) class ReviewQueueAdmin(ModelAdmin): + """Admin view for items waiting on editorial judgment.""" + list_display = ( "get_content_title", "project", @@ -589,25 +721,39 @@ class ReviewQueueAdmin(ModelAdmin): @admin.display(description="Content") def get_content_title(self, obj): + """Return a shortened content title for list display.""" + return obj.content.title[:50] + "..." @admin.display(description="Confidence") def display_confidence(self, obj): - color = "red" if obj.confidence < 0.3 else "orange" if obj.confidence < 0.6 else "green" + """Render confidence as a percentage with risk coloring.""" + + color = ( + "red" + if obj.confidence < 0.3 + else "orange" if obj.confidence < 0.6 else "green" + ) confidence_label = f"{obj.confidence * 100:.0f}%" return format_html('{}', color, confidence_label) @admin.action(description="Approve selected items") def mark_as_approved(self, request, queryset): + """Resolve selected review items as approved.""" + queryset.update(resolved=True, resolution="APPROVED") self.message_user(request, "Selected items approved.", messages.SUCCESS) @admin.action(description="Reject selected items") def mark_as_rejected(self, request, queryset): + """Resolve selected review items as rejected.""" + queryset.update(resolved=True, resolution="REJECTED") self.message_user(request, "Selected items rejected.", messages.WARNING) def changelist_view(self, request, extra_context=None): + """Augment the changelist with pending-volume and confidence stats.""" + qs = self.get_queryset(request) extra_context = extra_context or {} pending_count = qs.filter(resolved=False).count() diff --git a/core/api.py b/core/api.py index ebe03e1d..0d79df79 100644 --- a/core/api.py +++ b/core/api.py @@ -1,3 +1,10 @@ +"""REST API viewsets and OpenAPI documentation helpers for the core app. + +This module exposes the project-scoped CRUD surface used by the frontend and by +external clients. It also centralizes the drf-spectacular helpers that keep the +generated schema consistent across similar viewsets. +""" + from typing import Any from drf_spectacular.utils import ( @@ -222,7 +229,20 @@ ) -def build_success_response(response, description: str, examples: list[OpenApiExample] | None = None): +def build_success_response( + response, description: str, examples: list[OpenApiExample] | None = None +): + """Build a reusable OpenAPI success response object. + + Args: + response: Serializer, inline serializer, or response object for the schema. + description: Human-readable description shown in the generated docs. + examples: Optional example payloads to attach to the response. + + Returns: + A configured ``OpenApiResponse`` instance. + """ + response_kwargs = { "response": response, "description": description, @@ -242,6 +262,22 @@ def build_crud_action_overrides( create_examples: list[OpenApiExample] | None = None, create_response_examples: list[OpenApiExample] | None = None, ): + """Generate common schema overrides for CRUD-style viewset actions. + + Args: + serializer_class: Serializer used by the viewset. + resource_plural: Human-readable plural name for the resource. + resource_singular: Human-readable singular name for the resource. + list_examples: Optional examples for list responses. + retrieve_examples: Optional examples for retrieve responses. + create_examples: Optional request examples for create actions. + create_response_examples: Optional examples for create responses. + + Returns: + A mapping suitable for ``action_overrides`` on the documentation helpers + below. + """ + overrides: dict[str, dict[str, Any]] = { "list": { "responses": { @@ -275,19 +311,25 @@ def build_crud_action_overrides( }, "update": { "responses": { - 200: build_success_response(serializer_class, f"The updated {resource_singular}."), + 200: build_success_response( + serializer_class, f"The updated {resource_singular}." + ), 403: AUTHENTICATION_REQUIRED_RESPONSE, } }, "partial_update": { "responses": { - 200: build_success_response(serializer_class, f"The updated {resource_singular}."), + 200: build_success_response( + serializer_class, f"The updated {resource_singular}." + ), 403: AUTHENTICATION_REQUIRED_RESPONSE, } }, "destroy": { "responses": { - 204: OpenApiResponse(description=f"The {resource_singular} was deleted."), + 204: OpenApiResponse( + description=f"The {resource_singular} was deleted." + ), 403: AUTHENTICATION_REQUIRED_RESPONSE, } }, @@ -304,6 +346,19 @@ def document_group_access_viewset( tag: str, action_overrides: dict[str, dict] | None = None, ): + """Decorate a viewset with schema metadata for group-access resources. + + Args: + resource_plural: Human-readable plural label for the resource. + resource_singular: Human-readable singular label for the resource. + create_description: Detailed description for the create action. + tag: OpenAPI tag applied to each action. + action_overrides: Optional per-action schema overrides. + + Returns: + A class decorator produced by ``extend_schema_view``. + """ + action_overrides = action_overrides or {} def schema(action: str, **kwargs): @@ -314,7 +369,9 @@ def schema(action: str, **kwargs): responses = dict(schema_kwargs.get("responses", {})) responses.update(override_responses) schema_kwargs["responses"] = responses - schema_kwargs.update({key: value for key, value in action_override.items() if key != "responses"}) + schema_kwargs.update( + {key: value for key, value in action_override.items() if key != "responses"} + ) return extend_schema(**schema_kwargs) return extend_schema_view( @@ -358,6 +415,19 @@ def document_project_owned_viewset( tag: str, action_overrides: dict[str, dict] | None = None, ): + """Decorate a nested project-scoped viewset with consistent schema metadata. + + Args: + resource_plural: Human-readable plural label for the resource. + resource_singular: Human-readable singular label for the resource. + create_description: Detailed description for the create action. + tag: OpenAPI tag applied to each action. + action_overrides: Optional per-action schema overrides. + + Returns: + A class decorator produced by ``extend_schema_view``. + """ + parameters = [PROJECT_ID_PARAMETER] action_overrides = action_overrides or {} @@ -369,7 +439,9 @@ def schema(action: str, **kwargs): responses = dict(schema_kwargs.get("responses", {})) responses.update(override_responses) schema_kwargs["responses"] = responses - schema_kwargs.update({key: value for key, value in action_override.items() if key != "responses"}) + schema_kwargs.update( + {key: value for key, value in action_override.items() if key != "responses"} + ) return extend_schema(**schema_kwargs) return extend_schema_view( @@ -413,29 +485,46 @@ def schema(action: str, **kwargs): class ProjectOwnedQuerysetMixin: + """Scope nested viewsets to the authenticated user's selected project.""" + queryset: Any = None def get_project(self): + """Return the project referenced by ``project_id`` after access checks. + + Raises: + AssertionError: If the nested route does not supply ``project_id``. + NotFound: If the project does not exist or the user lacks access. + """ + project_id = self.kwargs.get("project_id") if project_id is None: - raise AssertionError("project_id must be present in nested project-scoped routes") + raise AssertionError( + "project_id must be present in nested project-scoped routes" + ) try: return Project.objects.get(pk=project_id, group__user=self.request.user) except Project.DoesNotExist as exc: raise NotFound("Project not found.") from exc def get_queryset(self): + """Filter the configured queryset down to the current project.""" + queryset = self.queryset if queryset is None: raise AssertionError("queryset must be set on project-scoped viewsets") return queryset.filter(project=self.get_project()) def get_serializer_context(self): + """Inject the resolved project into serializer context.""" + context = super().get_serializer_context() context["project"] = self.get_project() return context def perform_create(self, serializer): + """Ensure nested resources are always created under the current project.""" + serializer.save(project=self.get_project()) @@ -454,11 +543,15 @@ def perform_create(self, serializer): ), ) class ProjectViewSet(viewsets.ModelViewSet): + """Manage projects accessible through the current user's group memberships.""" + serializer_class = ProjectSerializer queryset = Project.objects.select_related("group") lookup_url_kwarg = "id" def get_queryset(self): + """Limit projects to those visible through the authenticated user.""" + return self.queryset.filter(group__user=self.request.user).distinct() @@ -474,6 +567,8 @@ def get_queryset(self): ), ) class ProjectConfigViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Manage per-project scoring and authority configuration.""" + serializer_class = ProjectConfigSerializer queryset = ProjectConfig.objects.select_related("project") @@ -490,6 +585,8 @@ class ProjectConfigViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): ), ) class EntityViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Manage tracked entities associated with a project.""" + serializer_class = EntitySerializer queryset = Entity.objects.select_related("project") @@ -509,6 +606,8 @@ class EntityViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): ), ) class ContentViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Browse project content and trigger ad hoc AI processing for it.""" + serializer_class = ContentSerializer queryset = Content.objects.select_related("project", "entity") @@ -529,6 +628,13 @@ class ContentViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): ) @action(detail=True, methods=["post"], url_path=r"skills/(?P[^/.]+)") def run_skill(self, request, *args, **kwargs): + """Execute one supported ad hoc skill for a content item. + + Relevant and summarization requests are queued through Celery, while the + other supported skills execute inline and return their ``SkillResult`` + immediately. + """ + from core.pipeline import execute_ad_hoc_skill from core.tasks import queue_content_skill @@ -551,11 +657,15 @@ def run_skill(self, request, *args, **kwargs): content = self.get_object() if skill_name in {RELEVANCE_SKILL_NAME, SUMMARIZATION_SKILL_NAME}: skill_result = queue_content_skill(content, skill_name) - serializer = SkillResultSerializer(skill_result, context=self.get_serializer_context()) + serializer = SkillResultSerializer( + skill_result, context=self.get_serializer_context() + ) return Response(serializer.data, status=status.HTTP_202_ACCEPTED) skill_result = execute_ad_hoc_skill(content, skill_name) - serializer = SkillResultSerializer(skill_result, context=self.get_serializer_context()) + serializer = SkillResultSerializer( + skill_result, context=self.get_serializer_context() + ) return Response(serializer.data, status=status.HTTP_201_CREATED) @@ -572,6 +682,8 @@ def run_skill(self, request, *args, **kwargs): ), ) class SkillResultViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Inspect persisted AI skill outputs for project content.""" + serializer_class = SkillResultSerializer queryset = SkillResult.objects.select_related("content", "project", "superseded_by") @@ -588,10 +700,14 @@ class SkillResultViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): ), ) class UserFeedbackViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Capture editor feedback on project content items.""" + serializer_class = UserFeedbackSerializer queryset = UserFeedback.objects.select_related("content", "project", "user") def perform_create(self, serializer): + """Attach the authenticated user automatically to new feedback rows.""" + serializer.save(project=self.get_project(), user=self.request.user) @@ -607,6 +723,8 @@ def perform_create(self, serializer): ), ) class IngestionRunViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Inspect ingestion-run history for a project.""" + serializer_class = IngestionRunSerializer queryset = IngestionRun.objects.select_related("project") @@ -630,6 +748,8 @@ class IngestionRunViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): ), ) class SourceConfigViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Manage source-plugin configuration for a project.""" + serializer_class = SourceConfigSerializer queryset = SourceConfig.objects.select_related("project") @@ -646,5 +766,7 @@ class SourceConfigViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): ), ) class ReviewQueueViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): + """Inspect and manage content awaiting manual review.""" + serializer_class = ReviewQueueSerializer queryset = ReviewQueue.objects.select_related("content", "project") diff --git a/core/api_urls.py b/core/api_urls.py index d7cc1cfa..2c6184e3 100644 --- a/core/api_urls.py +++ b/core/api_urls.py @@ -19,14 +19,24 @@ router.register("projects", ProjectViewSet, basename="project") project_router = NestedSimpleRouter(router, r"projects", lookup="project") -project_router.register(r"project-configs", ProjectConfigViewSet, basename="project-config") +project_router.register( + r"project-configs", ProjectConfigViewSet, basename="project-config" +) project_router.register(r"entities", EntityViewSet, basename="project-entity") project_router.register(r"contents", ContentViewSet, basename="project-content") -project_router.register(r"skill-results", SkillResultViewSet, basename="project-skill-result") +project_router.register( + r"skill-results", SkillResultViewSet, basename="project-skill-result" +) project_router.register(r"feedback", UserFeedbackViewSet, basename="project-feedback") -project_router.register(r"ingestion-runs", IngestionRunViewSet, basename="project-ingestion-run") -project_router.register(r"source-configs", SourceConfigViewSet, basename="project-source-config") -project_router.register(r"review-queue", ReviewQueueViewSet, basename="project-review-queue") +project_router.register( + r"ingestion-runs", IngestionRunViewSet, basename="project-ingestion-run" +) +project_router.register( + r"source-configs", SourceConfigViewSet, basename="project-source-config" +) +project_router.register( + r"review-queue", ReviewQueueViewSet, basename="project-review-queue" +) urlpatterns = [ *router.urls, diff --git a/core/embeddings.py b/core/embeddings.py index 5bc5f444..051adb25 100644 --- a/core/embeddings.py +++ b/core/embeddings.py @@ -1,3 +1,11 @@ +"""Embedding generation and Qdrant vector-store helpers. + +The rest of the application treats this module as the integration boundary for +vector search. It normalizes provider differences, creates per-project Qdrant +collections, and stores the payload fields later used by relevance scoring and +related-content search. +""" + from __future__ import annotations from abc import ABC, abstractmethod @@ -10,12 +18,12 @@ from django.utils.dateparse import parse_datetime from qdrant_client import QdrantClient from qdrant_client.models import ( - Distance, - FieldCondition, - Filter, - MatchValue, - PointStruct, - VectorParams, + Distance, + FieldCondition, + Filter, + MatchValue, + PointStruct, + VectorParams, ) from core.models import Content @@ -26,26 +34,46 @@ def get_sentence_transformer_class(): - global SentenceTransformer + """Lazily import and cache the sentence-transformer class. + + Returns: + The ``SentenceTransformer`` class from the optional dependency. + """ - if SentenceTransformer is None: - from sentence_transformers import SentenceTransformer as sentence_transformer_class + global SentenceTransformer - SentenceTransformer = sentence_transformer_class + if SentenceTransformer is None: + from sentence_transformers import ( + SentenceTransformer as sentence_transformer_class, + ) + + SentenceTransformer = sentence_transformer_class - return SentenceTransformer + return SentenceTransformer class EmbeddingProvider(ABC): + """Abstract interface implemented by all embedding backends.""" + @abstractmethod def embed_text(self, text: str) -> list[float]: + """Embed normalized text into a dense vector.""" + raise NotImplementedError def get_embedding_dimension(self) -> int: + """Infer the output vector size for the provider. + + Returns: + The number of dimensions produced by ``embed_text``. + """ + return len(self.embed_text("dimension probe")) class SentenceTransformerEmbeddingProvider(EmbeddingProvider): + """Embedding provider backed by ``sentence-transformers`` models.""" + def __init__(self): sentence_transformer_class = get_sentence_transformer_class() self.model = sentence_transformer_class( @@ -54,14 +82,22 @@ def __init__(self): ) def embed_text(self, text: str) -> list[float]: + """Encode text with normalized sentence-transformer embeddings.""" + return self.model.encode(text, normalize_embeddings=True).tolist() def get_embedding_dimension(self) -> int: + """Return the model's native embedding dimension without probing text.""" + return int(self.model.get_sentence_embedding_dimension()) class OllamaEmbeddingProvider(EmbeddingProvider): + """Embedding provider backed by an Ollama server.""" + def embed_text(self, text: str) -> list[float]: + """Request embeddings from the Ollama HTTP API.""" + normalized_text = normalize_text(text) response = httpx.post( f"{settings.OLLAMA_URL.rstrip('/')}/api/embed", @@ -81,9 +117,19 @@ def embed_text(self, text: str) -> list[float]: class OpenRouterEmbeddingProvider(EmbeddingProvider): + """Embedding provider backed by OpenRouter's embeddings endpoint.""" + def embed_text(self, text: str) -> list[float]: + """Request embeddings from OpenRouter using the configured model. + + Raises: + RuntimeError: If the OpenRouter API key is not configured. + """ + if not settings.OPENROUTER_API_KEY: - raise RuntimeError("OPENROUTER_API_KEY must be set when using the openrouter embedding provider.") + raise RuntimeError( + "OPENROUTER_API_KEY must be set when using the openrouter embedding provider." + ) headers = { "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}", "Content-Type": "application/json", @@ -107,16 +153,29 @@ def embed_text(self, text: str) -> list[float]: def collection_name_for_project(project_id: int) -> str: + """Return the Qdrant collection name for a project.""" + return f"project_{project_id}_content" @lru_cache(maxsize=1) def get_qdrant_client() -> QdrantClient: + """Create and cache the shared Qdrant client instance.""" + return QdrantClient(url=settings.QDRANT_URL, timeout=10, check_compatibility=False) @lru_cache(maxsize=1) def get_embedding_provider() -> EmbeddingProvider: + """Resolve the configured embedding provider implementation. + + Returns: + The provider instance selected by ``EMBEDDING_PROVIDER``. + + Raises: + ValueError: If the configured provider name is unsupported. + """ + provider_name = settings.EMBEDDING_PROVIDER if provider_name == "sentence-transformers": return SentenceTransformerEmbeddingProvider() @@ -128,14 +187,27 @@ def get_embedding_provider() -> EmbeddingProvider: def get_embedding_dimension() -> int: + """Return the current embedding model's output dimension.""" + return get_embedding_provider().get_embedding_dimension() def embed_text(text: str) -> list[float]: + """Normalize and embed arbitrary text with the active provider.""" + return get_embedding_provider().embed_text(normalize_text(text)) def upsert_content_embedding(content: Content) -> str: + """Write or update a content embedding in the project's Qdrant collection. + + Args: + content: The content row whose embedding should be stored. + + Returns: + The Qdrant point identifier associated with the content row. + """ + client = get_qdrant_client() ensure_project_collection(content.project_id) embedding_id = content.embedding_id or str(uuid4()) @@ -173,9 +245,26 @@ def search_similar( is_reference: bool | None = None, exclude_content_id: int | None = None, ): + """Search a project's Qdrant collection for nearest-neighbor matches. + + Args: + project_id: Project whose collection should be queried. + query_vector: Embedded query vector to compare against stored points. + limit: Maximum number of results to return. + is_reference: Optional filter limiting matches to reference or non-reference + content. + exclude_content_id: Optional content ID to exclude from the result set. + + Returns: + A list of scored Qdrant points. Returns an empty list when the collection + does not exist yet. + """ + if not project_collection_exists(project_id): return [] - query_filter = build_search_filter(is_reference=is_reference, exclude_content_id=exclude_content_id) + query_filter = build_search_filter( + is_reference=is_reference, exclude_content_id=exclude_content_id + ) client = cast(Any, get_qdrant_client()) return client.search( collection_name=collection_name_for_project(project_id), @@ -186,7 +275,11 @@ def search_similar( ) -def search_similar_content(content: Content, limit: int = 10, *, is_reference: bool | None = None): +def search_similar_content( + content: Content, limit: int = 10, *, is_reference: bool | None = None +): + """Find content similar to an existing content row within the same project.""" + return search_similar( content.project_id, embed_text(build_content_embedding_text(content)), @@ -196,7 +289,21 @@ def search_similar_content(content: Content, limit: int = 10, *, is_reference: b ) -def get_reference_similarity(project_id: int, vector: list[float], limit: int = 5) -> float: +def get_reference_similarity( + project_id: int, vector: list[float], limit: int = 5 +) -> float: + """Average the top reference-item similarity scores for a vector. + + Args: + project_id: Project whose reference corpus should be searched. + vector: Embedded representation of the candidate content. + limit: Number of reference matches to average. + + Returns: + The mean cosine similarity of the top matching reference items, or ``0.0`` + when the project has no reference corpus. + """ + scored_points = search_similar(project_id, vector, limit=limit, is_reference=True) if not scored_points: return 0.0 @@ -204,17 +311,23 @@ def get_reference_similarity(project_id: int, vector: list[float], limit: int = def ensure_project_collection(project_id: int) -> None: + """Create the per-project Qdrant collection when it does not yet exist.""" + client = get_qdrant_client() collection_name = collection_name_for_project(project_id) if project_collection_exists(project_id): return client.create_collection( collection_name=collection_name, - vectors_config=VectorParams(size=get_embedding_dimension(), distance=Distance.COSINE), + vectors_config=VectorParams( + size=get_embedding_dimension(), distance=Distance.COSINE + ), ) def project_collection_exists(project_id: int) -> bool: + """Return whether the project's Qdrant collection already exists.""" + try: get_qdrant_client().get_collection(collection_name_for_project(project_id)) except Exception: @@ -223,10 +336,14 @@ def project_collection_exists(project_id: int) -> bool: def build_content_embedding_text(content: Content) -> str: + """Build the text blob used to generate content embeddings.""" + return "\n\n".join(part for part in [content.title, content.content_text] if part) def normalize_text(text: str) -> str: + """Trim input text and replace empty input with a stable placeholder.""" + normalized_text = text.strip() if not normalized_text: return "empty content" @@ -234,6 +351,8 @@ def normalize_text(text: str) -> str: def serialize_published_date(value) -> str: + """Convert supported published-date values into a string payload for Qdrant.""" + if hasattr(value, "isoformat"): return value.isoformat() if isinstance(value, str): @@ -244,12 +363,20 @@ def serialize_published_date(value) -> str: return str(value) -def build_search_filter(*, is_reference: bool | None = None, exclude_content_id: int | None = None) -> Filter | None: +def build_search_filter( + *, is_reference: bool | None = None, exclude_content_id: int | None = None +) -> Filter | None: + """Build a Qdrant filter for reference scoping and self-exclusion.""" + conditions = [] if is_reference is not None: - conditions.append(FieldCondition(key="is_reference", match=MatchValue(value=is_reference))) + conditions.append( + FieldCondition(key="is_reference", match=MatchValue(value=is_reference)) + ) if exclude_content_id is not None: - conditions.append(FieldCondition(key="content_id", match=MatchValue(value=exclude_content_id))) + conditions.append( + FieldCondition(key="content_id", match=MatchValue(value=exclude_content_id)) + ) if not conditions: return None must_conditions = conditions if exclude_content_id is None else conditions[:-1] diff --git a/core/llm.py b/core/llm.py index 2bf545f3..cf1b8920 100644 --- a/core/llm.py +++ b/core/llm.py @@ -19,9 +19,13 @@ class OpenRouterJSONResponse: latency_ms: int -def openrouter_chat_json(*, model: str, system_prompt: str, user_prompt: str) -> OpenRouterJSONResponse: +def openrouter_chat_json( + *, model: str, system_prompt: str, user_prompt: str +) -> OpenRouterJSONResponse: if not settings.OPENROUTER_API_KEY: - raise RuntimeError("OPENROUTER_API_KEY must be configured for OpenRouter chat completions.") + raise RuntimeError( + "OPENROUTER_API_KEY must be configured for OpenRouter chat completions." + ) headers = { "Authorization": f"Bearer {settings.OPENROUTER_API_KEY}", diff --git a/core/management/commands/embedding_smoke.py b/core/management/commands/embedding_smoke.py index 431cb625..2735740f 100644 --- a/core/management/commands/embedding_smoke.py +++ b/core/management/commands/embedding_smoke.py @@ -27,12 +27,22 @@ def handle(self, *args, **options): try: content = Content.objects.get(pk=content_id) except Content.DoesNotExist as exc: - raise CommandError(f"Content with id {content_id} does not exist.") from exc + raise CommandError( + f"Content with id {content_id} does not exist." + ) from exc embedding_id = upsert_content_embedding(content) - self.stdout.write(self.style.SUCCESS(f"Upserted embedding for content {content_id}: {embedding_id}")) + self.stdout.write( + self.style.SUCCESS( + f"Upserted embedding for content {content_id}: {embedding_id}" + ) + ) return vector = embed_text(text) preview = ", ".join(f"{value:.4f}" for value in vector[:5]) - self.stdout.write(self.style.SUCCESS(f"Embedding generated successfully. Dimension: {len(vector)}")) + self.stdout.write( + self.style.SUCCESS( + f"Embedding generated successfully. Dimension: {len(vector)}" + ) + ) self.stdout.write(f"Preview: [{preview}]") diff --git a/core/management/commands/seed_demo.py b/core/management/commands/seed_demo.py index d3d058aa..2a1a535e 100644 --- a/core/management/commands/seed_demo.py +++ b/core/management/commands/seed_demo.py @@ -646,7 +646,9 @@ def _seed_pipeline_state( ) content.content_type = article["content_type"] content.relevance_score = relevance_score - content.is_active = relevance_score >= settings.AI_RELEVANCE_REVIEW_THRESHOLD + content.is_active = ( + relevance_score >= settings.AI_RELEVANCE_REVIEW_THRESHOLD + ) content_updates.append(content) skill_results.append( @@ -746,7 +748,10 @@ def _seed_feedback(self, project: Project, contents: list[Content]) -> int: active_contents = sorted( [content for content in contents if content.is_active], - key=lambda content: (content.relevance_score or 0.0, content.published_date), + key=lambda content: ( + content.relevance_score or 0.0, + content.published_date, + ), reverse=True, ) feedback_count = 0 @@ -889,7 +894,9 @@ def _build_demo_content(self) -> list[dict[str, Any]]: def _build_generated_rss_content(self) -> list[dict[str, Any]]: articles = [] for index in range(147): - band = self._band_for_index(index, relevant_cutoff=87, borderline_cutoff=122) + band = self._band_for_index( + index, relevant_cutoff=87, borderline_cutoff=122 + ) publication = RSS_PUBLICATIONS[index % len(RSS_PUBLICATIONS)] topic = RSS_TOPIC_BLUEPRINTS[index % len(RSS_TOPIC_BLUEPRINTS)] relevance_score = self._relevance_score(index, band) @@ -899,12 +906,18 @@ def _build_generated_rss_content(self) -> list[dict[str, Any]]: f"https://{publication['host']}/2026/04/" f"{topic['slug']}-{index + 1:03d}" ), - "title": self._rss_title(publication["label"], topic["headline"], band), + "title": self._rss_title( + publication["label"], topic["headline"], band + ), "author": f"{publication['label']} Editorial", "source_plugin": SourcePluginName.RSS, - "content_text": self._rss_body(publication["label"], topic["body"], band), + "content_text": self._rss_body( + publication["label"], topic["body"], band + ), "days_ago": 1 + (index % 30), - "content_type": self._content_type_for_band(topic["content_type"], band), + "content_type": self._content_type_for_band( + topic["content_type"], band + ), "classification_confidence": self._classification_confidence(index), "relevance_score": relevance_score, "entity_name": publication["entity_name"], @@ -928,10 +941,16 @@ def _build_generated_reddit_content(self) -> list[dict[str, Any]]: "title": self._reddit_title(subreddit, topic["headline"], band), "author": f"u/demo_{subreddit}_{index + 1:03d}", "source_plugin": SourcePluginName.REDDIT, - "content_text": self._reddit_body(subreddit, topic["body"], band, index), + "content_text": self._reddit_body( + subreddit, topic["body"], band, index + ), "days_ago": 1 + ((index * 2) % 30), - "content_type": self._content_type_for_band(topic["content_type"], band), - "classification_confidence": self._classification_confidence(index + 200), + "content_type": self._content_type_for_band( + topic["content_type"], band + ), + "classification_confidence": self._classification_confidence( + index + 200 + ), "relevance_score": self._relevance_score(index + 200, band), "entity_name": None, "used_llm": band == "borderline", @@ -940,7 +959,9 @@ def _build_generated_reddit_content(self) -> list[dict[str, Any]]: return articles @staticmethod - def _band_for_index(index: int, *, relevant_cutoff: int, borderline_cutoff: int) -> str: + def _band_for_index( + index: int, *, relevant_cutoff: int, borderline_cutoff: int + ) -> str: if index < relevant_cutoff: return "relevant" if index < borderline_cutoff: diff --git a/core/management/commands/sync_embeddings.py b/core/management/commands/sync_embeddings.py index 8cb1836b..4fbea211 100644 --- a/core/management/commands/sync_embeddings.py +++ b/core/management/commands/sync_embeddings.py @@ -8,8 +8,12 @@ class Command(BaseCommand): help = "Backfill Qdrant embeddings for content records." def add_arguments(self, parser): - parser.add_argument("--project-id", type=int, help="Only sync content for one project.") - parser.add_argument("--content-id", type=int, help="Only sync one content record.") + parser.add_argument( + "--project-id", type=int, help="Only sync content for one project." + ) + parser.add_argument( + "--content-id", type=int, help="Only sync one content record." + ) parser.add_argument( "--references-only", action="store_true", @@ -33,4 +37,6 @@ def handle(self, *args, **options): upsert_content_embedding(content) synced_count += 1 - self.stdout.write(self.style.SUCCESS(f"Synced embeddings for {synced_count} content item(s).")) + self.stdout.write( + self.style.SUCCESS(f"Synced embeddings for {synced_count} content item(s).") + ) diff --git a/core/migrations/0002_newsletter_intake.py b/core/migrations/0002_newsletter_intake.py index 70991ab7..bb0fda5b 100644 --- a/core/migrations/0002_newsletter_intake.py +++ b/core/migrations/0002_newsletter_intake.py @@ -19,7 +19,12 @@ class Migration(migrations.Migration): migrations.AddField( model_name="project", name="intake_token", - field=models.CharField(default=core.models.generate_project_intake_token, editable=False, max_length=64, unique=True), + field=models.CharField( + default=core.models.generate_project_intake_token, + editable=False, + max_length=64, + unique=True, + ), ), migrations.AddField( model_name="content", @@ -29,14 +34,33 @@ class Migration(migrations.Migration): migrations.CreateModel( name="IntakeAllowlist", fields=[ - ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), ("sender_email", models.EmailField(max_length=254)), ("confirmed_at", models.DateTimeField(blank=True, null=True)), - ("confirmation_token", models.CharField(default=core.models.generate_confirmation_token, max_length=64, unique=True)), + ( + "confirmation_token", + models.CharField( + default=core.models.generate_confirmation_token, + max_length=64, + unique=True, + ), + ), ("created_at", models.DateTimeField(auto_now_add=True)), ( "project", - models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="intake_allowlist", to="core.project"), + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="intake_allowlist", + to="core.project", + ), ), ], options={ @@ -46,7 +70,15 @@ class Migration(migrations.Migration): migrations.CreateModel( name="NewsletterIntake", fields=[ - ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), ("sender_email", models.EmailField(max_length=254)), ("subject", models.CharField(max_length=512)), ("received_at", models.DateTimeField(auto_now_add=True)), @@ -56,7 +88,12 @@ class Migration(migrations.Migration): ( "status", models.CharField( - choices=[("pending", "Pending"), ("extracted", "Extracted"), ("failed", "Failed"), ("rejected", "Rejected")], + choices=[ + ("pending", "Pending"), + ("extracted", "Extracted"), + ("failed", "Failed"), + ("rejected", "Rejected"), + ], default="pending", max_length=16, ), @@ -65,7 +102,11 @@ class Migration(migrations.Migration): ("error_message", models.TextField(blank=True)), ( "project", - models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="newsletter_intakes", to="core.project"), + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="newsletter_intakes", + to="core.project", + ), ), ], options={ @@ -74,10 +115,16 @@ class Migration(migrations.Migration): ), migrations.AddIndex( model_name="newsletterintake", - index=models.Index(fields=["project", "sender_email", "status"], name="core_newsle_project_2c63fb_idx"), + index=models.Index( + fields=["project", "sender_email", "status"], + name="core_newsle_project_2c63fb_idx", + ), ), migrations.AddConstraint( model_name="intakeallowlist", - constraint=models.UniqueConstraint(fields=("project", "sender_email"), name="core_allowlist_unique_project_sender"), + constraint=models.UniqueConstraint( + fields=("project", "sender_email"), + name="core_allowlist_unique_project_sender", + ), ), ] diff --git a/core/models.py b/core/models.py index 3b260940..a7573687 100644 --- a/core/models.py +++ b/core/models.py @@ -1,3 +1,10 @@ +"""Core domain models for projects, ingestion, and editorial review. + +The admin, API, Celery tasks, and AI pipeline all revolve around the models in this +module. Adding model-level docstrings here gives Django admindocs a useful summary +of the core entities new contributors interact with first. +""" + import secrets from django.conf import settings @@ -6,281 +13,445 @@ def generate_project_intake_token() -> str: - return secrets.token_hex(16) + """Generate the stable token used in project-specific intake email aliases. + + Returns: + A random hex token that can be embedded in addresses like + ``intake+@...`` to route inbound newsletters to a project. + """ + + return secrets.token_hex(16) def generate_confirmation_token() -> str: - return secrets.token_urlsafe(24) + """Generate a one-time token for newsletter sender confirmation links. + + Returns: + A URL-safe random token stored on an allowlist entry until the sender + confirms newsletter intake access. + """ + + return secrets.token_urlsafe(24) class EntityType(models.TextChoices): - INDIVIDUAL = "individual", "Individual" - VENDOR = "vendor", "Vendor" - ORGANIZATION = "organization", "Organization" + """Supported types of tracked entities within a project.""" + + INDIVIDUAL = "individual", "Individual" + VENDOR = "vendor", "Vendor" + ORGANIZATION = "organization", "Organization" class SkillStatus(models.TextChoices): - PENDING = "pending", "Pending" - RUNNING = "running", "Running" - COMPLETED = "completed", "Completed" - FAILED = "failed", "Failed" + """Execution states recorded for AI skill runs.""" + + PENDING = "pending", "Pending" + RUNNING = "running", "Running" + COMPLETED = "completed", "Completed" + FAILED = "failed", "Failed" class FeedbackType(models.TextChoices): - UPVOTE = "upvote", "Upvote" - DOWNVOTE = "downvote", "Downvote" + """Editorial feedback signals that tune authority and ranking.""" + + UPVOTE = "upvote", "Upvote" + DOWNVOTE = "downvote", "Downvote" class SourcePluginName(models.TextChoices): - RSS = "rss", "RSS" - REDDIT = "reddit", "Reddit" + """Built-in ingestion plugins that can populate project content.""" + + RSS = "rss", "RSS" + REDDIT = "reddit", "Reddit" class NewsletterIntakeStatus(models.TextChoices): - PENDING = "pending", "Pending" - EXTRACTED = "extracted", "Extracted" - FAILED = "failed", "Failed" - REJECTED = "rejected", "Rejected" + """Lifecycle states for a raw inbound newsletter email.""" + + PENDING = "pending", "Pending" + EXTRACTED = "extracted", "Extracted" + FAILED = "failed", "Failed" + REJECTED = "rejected", "Rejected" class RunStatus(models.TextChoices): - RUNNING = "running", "Running" - SUCCESS = "success", "Success" - FAILED = "failed", "Failed" + """Outcome states for ingestion runs.""" + + RUNNING = "running", "Running" + SUCCESS = "success", "Success" + FAILED = "failed", "Failed" class ReviewReason(models.TextChoices): - LOW_CONFIDENCE_CLASSIFICATION = "low_confidence_classification", "Low Confidence Classification" - BORDERLINE_RELEVANCE = "borderline_relevance", "Borderline Relevance" + """Reasons content is pushed to the manual review queue.""" + + LOW_CONFIDENCE_CLASSIFICATION = ( + "low_confidence_classification", + "Low Confidence Classification", + ) + BORDERLINE_RELEVANCE = "borderline_relevance", "Borderline Relevance" class ReviewResolution(models.TextChoices): - HUMAN_APPROVED = "human_approved", "Human Approved" - HUMAN_REJECTED = "human_rejected", "Human Rejected" + """Human outcomes for review queue items.""" + + HUMAN_APPROVED = "human_approved", "Human Approved" + HUMAN_REJECTED = "human_rejected", "Human Rejected" class Project(models.Model): - name = models.CharField(max_length=255) - group = models.ForeignKey(Group, on_delete=models.CASCADE, related_name="projects") - topic_description = models.TextField() - content_retention_days = models.PositiveIntegerField(default=365) - intake_token = models.CharField(max_length=64, unique=True, default=generate_project_intake_token, editable=False) - intake_enabled = models.BooleanField(default=False) - created_at = models.DateTimeField(auto_now_add=True) + """Represents a newsletter workspace owned by a Django auth group. + + A project defines the editorial topic, retention policy, and email-intake + identity used by all downstream ingestion, relevance scoring, and review flows. + Most other core models are scoped to a single project. + """ + + name = models.CharField(max_length=255) + group = models.ForeignKey(Group, on_delete=models.CASCADE, related_name="projects") + topic_description = models.TextField() + content_retention_days = models.PositiveIntegerField(default=365) + intake_token = models.CharField( + max_length=64, + unique=True, + default=generate_project_intake_token, + editable=False, + ) + intake_enabled = models.BooleanField(default=False) + created_at = models.DateTimeField(auto_now_add=True) - class Meta: - ordering = ["name"] + class Meta: + ordering = ["name"] - def __str__(self) -> str: - return self.name + def __str__(self) -> str: + return self.name class ProjectConfig(models.Model): - project = models.OneToOneField(Project, on_delete=models.CASCADE, related_name="config") - upvote_authority_weight = models.FloatField(default=0.1) - downvote_authority_weight = models.FloatField(default=-0.05) - authority_decay_rate = models.FloatField(default=0.95) + """Stores tunable scoring parameters for a single project. - class Meta: - verbose_name = "Project config" - verbose_name_plural = "Project configs" + These values let the application adjust how strongly upvotes, downvotes, and + score decay influence entity authority over time without changing code. + """ - def __str__(self) -> str: - return f"Config for {self.project.name}" + project = models.OneToOneField( + Project, on_delete=models.CASCADE, related_name="config" + ) + upvote_authority_weight = models.FloatField(default=0.1) + downvote_authority_weight = models.FloatField(default=-0.05) + authority_decay_rate = models.FloatField(default=0.95) + + class Meta: + verbose_name = "Project config" + verbose_name_plural = "Project configs" + + def __str__(self) -> str: + return f"Config for {self.project.name}" class Entity(models.Model): - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="entities") - name = models.CharField(max_length=255) - type = models.CharField(max_length=32, choices=EntityType.choices) - description = models.TextField(blank=True) - authority_score = models.FloatField(default=0.5) - website_url = models.URLField(blank=True) - github_url = models.URLField(blank=True) - linkedin_url = models.URLField(blank=True) - bluesky_handle = models.CharField(max_length=255, blank=True) - mastodon_handle = models.CharField(max_length=255, blank=True) - twitter_handle = models.CharField(max_length=255, blank=True) - created_at = models.DateTimeField(auto_now_add=True) - - class Meta: - ordering = ["name"] - constraints = [ - models.UniqueConstraint(fields=["project", "name"], name="core_entity_unique_project_name"), - ] - - def __str__(self) -> str: - return self.name + """Represents a person, vendor, or organization tracked inside a project. + + Content can optionally link to an entity so authority signals and editorial + curation can accumulate around a known subject instead of isolated articles. + """ + + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="entities" + ) + name = models.CharField(max_length=255) + type = models.CharField(max_length=32, choices=EntityType.choices) + description = models.TextField(blank=True) + authority_score = models.FloatField(default=0.5) + website_url = models.URLField(blank=True) + github_url = models.URLField(blank=True) + linkedin_url = models.URLField(blank=True) + bluesky_handle = models.CharField(max_length=255, blank=True) + mastodon_handle = models.CharField(max_length=255, blank=True) + twitter_handle = models.CharField(max_length=255, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + ordering = ["name"] + constraints = [ + models.UniqueConstraint( + fields=["project", "name"], name="core_entity_unique_project_name" + ), + ] + + def __str__(self) -> str: + return self.name class Content(models.Model): - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="contents") - url = models.URLField() - title = models.CharField(max_length=512) - author = models.CharField(max_length=255, blank=True) - entity = models.ForeignKey(Entity, null=True, blank=True, on_delete=models.SET_NULL, related_name="contents") - source_plugin = models.CharField(max_length=64) - content_type = models.CharField(max_length=64, blank=True) - published_date = models.DateTimeField() - ingested_at = models.DateTimeField(auto_now_add=True) - content_text = models.TextField() - relevance_score = models.FloatField(null=True, blank=True) - embedding_id = models.CharField(max_length=64, blank=True) - source_metadata = models.JSONField(default=dict, blank=True) - is_reference = models.BooleanField(default=False) - is_active = models.BooleanField(default=True) - - class Meta: - ordering = ["-published_date"] - indexes = [ - models.Index(fields=["project", "-published_date"]), - models.Index(fields=["project", "-relevance_score"]), - models.Index(fields=["project", "is_reference"]), - models.Index(fields=["url"]), - ] - - def __str__(self) -> str: - return self.title + """Stores an ingested content item that may appear in a newsletter. + + A content row is the canonical record for fetched articles, newsletter links, + or other source items. It keeps the raw text used for embedding, skill output, + and editorial review, and it also links the row to its Qdrant vector via + ``embedding_id``. + """ + + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="contents" + ) + url = models.URLField() + title = models.CharField(max_length=512) + author = models.CharField(max_length=255, blank=True) + entity = models.ForeignKey( + Entity, + null=True, + blank=True, + on_delete=models.SET_NULL, + related_name="contents", + ) + source_plugin = models.CharField(max_length=64) + content_type = models.CharField(max_length=64, blank=True) + published_date = models.DateTimeField() + ingested_at = models.DateTimeField(auto_now_add=True) + content_text = models.TextField() + relevance_score = models.FloatField(null=True, blank=True) + embedding_id = models.CharField(max_length=64, blank=True) + source_metadata = models.JSONField(default=dict, blank=True) + is_reference = models.BooleanField(default=False) + is_active = models.BooleanField(default=True) + + class Meta: + ordering = ["-published_date"] + indexes = [ + models.Index(fields=["project", "-published_date"]), + models.Index(fields=["project", "-relevance_score"]), + models.Index(fields=["project", "is_reference"]), + models.Index(fields=["url"]), + ] + + def __str__(self) -> str: + return self.title class IntakeAllowlist(models.Model): - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="intake_allowlist") - sender_email = models.EmailField() - confirmed_at = models.DateTimeField(null=True, blank=True) - confirmation_token = models.CharField(max_length=64, unique=True, default=generate_confirmation_token) - created_at = models.DateTimeField(auto_now_add=True) - - class Meta: - ordering = ["sender_email"] - constraints = [ - models.UniqueConstraint(fields=["project", "sender_email"], name="core_allowlist_unique_project_sender"), - ] - - def __str__(self) -> str: - return f"{self.sender_email} for {self.project.name}" - - @property - def is_confirmed(self) -> bool: - return self.confirmed_at is not None + """Tracks who is allowed to send newsletters into a project inbox. + + When the first message arrives from a sender, the system creates an allowlist + entry and emails a confirmation link. After confirmation, future inbound + messages from the same sender can be processed automatically. + """ + + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="intake_allowlist" + ) + sender_email = models.EmailField() + confirmed_at = models.DateTimeField(null=True, blank=True) + confirmation_token = models.CharField( + max_length=64, unique=True, default=generate_confirmation_token + ) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + ordering = ["sender_email"] + constraints = [ + models.UniqueConstraint( + fields=["project", "sender_email"], + name="core_allowlist_unique_project_sender", + ), + ] + + def __str__(self) -> str: + return f"{self.sender_email} for {self.project.name}" + + @property + def is_confirmed(self) -> bool: + """Return whether the sender has confirmed newsletter intake access.""" + + return self.confirmed_at is not None class NewsletterIntake(models.Model): - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="newsletter_intakes") - sender_email = models.EmailField() - subject = models.CharField(max_length=512) - received_at = models.DateTimeField(auto_now_add=True) - raw_html = models.TextField(blank=True) - raw_text = models.TextField(blank=True) - message_id = models.CharField(max_length=255, unique=True) - status = models.CharField(max_length=16, choices=NewsletterIntakeStatus.choices, default=NewsletterIntakeStatus.PENDING) - extraction_result = models.JSONField(null=True, blank=True) - error_message = models.TextField(blank=True) - - class Meta: - ordering = ["-received_at"] - indexes = [ - models.Index(fields=["project", "sender_email", "status"]), - ] - - def __str__(self) -> str: - return f"{self.subject or self.message_id}" + """Stores a raw inbound newsletter email before extraction. + + Intake rows preserve the original email payload, deduplicate by message ID, + and record whether extraction succeeded so the system can reprocess or audit + inbound newsletter handling later. + """ + + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="newsletter_intakes" + ) + sender_email = models.EmailField() + subject = models.CharField(max_length=512) + received_at = models.DateTimeField(auto_now_add=True) + raw_html = models.TextField(blank=True) + raw_text = models.TextField(blank=True) + message_id = models.CharField(max_length=255, unique=True) + status = models.CharField( + max_length=16, + choices=NewsletterIntakeStatus.choices, + default=NewsletterIntakeStatus.PENDING, + ) + extraction_result = models.JSONField(null=True, blank=True) + error_message = models.TextField(blank=True) + + class Meta: + ordering = ["-received_at"] + indexes = [ + models.Index(fields=["project", "sender_email", "status"]), + ] + + def __str__(self) -> str: + return f"{self.subject or self.message_id}" class SkillResult(models.Model): - content = models.ForeignKey(Content, on_delete=models.CASCADE, related_name="skill_results") - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="skill_results") - skill_name = models.CharField(max_length=64) - status = models.CharField(max_length=16, choices=SkillStatus.choices) - result_data = models.JSONField(null=True, blank=True) - error_message = models.TextField(blank=True) - model_used = models.CharField(max_length=64, blank=True) - latency_ms = models.IntegerField(null=True, blank=True) - confidence = models.FloatField(null=True, blank=True) - created_at = models.DateTimeField(auto_now_add=True) - superseded_by = models.ForeignKey( - "self", - null=True, - blank=True, - on_delete=models.SET_NULL, - related_name="supersedes", - ) - - class Meta: - ordering = ["-created_at"] - indexes = [ - models.Index(fields=["content", "skill_name"]), - models.Index(fields=["project", "created_at"]), - ] - - def __str__(self) -> str: - return f"{self.skill_name} for {self.content.title}" + """Persists the output of one AI skill execution for a content item. + + Skill results provide an auditable history of classifications, relevance + scores, summaries, and related-content lookups, including model metadata, + latency, and any superseded reruns. + """ + + content = models.ForeignKey( + Content, on_delete=models.CASCADE, related_name="skill_results" + ) + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="skill_results" + ) + skill_name = models.CharField(max_length=64) + status = models.CharField(max_length=16, choices=SkillStatus.choices) + result_data = models.JSONField(null=True, blank=True) + error_message = models.TextField(blank=True) + model_used = models.CharField(max_length=64, blank=True) + latency_ms = models.IntegerField(null=True, blank=True) + confidence = models.FloatField(null=True, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + superseded_by = models.ForeignKey( + "self", + null=True, + blank=True, + on_delete=models.SET_NULL, + related_name="supersedes", + ) + + class Meta: + ordering = ["-created_at"] + indexes = [ + models.Index(fields=["content", "skill_name"]), + models.Index(fields=["project", "created_at"]), + ] + + def __str__(self) -> str: + return f"{self.skill_name} for {self.content.title}" class UserFeedback(models.Model): - content = models.ForeignKey(Content, on_delete=models.CASCADE, related_name="feedback") - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="feedback") - user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="content_feedback") - feedback_type = models.CharField(max_length=16, choices=FeedbackType.choices) - created_at = models.DateTimeField(auto_now_add=True) + """Records an editor's feedback on a specific content item. + + Feedback is stored separately from the content row so the application can use + it as an explicit human signal when adjusting ranking and authority logic. + """ + + content = models.ForeignKey( + Content, on_delete=models.CASCADE, related_name="feedback" + ) + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="feedback" + ) + user = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + related_name="content_feedback", + ) + feedback_type = models.CharField(max_length=16, choices=FeedbackType.choices) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + ordering = ["-created_at"] + constraints = [ + models.UniqueConstraint( + fields=["content", "user"], name="core_feedback_unique_content_user" + ), + ] + + def __str__(self) -> str: + return f"{self.feedback_type} by {self.user}" - class Meta: - ordering = ["-created_at"] - constraints = [ - models.UniqueConstraint(fields=["content", "user"], name="core_feedback_unique_content_user"), - ] - def __str__(self) -> str: - return f"{self.feedback_type} by {self.user}" +class SourceConfig(models.Model): + """Configures one ingestion source for a project. + Each source config selects a plugin, stores its provider-specific settings, + and records the last successful fetch time used for incremental ingestion. + """ -class SourceConfig(models.Model): - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="source_configs") - plugin_name = models.CharField(max_length=64, choices=SourcePluginName.choices) - config = models.JSONField(default=dict) - is_active = models.BooleanField(default=True) - last_fetched_at = models.DateTimeField(null=True, blank=True) + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="source_configs" + ) + plugin_name = models.CharField(max_length=64, choices=SourcePluginName.choices) + config = models.JSONField(default=dict) + is_active = models.BooleanField(default=True) + last_fetched_at = models.DateTimeField(null=True, blank=True) - class Meta: - ordering = ["plugin_name", "id"] - indexes = [ - models.Index(fields=["project", "plugin_name", "is_active"]), - ] + class Meta: + ordering = ["plugin_name", "id"] + indexes = [ + models.Index(fields=["project", "plugin_name", "is_active"]), + ] - def __str__(self) -> str: - return f"{self.plugin_name} source for {self.project.name}" + def __str__(self) -> str: + return f"{self.plugin_name} source for {self.project.name}" class IngestionRun(models.Model): - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="ingestion_runs") - plugin_name = models.CharField(max_length=64) - started_at = models.DateTimeField(auto_now_add=True) - completed_at = models.DateTimeField(null=True, blank=True) - status = models.CharField(max_length=16, choices=RunStatus.choices) - items_fetched = models.IntegerField(default=0) - items_ingested = models.IntegerField(default=0) - error_message = models.TextField(blank=True) + """Captures the outcome of one source-ingestion execution. + + Run rows make ingestion observable in the admin by recording the source, + timestamps, item counts, and any error that stopped the fetch. + """ + + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="ingestion_runs" + ) + plugin_name = models.CharField(max_length=64) + started_at = models.DateTimeField(auto_now_add=True) + completed_at = models.DateTimeField(null=True, blank=True) + status = models.CharField(max_length=16, choices=RunStatus.choices) + items_fetched = models.IntegerField(default=0) + items_ingested = models.IntegerField(default=0) + error_message = models.TextField(blank=True) - class Meta: - ordering = ["-started_at"] - indexes = [ - models.Index(fields=["project", "plugin_name", "-started_at"]), - ] + class Meta: + ordering = ["-started_at"] + indexes = [ + models.Index(fields=["project", "plugin_name", "-started_at"]), + ] - def __str__(self) -> str: - return f"{self.plugin_name} for {self.project.name}" + def __str__(self) -> str: + return f"{self.plugin_name} for {self.project.name}" class ReviewQueue(models.Model): - project = models.ForeignKey(Project, on_delete=models.CASCADE, related_name="review_queue_items") - content = models.ForeignKey(Content, on_delete=models.CASCADE, related_name="review_queue_items") - reason = models.CharField(max_length=64, choices=ReviewReason.choices) - confidence = models.FloatField() - created_at = models.DateTimeField(auto_now_add=True) - resolved = models.BooleanField(default=False) - resolution = models.CharField(max_length=64, choices=ReviewResolution.choices, blank=True) - - class Meta: - ordering = ["resolved", "-created_at"] - - def __str__(self) -> str: - return f"{self.reason} for {self.content.title}" + """Tracks content items that require a human decision. + + The AI pipeline adds rows here when classification confidence is low or the + relevance score is borderline. Review outcomes are stored on the queue item so + editors can see why an article was escalated and how it was resolved. + """ + + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="review_queue_items" + ) + content = models.ForeignKey( + Content, on_delete=models.CASCADE, related_name="review_queue_items" + ) + reason = models.CharField(max_length=64, choices=ReviewReason.choices) + confidence = models.FloatField() + created_at = models.DateTimeField(auto_now_add=True) + resolved = models.BooleanField(default=False) + resolution = models.CharField( + max_length=64, choices=ReviewResolution.choices, blank=True + ) + + class Meta: + ordering = ["resolved", "-created_at"] + + def __str__(self) -> str: + return f"{self.reason} for {self.content.title}" diff --git a/core/newsletter_extraction.py b/core/newsletter_extraction.py index 87e85113..51fec279 100644 --- a/core/newsletter_extraction.py +++ b/core/newsletter_extraction.py @@ -1,3 +1,5 @@ +"""Heuristics for extracting article candidates from newsletter emails.""" + from __future__ import annotations import re @@ -9,6 +11,8 @@ @dataclass(slots=True) class ExtractedNewsletterItem: + """Represents one link candidate extracted from a newsletter email.""" + url: str title: str excerpt: str @@ -16,6 +20,8 @@ class ExtractedNewsletterItem: class _NewsletterLinkParser(HTMLParser): + """Collect anchor tags with HTTP(S) targets from newsletter HTML.""" + def __init__(self) -> None: super().__init__() self.links: list[dict[str, str]] = [] @@ -41,14 +47,29 @@ def handle_endtag(self, tag: str) -> None: self.links.append( { "url": self._active_href, - "title": " ".join(part.strip() for part in self._active_text if part.strip()), + "title": " ".join( + part.strip() for part in self._active_text if part.strip() + ), } ) self._active_href = None self._active_text = [] -def extract_newsletter_items(*, subject: str, raw_html: str, raw_text: str) -> list[ExtractedNewsletterItem]: +def extract_newsletter_items( + *, subject: str, raw_html: str, raw_text: str +) -> list[ExtractedNewsletterItem]: + """Extract ordered newsletter items from HTML anchors and plain-text URLs. + + Args: + subject: Subject line used as a fallback title. + raw_html: HTML body of the newsletter email. + raw_text: Plain-text body of the newsletter email. + + Returns: + A de-duplicated ordered list of extracted article candidates. + """ + parser = _NewsletterLinkParser() if raw_html: parser.feed(raw_html) diff --git a/core/newsletters.py b/core/newsletters.py index a44b45fe..c7fb4665 100644 --- a/core/newsletters.py +++ b/core/newsletters.py @@ -1,3 +1,10 @@ +"""Newsletter intake helpers for inbound email processing. + +This module normalizes inbound sender data, sanitizes HTML before storage, +deduplicates raw email messages, and hands confirmed messages off to the Celery +task that extracts content items from a newsletter email. +""" + from __future__ import annotations from email.utils import parseaddr @@ -20,11 +27,22 @@ def normalize_sender_email(value: str) -> str: + """Normalize a sender header into a lowercase bare email address.""" + _, email_address = parseaddr(value) return email_address.strip().lower() def sanitize_newsletter_html(raw_html: str) -> str: + """Remove script content and inline event handlers from newsletter HTML. + + Args: + raw_html: Raw HTML body captured from the inbound message. + + Returns: + A sanitized HTML fragment safe to persist and render in the admin. + """ + without_scripts = _strip_script_blocks(raw_html) parser = _InlineHandlerStrippingParser() parser.feed(without_scripts) @@ -33,6 +51,8 @@ def sanitize_newsletter_html(raw_html: str) -> str: def _strip_script_blocks(raw_html: str) -> str: + """Remove complete ```` boundary for a previously found script tag.""" + search_index = start_index lower_html = raw_html.lower() while True: @@ -85,6 +109,8 @@ def _find_script_end(raw_html: str, start_index: int) -> int: def _find_tag_end(raw_html: str, start_index: int) -> int: + """Find the closing ``>`` for a tag while respecting quoted attributes.""" + quote_char: str | None = None for index in range(start_index, len(raw_html)): current_char = raw_html[index] @@ -101,11 +127,15 @@ def _find_tag_end(raw_html: str, start_index: int) -> int: class _InlineHandlerStrippingParser(HTMLParser): + """HTML parser that rebuilds markup without inline JavaScript handlers.""" + def __init__(self) -> None: super().__init__(convert_charrefs=False) self._parts: list[str] = [] def get_html(self) -> str: + """Return the reconstructed sanitized HTML string.""" + return "".join(self._parts) def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: @@ -140,6 +170,8 @@ def unknown_decl(self, data: str) -> None: @staticmethod def _render_tag(tag: str, attrs: list[tuple[str, str | None]]) -> str: + """Render a tag while omitting attributes like ``onclick``.""" + rendered_attrs: list[str] = [] for name, value in attrs: if name.lower().startswith("on"): @@ -153,6 +185,16 @@ def _render_tag(tag: str, attrs: list[tuple[str, str | None]]) -> str: def extract_project_token(recipient: str) -> str | None: + """Extract the project intake token from an inbound recipient address. + + Args: + recipient: Email recipient such as ``intake+@example.com``. + + Returns: + The embedded project token, or ``None`` when the address does not match the + intake alias format. + """ + _, email_address = parseaddr(recipient) local_part = email_address.partition("@")[0] prefix, separator, token = local_part.partition("+") @@ -161,7 +203,11 @@ def extract_project_token(recipient: str) -> str | None: return token -def send_confirmation_email(*, to_email: str, confirm_url: str, project_name: str) -> None: +def send_confirmation_email( + *, to_email: str, confirm_url: str, project_name: str +) -> None: + """Send the confirmation email required for new newsletter senders.""" + subject = f"Confirm newsletter intake for {project_name}" text_body = ( "Confirm this sender for newsletter ingestion.\n\n" @@ -183,6 +229,8 @@ def send_confirmation_email(*, to_email: str, confirm_url: str, project_name: st def build_confirmation_url(token: str) -> str: + """Build the absolute confirmation URL for an allowlist token.""" + base_url = settings.NEWSLETTER_API_BASE_URL.rstrip("/") return f"{base_url}{reverse('confirm-newsletter-sender', kwargs={'token': token})}" @@ -196,6 +244,21 @@ def process_inbound_newsletter( raw_text: str, message_id: str, ) -> dict[str, Any]: + """Persist and route one inbound newsletter message. + + Args: + recipients: Recipient addresses from the inbound email payload. + sender_email: Envelope sender or normalized message sender. + subject: Newsletter email subject. + raw_html: Raw HTML body captured from the provider webhook. + raw_text: Raw plain-text body captured from the provider webhook. + message_id: Provider message identifier used for deduplication. + + Returns: + A status payload describing whether the message was ignored, queued, or is + waiting for sender confirmation. + """ + project = _find_intake_project(recipients) if project is None: return {"status": "ignored", "reason": "no_matching_project"} @@ -239,7 +302,15 @@ def process_inbound_newsletter( def queue_newsletter_intake(intake_id: int) -> None: - process_newsletter_intake = current_app.tasks["core.tasks.process_newsletter_intake"] + """Dispatch newsletter extraction for a stored intake row. + + Args: + intake_id: Primary key of the stored ``NewsletterIntake`` row. + """ + + process_newsletter_intake = current_app.tasks[ + "core.tasks.process_newsletter_intake" + ] if settings.CELERY_TASK_ALWAYS_EAGER: process_newsletter_intake.apply(args=(intake_id,), throw=True) else: @@ -247,11 +318,15 @@ def queue_newsletter_intake(intake_id: int) -> None: def _find_intake_project(recipients: Iterable[str]) -> Project | None: + """Resolve the first enabled project referenced by the recipient list.""" + for recipient in recipients: token = extract_project_token(recipient) if token is None: continue - project = Project.objects.filter(intake_token=token, intake_enabled=True).first() + project = Project.objects.filter( + intake_token=token, intake_enabled=True + ).first() if project is not None: return project return None diff --git a/core/pipeline.py b/core/pipeline.py index ac5527c4..c14974ee 100644 --- a/core/pipeline.py +++ b/core/pipeline.py @@ -1,3 +1,10 @@ +"""Content-classification, relevance, and summarization workflow helpers. + +This module contains the runtime implementation of the app's AI pipeline. It owns +the LangGraph orchestration, the heuristics and LLM fallbacks for each skill, and +the persistence of skill results and manual-review queue items. +""" + from __future__ import annotations import logging @@ -8,7 +15,12 @@ from django.conf import settings from langgraph.graph import END, StateGraph -from core.embeddings import build_content_embedding_text, embed_text, get_reference_similarity, search_similar_content +from core.embeddings import ( + build_content_embedding_text, + embed_text, + get_reference_similarity, + search_similar_content, +) from core.llm import openrouter_chat_json from core.models import Content, ReviewQueue, ReviewReason, SkillResult, SkillStatus @@ -32,6 +44,8 @@ class PipelineState(TypedDict, total=False): + """State payload passed between LangGraph pipeline nodes.""" + content_id: int project_id: int classification: dict[str, Any] | None @@ -42,6 +56,13 @@ class PipelineState(TypedDict, total=False): @lru_cache(maxsize=1) def get_ingestion_graph(): + """Build and cache the LangGraph workflow used for content processing. + + Returns: + A compiled state graph that classifies content, scores relevance, and then + routes the item to summarization, archival, or human review. + """ + graph = StateGraph(PipelineState) graph.add_node("classify", classify_node) graph.add_node("score_relevance", relevance_node) @@ -66,6 +87,15 @@ def get_ingestion_graph(): def process_content_pipeline(content_id: int) -> PipelineState: + """Run the end-to-end ingestion pipeline for one content item. + + Args: + content_id: Primary key of the content row to process. + + Returns: + The final pipeline state returned by the compiled LangGraph workflow. + """ + content = Content.objects.select_related("project").get(pk=content_id) initial_state: PipelineState = { "content_id": content.id, @@ -76,8 +106,12 @@ def process_content_pipeline(content_id: int) -> PipelineState: def classify_node(state: PipelineState) -> PipelineState: + """Classify the content item and persist the resulting skill output.""" + content = _get_content(state) - classification = _execute_with_retries(CLASSIFICATION_SKILL_NAME, lambda: run_content_classification(content)) + classification = _execute_with_retries( + CLASSIFICATION_SKILL_NAME, lambda: run_content_classification(content) + ) content.content_type = classification["content_type"] content.save(update_fields=["content_type"]) _create_skill_result( @@ -99,8 +133,12 @@ def classify_node(state: PipelineState) -> PipelineState: def relevance_node(state: PipelineState) -> PipelineState: + """Score content relevance, persist the score, and keep the item active.""" + content = _get_content(state) - relevance = _execute_with_retries(RELEVANCE_SKILL_NAME, lambda: run_relevance_scoring(content)) + relevance = _execute_with_retries( + RELEVANCE_SKILL_NAME, lambda: run_relevance_scoring(content) + ) content.relevance_score = relevance["relevance_score"] content.is_active = True content.save(update_fields=["relevance_score", "is_active"]) @@ -117,8 +155,12 @@ def relevance_node(state: PipelineState) -> PipelineState: def summarize_node(state: PipelineState) -> PipelineState: + """Generate and store a newsletter-ready summary for relevant content.""" + content = _get_content(state) - summary = _execute_with_retries(SUMMARIZATION_SKILL_NAME, lambda: run_summarization(content)) + summary = _execute_with_retries( + SUMMARIZATION_SKILL_NAME, lambda: run_summarization(content) + ) _create_skill_result( content, skill_name=SUMMARIZATION_SKILL_NAME, @@ -131,6 +173,8 @@ def summarize_node(state: PipelineState) -> PipelineState: def archive_node(state: PipelineState) -> PipelineState: + """Mark a low-value content item inactive so it drops out of active flows.""" + content = _get_content(state) content.is_active = False content.save(update_fields=["is_active"]) @@ -138,19 +182,34 @@ def archive_node(state: PipelineState) -> PipelineState: def queue_review_node(state: PipelineState) -> PipelineState: + """Create or refresh a manual review item for borderline relevance.""" + content = _get_content(state) relevance = state.get("relevance") or {} _upsert_review_queue_item( content, reason=ReviewReason.BORDERLINE_RELEVANCE, - confidence=float(relevance.get("relevance_score", settings.AI_RELEVANCE_REVIEW_THRESHOLD)), + confidence=float( + relevance.get("relevance_score", settings.AI_RELEVANCE_REVIEW_THRESHOLD) + ), ) content.is_active = True content.save(update_fields=["is_active"]) return {"status": "review"} -def route_by_relevance(state: PipelineState) -> Literal["relevant", "borderline", "irrelevant"]: +def route_by_relevance( + state: PipelineState, +) -> Literal["relevant", "borderline", "irrelevant"]: + """Choose the next workflow branch from the computed relevance score. + + Args: + state: Current pipeline state, including the relevance result when present. + + Returns: + The route name consumed by LangGraph to continue processing. + """ + relevance = state.get("relevance") or {} score = float(relevance.get("relevance_score", 0.0)) if score >= settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD: @@ -161,6 +220,16 @@ def route_by_relevance(state: PipelineState) -> Literal["relevant", "borderline" def run_content_classification(content: Content) -> dict[str, Any]: + """Classify a content item into a newsletter-oriented content type. + + Args: + content: The content row being classified. + + Returns: + A normalized payload containing the selected content type, confidence, + explanation, and model metadata. + """ + if settings.OPENROUTER_API_KEY: try: response = openrouter_chat_json( @@ -180,7 +249,9 @@ def run_content_classification(content: Content) -> dict[str, Any]: return { "content_type": content_type, "confidence": confidence, - "explanation": str(payload.get("explanation", "LLM-based classification.")), + "explanation": str( + payload.get("explanation", "LLM-based classification.") + ), "model_used": response.model, "latency_ms": response.latency_ms, } @@ -193,9 +264,24 @@ def run_content_classification(content: Content) -> dict[str, Any]: def run_relevance_scoring(content: Content) -> dict[str, Any]: + """Score how relevant a content item is to its project's topic. + + The function first measures similarity to the project's reference corpus in + Qdrant. Only borderline scores are sent to the LLM for adjudication. + + Args: + content: The content row being scored. + + Returns: + A payload containing the relevance score, explanation, and model metadata. + """ + vector = embed_text(build_content_embedding_text(content)) similarity = float(get_reference_similarity(content.project_id, vector)) - if similarity >= settings.AI_RELEVANCE_HIGH_THRESHOLD or similarity < settings.AI_RELEVANCE_LOW_THRESHOLD: + if ( + similarity >= settings.AI_RELEVANCE_HIGH_THRESHOLD + or similarity < settings.AI_RELEVANCE_LOW_THRESHOLD + ): return { "relevance_score": similarity, "explanation": f"Reference corpus similarity score is {similarity:.2f}; no LLM adjudication was required.", @@ -221,8 +307,12 @@ def run_relevance_scoring(content: Content) -> dict[str, Any]: ) payload = response.payload return { - "relevance_score": _clamp_score(payload.get("relevance_score", similarity)), - "explanation": str(payload.get("explanation", "LLM-based relevance adjudication.")), + "relevance_score": _clamp_score( + payload.get("relevance_score", similarity) + ), + "explanation": str( + payload.get("explanation", "LLM-based relevance adjudication.") + ), "used_llm": True, "model_used": response.model, "latency_ms": response.latency_ms, @@ -246,6 +336,15 @@ def run_relevance_scoring(content: Content) -> dict[str, Any]: def run_summarization(content: Content) -> dict[str, Any]: + """Generate a concise newsletter summary for a content item. + + Args: + content: The content row to summarize. + + Returns: + A payload containing the summary text plus model metadata. + """ + if settings.OPENROUTER_API_KEY: try: response = openrouter_chat_json( @@ -260,7 +359,9 @@ def run_summarization(content: Content) -> dict[str, Any]: ), ) return { - "summary": _normalize_summary(str(response.payload.get("summary", "")), content), + "summary": _normalize_summary( + str(response.payload.get("summary", "")), content + ), "model_used": response.model, "latency_ms": response.latency_ms, } @@ -277,6 +378,19 @@ def run_summarization(content: Content) -> dict[str, Any]: def execute_ad_hoc_skill(content: Content, skill_name: str) -> SkillResult: + """Run one supported skill immediately for a single content item. + + Args: + content: The content row to evaluate. + skill_name: Name of the skill to execute. + + Returns: + The persisted skill-result row for the ad hoc execution. + + Raises: + ValueError: If the requested skill name is not supported. + """ + if skill_name == CLASSIFICATION_SKILL_NAME: return _execute_ad_hoc_classification(content) if skill_name == RELEVANCE_SKILL_NAME: @@ -289,6 +403,19 @@ def execute_ad_hoc_skill(content: Content, skill_name: str) -> SkillResult: def create_pending_skill_result(content: Content, skill_name: str) -> SkillResult: + """Create a placeholder skill-result row for async ad hoc execution. + + Args: + content: The content row the skill will operate on. + skill_name: Supported async skill name. + + Returns: + A pending skill-result row that can be updated by a Celery worker. + + Raises: + ValueError: If the skill cannot be executed asynchronously. + """ + if skill_name not in ASYNC_AD_HOC_SKILL_NAMES: raise ValueError(f"Unsupported async skill name: {skill_name}") return _create_skill_result( @@ -298,8 +425,26 @@ def create_pending_skill_result(content: Content, skill_name: str) -> SkillResul ) -def execute_background_skill_result(skill_result_id: int, skill_name: str) -> SkillResult: - skill_result = SkillResult.objects.select_related("content", "content__project").get(pk=skill_result_id) +def execute_background_skill_result( + skill_result_id: int, skill_name: str +) -> SkillResult: + """Execute an async ad hoc skill and update its persisted result row. + + Args: + skill_result_id: Primary key of the pending skill-result row. + skill_name: Expected skill name for the row being executed. + + Returns: + The updated skill-result row after success or failure. + + Raises: + ValueError: If the stored skill name does not match the requested one or if + the skill name is unsupported. + """ + + skill_result = SkillResult.objects.select_related( + "content", "content__project" + ).get(pk=skill_result_id) if skill_result.skill_name != skill_name: raise ValueError( f"Skill result {skill_result.id} is for {skill_result.skill_name}, not {skill_name}." @@ -344,8 +489,12 @@ def execute_background_skill_result(skill_result_id: int, skill_name: str) -> Sk def _execute_ad_hoc_classification(content: Content) -> SkillResult: + """Run classification immediately and persist success or failure.""" + try: - classification = _execute_with_retries(CLASSIFICATION_SKILL_NAME, lambda: run_content_classification(content)) + classification = _execute_with_retries( + CLASSIFICATION_SKILL_NAME, lambda: run_content_classification(content) + ) content.content_type = classification["content_type"] content.save(update_fields=["content_type"]) if classification["confidence"] < settings.AI_CLASSIFICATION_REVIEW_THRESHOLD: @@ -364,10 +513,14 @@ def _execute_ad_hoc_classification(content: Content) -> SkillResult: confidence=classification["confidence"], ) except Exception as exc: - return _create_failed_skill_result(content, skill_name=CLASSIFICATION_SKILL_NAME, error_message=str(exc)) + return _create_failed_skill_result( + content, skill_name=CLASSIFICATION_SKILL_NAME, error_message=str(exc) + ) def _execute_ad_hoc_relevance(content: Content) -> SkillResult: + """Run relevance scoring immediately and persist success or failure.""" + try: relevance, relevance_score = _run_ad_hoc_relevance(content) return _create_skill_result( @@ -380,10 +533,14 @@ def _execute_ad_hoc_relevance(content: Content) -> SkillResult: confidence=relevance_score, ) except Exception as exc: - return _create_failed_skill_result(content, skill_name=RELEVANCE_SKILL_NAME, error_message=str(exc)) + return _create_failed_skill_result( + content, skill_name=RELEVANCE_SKILL_NAME, error_message=str(exc) + ) def _execute_ad_hoc_summarization(content: Content) -> SkillResult: + """Run summarization immediately and persist success or failure.""" + try: summary = _run_ad_hoc_summarization(content) return _create_skill_result( @@ -395,10 +552,14 @@ def _execute_ad_hoc_summarization(content: Content) -> SkillResult: latency_ms=summary["latency_ms"], ) except Exception as exc: - return _create_failed_skill_result(content, skill_name=SUMMARIZATION_SKILL_NAME, error_message=str(exc)) + return _create_failed_skill_result( + content, skill_name=SUMMARIZATION_SKILL_NAME, error_message=str(exc) + ) def _execute_ad_hoc_related_content(content: Content) -> SkillResult: + """Find similar non-reference content and store the match list as a skill result.""" + try: matches = search_similar_content(content, limit=5, is_reference=False) related_items = [_serialize_related_match(match) for match in matches] @@ -416,16 +577,26 @@ def _execute_ad_hoc_related_content(content: Content) -> SkillResult: confidence=top_score, ) except Exception as exc: - return _create_failed_skill_result(content, skill_name=RELATED_CONTENT_SKILL_NAME, error_message=str(exc)) + return _create_failed_skill_result( + content, skill_name=RELATED_CONTENT_SKILL_NAME, error_message=str(exc) + ) def _run_ad_hoc_relevance(content: Content) -> tuple[dict[str, Any], float]: - relevance = _execute_with_retries(RELEVANCE_SKILL_NAME, lambda: run_relevance_scoring(content)) + """Apply ad hoc relevance scoring and update the content row accordingly.""" + + relevance = _execute_with_retries( + RELEVANCE_SKILL_NAME, lambda: run_relevance_scoring(content) + ) relevance_score = float(relevance["relevance_score"]) content.relevance_score = relevance_score content.is_active = relevance_score >= settings.AI_RELEVANCE_REVIEW_THRESHOLD content.save(update_fields=["relevance_score", "is_active"]) - if settings.AI_RELEVANCE_REVIEW_THRESHOLD <= relevance_score < settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD: + if ( + settings.AI_RELEVANCE_REVIEW_THRESHOLD + <= relevance_score + < settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD + ): _upsert_review_queue_item( content, reason=ReviewReason.BORDERLINE_RELEVANCE, @@ -435,15 +606,43 @@ def _run_ad_hoc_relevance(content: Content) -> tuple[dict[str, Any], float]: def _run_ad_hoc_summarization(content: Content) -> dict[str, Any]: + """Run summarization only when the content has already cleared the score gate. + + Args: + content: The content row to summarize. + + Returns: + The summarization payload returned by ``run_summarization``. + + Raises: + ValueError: If the content has not yet reached the relevance threshold + required for summarization. + """ + if (content.relevance_score or 0.0) < settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD: raise ValueError( "Summarization requires relevance_score >= " f"{settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD:.2f}. Run relevance scoring first or review the content." ) - return _execute_with_retries(SUMMARIZATION_SKILL_NAME, lambda: run_summarization(content)) + return _execute_with_retries( + SUMMARIZATION_SKILL_NAME, lambda: run_summarization(content) + ) def _execute_with_retries(skill_name: str, fn): + """Retry a skill callable up to the configured retry budget. + + Args: + skill_name: Name used for logging failed attempts. + fn: Zero-argument callable that performs the skill work. + + Returns: + The value returned by ``fn`` when one attempt succeeds. + + Raises: + Exception: Re-raises the final exception after all retries fail. + """ + last_exc: Exception | None = None for attempt in range(settings.AI_MAX_NODE_RETRIES + 1): try: @@ -459,6 +658,8 @@ def _execute_with_retries(skill_name: str, fn): def _serialize_related_match(match: Any) -> dict[str, Any]: + """Convert a Qdrant match object into the API-friendly related-content shape.""" + payload = dict(getattr(match, "payload", {}) or {}) return { "content_id": payload.get("content_id"), @@ -475,10 +676,23 @@ def _heuristic_classification(content: Content) -> dict[str, Any]: keyword_sets = { "release_notes": ("release notes", "changelog", "version", "released"), "tutorial": ("tutorial", "how to", "guide", "walkthrough", "step-by-step"), - "product_announcement": ("announcing", "launch", "launched", "available now", "introducing"), + "product_announcement": ( + "announcing", + "launch", + "launched", + "available now", + "introducing", + ), "event": ("conference", "summit", "meetup", "webinar", "event"), "opinion": ("opinion", "why i", "lessons learned", "thoughts", "editorial"), - "technical_article": ("architecture", "engineering", "platform", "infrastructure", "devops", "kubernetes"), + "technical_article": ( + "architecture", + "engineering", + "platform", + "infrastructure", + "devops", + "kubernetes", + ), } best_type = "other" best_score = 0 @@ -498,7 +712,11 @@ def _heuristic_classification(content: Content) -> dict[str, Any]: def _heuristic_summary(content: Content) -> str: - sentences = [segment.strip() for segment in re.split(r"(?<=[.!?])\s+", content.content_text.strip()) if segment.strip()] + sentences = [ + segment.strip() + for segment in re.split(r"(?<=[.!?])\s+", content.content_text.strip()) + if segment.strip() + ] if not sentences: return f"{content.title}: no summary was available from the source content." summary = " ".join(sentences[:2]) @@ -526,8 +744,12 @@ def _get_content(state: PipelineState) -> Content: return Content.objects.select_related("project").get(pk=state["content_id"]) -def _upsert_review_queue_item(content: Content, *, reason: ReviewReason, confidence: float) -> ReviewQueue: - existing = ReviewQueue.objects.filter(content=content, reason=reason, resolved=False).first() +def _upsert_review_queue_item( + content: Content, *, reason: ReviewReason, confidence: float +) -> ReviewQueue: + existing = ReviewQueue.objects.filter( + content=content, reason=reason, resolved=False + ).first() if existing: existing.confidence = confidence existing.save(update_fields=["confidence"]) @@ -551,7 +773,9 @@ def _create_skill_result( latency_ms: int | None = None, confidence: float | None = None, ) -> SkillResult: - previous = SkillResult.objects.filter(content=content, skill_name=skill_name, superseded_by__isnull=True).first() + previous = SkillResult.objects.filter( + content=content, skill_name=skill_name, superseded_by__isnull=True + ).first() skill_result = SkillResult.objects.create( content=content, project=content.project, @@ -569,7 +793,9 @@ def _create_skill_result( return skill_result -def _create_failed_skill_result(content: Content, *, skill_name: str, error_message: str) -> SkillResult: +def _create_failed_skill_result( + content: Content, *, skill_name: str, error_message: str +) -> SkillResult: return _create_skill_result( content, skill_name=skill_name, diff --git a/core/plugins/__init__.py b/core/plugins/__init__.py index 2f0094c2..29184f7d 100644 --- a/core/plugins/__init__.py +++ b/core/plugins/__init__.py @@ -1,3 +1,5 @@ +"""Public plugin-registry helpers used by the rest of the application.""" + from core.plugins.registry import get_plugin_for_source_config, validate_plugin_config __all__ = ["get_plugin_for_source_config", "validate_plugin_config"] diff --git a/core/plugins/base.py b/core/plugins/base.py index 51fa67d2..072a0d48 100644 --- a/core/plugins/base.py +++ b/core/plugins/base.py @@ -1,3 +1,5 @@ +"""Base types and shared behavior for ingestion source plugins.""" + from __future__ import annotations from abc import ABC, abstractmethod @@ -8,6 +10,8 @@ @dataclass(slots=True) class ContentItem: + """Normalized content payload returned by source plugins.""" + url: str title: str author: str @@ -17,14 +21,31 @@ class ContentItem: class SourcePlugin(ABC): + """Abstract base class implemented by all ingestion source plugins.""" + required_config_fields: tuple[str, ...] = () def __init__(self, source_config): + """Bind a plugin instance to the saved source configuration and project.""" + self.source_config = source_config self.project = source_config.project @classmethod def validate_config(cls, config: object) -> dict: + """Validate and normalize raw JSON configuration for a plugin. + + Args: + config: Raw configuration object submitted through admin or API. + + Returns: + A normalized configuration dictionary. + + Raises: + ValueError: If the config is not a mapping or required fields are + missing. + """ + if not isinstance(config, dict): raise ValueError("Config must be a JSON object.") normalized_config = dict(config) @@ -35,13 +56,19 @@ def validate_config(cls, config: object) -> dict: @abstractmethod def fetch_new_content(self, since: datetime | None) -> list[ContentItem]: + """Fetch content newer than the given timestamp.""" + raise NotImplementedError @abstractmethod def health_check(self) -> bool: + """Return whether the remote source is reachable and usable.""" + raise NotImplementedError def match_entity_for_url(self, url: str): + """Match a fetched URL to a tracked entity based on hostname equality.""" + target_hostname = self._normalize_hostname(url) if not target_hostname: return None @@ -52,5 +79,7 @@ def match_entity_for_url(self, url: str): @staticmethod def _normalize_hostname(url: str) -> str: + """Normalize a URL hostname for entity matching.""" + hostname = urlparse(url).hostname or "" return hostname.removeprefix("www.").lower() diff --git a/core/plugins/reddit.py b/core/plugins/reddit.py index 7529fab0..c91f5168 100644 --- a/core/plugins/reddit.py +++ b/core/plugins/reddit.py @@ -1,3 +1,5 @@ +"""Reddit source plugin used for trend and discussion ingestion.""" + from __future__ import annotations from datetime import UTC, datetime @@ -10,10 +12,14 @@ class RedditSourcePlugin(SourcePlugin): + """Fetch posts from a configured subreddit listing.""" + required_config_fields = ("subreddit",) @classmethod def validate_config(cls, config: object) -> dict: + """Validate Reddit-specific config such as listing and limit values.""" + normalized_config = super().validate_config(config) listing = normalized_config.get("listing", "both") if listing not in {"new", "hot", "both"}: @@ -25,6 +31,8 @@ def validate_config(cls, config: object) -> dict: return normalized_config def fetch_new_content(self, since: datetime | None) -> list[ContentItem]: + """Fetch subreddit submissions newer than ``since`` and normalize them.""" + subreddit = self._client().subreddit(self.source_config.config["subreddit"]) items: list[ContentItem] = [] seen_submission_ids: set[str] = set() @@ -37,7 +45,8 @@ def fetch_new_content(self, since: datetime | None) -> list[ContentItem]: continue items.append( ContentItem( - url=submission.url or f"https://www.reddit.com{submission.permalink}", + url=submission.url + or f"https://www.reddit.com{submission.permalink}", title=submission.title.strip(), author=str(submission.author) if submission.author else "", published_date=published_date, @@ -48,14 +57,20 @@ def fetch_new_content(self, since: datetime | None) -> list[ContentItem]: return items def health_check(self) -> bool: + """Verify that the configured subreddit can be queried successfully.""" + subreddit = self._client().subreddit(self.source_config.config["subreddit"]) next(subreddit.new(limit=1), None) return True def match_entity_for_url(self, url: str): + """Skip entity matching for Reddit because posts are not entity-driven.""" + return None def _iter_submissions(self, subreddit): + """Yield submissions from the configured listing modes without duplicates.""" + listing = self.source_config.config.get("listing", "both") limit = self.source_config.config.get("limit", 25) if listing in {"new", "both"}: @@ -65,6 +80,12 @@ def _iter_submissions(self, subreddit): @staticmethod def _client(): + """Create the authenticated PRAW client from Django settings. + + Raises: + RuntimeError: If Reddit credentials are missing. + """ + if not settings.REDDIT_CLIENT_ID or not settings.REDDIT_CLIENT_SECRET: raise RuntimeError("Reddit credentials are not configured.") return praw.Reddit( diff --git a/core/plugins/registry.py b/core/plugins/registry.py index d13bb3a0..9a5db0a0 100644 --- a/core/plugins/registry.py +++ b/core/plugins/registry.py @@ -1,3 +1,5 @@ +"""Registry helpers for resolving source-plugin implementations by name.""" + from typing import Any from core.models import SourcePluginName @@ -11,14 +13,26 @@ def get_plugin_for_source_config(source_config): + """Instantiate the plugin configured for a saved source configuration.""" + return _get_plugin_class(source_config.plugin_name)(source_config) -def validate_plugin_config(plugin_name: SourcePluginName | str, config: object) -> dict[str, Any]: +def validate_plugin_config( + plugin_name: SourcePluginName | str, config: object +) -> dict[str, Any]: + """Validate plugin config using the plugin class registered for the name.""" + return _get_plugin_class(plugin_name).validate_config(config) def _get_plugin_class(plugin_name: SourcePluginName | str): + """Resolve a plugin enum value or string into its registered class. + + Raises: + ValueError: If the plugin name is not supported. + """ + try: return PLUGIN_REGISTRY[SourcePluginName(plugin_name)] except KeyError as exc: diff --git a/core/plugins/rss.py b/core/plugins/rss.py index 9b4776af..75bd906b 100644 --- a/core/plugins/rss.py +++ b/core/plugins/rss.py @@ -1,3 +1,5 @@ +"""RSS source plugin used to ingest feed entries into project content.""" + from __future__ import annotations from datetime import UTC, datetime @@ -11,9 +13,13 @@ class RSSSourcePlugin(SourcePlugin): + """Fetch content from a configured RSS or Atom feed.""" + required_config_fields = ("feed_url",) def fetch_new_content(self, since: datetime | None) -> list[ContentItem]: + """Parse the feed and return entries newer than ``since``.""" + parsed_feed = feedparser.parse(self.source_config.config["feed_url"]) items: list[ContentItem] = [] for entry in parsed_feed.entries: @@ -24,7 +30,11 @@ def fetch_new_content(self, since: datetime | None) -> list[ContentItem]: title = (getattr(entry, "title", "") or "").strip() if not url or not title: continue - summary = getattr(entry, "summary", "") or getattr(entry, "description", "") or title + summary = ( + getattr(entry, "summary", "") + or getattr(entry, "description", "") + or title + ) items.append( ContentItem( url=url, @@ -38,11 +48,15 @@ def fetch_new_content(self, since: datetime | None) -> list[ContentItem]: return items def health_check(self) -> bool: + """Treat the feed as healthy when it returns at least one entry.""" + parsed_feed = feedparser.parse(self.source_config.config["feed_url"]) return bool(getattr(parsed_feed, "entries", [])) @staticmethod def _published_date_for_entry(entry) -> datetime: + """Choose the best available published timestamp for a feed entry.""" + for field_name in ("published_parsed", "updated_parsed", "created_parsed"): parsed_value = getattr(entry, field_name, None) if parsed_value: @@ -51,6 +65,8 @@ def _published_date_for_entry(entry) -> datetime: @staticmethod def _struct_time_to_datetime(parsed_value: struct_time) -> datetime: + """Convert ``feedparser`` time tuples into timezone-aware datetimes.""" + return datetime( parsed_value.tm_year, parsed_value.tm_mon, diff --git a/core/serializers.py b/core/serializers.py index 16850b4e..57e22d54 100644 --- a/core/serializers.py +++ b/core/serializers.py @@ -1,3 +1,10 @@ +"""DRF serializers for project-scoped core models. + +These serializers enforce the project's access rules at the API boundary. They do +more than simple field translation: several serializers limit related querysets to + the active project and validate that cross-project relationships cannot be posted. +""" + from django.contrib.auth.models import Group from rest_framework import serializers @@ -18,18 +25,32 @@ class ProjectScopedSerializerMixin: + """Limit serializer relationship fields to objects the current user can access.""" + def _filter_related_queryset(self, request): + """Constrain related-field querysets using the request user and project context.""" + user = request.user project = self.context.get("project") if "group" in self.fields: self.fields["group"].queryset = Group.objects.filter(user=user) if "project" in self.fields: - self.fields["project"].queryset = Project.objects.filter(group__user=user).distinct() + self.fields["project"].queryset = Project.objects.filter( + group__user=user + ).distinct() if "entity" in self.fields: - entity_queryset = Entity.objects.filter(project=project) if project else Entity.objects.filter(project__group__user=user) + entity_queryset = ( + Entity.objects.filter(project=project) + if project + else Entity.objects.filter(project__group__user=user) + ) self.fields["entity"].queryset = entity_queryset if "content" in self.fields: - content_queryset = Content.objects.filter(project=project) if project else Content.objects.filter(project__group__user=user) + content_queryset = ( + Content.objects.filter(project=project) + if project + else Content.objects.filter(project__group__user=user) + ) self.fields["content"].queryset = content_queryset if "superseded_by" in self.fields: skill_result_queryset = ( @@ -40,6 +61,8 @@ def _filter_related_queryset(self, request): self.fields["superseded_by"].queryset = skill_result_queryset def __init__(self, *args, **kwargs): + """Initialize the serializer and scope relation fields when authenticated.""" + super().__init__(*args, **kwargs) request = self.context.get("request") if request and request.user.is_authenticated: @@ -47,6 +70,7 @@ def __init__(self, *args, **kwargs): class ProjectSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize top-level project records.""" class Meta: model = Project @@ -63,7 +87,11 @@ class Meta: read_only_fields = ["id", "created_at"] -class ProjectConfigSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): +class ProjectConfigSerializer( + ProjectScopedSerializerMixin, serializers.ModelSerializer +): + """Serialize per-project authority and scoring settings.""" + class Meta: model = ProjectConfig fields = [ @@ -77,6 +105,8 @@ class Meta: class EntitySerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize tracked entities for a project.""" + class Meta: model = Entity fields = [ @@ -98,6 +128,8 @@ class Meta: class ContentSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize ingested content items and enforce project/entity consistency.""" + class Meta: model = Content fields = [ @@ -121,14 +153,24 @@ class Meta: read_only_fields = ["id", "project", "ingested_at", "embedding_id"] def validate(self, attrs): - project = self.context.get("project") or attrs.get("project") or getattr(self.instance, "project", None) + """Reject entity assignments that point at a different project.""" + + project = ( + self.context.get("project") + or attrs.get("project") + or getattr(self.instance, "project", None) + ) entity = attrs.get("entity") or getattr(self.instance, "entity", None) if project and entity and entity.project_id != project.id: - raise serializers.ValidationError({"entity": "Entity must belong to the selected project."}) + raise serializers.ValidationError( + {"entity": "Entity must belong to the selected project."} + ) return attrs class SkillResultSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize persisted AI skill executions for content.""" + class Meta: model = SkillResult fields = [ @@ -148,14 +190,24 @@ class Meta: read_only_fields = ["id", "project", "created_at"] def validate(self, attrs): - project = self.context.get("project") or attrs.get("project") or getattr(self.instance, "project", None) + """Reject skill results whose content does not belong to the active project.""" + + project = ( + self.context.get("project") + or attrs.get("project") + or getattr(self.instance, "project", None) + ) content = attrs.get("content") or getattr(self.instance, "content", None) if project and content and content.project_id != project.id: - raise serializers.ValidationError({"content": "Content must belong to the selected project."}) + raise serializers.ValidationError( + {"content": "Content must belong to the selected project."} + ) return attrs class UserFeedbackSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize editor feedback attached to a content item.""" + user = serializers.PrimaryKeyRelatedField(read_only=True) class Meta: @@ -164,14 +216,24 @@ class Meta: read_only_fields = ["id", "project", "user", "created_at"] def validate(self, attrs): - project = self.context.get("project") or attrs.get("project") or getattr(self.instance, "project", None) + """Reject feedback that targets content outside the active project.""" + + project = ( + self.context.get("project") + or attrs.get("project") + or getattr(self.instance, "project", None) + ) content = attrs.get("content") or getattr(self.instance, "content", None) if project and content and content.project_id != project.id: - raise serializers.ValidationError({"content": "Content must belong to the selected project."}) + raise serializers.ValidationError( + {"content": "Content must belong to the selected project."} + ) return attrs class IngestionRunSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize ingestion-run audit records.""" + class Meta: model = IngestionRun fields = [ @@ -189,13 +251,26 @@ class Meta: class SourceConfigSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize source-plugin configuration and normalize provider settings.""" + class Meta: model = SourceConfig - fields = ["id", "project", "plugin_name", "config", "is_active", "last_fetched_at"] + fields = [ + "id", + "project", + "plugin_name", + "config", + "is_active", + "last_fetched_at", + ] read_only_fields = ["id", "project", "last_fetched_at"] def validate(self, attrs): - plugin_name = attrs.get("plugin_name") or getattr(self.instance, "plugin_name", None) + """Validate plugin-specific configuration with the plugin registry.""" + + plugin_name = attrs.get("plugin_name") or getattr( + self.instance, "plugin_name", None + ) config = attrs.get("config") or getattr(self.instance, "config", {}) if plugin_name: try: @@ -206,27 +281,61 @@ def validate(self, attrs): class ReviewQueueSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize manual-review queue items for project content.""" + class Meta: model = ReviewQueue - fields = ["id", "project", "content", "reason", "confidence", "created_at", "resolved", "resolution"] + fields = [ + "id", + "project", + "content", + "reason", + "confidence", + "created_at", + "resolved", + "resolution", + ] read_only_fields = ["id", "project", "created_at"] def validate(self, attrs): - project = self.context.get("project") or attrs.get("project") or getattr(self.instance, "project", None) + """Reject review items whose content does not belong to the active project.""" + + project = ( + self.context.get("project") + or attrs.get("project") + or getattr(self.instance, "project", None) + ) content = attrs.get("content") or getattr(self.instance, "content", None) if project and content and content.project_id != project.id: - raise serializers.ValidationError({"content": "Content must belong to the selected project."}) + raise serializers.ValidationError( + {"content": "Content must belong to the selected project."} + ) return attrs -class IntakeAllowlistSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): +class IntakeAllowlistSerializer( + ProjectScopedSerializerMixin, serializers.ModelSerializer +): + """Serialize confirmed and pending newsletter sender allowlist entries.""" + class Meta: model = IntakeAllowlist - fields = ["id", "project", "sender_email", "confirmed_at", "confirmation_token", "created_at"] + fields = [ + "id", + "project", + "sender_email", + "confirmed_at", + "confirmation_token", + "created_at", + ] read_only_fields = ["id", "project", "confirmation_token", "created_at"] -class NewsletterIntakeSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): +class NewsletterIntakeSerializer( + ProjectScopedSerializerMixin, serializers.ModelSerializer +): + """Serialize raw inbound newsletter messages captured for a project.""" + class Meta: model = NewsletterIntake fields = [ @@ -242,4 +351,11 @@ class Meta: "extraction_result", "error_message", ] - read_only_fields = ["id", "project", "received_at", "status", "extraction_result", "error_message"] + read_only_fields = [ + "id", + "project", + "received_at", + "status", + "extraction_result", + "error_message", + ] diff --git a/core/signals.py b/core/signals.py index 8a4327f6..c7d39eca 100644 --- a/core/signals.py +++ b/core/signals.py @@ -1,3 +1,5 @@ +"""Signal handlers that adapt Anymail inbound events to project intake logic.""" + from __future__ import annotations from anymail.signals import inbound @@ -7,6 +9,8 @@ def _address_to_string(address) -> str: + """Normalize an Anymail address object or string into plain text.""" + if address is None: return "" addr_spec = getattr(address, "addr_spec", None) @@ -17,6 +21,15 @@ def _address_to_string(address) -> str: @receiver(inbound) def handle_anymail_inbound(sender, event, esp_name, **kwargs): + """Translate an inbound Anymail event into the internal intake payload. + + Args: + sender: Signal sender supplied by Anymail. + event: Normalized inbound event object. + esp_name: Name of the email service provider that generated the event. + **kwargs: Additional Anymail signal metadata. + """ + message = event.message recipients: list[str] = [] @@ -30,7 +43,8 @@ def handle_anymail_inbound(sender, event, esp_name, **kwargs): process_inbound_newsletter( recipients=recipients, - sender_email=message.envelope_sender or _address_to_string(getattr(message, "from_email", None)), + sender_email=message.envelope_sender + or _address_to_string(getattr(message, "from_email", None)), subject=message.subject or "", raw_html=message.html or "", raw_text=message.text or "", diff --git a/core/tasks.py b/core/tasks.py index bed6c143..997958d7 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -1,3 +1,5 @@ +"""Celery tasks that drive ingestion, AI processing, and newsletter extraction.""" + import logging from celery import shared_task @@ -29,7 +31,18 @@ @shared_task(name="core.tasks.run_ingestion") def run_ingestion(source_config_id: int): - source_config = SourceConfig.objects.select_related("project").get(pk=source_config_id) + """Fetch new content for one source config and record an ingestion run. + + Args: + source_config_id: Primary key of the source configuration to ingest. + + Returns: + A summary containing fetched and ingested item counts. + """ + + source_config = SourceConfig.objects.select_related("project").get( + pk=source_config_id + ) ingestion_run = IngestionRun.objects.create( project=source_config.project, plugin_name=source_config.plugin_name, @@ -42,20 +55,32 @@ def run_ingestion(source_config_id: int): ingestion_run.completed_at = timezone.now() ingestion_run.error_message = str(exc) ingestion_run.save(update_fields=["status", "completed_at", "error_message"]) - logger.exception("Source ingestion failed", extra={"source_config_id": source_config_id}) + logger.exception( + "Source ingestion failed", extra={"source_config_id": source_config_id} + ) raise ingestion_run.status = RunStatus.SUCCESS ingestion_run.completed_at = timezone.now() ingestion_run.items_fetched = items_fetched ingestion_run.items_ingested = items_ingested - ingestion_run.save(update_fields=["status", "completed_at", "items_fetched", "items_ingested"]) + ingestion_run.save( + update_fields=["status", "completed_at", "items_fetched", "items_ingested"] + ) return {"items_fetched": items_fetched, "items_ingested": items_ingested} @shared_task(name="core.tasks.run_all_ingestions") def run_all_ingestions(): - source_config_ids = list(SourceConfig.objects.filter(is_active=True).values_list("id", flat=True)) + """Queue ingestion for every active source configuration. + + Returns: + The number of source configurations scheduled. + """ + + source_config_ids = list( + SourceConfig.objects.filter(is_active=True).values_list("id", flat=True) + ) for source_config_id in source_config_ids: if settings.CELERY_TASK_ALWAYS_EAGER: run_ingestion(source_config_id) @@ -66,20 +91,37 @@ def run_all_ingestions(): @shared_task(name="core.tasks.process_content") def process_content(content_id: int): + """Run the main AI pipeline for a stored content item.""" + return process_content_pipeline(content_id) @shared_task(name="core.tasks.run_relevance_scoring_skill", ignore_result=True) def run_relevance_scoring_skill(skill_result_id: int): + """Execute a pending ad hoc relevance skill result in the background.""" + return execute_background_skill_result(skill_result_id, RELEVANCE_SKILL_NAME) @shared_task(name="core.tasks.run_summarization_skill", ignore_result=True) def run_summarization_skill(skill_result_id: int): + """Execute a pending ad hoc summarization skill result in the background.""" + return execute_background_skill_result(skill_result_id, SUMMARIZATION_SKILL_NAME) def queue_content_skill(content: Content, skill_name: str): + """Create and dispatch an asynchronous ad hoc skill for a content row. + + Args: + content: The content row to process. + skill_name: Supported async skill name. + + Returns: + The refreshed ``SkillResult`` row after the task has been queued or eagerly + executed. + """ + skill_result = create_pending_skill_result(content, skill_name) if skill_name == RELEVANCE_SKILL_NAME: @@ -100,6 +142,8 @@ def queue_content_skill(content: Content, skill_name: str): def _ingest_source_config(source_config: SourceConfig) -> tuple[int, int]: + """Fetch items from a configured source and create new content rows.""" + plugin = get_plugin_for_source_config(source_config) fetched_items = plugin.fetch_new_content(source_config.last_fetched_at) ingested_count = 0 @@ -125,6 +169,15 @@ def _ingest_source_config(source_config: SourceConfig) -> tuple[int, int]: @shared_task(name="core.tasks.process_newsletter_intake") def process_newsletter_intake(intake_id: int): + """Convert a stored newsletter email into content rows. + + Args: + intake_id: Primary key of the ``NewsletterIntake`` row to process. + + Returns: + A summary containing the final intake status and ingested item count. + """ + intake = NewsletterIntake.objects.select_related("project").get(pk=intake_id) allowlist = IntakeAllowlist.objects.filter( @@ -183,6 +236,8 @@ def process_newsletter_intake(intake_id: int): def _schedule_content_processing(content: Content) -> None: + """Ensure a content row is embedded before it enters the AI pipeline.""" + upsert_content_embedding(content) if settings.CELERY_TASK_ALWAYS_EAGER: process_content(content.id) diff --git a/core/tests/test_admin.py b/core/tests/test_admin.py index a819fdd4..a67b853e 100644 --- a/core/tests/test_admin.py +++ b/core/tests/test_admin.py @@ -36,10 +36,14 @@ @pytest.fixture def source_admin_context(django_user_model): - user = django_user_model.objects.create_user(username="admin-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="admin-owner", password="testpass123" + ) group = Group.objects.create(name="admin-team") user.groups.add(group) - project = Project.objects.create(name="Admin Project", group=group, topic_description="Infra") + project = Project.objects.create( + name="Admin Project", group=group, topic_description="Infra" + ) return SimpleNamespace(user=user, group=group, project=project) @@ -55,7 +59,9 @@ def test_test_source_connection_reports_success(source_admin_context, mocker): "core.admin.validate_plugin_config", return_value={"feed_url": "https://example.com/feed.xml"}, ) - get_plugin_mock = mocker.patch("core.admin.get_plugin_for_source_config", return_value=plugin) + get_plugin_mock = mocker.patch( + "core.admin.get_plugin_for_source_config", return_value=plugin + ) admin_instance = SourceConfigAdmin(SourceConfig, AdminSite()) admin_instance.message_user = mocker.Mock() @@ -64,7 +70,9 @@ def test_test_source_connection_reports_success(source_admin_context, mocker): queryset=SourceConfig.objects.filter(pk=source_config.pk), ) - validate_mock.assert_called_once_with(SourcePluginName.RSS, {"feed_url": "https://example.com/feed.xml"}) + validate_mock.assert_called_once_with( + SourcePluginName.RSS, {"feed_url": "https://example.com/feed.xml"} + ) get_plugin_mock.assert_called_once() plugin.health_check.assert_called_once_with() admin_instance.message_user.assert_called_once_with( @@ -99,7 +107,9 @@ def test_test_source_connection_reports_failures(source_admin_context, mocker): ) -def test_source_config_display_health_renders_without_django6_format_html_error(source_admin_context): +def test_source_config_display_health_renders_without_django6_format_html_error( + source_admin_context, +): source_config = SourceConfig.objects.create( project=source_admin_context.project, plugin_name=SourcePluginName.RSS, @@ -114,7 +124,9 @@ def test_source_config_display_health_renders_without_django6_format_html_error( assert "Healthy" in rendered -def test_review_queue_changelist_view_builds_dashboard_stats(source_admin_context, mocker): +def test_review_queue_changelist_view_builds_dashboard_stats( + source_admin_context, mocker +): content = Content.objects.create( project=source_admin_context.project, url="https://example.com/review-item", @@ -132,7 +144,9 @@ def test_review_queue_changelist_view_builds_dashboard_stats(source_admin_contex resolved=False, ) admin_instance = ReviewQueueAdmin(ReviewQueue, AdminSite()) - mocker.patch.object(admin_instance, "get_queryset", return_value=ReviewQueue.objects.all()) + mocker.patch.object( + admin_instance, "get_queryset", return_value=ReviewQueue.objects.all() + ) super_changelist_view = mocker.patch( "core.admin.ModelAdmin.changelist_view", side_effect=lambda request, extra_context=None: extra_context, @@ -145,7 +159,9 @@ def test_review_queue_changelist_view_builds_dashboard_stats(source_admin_contex assert response["dashboard_stats"][1]["value"] == "42%" -def test_review_queue_display_confidence_renders_without_django6_format_error(source_admin_context): +def test_review_queue_display_confidence_renders_without_django6_format_error( + source_admin_context, +): content = Content.objects.create( project=source_admin_context.project, url="https://example.com/review-confidence", @@ -169,7 +185,9 @@ def test_review_queue_display_confidence_renders_without_django6_format_error(so assert "42%" in rendered -def test_ingestion_run_display_efficiency_renders_without_django6_format_error(source_admin_context): +def test_ingestion_run_display_efficiency_renders_without_django6_format_error( + source_admin_context, +): run = IngestionRun.objects.create( project=source_admin_context.project, plugin_name=SourcePluginName.RSS, @@ -288,7 +306,9 @@ def test_content_changelist_view_builds_dashboard_stats(source_admin_context, mo relevance_score=40, ) admin_instance = ContentAdmin(Content, AdminSite()) - mocker.patch.object(admin_instance, "get_queryset", return_value=Content.objects.all()) + mocker.patch.object( + admin_instance, "get_queryset", return_value=Content.objects.all() + ) super_changelist_view = mocker.patch( "django.contrib.admin.options.ModelAdmin.changelist_view", side_effect=lambda request, extra_context=None: extra_context, @@ -301,7 +321,9 @@ def test_content_changelist_view_builds_dashboard_stats(source_admin_context, mo assert response["dashboard_stats"][1]["value"] == 2 -def test_generate_newsletter_ideas_queues_selected_content(source_admin_context, mocker): +def test_generate_newsletter_ideas_queues_selected_content( + source_admin_context, mocker +): first_content = Content.objects.create( project=source_admin_context.project, url="https://example.com/admin-queue-1", @@ -326,7 +348,9 @@ def test_generate_newsletter_ideas_queues_selected_content(source_admin_context, admin_instance.generate_newsletter_ideas( request=SimpleNamespace(), - queryset=Content.objects.filter(id__in=[first_content.id, second_content.id]).order_by("id"), + queryset=Content.objects.filter( + id__in=[first_content.id, second_content.id] + ).order_by("id"), ) delay_mock.assert_any_call(first_content.id) @@ -347,7 +371,9 @@ def test_generate_newsletter_ideas_queues_selected_content(source_admin_context, (20, "red"), ], ) -def test_entity_colored_score_uses_expected_color(source_admin_context, authority_score, expected_color): +def test_entity_colored_score_uses_expected_color( + source_admin_context, authority_score, expected_color +): entity = Entity.objects.create( project=source_admin_context.project, name=f"Entity {authority_score}", @@ -363,7 +389,9 @@ def test_entity_colored_score_uses_expected_color(source_admin_context, authorit assert str(authority_score) in rendered -def test_high_value_filter_only_returns_high_value_reference_content(source_admin_context): +def test_high_value_filter_only_returns_high_value_reference_content( + source_admin_context, +): high_value = Content.objects.create( project=source_admin_context.project, url="https://example.com/high-value", @@ -400,9 +428,7 @@ def test_high_value_filter_only_returns_high_value_reference_content(source_admi def test_content_view_trace_builds_template_trace_url(source_admin_context, settings): - settings.AI_TRACE_URL_TEMPLATE = ( - "https://trace.example/{project_id}/{skill_name}/{skill_result_id}/{trace_id}/{content_id}/{run_id}" - ) + settings.AI_TRACE_URL_TEMPLATE = "https://trace.example/{project_id}/{skill_name}/{skill_result_id}/{trace_id}/{content_id}/{run_id}" content = Content.objects.create( project=source_admin_context.project, url="https://example.com/admin-template-trace", @@ -423,7 +449,10 @@ def test_content_view_trace_builds_template_trace_url(source_admin_context, sett rendered = admin_instance.view_trace(content) - assert f"https://trace.example/{content.project_id}/summarization/{skill_result.id}/trace-123/{content.id}/trace-123" in rendered + assert ( + f"https://trace.example/{content.project_id}/summarization/{skill_result.id}/trace-123/{content.id}/trace-123" + in rendered + ) @pytest.mark.parametrize( @@ -435,7 +464,9 @@ def test_content_view_trace_builds_template_trace_url(source_admin_context, sett (10, "red"), ], ) -def test_content_display_relevance_uses_expected_output(source_admin_context, score, expected_color): +def test_content_display_relevance_uses_expected_output( + source_admin_context, score, expected_color +): content = Content.objects.create( project=source_admin_context.project, url=f"https://example.com/relevance-{score}", @@ -494,7 +525,9 @@ def test_skill_result_admin_helpers_and_dashboard_stats(source_admin_context, mo side_effect=lambda request, extra_context=None: extra_context, ) - admin_instance.retry_selected_skills(SimpleNamespace(), SkillResult.objects.filter(pk=current_result.pk)) + admin_instance.retry_selected_skills( + SimpleNamespace(), SkillResult.objects.filter(pk=current_result.pk) + ) current_result.refresh_from_db() response = admin_instance.changelist_view(SimpleNamespace()) @@ -505,7 +538,10 @@ def test_skill_result_admin_helpers_and_dashboard_stats(source_admin_context, mo "Successfully reset 1 skills to PENDING for retry.", messages.SUCCESS, ) - assert admin_instance.preview_json(current_result) == f'🔍 Preview' + assert ( + admin_instance.preview_json(current_result) + == f'🔍 Preview' + ) assert admin_instance.preview_json(superseded_result) == "-" assert admin_instance.get_content_link(current_result).endswith("...") assert "● PENDING" in admin_instance.display_status(current_result) @@ -519,8 +555,12 @@ def test_skill_result_admin_helpers_and_dashboard_stats(source_admin_context, mo assert response["dashboard_stats"][1]["value"] == "0.0%" -def test_user_feedback_admin_helpers_and_dashboard_stats(source_admin_context, django_user_model, mocker): - user = django_user_model.objects.create_user(username="feedback-user", password="testpass123") +def test_user_feedback_admin_helpers_and_dashboard_stats( + source_admin_context, django_user_model, mocker +): + user = django_user_model.objects.create_user( + username="feedback-user", password="testpass123" + ) content = Content.objects.create( project=source_admin_context.project, url="https://example.com/feedback", @@ -550,7 +590,9 @@ def test_user_feedback_admin_helpers_and_dashboard_stats(source_admin_context, d UserFeedback.objects.create( content=other_content, project=source_admin_context.project, - user=django_user_model.objects.create_user(username="feedback-user-2", password="testpass123"), + user=django_user_model.objects.create_user( + username="feedback-user-2", password="testpass123" + ), feedback_type="downvote", ) admin_instance = UserFeedbackAdmin(UserFeedback, AdminSite()) @@ -573,7 +615,9 @@ def test_user_feedback_admin_helpers_and_dashboard_stats(source_admin_context, d assert response["dashboard_stats"][1]["value"] == 2 -def test_ingestion_run_display_duration_handles_running_and_completed(source_admin_context): +def test_ingestion_run_display_duration_handles_running_and_completed( + source_admin_context, +): running_run = IngestionRun.objects.create( project=source_admin_context.project, plugin_name=SourcePluginName.RSS, @@ -589,7 +633,9 @@ def test_ingestion_run_display_duration_handles_running_and_completed(source_adm items_ingested=10, ) completed_run.started_at = timezone.now() - timezone.timedelta(minutes=3, seconds=5) - completed_run.completed_at = completed_run.started_at + timezone.timedelta(minutes=3, seconds=5) + completed_run.completed_at = completed_run.started_at + timezone.timedelta( + minutes=3, seconds=5 + ) completed_run.save(update_fields=["started_at", "completed_at"]) admin_instance = IngestionRunAdmin(IngestionRun, AdminSite()) @@ -597,7 +643,9 @@ def test_ingestion_run_display_duration_handles_running_and_completed(source_adm assert admin_instance.display_duration(completed_run) == "3m 5s" -def test_review_queue_actions_update_resolution_and_emit_message(source_admin_context, mocker): +def test_review_queue_actions_update_resolution_and_emit_message( + source_admin_context, mocker +): content = Content.objects.create( project=source_admin_context.project, url="https://example.com/review-action", @@ -624,8 +672,12 @@ def test_review_queue_actions_update_resolution_and_emit_message(source_admin_co admin_instance = ReviewQueueAdmin(ReviewQueue, AdminSite()) admin_instance.message_user = mocker.Mock() - admin_instance.mark_as_approved(SimpleNamespace(), ReviewQueue.objects.filter(pk=approve_item.pk)) - admin_instance.mark_as_rejected(SimpleNamespace(), ReviewQueue.objects.filter(pk=reject_item.pk)) + admin_instance.mark_as_approved( + SimpleNamespace(), ReviewQueue.objects.filter(pk=approve_item.pk) + ) + admin_instance.mark_as_rejected( + SimpleNamespace(), ReviewQueue.objects.filter(pk=reject_item.pk) + ) approve_item.refresh_from_db() reject_item.refresh_from_db() @@ -654,8 +706,12 @@ def test_high_value_filter_lookups_and_noop_queryset(source_admin_context): content_text="noop", ) - assert filter_instance.lookups(None, None) == (("high_value", "🔥 High Value (Score > 80 & Reference)"),) - assert list(filter_instance.queryset(SimpleNamespace(), Content.objects.all())) == [content] + assert filter_instance.lookups(None, None) == ( + ("high_value", "🔥 High Value (Score > 80 & Reference)"), + ) + assert list(filter_instance.queryset(SimpleNamespace(), Content.objects.all())) == [ + content + ] def test_content_view_trace_returns_dash_when_no_skill_results(source_admin_context): @@ -673,7 +729,9 @@ def test_content_view_trace_returns_dash_when_no_skill_results(source_admin_cont assert admin_instance.view_trace(content) == "-" -def test_skill_result_admin_handles_unknown_status_and_empty_performance(source_admin_context): +def test_skill_result_admin_handles_unknown_status_and_empty_performance( + source_admin_context, +): content = Content.objects.create( project=source_admin_context.project, url="https://example.com/skill-result-empty", @@ -699,7 +757,9 @@ def test_skill_result_admin_handles_unknown_status_and_empty_performance(source_ assert admin_instance.display_performance(skill_result) == "- / -" -def test_skill_result_changelist_view_uses_warning_and_danger_colors(source_admin_context, mocker): +def test_skill_result_changelist_view_uses_warning_and_danger_colors( + source_admin_context, mocker +): content = Content.objects.create( project=source_admin_context.project, url="https://example.com/skill-result-slow", @@ -752,7 +812,9 @@ def test_user_feedback_admin_upvote_and_orange_score_branches(source_admin_conte assert "orange" in admin_instance.get_ai_score(feedback) -def test_user_feedback_changelist_view_uses_success_color_for_high_approval(source_admin_context, django_user_model, mocker): +def test_user_feedback_changelist_view_uses_success_color_for_high_approval( + source_admin_context, django_user_model, mocker +): first_content = Content.objects.create( project=source_admin_context.project, url="https://example.com/feedback-success-1", @@ -782,7 +844,9 @@ def test_user_feedback_changelist_view_uses_success_color_for_high_approval(sour UserFeedback.objects.create( content=second_content, project=source_admin_context.project, - user=django_user_model.objects.create_user(username="feedback-success-2", password="testpass123"), + user=django_user_model.objects.create_user( + username="feedback-success-2", password="testpass123" + ), feedback_type="upvote", ) admin_instance = UserFeedbackAdmin(UserFeedback, AdminSite()) @@ -798,7 +862,9 @@ def test_user_feedback_changelist_view_uses_success_color_for_high_approval(sour assert response["dashboard_stats"][0]["value"] == "100.0%" -def test_ingestion_run_admin_status_efficiency_and_dashboard_branches(source_admin_context, mocker): +def test_ingestion_run_admin_status_efficiency_and_dashboard_branches( + source_admin_context, mocker +): IngestionRun.objects.create( project=source_admin_context.project, plugin_name=SourcePluginName.RSS, @@ -821,15 +887,24 @@ def test_ingestion_run_admin_status_efficiency_and_dashboard_branches(source_adm response = admin_instance.changelist_view(SimpleNamespace()) - assert "danger" in admin_instance.display_status(IngestionRun.objects.filter(status="failed").first()) - assert admin_instance.display_efficiency(IngestionRun.objects.filter(status="failed").first()) == "0/0" + assert "danger" in admin_instance.display_status( + IngestionRun.objects.filter(status="failed").first() + ) + assert ( + admin_instance.display_efficiency( + IngestionRun.objects.filter(status="failed").first() + ) + == "0/0" + ) assert "info" in admin_instance.display_status(running_run) super_changelist_view.assert_called_once() assert response["dashboard_stats"][0]["value"] == "5" assert response["dashboard_stats"][1]["color"] == "warning" -def test_source_config_admin_health_pretty_config_and_dashboard_branches(source_admin_context, mocker): +def test_source_config_admin_health_pretty_config_and_dashboard_branches( + source_admin_context, mocker +): stale_config = SourceConfig.objects.create( project=source_admin_context.project, plugin_name=SourcePluginName.RSS, diff --git a/core/tests/test_api.py b/core/tests/test_api.py index d9063874..d63f8b1a 100644 --- a/core/tests/test_api.py +++ b/core/tests/test_api.py @@ -28,8 +28,12 @@ class ProjectScopedApiTests(APITestCase): def setUp(self): user_model = get_user_model() - self.owner = user_model.objects.create_user(username="owner", password="testpass123") - self.other_user = user_model.objects.create_user(username="other", password="testpass123") + self.owner = user_model.objects.create_user( + username="owner", password="testpass123" + ) + self.other_user = user_model.objects.create_user( + username="other", password="testpass123" + ) self.owner_group = Group.objects.create(name="owner-team") self.owner.groups.add(self.owner_group) self.other_group = Group.objects.create(name="other-team") @@ -134,20 +138,30 @@ def test_project_list_is_scoped_to_request_user_groups(self): self.assertEqual(response.json()[0]["id"], self.owner_project.id) def test_entity_list_is_scoped_to_request_user_project(self): - response = self.client.get(reverse("v1:project-entity-list", kwargs={"project_id": self.owner_project.id})) + response = self.client.get( + reverse( + "v1:project-entity-list", kwargs={"project_id": self.owner_project.id} + ) + ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(len(response.json()), 1) self.assertEqual(response.json()[0]["id"], self.owner_entity.id) def test_nested_entity_list_rejects_other_users_project(self): - response = self.client.get(reverse("v1:project-entity-list", kwargs={"project_id": self.other_project.id})) + response = self.client.get( + reverse( + "v1:project-entity-list", kwargs={"project_id": self.other_project.id} + ) + ) self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) def test_feedback_create_assigns_current_user(self): response = self.client.post( - reverse("v1:project-feedback-list", kwargs={"project_id": self.owner_project.id}), + reverse( + "v1:project-feedback-list", kwargs={"project_id": self.owner_project.id} + ), { "content": self.owner_content.id, "feedback_type": FeedbackType.UPVOTE, @@ -162,7 +176,9 @@ def test_feedback_create_assigns_current_user(self): def test_feedback_rejects_cross_project_content(self): response = self.client.post( - reverse("v1:project-feedback-list", kwargs={"project_id": self.owner_project.id}), + reverse( + "v1:project-feedback-list", kwargs={"project_id": self.owner_project.id} + ), { "content": self.other_content.id, "feedback_type": FeedbackType.DOWNVOTE, @@ -175,7 +191,9 @@ def test_feedback_rejects_cross_project_content(self): def test_content_create_uses_project_from_url(self): response = self.client.post( - reverse("v1:project-content-list", kwargs={"project_id": self.owner_project.id}), + reverse( + "v1:project-content-list", kwargs={"project_id": self.owner_project.id} + ), { "url": "https://example.com/new", "title": "New Content", @@ -194,7 +212,9 @@ def test_content_create_uses_project_from_url(self): self.assertEqual(created_content.project, self.owner_project) @patch("core.tasks.run_relevance_scoring_skill.delay") - def test_content_skill_action_queues_relevance_scoring(self, run_relevance_scoring_delay_mock): + def test_content_skill_action_queues_relevance_scoring( + self, run_relevance_scoring_delay_mock + ): response = self.client.post( f"/api/v1/projects/{self.owner_project.id}/contents/{self.owner_content.id}/skills/relevance_scoring/", @@ -214,7 +234,9 @@ def test_content_skill_action_queues_relevance_scoring(self, run_relevance_scori self.assertEqual(response.json()["status"], SkillStatus.PENDING) @patch("core.tasks.run_summarization_skill.delay") - def test_content_skill_action_queues_summarization(self, run_summarization_delay_mock): + def test_content_skill_action_queues_summarization( + self, run_summarization_delay_mock + ): self.owner_content.relevance_score = 0.25 self.owner_content.save(update_fields=["relevance_score"]) @@ -256,18 +278,41 @@ def test_content_skill_action_runs_find_related(self, search_similar_content_moc self.assertEqual(response.status_code, status.HTTP_201_CREATED) self.assertEqual(response.json()["skill_name"], "find_related") self.assertEqual(response.json()["status"], SkillStatus.COMPLETED) - self.assertEqual(response.json()["result_data"]["related_items"][0]["content_id"], self.other_content.id) + self.assertEqual( + response.json()["result_data"]["related_items"][0]["content_id"], + self.other_content.id, + ) def test_authenticated_nested_list_endpoints_smoke(self): list_endpoints = [ - reverse("v1:project-config-list", kwargs={"project_id": self.owner_project.id}), - reverse("v1:project-entity-list", kwargs={"project_id": self.owner_project.id}), - reverse("v1:project-content-list", kwargs={"project_id": self.owner_project.id}), - reverse("v1:project-skill-result-list", kwargs={"project_id": self.owner_project.id}), - reverse("v1:project-feedback-list", kwargs={"project_id": self.owner_project.id}), - reverse("v1:project-ingestion-run-list", kwargs={"project_id": self.owner_project.id}), - reverse("v1:project-source-config-list", kwargs={"project_id": self.owner_project.id}), - reverse("v1:project-review-queue-list", kwargs={"project_id": self.owner_project.id}), + reverse( + "v1:project-config-list", kwargs={"project_id": self.owner_project.id} + ), + reverse( + "v1:project-entity-list", kwargs={"project_id": self.owner_project.id} + ), + reverse( + "v1:project-content-list", kwargs={"project_id": self.owner_project.id} + ), + reverse( + "v1:project-skill-result-list", + kwargs={"project_id": self.owner_project.id}, + ), + reverse( + "v1:project-feedback-list", kwargs={"project_id": self.owner_project.id} + ), + reverse( + "v1:project-ingestion-run-list", + kwargs={"project_id": self.owner_project.id}, + ), + reverse( + "v1:project-source-config-list", + kwargs={"project_id": self.owner_project.id}, + ), + reverse( + "v1:project-review-queue-list", + kwargs={"project_id": self.owner_project.id}, + ), ] for endpoint in list_endpoints: @@ -279,31 +324,52 @@ def test_authenticated_nested_detail_endpoints_smoke(self): detail_endpoints = [ reverse( "v1:project-config-detail", - kwargs={"project_id": self.owner_project.id, "pk": self.owner_config.id}, + kwargs={ + "project_id": self.owner_project.id, + "pk": self.owner_config.id, + }, ), reverse( "v1:project-entity-detail", - kwargs={"project_id": self.owner_project.id, "pk": self.owner_entity.id}, + kwargs={ + "project_id": self.owner_project.id, + "pk": self.owner_entity.id, + }, ), reverse( "v1:project-content-detail", - kwargs={"project_id": self.owner_project.id, "pk": self.owner_content.id}, + kwargs={ + "project_id": self.owner_project.id, + "pk": self.owner_content.id, + }, ), reverse( "v1:project-skill-result-detail", - kwargs={"project_id": self.owner_project.id, "pk": self.owner_skill_result.id}, + kwargs={ + "project_id": self.owner_project.id, + "pk": self.owner_skill_result.id, + }, ), reverse( "v1:project-ingestion-run-detail", - kwargs={"project_id": self.owner_project.id, "pk": self.owner_ingestion_run.id}, + kwargs={ + "project_id": self.owner_project.id, + "pk": self.owner_ingestion_run.id, + }, ), reverse( "v1:project-source-config-detail", - kwargs={"project_id": self.owner_project.id, "pk": self.owner_source_config.id}, + kwargs={ + "project_id": self.owner_project.id, + "pk": self.owner_source_config.id, + }, ), reverse( "v1:project-review-queue-detail", - kwargs={"project_id": self.owner_project.id, "pk": self.owner_review_queue.id}, + kwargs={ + "project_id": self.owner_project.id, + "pk": self.owner_review_queue.id, + }, ), ] @@ -327,7 +393,10 @@ def test_authenticated_nested_detail_endpoints_smoke(self): def test_source_config_create_validates_plugin_config(self): response = self.client.post( - reverse("v1:project-source-config-list", kwargs={"project_id": self.owner_project.id}), + reverse( + "v1:project-source-config-list", + kwargs={"project_id": self.owner_project.id}, + ), {"plugin_name": SourcePluginName.RSS, "config": {}}, format="json", ) diff --git a/core/tests/test_embeddings.py b/core/tests/test_embeddings.py index af13a765..b9755647 100644 --- a/core/tests/test_embeddings.py +++ b/core/tests/test_embeddings.py @@ -49,10 +49,14 @@ def clear_embedding_provider_cache(): @pytest.fixture def embedding_context(django_user_model): - user = django_user_model.objects.create_user(username="embed-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="embed-owner", password="testpass123" + ) group = Group.objects.create(name="embedding-team") user.groups.add(group) - project = Project.objects.create(name="Embedding Project", group=group, topic_description="Infra") + project = Project.objects.create( + name="Embedding Project", group=group, topic_description="Infra" + ) content = Content.objects.create( project=project, url="https://example.com/embed", @@ -65,7 +69,9 @@ def embedding_context(django_user_model): return SimpleNamespace(user=user, group=group, project=project, content=content) -def test_upsert_content_embedding_persists_embedding_id_and_payload(embedding_context, mocker): +def test_upsert_content_embedding_persists_embedding_id_and_payload( + embedding_context, mocker +): provider_mock = mocker.patch("core.embeddings.get_embedding_provider") client_mock = mocker.patch("core.embeddings.get_qdrant_client") provider_mock.return_value.embed_text.return_value = [0.1, 0.2, 0.3] @@ -82,28 +88,47 @@ def test_upsert_content_embedding_persists_embedding_id_and_payload(embedding_co assert upsert_points[0].payload["content_id"] == embedding_context.content.id assert upsert_points[0].payload["is_reference"] is False -def test_search_similar_returns_qdrant_results_for_project_collection(embedding_context, mocker): + +def test_search_similar_returns_qdrant_results_for_project_collection( + embedding_context, mocker +): client_mock = mocker.patch("core.embeddings.get_qdrant_client") - scored_point = SimpleNamespace(score=0.91, payload={"content_id": embedding_context.content.id}) + scored_point = SimpleNamespace( + score=0.91, payload={"content_id": embedding_context.content.id} + ) client_mock.return_value.get_collection.return_value = SimpleNamespace() client_mock.return_value.search.return_value = [scored_point] - results = search_similar(embedding_context.project.id, [0.1, 0.2, 0.3], limit=3, exclude_content_id=embedding_context.content.id) + results = search_similar( + embedding_context.project.id, + [0.1, 0.2, 0.3], + limit=3, + exclude_content_id=embedding_context.content.id, + ) assert results == [scored_point] client_mock.return_value.search.assert_called_once() -def test_search_similar_content_embeds_current_content_and_excludes_self(embedding_context, mocker): - embed_text_mock = mocker.patch("core.embeddings.embed_text", return_value=[0.3, 0.2, 0.1]) + +def test_search_similar_content_embeds_current_content_and_excludes_self( + embedding_context, mocker +): + embed_text_mock = mocker.patch( + "core.embeddings.embed_text", return_value=[0.3, 0.2, 0.1] + ) search_similar_mock = mocker.patch( "core.embeddings.search_similar", return_value=[SimpleNamespace(score=0.88, payload={"content_id": 999})], ) - results = search_similar_content(embedding_context.content, limit=4, is_reference=False) + results = search_similar_content( + embedding_context.content, limit=4, is_reference=False + ) assert len(results) == 1 - embed_text_mock.assert_called_once_with("Embedding Content\n\nThis article covers platform engineering practices.") + embed_text_mock.assert_called_once_with( + "Embedding Content\n\nThis article covers platform engineering practices." + ) search_similar_mock.assert_called_once_with( embedding_context.project.id, [0.3, 0.2, 0.1], @@ -112,6 +137,7 @@ def test_search_similar_content_embeds_current_content_and_excludes_self(embeddi exclude_content_id=embedding_context.content.id, ) + def test_get_reference_similarity_averages_reference_scores(embedding_context, mocker): search_mock = mocker.patch("core.embeddings.search_similar") search_mock.return_value = [SimpleNamespace(score=0.8), SimpleNamespace(score=0.6)] @@ -120,11 +146,15 @@ def test_get_reference_similarity_averages_reference_scores(embedding_context, m assert similarity == 0.7 -def test_get_reference_similarity_returns_zero_when_no_reference_matches(embedding_context): + +def test_get_reference_similarity_returns_zero_when_no_reference_matches( + embedding_context, +): similarity = get_reference_similarity(embedding_context.project.id, [0.1, 0.2, 0.3]) assert similarity == 0.0 + def test_get_embedding_provider_uses_sentence_transformer_backend(settings, mocker): settings.EMBEDDING_PROVIDER = "sentence-transformers" sentence_transformer_mock = mocker.patch("core.embeddings.SentenceTransformer") @@ -134,6 +164,7 @@ def test_get_embedding_provider_uses_sentence_transformer_backend(settings, mock assert provider.__class__.__name__ == "SentenceTransformerEmbeddingProvider" sentence_transformer_mock.assert_called_once() + def test_get_embedding_provider_uses_ollama_backend(settings): settings.EMBEDDING_PROVIDER = "ollama" @@ -141,6 +172,7 @@ def test_get_embedding_provider_uses_ollama_backend(settings): assert provider.__class__.__name__ == "OllamaEmbeddingProvider" + def test_get_embedding_provider_uses_openrouter_backend(settings): settings.EMBEDDING_PROVIDER = "openrouter" @@ -148,17 +180,23 @@ def test_get_embedding_provider_uses_openrouter_backend(settings): assert provider.__class__.__name__ == "OpenRouterEmbeddingProvider" + def test_ollama_embedding_provider_calls_embed_endpoint(settings, mocker): settings.EMBEDDING_PROVIDER = "ollama" settings.EMBEDDING_MODEL = "nomic-embed-text" post_mock = mocker.patch("core.embeddings.httpx.post") - post_mock.return_value = SimpleNamespace(status_code=200, json=lambda: {"embeddings": [[0.3, 0.4]]}, raise_for_status=lambda: None) + post_mock.return_value = SimpleNamespace( + status_code=200, + json=lambda: {"embeddings": [[0.3, 0.4]]}, + raise_for_status=lambda: None, + ) vector = embeddings.embed_text("ollama text") assert vector == [0.3, 0.4] assert "/api/embed" in post_mock.call_args.args[0] + def test_openrouter_embedding_provider_calls_embeddings_endpoint(settings, mocker): settings.EMBEDDING_PROVIDER = "openrouter" settings.EMBEDDING_MODEL = "openai/text-embedding-3-small" @@ -183,7 +221,9 @@ def test_openrouter_embedding_provider_requires_api_key(settings): embeddings.embed_text("api text") -def test_ollama_embedding_provider_falls_back_to_legacy_endpoint_on_404(settings, mocker): +def test_ollama_embedding_provider_falls_back_to_legacy_endpoint_on_404( + settings, mocker +): settings.EMBEDDING_PROVIDER = "ollama" settings.EMBEDDING_MODEL = "nomic-embed-text" embed_response = SimpleNamespace(status_code=404) @@ -191,7 +231,9 @@ def test_ollama_embedding_provider_falls_back_to_legacy_endpoint_on_404(settings json=lambda: {"embedding": [0.9, 0.8]}, raise_for_status=lambda: None, ) - post_mock = mocker.patch("core.embeddings.httpx.post", side_effect=[embed_response, legacy_response]) + post_mock = mocker.patch( + "core.embeddings.httpx.post", side_effect=[embed_response, legacy_response] + ) vector = embeddings.embed_text("legacy text") @@ -206,9 +248,13 @@ def test_get_embedding_provider_rejects_unsupported_backend(settings): get_embedding_provider() -def test_ensure_project_collection_skips_create_when_collection_exists(embedding_context, mocker): +def test_ensure_project_collection_skips_create_when_collection_exists( + embedding_context, mocker +): client_mock = mocker.patch("core.embeddings.get_qdrant_client") - exists_mock = mocker.patch("core.embeddings.project_collection_exists", return_value=True) + exists_mock = mocker.patch( + "core.embeddings.project_collection_exists", return_value=True + ) embeddings.ensure_project_collection(embedding_context.project.id) @@ -216,7 +262,9 @@ def test_ensure_project_collection_skips_create_when_collection_exists(embedding client_mock.return_value.create_collection.assert_not_called() -def test_project_collection_exists_returns_false_when_lookup_raises(embedding_context, mocker): +def test_project_collection_exists_returns_false_when_lookup_raises( + embedding_context, mocker +): client_mock = mocker.patch("core.embeddings.get_qdrant_client") client_mock.return_value.get_collection.side_effect = RuntimeError("missing") @@ -226,7 +274,10 @@ def test_project_collection_exists_returns_false_when_lookup_raises(embedding_co def test_build_content_embedding_text_skips_blank_parts(embedding_context): embedding_context.content.title = "" - assert build_content_embedding_text(embedding_context.content) == "This article covers platform engineering practices." + assert ( + build_content_embedding_text(embedding_context.content) + == "This article covers platform engineering practices." + ) @pytest.mark.parametrize( @@ -241,7 +292,9 @@ def test_normalize_text_handles_blank_and_trimmed_input(raw_text, expected): def test_serialize_published_date_handles_string_and_fallback_values(): - assert serialize_published_date("2026-04-20T12:00:00Z") == "2026-04-20T12:00:00+00:00" + assert ( + serialize_published_date("2026-04-20T12:00:00Z") == "2026-04-20T12:00:00+00:00" + ) assert serialize_published_date("not-a-date") == "not-a-date" assert serialize_published_date(123) == "123" @@ -260,7 +313,10 @@ def test_build_search_filter_supports_reference_and_exclusion_conditions(): def test_embedding_smoke_command_prints_dimension(mocker, capsys): - embed_text_mock = mocker.patch("core.management.commands.embedding_smoke.embed_text", return_value=[0.1, 0.2, 0.3]) + embed_text_mock = mocker.patch( + "core.management.commands.embedding_smoke.embed_text", + return_value=[0.1, 0.2, 0.3], + ) call_command("embedding_smoke", text="test text") @@ -269,7 +325,10 @@ def test_embedding_smoke_command_prints_dimension(mocker, capsys): def test_embedding_smoke_command_can_upsert_content(embedding_context, mocker, capsys): - upsert_mock = mocker.patch("core.management.commands.embedding_smoke.upsert_content_embedding", return_value="embedding-123") + upsert_mock = mocker.patch( + "core.management.commands.embedding_smoke.upsert_content_embedding", + return_value="embedding-123", + ) call_command("embedding_smoke", content_id=embedding_context.content.id) @@ -278,7 +337,9 @@ def test_embedding_smoke_command_can_upsert_content(embedding_context, mocker, c def test_seed_demo_creates_reference_corpus_and_embeds_demo_content(mocker, capsys): - upsert_mock = mocker.patch("core.management.commands.seed_demo.upsert_content_embedding") + upsert_mock = mocker.patch( + "core.management.commands.seed_demo.upsert_content_embedding" + ) call_command("seed_demo") @@ -287,9 +348,24 @@ def test_seed_demo_creates_reference_corpus_and_embeds_demo_content(mocker, caps assert SourceConfig.objects.filter(project=project).count() == 8 assert Content.objects.filter(project=project, is_reference=True).count() == 50 assert Content.objects.filter(project=project, is_reference=False).count() == 200 - assert SkillResult.objects.filter(project=project, skill_name=CLASSIFICATION_SKILL_NAME).count() == 200 - assert SkillResult.objects.filter(project=project, skill_name=RELEVANCE_SKILL_NAME).count() == 200 - assert SkillResult.objects.filter(project=project, skill_name=SUMMARIZATION_SKILL_NAME).count() == 115 + assert ( + SkillResult.objects.filter( + project=project, skill_name=CLASSIFICATION_SKILL_NAME + ).count() + == 200 + ) + assert ( + SkillResult.objects.filter( + project=project, skill_name=RELEVANCE_SKILL_NAME + ).count() + == 200 + ) + assert ( + SkillResult.objects.filter( + project=project, skill_name=SUMMARIZATION_SKILL_NAME + ).count() + == 115 + ) assert ReviewQueue.objects.filter(project=project).exists() assert UserFeedback.objects.filter(project=project).count() == 45 assert IngestionRun.objects.filter(project=project).count() == 6 @@ -344,21 +420,31 @@ def test_sync_embeddings_scopes_to_requested_content_id(embedding_context, mocke published_date="2026-04-21T12:00:00Z", content_text="Sibling body.", ) - upsert_mock = mocker.patch("core.management.commands.sync_embeddings.upsert_content_embedding") + upsert_mock = mocker.patch( + "core.management.commands.sync_embeddings.upsert_content_embedding" + ) stdout = StringIO() - call_command("sync_embeddings", content_id=embedding_context.content.id, stdout=stdout) + call_command( + "sync_embeddings", content_id=embedding_context.content.id, stdout=stdout + ) upsert_mock.assert_called_once_with(embedding_context.content) assert sibling_content.id != embedding_context.content.id assert "Synced embeddings for 1 content item(s)." in stdout.getvalue() -def test_sync_embeddings_filters_project_and_references_only(embedding_context, django_user_model, mocker): - other_user = django_user_model.objects.create_user(username="embed-owner-2", password="testpass123") +def test_sync_embeddings_filters_project_and_references_only( + embedding_context, django_user_model, mocker +): + other_user = django_user_model.objects.create_user( + username="embed-owner-2", password="testpass123" + ) other_group = Group.objects.create(name="embedding-team-2") other_user.groups.add(other_group) - other_project = Project.objects.create(name="Other Embedding Project", group=other_group, topic_description="Other") + other_project = Project.objects.create( + name="Other Embedding Project", group=other_group, topic_description="Other" + ) same_project_reference = Content.objects.create( project=embedding_context.project, url="https://example.com/reference-item", @@ -389,7 +475,9 @@ def test_sync_embeddings_filters_project_and_references_only(embedding_context, content_text="Other project reference body.", is_reference=True, ) - upsert_mock = mocker.patch("core.management.commands.sync_embeddings.upsert_content_embedding") + upsert_mock = mocker.patch( + "core.management.commands.sync_embeddings.upsert_content_embedding" + ) call_command( "sync_embeddings", @@ -401,5 +489,7 @@ def test_sync_embeddings_filters_project_and_references_only(embedding_context, def test_sync_embeddings_raises_command_error_when_scope_matches_no_content(): - with pytest.raises(CommandError, match="No content records matched the requested scope"): + with pytest.raises( + CommandError, match="No content records matched the requested scope" + ): call_command("sync_embeddings", project_id=999999) diff --git a/core/tests/test_entrypoints.py b/core/tests/test_entrypoints.py index a2f9d313..9fda75d7 100644 --- a/core/tests/test_entrypoints.py +++ b/core/tests/test_entrypoints.py @@ -10,21 +10,29 @@ def _import_fresh(module_name: str): def test_asgi_module_sets_default_settings_and_builds_application(mocker): setdefault_mock = mocker.patch.object(os.environ, "setdefault") - get_app_mock = mocker.patch("django.core.asgi.get_asgi_application", return_value="asgi-app") + get_app_mock = mocker.patch( + "django.core.asgi.get_asgi_application", return_value="asgi-app" + ) module = _import_fresh("newsletter_maker.asgi") - setdefault_mock.assert_called_once_with("DJANGO_SETTINGS_MODULE", "newsletter_maker.settings") + setdefault_mock.assert_called_once_with( + "DJANGO_SETTINGS_MODULE", "newsletter_maker.settings" + ) get_app_mock.assert_called_once_with() assert module.application == "asgi-app" def test_wsgi_module_sets_default_settings_and_builds_application(mocker): setdefault_mock = mocker.patch.object(os.environ, "setdefault") - get_app_mock = mocker.patch("django.core.wsgi.get_wsgi_application", return_value="wsgi-app") + get_app_mock = mocker.patch( + "django.core.wsgi.get_wsgi_application", return_value="wsgi-app" + ) module = _import_fresh("newsletter_maker.wsgi") - setdefault_mock.assert_called_once_with("DJANGO_SETTINGS_MODULE", "newsletter_maker.settings") + setdefault_mock.assert_called_once_with( + "DJANGO_SETTINGS_MODULE", "newsletter_maker.settings" + ) get_app_mock.assert_called_once_with() assert module.application == "wsgi-app" diff --git a/core/tests/test_health.py b/core/tests/test_health.py index 5c8714df..e002d2bd 100644 --- a/core/tests/test_health.py +++ b/core/tests/test_health.py @@ -49,21 +49,29 @@ def test_check_database_returns_true_when_query_succeeds(mocker): def test_check_database_returns_false_when_query_raises(mocker): - mocker.patch("core.views.connection.cursor", side_effect=RuntimeError("db unavailable")) + mocker.patch( + "core.views.connection.cursor", side_effect=RuntimeError("db unavailable") + ) assert _check_database() is False def test_check_qdrant_returns_true_when_client_can_list_collections(mocker, settings): client_cls = mocker.patch("core.views.QdrantClient") - client_cls.return_value.get_collections.return_value = SimpleNamespace(collections=[]) + client_cls.return_value.get_collections.return_value = SimpleNamespace( + collections=[] + ) assert _check_qdrant() is True - client_cls.assert_called_once_with(url=settings.QDRANT_URL, timeout=2, check_compatibility=False) + client_cls.assert_called_once_with( + url=settings.QDRANT_URL, timeout=2, check_compatibility=False + ) def test_check_qdrant_returns_false_when_client_errors(mocker): client_cls = mocker.patch("core.views.QdrantClient") - client_cls.return_value.get_collections.side_effect = RuntimeError("qdrant unavailable") + client_cls.return_value.get_collections.side_effect = RuntimeError( + "qdrant unavailable" + ) assert _check_qdrant() is False diff --git a/core/tests/test_llm.py b/core/tests/test_llm.py index e999f8ee..501a0324 100644 --- a/core/tests/test_llm.py +++ b/core/tests/test_llm.py @@ -9,7 +9,9 @@ def test_openrouter_chat_json_requires_api_key(settings): settings.OPENROUTER_API_KEY = "" with pytest.raises(RuntimeError, match="OPENROUTER_API_KEY must be configured"): - openrouter_chat_json(model="test-model", system_prompt="system", user_prompt="user") + openrouter_chat_json( + model="test-model", system_prompt="system", user_prompt="user" + ) def test_openrouter_chat_json_posts_expected_request(settings, mocker): @@ -35,7 +37,10 @@ def test_openrouter_chat_json_posts_expected_request(settings, mocker): assert result.payload == {"summary": "Hello"} assert result.model == "openrouter/test-model" assert result.latency_ms == 123 - assert post_mock.call_args.args[0] == "https://openrouter.example/api/v1/chat/completions" + assert ( + post_mock.call_args.args[0] + == "https://openrouter.example/api/v1/chat/completions" + ) assert post_mock.call_args.kwargs["headers"] == { "Authorization": "Bearer test-key", "Content-Type": "application/json", @@ -59,7 +64,9 @@ def test_extract_json_object_accepts_direct_json_object(): def test_extract_json_object_extracts_embedded_json_object_from_text(): - assert _extract_json_object('Here is the result:\n```json\n{"score": 0.7}\n```') == {"score": 0.7} + assert _extract_json_object( + 'Here is the result:\n```json\n{"score": 0.7}\n```' + ) == {"score": 0.7} def test_extract_json_object_rejects_missing_json_object(): diff --git a/core/tests/test_newsletters.py b/core/tests/test_newsletters.py index 068f4a02..54a0466a 100644 --- a/core/tests/test_newsletters.py +++ b/core/tests/test_newsletters.py @@ -9,7 +9,13 @@ from django.urls import reverse from svix.webhooks import Webhook -from core.models import Content, IntakeAllowlist, NewsletterIntake, NewsletterIntakeStatus, Project +from core.models import ( + Content, + IntakeAllowlist, + NewsletterIntake, + NewsletterIntakeStatus, + Project, +) from core.newsletters import ( extract_newsletter_items, sanitize_newsletter_html, @@ -32,7 +38,9 @@ def project(): def test_sanitize_newsletter_html_removes_scripts_and_inline_handlers(): - sanitized = sanitize_newsletter_html('
Read
') + sanitized = sanitize_newsletter_html( + '
Read
' + ) assert " dict[str, str]: +def _signed_resend_headers( + secret: str, payload: str, *, message_id: str +) -> dict[str, str]: timestamp = datetime.now(timezone.utc) signature = Webhook(secret).sign( msg_id=message_id, @@ -105,7 +118,9 @@ def _basic_auth_header(credentials: str) -> str: return f"Basic {encoded}" -def test_handle_anymail_inbound_creates_pending_intake_and_sends_confirmation(settings, mocker, project): +def test_handle_anymail_inbound_creates_pending_intake_and_sends_confirmation( + settings, mocker, project +): settings.NEWSLETTER_API_BASE_URL = "https://example.com" send_mock = mocker.patch("core.newsletters.send_confirmation_email") event = SimpleNamespace( @@ -123,11 +138,15 @@ def test_handle_anymail_inbound_creates_pending_intake_and_sends_confirmation(se handle_anymail_inbound(sender=object(), event=event, esp_name="Resend") intake = NewsletterIntake.objects.get(message_id="msg-123") - allowlist = IntakeAllowlist.objects.get(project=project, sender_email="newsletter@example.com") + allowlist = IntakeAllowlist.objects.get( + project=project, sender_email="newsletter@example.com" + ) assert intake.status == NewsletterIntakeStatus.PENDING assert allowlist.confirmed_at is None send_mock.assert_called_once() - assert send_mock.call_args.kwargs["confirm_url"].startswith("https://example.com/api/v1/inbound/confirm/") + assert send_mock.call_args.kwargs["confirm_url"].startswith( + "https://example.com/api/v1/inbound/confirm/" + ) def test_handle_anymail_inbound_queues_confirmed_sender(settings, mocker, project): @@ -208,7 +227,9 @@ def test_resend_inbound_webhook_posts_to_anymail_url(settings, client, mocker, p assert response.status_code == 200 intake = NewsletterIntake.objects.get(message_id="") - allowlist = IntakeAllowlist.objects.get(project=project, sender_email="newsletter@example.com") + allowlist = IntakeAllowlist.objects.get( + project=project, sender_email="newsletter@example.com" + ) assert intake.status == NewsletterIntakeStatus.PENDING assert allowlist.confirmed_at is None assert len(mail.outbox) == 1 @@ -273,7 +294,9 @@ def test_amazon_ses_inbound_webhook_posts_to_anymail_url(settings, client, proje assert response.status_code == 200 intake = NewsletterIntake.objects.get(message_id="") - allowlist = IntakeAllowlist.objects.get(project=project, sender_email="newsletter@example.com") + allowlist = IntakeAllowlist.objects.get( + project=project, sender_email="newsletter@example.com" + ) assert intake.status == NewsletterIntakeStatus.PENDING assert allowlist.confirmed_at is None assert len(mail.outbox) == 1 @@ -292,7 +315,9 @@ def test_send_confirmation_email_uses_django_mail_backend(settings): assert len(mail.outbox) == 1 message = mail.outbox[0] - assert message.subject == "Confirm newsletter intake for Platform Engineering Weekly" + assert ( + message.subject == "Confirm newsletter intake for Platform Engineering Weekly" + ) assert message.from_email == "noreply@example.com" assert message.to == ["newsletter@example.com"] assert "https://example.com/confirm/token" in message.body @@ -302,9 +327,13 @@ def test_send_confirmation_email_uses_django_mail_backend(settings): ) -def test_confirm_newsletter_sender_confirms_allowlist_and_queues_pending_intakes(client, settings, mocker, project): +def test_confirm_newsletter_sender_confirms_allowlist_and_queues_pending_intakes( + client, settings, mocker, project +): settings.CELERY_TASK_ALWAYS_EAGER = False - allowlist = IntakeAllowlist.objects.create(project=project, sender_email="newsletter@example.com") + allowlist = IntakeAllowlist.objects.create( + project=project, sender_email="newsletter@example.com" + ) intake = NewsletterIntake.objects.create( project=project, sender_email="newsletter@example.com", @@ -314,7 +343,11 @@ def test_confirm_newsletter_sender_confirms_allowlist_and_queues_pending_intakes ) delay_mock = mocker.patch("core.tasks.process_newsletter_intake.delay") - response = client.get(reverse("confirm-newsletter-sender", kwargs={"token": allowlist.confirmation_token})) + response = client.get( + reverse( + "confirm-newsletter-sender", kwargs={"token": allowlist.confirmation_token} + ) + ) assert response.status_code == 200 allowlist.refresh_from_db() @@ -322,7 +355,9 @@ def test_confirm_newsletter_sender_confirms_allowlist_and_queues_pending_intakes delay_mock.assert_called_once_with(intake.id) -def test_process_newsletter_intake_creates_content_for_confirmed_sender(settings, mocker, project): +def test_process_newsletter_intake_creates_content_for_confirmed_sender( + settings, mocker, project +): settings.CELERY_TASK_ALWAYS_EAGER = False allowlist = IntakeAllowlist.objects.create( project=project, diff --git a/core/tests/test_pipeline.py b/core/tests/test_pipeline.py index fe81f49f..ae513bf3 100644 --- a/core/tests/test_pipeline.py +++ b/core/tests/test_pipeline.py @@ -3,7 +3,14 @@ import pytest from django.contrib.auth.models import Group -from core.models import Content, Project, ReviewQueue, ReviewReason, SkillResult, SkillStatus +from core.models import ( + Content, + Project, + ReviewQueue, + ReviewReason, + SkillResult, + SkillStatus, +) from core.pipeline import ( CLASSIFICATION_SKILL_NAME, RELATED_CONTENT_SKILL_NAME, @@ -30,10 +37,14 @@ @pytest.fixture def pipeline_context(django_user_model): - user = django_user_model.objects.create_user(username="pipeline-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="pipeline-owner", password="testpass123" + ) group = Group.objects.create(name="pipeline-team") user.groups.add(group) - project = Project.objects.create(name="Pipeline Project", group=group, topic_description="Platform engineering") + project = Project.objects.create( + name="Pipeline Project", group=group, topic_description="Platform engineering" + ) content = Content.objects.create( project=project, url="https://example.com/article", @@ -47,7 +58,9 @@ def pipeline_context(django_user_model): return SimpleNamespace(user=user, group=group, project=project, content=content) -def test_process_content_runs_full_pipeline_for_relevant_content(pipeline_context, mocker): +def test_process_content_runs_full_pipeline_for_relevant_content( + pipeline_context, mocker +): mocker.patch( "core.pipeline.run_content_classification", return_value={ @@ -84,9 +97,24 @@ def test_process_content_runs_full_pipeline_for_relevant_content(pipeline_contex assert pipeline_context.content.content_type == "release_notes" assert pipeline_context.content.relevance_score == pytest.approx(0.92) assert pipeline_context.content.is_active is True - assert SkillResult.objects.filter(content=pipeline_context.content, skill_name=CLASSIFICATION_SKILL_NAME).count() == 1 - assert SkillResult.objects.filter(content=pipeline_context.content, skill_name=RELEVANCE_SKILL_NAME).count() == 1 - assert SkillResult.objects.filter(content=pipeline_context.content, skill_name=SUMMARIZATION_SKILL_NAME).count() == 1 + assert ( + SkillResult.objects.filter( + content=pipeline_context.content, skill_name=CLASSIFICATION_SKILL_NAME + ).count() + == 1 + ) + assert ( + SkillResult.objects.filter( + content=pipeline_context.content, skill_name=RELEVANCE_SKILL_NAME + ).count() + == 1 + ) + assert ( + SkillResult.objects.filter( + content=pipeline_context.content, skill_name=SUMMARIZATION_SKILL_NAME + ).count() + == 1 + ) assert ReviewQueue.objects.filter(content=pipeline_context.content).count() == 0 @@ -119,7 +147,9 @@ def test_process_content_queues_borderline_items_for_review(pipeline_context, mo assert result["status"] == "review" assert pipeline_context.content.is_active is True summarize_mock.assert_not_called() - review_item = ReviewQueue.objects.get(content=pipeline_context.content, reason=ReviewReason.BORDERLINE_RELEVANCE) + review_item = ReviewQueue.objects.get( + content=pipeline_context.content, reason=ReviewReason.BORDERLINE_RELEVANCE + ) assert review_item.confidence == pytest.approx(0.55) @@ -152,10 +182,17 @@ def test_process_content_archives_irrelevant_items(pipeline_context, mocker): assert result["status"] == "archived" assert pipeline_context.content.is_active is False summarize_mock.assert_not_called() - assert ReviewQueue.objects.filter(content=pipeline_context.content, reason=ReviewReason.BORDERLINE_RELEVANCE).count() == 0 + assert ( + ReviewQueue.objects.filter( + content=pipeline_context.content, reason=ReviewReason.BORDERLINE_RELEVANCE + ).count() + == 0 + ) -def test_process_content_adds_review_item_for_low_confidence_classification(pipeline_context, mocker): +def test_process_content_adds_review_item_for_low_confidence_classification( + pipeline_context, mocker +): mocker.patch( "core.pipeline.run_content_classification", return_value={ @@ -231,7 +268,10 @@ def test_run_content_classification_falls_back_to_heuristic_when_openrouter_fail mocker, ): settings.OPENROUTER_API_KEY = "test-key" - mocker.patch("core.pipeline.openrouter_chat_json", side_effect=RuntimeError("llm unavailable")) + mocker.patch( + "core.pipeline.openrouter_chat_json", + side_effect=RuntimeError("llm unavailable"), + ) result = run_content_classification(pipeline_context.content) @@ -246,7 +286,9 @@ def test_run_relevance_scoring_uses_openrouter_for_borderline_similarity( mocker, ): settings.OPENROUTER_API_KEY = "test-key" - mocker.patch("core.pipeline.build_content_embedding_text", return_value="embedding text") + mocker.patch( + "core.pipeline.build_content_embedding_text", return_value="embedding text" + ) mocker.patch("core.pipeline.embed_text", return_value=[0.1, 0.2, 0.3]) mocker.patch("core.pipeline.get_reference_similarity", return_value=0.6) openrouter_mock = mocker.patch( @@ -278,7 +320,9 @@ def test_run_relevance_scoring_skips_llm_for_high_similarity( settings, mocker, ): - mocker.patch("core.pipeline.build_content_embedding_text", return_value="embedding text") + mocker.patch( + "core.pipeline.build_content_embedding_text", return_value="embedding text" + ) mocker.patch("core.pipeline.embed_text", return_value=[0.1, 0.2, 0.3]) mocker.patch("core.pipeline.get_reference_similarity", return_value=0.95) openrouter_mock = mocker.patch("core.pipeline.openrouter_chat_json") @@ -301,10 +345,15 @@ def test_run_relevance_scoring_falls_back_when_openrouter_fails( mocker, ): settings.OPENROUTER_API_KEY = "test-key" - mocker.patch("core.pipeline.build_content_embedding_text", return_value="embedding text") + mocker.patch( + "core.pipeline.build_content_embedding_text", return_value="embedding text" + ) mocker.patch("core.pipeline.embed_text", return_value=[0.1, 0.2, 0.3]) mocker.patch("core.pipeline.get_reference_similarity", return_value=0.6) - mocker.patch("core.pipeline.openrouter_chat_json", side_effect=RuntimeError("llm unavailable")) + mocker.patch( + "core.pipeline.openrouter_chat_json", + side_effect=RuntimeError("llm unavailable"), + ) result = run_relevance_scoring(pipeline_context.content) @@ -319,7 +368,10 @@ def test_run_summarization_falls_back_to_heuristic_when_openrouter_fails( mocker, ): settings.OPENROUTER_API_KEY = "test-key" - mocker.patch("core.pipeline.openrouter_chat_json", side_effect=RuntimeError("model unavailable")) + mocker.patch( + "core.pipeline.openrouter_chat_json", + side_effect=RuntimeError("model unavailable"), + ) result = run_summarization(pipeline_context.content) @@ -354,8 +406,12 @@ def test_execute_ad_hoc_classification_supersedes_previous_result_and_updates_re ], ) - first_result = execute_ad_hoc_skill(pipeline_context.content, CLASSIFICATION_SKILL_NAME) - second_result = execute_ad_hoc_skill(pipeline_context.content, CLASSIFICATION_SKILL_NAME) + first_result = execute_ad_hoc_skill( + pipeline_context.content, CLASSIFICATION_SKILL_NAME + ) + second_result = execute_ad_hoc_skill( + pipeline_context.content, CLASSIFICATION_SKILL_NAME + ) first_result.refresh_from_db() pipeline_context.content.refresh_from_db() @@ -371,10 +427,13 @@ def test_execute_ad_hoc_classification_supersedes_previous_result_and_updates_re assert first_result.superseded_by_id == second_result.id assert pipeline_context.content.content_type == "tutorial" assert review_item.confidence == pytest.approx(0.45) - assert ReviewQueue.objects.filter( - content=pipeline_context.content, - reason=ReviewReason.LOW_CONFIDENCE_CLASSIFICATION, - ).count() == 1 + assert ( + ReviewQueue.objects.filter( + content=pipeline_context.content, + reason=ReviewReason.LOW_CONFIDENCE_CLASSIFICATION, + ).count() + == 1 + ) def test_execute_ad_hoc_relevance_creates_review_item_for_borderline_scores( @@ -424,7 +483,10 @@ def test_execute_ad_hoc_related_content_returns_failed_result_on_search_error( pipeline_context, mocker, ): - mocker.patch("core.pipeline.search_similar_content", side_effect=RuntimeError("vector index unavailable")) + mocker.patch( + "core.pipeline.search_similar_content", + side_effect=RuntimeError("vector index unavailable"), + ) result = execute_ad_hoc_skill(pipeline_context.content, RELATED_CONTENT_SKILL_NAME) @@ -439,7 +501,9 @@ def test_create_pending_skill_result_rejects_non_async_skill(pipeline_context): def test_execute_background_skill_result_rejects_skill_name_mismatch(pipeline_context): - pending_result = create_pending_skill_result(pipeline_context.content, RELEVANCE_SKILL_NAME) + pending_result = create_pending_skill_result( + pipeline_context.content, RELEVANCE_SKILL_NAME + ) with pytest.raises(ValueError, match="is for relevance_scoring, not summarization"): execute_background_skill_result(pending_result.id, SUMMARIZATION_SKILL_NAME) @@ -451,7 +515,9 @@ def test_execute_background_skill_result_completes_summary_when_requirements_are ): pipeline_context.content.relevance_score = 0.9 pipeline_context.content.save(update_fields=["relevance_score"]) - pending_result = create_pending_skill_result(pipeline_context.content, SUMMARIZATION_SKILL_NAME) + pending_result = create_pending_skill_result( + pipeline_context.content, SUMMARIZATION_SKILL_NAME + ) mocker.patch( "core.pipeline.run_summarization", return_value={ @@ -461,20 +527,31 @@ def test_execute_background_skill_result_completes_summary_when_requirements_are }, ) - result = execute_background_skill_result(pending_result.id, SUMMARIZATION_SKILL_NAME) + result = execute_background_skill_result( + pending_result.id, SUMMARIZATION_SKILL_NAME + ) pending_result.refresh_from_db() assert result.status == SkillStatus.COMPLETED assert pending_result.status == SkillStatus.COMPLETED - assert pending_result.result_data == {"summary": "Manual summary output.", "model_used": "heuristic", "latency_ms": 0} + assert pending_result.result_data == { + "summary": "Manual summary output.", + "model_used": "heuristic", + "latency_ms": 0, + } def test_execute_background_skill_result_marks_relevance_failed_when_execution_errors( pipeline_context, mocker, ): - pending_result = create_pending_skill_result(pipeline_context.content, RELEVANCE_SKILL_NAME) - mocker.patch("core.pipeline.run_relevance_scoring", side_effect=RuntimeError("embedding unavailable")) + pending_result = create_pending_skill_result( + pipeline_context.content, RELEVANCE_SKILL_NAME + ) + mocker.patch( + "core.pipeline.run_relevance_scoring", + side_effect=RuntimeError("embedding unavailable"), + ) result = execute_background_skill_result(pending_result.id, RELEVANCE_SKILL_NAME) @@ -485,9 +562,32 @@ def test_execute_background_skill_result_marks_relevance_failed_when_execution_e def test_route_by_relevance_uses_threshold_boundaries(settings): - assert route_by_relevance({"relevance": {"relevance_score": settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD}}) == "relevant" - assert route_by_relevance({"relevance": {"relevance_score": settings.AI_RELEVANCE_REVIEW_THRESHOLD - 0.01}}) == "irrelevant" - assert route_by_relevance({"relevance": {"relevance_score": settings.AI_RELEVANCE_REVIEW_THRESHOLD}}) == "borderline" + assert ( + route_by_relevance( + { + "relevance": { + "relevance_score": settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD + } + } + ) + == "relevant" + ) + assert ( + route_by_relevance( + { + "relevance": { + "relevance_score": settings.AI_RELEVANCE_REVIEW_THRESHOLD - 0.01 + } + } + ) + == "irrelevant" + ) + assert ( + route_by_relevance( + {"relevance": {"relevance_score": settings.AI_RELEVANCE_REVIEW_THRESHOLD}} + ) + == "borderline" + ) def test_run_ad_hoc_relevance_updates_existing_review_item(pipeline_context, mocker): @@ -514,7 +614,12 @@ def test_run_ad_hoc_relevance_updates_existing_review_item(pipeline_context, moc existing.refresh_from_db() assert relevance_score == pytest.approx(0.58) assert existing.confidence == pytest.approx(0.58) - assert ReviewQueue.objects.filter(content=pipeline_context.content, reason=ReviewReason.BORDERLINE_RELEVANCE).count() == 1 + assert ( + ReviewQueue.objects.filter( + content=pipeline_context.content, reason=ReviewReason.BORDERLINE_RELEVANCE + ).count() + == 1 + ) def test_execute_with_retries_retries_until_success(settings): @@ -540,7 +645,9 @@ def always_fail(): _execute_with_retries("summarization", always_fail) -def test_pipeline_helper_utilities_cover_serialization_and_summary_edges(pipeline_context): +def test_pipeline_helper_utilities_cover_serialization_and_summary_edges( + pipeline_context, +): empty_content = Content( project=pipeline_context.project, url="https://example.com/empty", @@ -569,7 +676,10 @@ def test_pipeline_helper_utilities_cover_serialization_and_summary_edges(pipelin "source_plugin": None, "score": 0.0, } - assert _heuristic_summary(empty_content) == "Empty Content: no summary was available from the source content." + assert ( + _heuristic_summary(empty_content) + == "Empty Content: no summary was available from the source content." + ) assert _heuristic_summary(long_content).endswith("...") assert _normalize_summary(" ", pipeline_context.content) == ( "Kubernetes Release Notes: summary generation returned no content." diff --git a/core/tests/test_plugin_base.py b/core/tests/test_plugin_base.py index 1c3c2240..395be933 100644 --- a/core/tests/test_plugin_base.py +++ b/core/tests/test_plugin_base.py @@ -31,10 +31,14 @@ def health_check(self) -> bool: @pytest.fixture def plugin_context(django_user_model): - user = django_user_model.objects.create_user(username="plugin-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="plugin-owner", password="testpass123" + ) group = Group.objects.create(name="plugin-team") user.groups.add(group) - project = Project.objects.create(name="Plugin Project", group=group, topic_description="Infra") + project = Project.objects.create( + name="Plugin Project", group=group, topic_description="Infra" + ) source_config = SimpleNamespace(project=project, config={"api_key": "secret"}) return SimpleNamespace(project=project, source_config=source_config) @@ -72,14 +76,21 @@ def test_source_plugin_match_entity_for_url_matches_normalized_hostname(plugin_c assert result == matching_entity -def test_source_plugin_match_entity_for_url_returns_none_for_missing_hostname(plugin_context): +def test_source_plugin_match_entity_for_url_returns_none_for_missing_hostname( + plugin_context, +): plugin = DummySourcePlugin(plugin_context.source_config) assert plugin.match_entity_for_url("not-a-valid-url") is None - assert DummySourcePlugin._normalize_hostname("https://www.EXAMPLE.com/path") == "example.com" + assert ( + DummySourcePlugin._normalize_hostname("https://www.EXAMPLE.com/path") + == "example.com" + ) -def test_source_plugin_match_entity_for_url_returns_none_when_no_entity_matches(plugin_context): +def test_source_plugin_match_entity_for_url_returns_none_when_no_entity_matches( + plugin_context, +): Entity.objects.create( project=plugin_context.project, name="Different Entity", diff --git a/core/tests/test_reddit.py b/core/tests/test_reddit.py index bf0a7289..c6243629 100644 --- a/core/tests/test_reddit.py +++ b/core/tests/test_reddit.py @@ -13,10 +13,14 @@ @pytest.fixture def reddit_context(django_user_model): - user = django_user_model.objects.create_user(username="reddit-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="reddit-owner", password="testpass123" + ) group = Group.objects.create(name="reddit-team") user.groups.add(group) - project = Project.objects.create(name="Reddit Project", group=group, topic_description="Infra") + project = Project.objects.create( + name="Reddit Project", group=group, topic_description="Infra" + ) source_config = SourceConfig.objects.create( project=project, plugin_name=SourcePluginName.REDDIT, @@ -44,7 +48,9 @@ def test_validate_plugin_config_rejects_unknown_plugin_name(): validate_plugin_config("unknown-plugin", {}) -def test_reddit_fetch_new_content_deduplicates_and_filters_by_since(reddit_context, mocker): +def test_reddit_fetch_new_content_deduplicates_and_filters_by_since( + reddit_context, mocker +): plugin = RedditSourcePlugin(reddit_context.source_config) now = datetime.now(tz=UTC) duplicate_id = "dup-1" @@ -113,7 +119,9 @@ def test_reddit_client_builds_praw_client(settings, mocker): settings.REDDIT_CLIENT_ID = "client-id" settings.REDDIT_CLIENT_SECRET = "client-secret" settings.REDDIT_USER_AGENT = "newsletter-maker-test" - reddit_cls = mocker.patch("core.plugins.reddit.praw.Reddit", return_value="reddit-client") + reddit_cls = mocker.patch( + "core.plugins.reddit.praw.Reddit", return_value="reddit-client" + ) client = RedditSourcePlugin._client() diff --git a/core/tests/test_rss.py b/core/tests/test_rss.py index bc70c4ab..6a291977 100644 --- a/core/tests/test_rss.py +++ b/core/tests/test_rss.py @@ -13,10 +13,14 @@ @pytest.fixture def rss_context(django_user_model): - user = django_user_model.objects.create_user(username="rss-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="rss-owner", password="testpass123" + ) group = Group.objects.create(name="rss-team") user.groups.add(group) - project = Project.objects.create(name="RSS Project", group=group, topic_description="Infra") + project = Project.objects.create( + name="RSS Project", group=group, topic_description="Infra" + ) source_config = SourceConfig.objects.create( project=project, plugin_name=SourcePluginName.RSS, @@ -32,8 +36,14 @@ def test_rss_fetch_new_content_filters_invalid_and_old_entries(rss_context, mock parsed_feed = SimpleNamespace( entries=[ SimpleNamespace(link="", title="Missing link", published_parsed=fresh_time), - SimpleNamespace(link="https://example.com/no-title", title=" ", published_parsed=fresh_time), - SimpleNamespace(link="https://example.com/old", title="Old", published_parsed=old_time), + SimpleNamespace( + link="https://example.com/no-title", + title=" ", + published_parsed=fresh_time, + ), + SimpleNamespace( + link="https://example.com/old", title="Old", published_parsed=old_time + ), SimpleNamespace( link="https://example.com/fresh", title=" Fresh entry ", @@ -56,7 +66,9 @@ def test_rss_fetch_new_content_filters_invalid_and_old_entries(rss_context, mock assert items[0].source_plugin == SourcePluginName.RSS -def test_rss_fetch_new_content_uses_title_when_summary_and_description_missing(rss_context, mocker): +def test_rss_fetch_new_content_uses_title_when_summary_and_description_missing( + rss_context, mocker +): parsed_feed = SimpleNamespace( entries=[ SimpleNamespace( @@ -76,7 +88,9 @@ def test_rss_fetch_new_content_uses_title_when_summary_and_description_missing(r def test_rss_health_check_returns_false_for_empty_feed(rss_context, mocker): - mocker.patch("core.plugins.rss.feedparser.parse", return_value=SimpleNamespace(entries=[])) + mocker.patch( + "core.plugins.rss.feedparser.parse", return_value=SimpleNamespace(entries=[]) + ) plugin = RSSSourcePlugin(rss_context.source_config) assert plugin.health_check() is False diff --git a/core/tests/test_serializers.py b/core/tests/test_serializers.py index 65150745..6efb2e34 100644 --- a/core/tests/test_serializers.py +++ b/core/tests/test_serializers.py @@ -3,7 +3,15 @@ import pytest from django.contrib.auth.models import AnonymousUser, Group -from core.models import Content, Entity, Project, ReviewReason, SkillResult, SourceConfig, SourcePluginName +from core.models import ( + Content, + Entity, + Project, + ReviewReason, + SkillResult, + SourceConfig, + SourcePluginName, +) from core.serializers import ( ContentSerializer, EntitySerializer, @@ -20,16 +28,28 @@ @pytest.fixture def serializer_context(django_user_model): - user = django_user_model.objects.create_user(username="serializer-owner", password="testpass123") - other_user = django_user_model.objects.create_user(username="serializer-other", password="testpass123") + user = django_user_model.objects.create_user( + username="serializer-owner", password="testpass123" + ) + other_user = django_user_model.objects.create_user( + username="serializer-other", password="testpass123" + ) group = Group.objects.create(name="serializer-team") other_group = Group.objects.create(name="serializer-other-team") user.groups.add(group) other_user.groups.add(other_group) - project = Project.objects.create(name="Serializer Project", group=group, topic_description="Infra") - other_project = Project.objects.create(name="Other Serializer Project", group=other_group, topic_description="Data") - entity = Entity.objects.create(project=project, name="Serializer Entity", type="vendor") - other_entity = Entity.objects.create(project=other_project, name="Other Entity", type="vendor") + project = Project.objects.create( + name="Serializer Project", group=group, topic_description="Infra" + ) + other_project = Project.objects.create( + name="Other Serializer Project", group=other_group, topic_description="Data" + ) + entity = Entity.objects.create( + project=project, name="Serializer Entity", type="vendor" + ) + other_entity = Entity.objects.create( + project=other_project, name="Other Entity", type="vendor" + ) content = Content.objects.create( project=project, url="https://example.com/serializer-content", @@ -82,7 +102,9 @@ def _request_for(user): return SimpleNamespace(user=user) -def test_project_scoped_serializer_filters_related_querysets_with_project_context(serializer_context): +def test_project_scoped_serializer_filters_related_querysets_with_project_context( + serializer_context, +): serializer = SkillResultSerializer( context={ "request": _request_for(serializer_context.user), @@ -91,12 +113,18 @@ def test_project_scoped_serializer_filters_related_querysets_with_project_contex ) assert list(serializer.fields["content"].queryset) == [serializer_context.content] - assert list(serializer.fields["superseded_by"].queryset) == [serializer_context.skill_result] + assert list(serializer.fields["superseded_by"].queryset) == [ + serializer_context.skill_result + ] assert list(serializer.fields["project"].queryset) == [serializer_context.project] -def test_project_scoped_serializer_filters_related_querysets_without_project_context(serializer_context): - serializer = ContentSerializer(context={"request": _request_for(serializer_context.user)}) +def test_project_scoped_serializer_filters_related_querysets_without_project_context( + serializer_context, +): + serializer = ContentSerializer( + context={"request": _request_for(serializer_context.user)} + ) assert list(serializer.fields["entity"].queryset) == [serializer_context.entity] assert list(serializer.fields["project"].queryset) == [serializer_context.project] @@ -117,7 +145,9 @@ def test_content_serializer_rejects_cross_project_entity(serializer_context): ) assert serializer.is_valid() is False - assert serializer.errors == {"entity": ["Entity must belong to the selected project."]} + assert serializer.errors == { + "entity": ["Entity must belong to the selected project."] + } def test_skill_result_serializer_rejects_cross_project_content(serializer_context): @@ -133,7 +163,9 @@ def test_skill_result_serializer_rejects_cross_project_content(serializer_contex ) assert serializer.is_valid() is False - assert serializer.errors == {"content": ["Content must belong to the selected project."]} + assert serializer.errors == { + "content": ["Content must belong to the selected project."] + } def test_review_queue_serializer_rejects_cross_project_content(serializer_context): @@ -149,7 +181,9 @@ def test_review_queue_serializer_rejects_cross_project_content(serializer_contex ) assert serializer.is_valid() is False - assert serializer.errors == {"content": ["Content must belong to the selected project."]} + assert serializer.errors == { + "content": ["Content must belong to the selected project."] + } def test_source_config_serializer_normalizes_valid_config(serializer_context): @@ -166,7 +200,9 @@ def test_source_config_serializer_normalizes_valid_config(serializer_context): ) assert serializer.is_valid(), serializer.errors - assert serializer.validated_data["config"] == {"feed_url": "https://example.com/feed.xml"} + assert serializer.validated_data["config"] == { + "feed_url": "https://example.com/feed.xml" + } def test_source_config_serializer_surfaces_plugin_validation_errors(serializer_context): @@ -189,7 +225,9 @@ def test_source_config_serializer_surfaces_plugin_validation_errors(serializer_c def test_entity_serializer_filters_project_queryset_to_request_user(serializer_context): - serializer = EntitySerializer(context={"request": _request_for(serializer_context.user)}) + serializer = EntitySerializer( + context={"request": _request_for(serializer_context.user)} + ) assert list(serializer.fields["project"].queryset) == [serializer_context.project] @@ -206,7 +244,9 @@ def test_user_feedback_serializer_rejects_cross_project_content(serializer_conte ) assert serializer.is_valid() is False - assert serializer.errors == {"content": ["Content must belong to the selected project."]} + assert serializer.errors == { + "content": ["Content must belong to the selected project."] + } def test_review_queue_serializer_accepts_same_project_content(serializer_context): @@ -225,9 +265,13 @@ def test_review_queue_serializer_accepts_same_project_content(serializer_context assert serializer.validated_data["content"] == serializer_context.content -def test_source_config_serializer_skips_plugin_validation_when_plugin_name_missing(serializer_context): +def test_source_config_serializer_skips_plugin_validation_when_plugin_name_missing( + serializer_context, +): serializer = SourceConfigSerializer( - instance=SourceConfig(project=serializer_context.project, plugin_name="", config={}), + instance=SourceConfig( + project=serializer_context.project, plugin_name="", config={} + ), data={"config": {}}, partial=True, context={ @@ -240,6 +284,8 @@ def test_source_config_serializer_skips_plugin_validation_when_plugin_name_missi def test_ingestion_run_serializer_filters_project_queryset(serializer_context): - serializer = IngestionRunSerializer(context={"request": _request_for(serializer_context.user)}) + serializer = IngestionRunSerializer( + context={"request": _request_for(serializer_context.user)} + ) assert list(serializer.fields["project"].queryset) == [serializer_context.project] diff --git a/core/tests/test_tasks.py b/core/tests/test_tasks.py index 953c9008..ff16bdfb 100644 --- a/core/tests/test_tasks.py +++ b/core/tests/test_tasks.py @@ -29,10 +29,14 @@ @pytest.fixture def source_plugin_context(django_user_model): - user = django_user_model.objects.create_user(username="plugin-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="plugin-owner", password="testpass123" + ) group = Group.objects.create(name="plugin-team") user.groups.add(group) - project = Project.objects.create(name="Plugin Project", group=group, topic_description="Infra") + project = Project.objects.create( + name="Plugin Project", group=group, topic_description="Infra" + ) entity = Entity.objects.create( project=project, name="Example", @@ -47,9 +51,9 @@ def test_run_ingestion_creates_content_from_rss_entries(source_plugin_context, m process_content_delay_mock = mocker.patch("core.tasks.process_content.delay") parse_mock = mocker.patch("core.plugins.rss.feedparser.parse") source_config = SourceConfig.objects.create( - project=source_plugin_context.project, - plugin_name=SourcePluginName.RSS, - config={"feed_url": "https://example.com/feed.xml"}, + project=source_plugin_context.project, + plugin_name=SourcePluginName.RSS, + config={"feed_url": "https://example.com/feed.xml"}, ) parse_mock.return_value = SimpleNamespace( entries=[ @@ -58,7 +62,9 @@ def test_run_ingestion_creates_content_from_rss_entries(source_plugin_context, m title="Example Post", author="Author", summary="Summary", - published_parsed=datetime(2026, 4, 20, 12, 0, tzinfo=timezone.utc).timetuple(), + published_parsed=datetime( + 2026, 4, 20, 12, 0, tzinfo=timezone.utc + ).timetuple(), ) ] ) @@ -73,17 +79,20 @@ def test_run_ingestion_creates_content_from_rss_entries(source_plugin_context, m upsert_embedding_mock.assert_called_once_with(content) process_content_delay_mock.assert_called_once_with(content.id) assert SourceConfig.objects.get(pk=source_config.id).last_fetched_at is not None - ingestion_run = IngestionRun.objects.get(project=source_plugin_context.project, plugin_name=SourcePluginName.RSS) + ingestion_run = IngestionRun.objects.get( + project=source_plugin_context.project, plugin_name=SourcePluginName.RSS + ) assert ingestion_run.status == RunStatus.SUCCESS + def test_run_ingestion_skips_duplicate_urls(source_plugin_context, mocker): upsert_embedding_mock = mocker.patch("core.tasks.upsert_content_embedding") process_content_delay_mock = mocker.patch("core.tasks.process_content.delay") parse_mock = mocker.patch("core.plugins.rss.feedparser.parse") source_config = SourceConfig.objects.create( - project=source_plugin_context.project, - plugin_name=SourcePluginName.RSS, - config={"feed_url": "https://example.com/feed.xml"}, + project=source_plugin_context.project, + plugin_name=SourcePluginName.RSS, + config={"feed_url": "https://example.com/feed.xml"}, ) Content.objects.create( project=source_plugin_context.project, @@ -102,7 +111,9 @@ def test_run_ingestion_skips_duplicate_urls(source_plugin_context, mocker): title="Duplicate Post", author="Author", summary="Summary", - published_parsed=datetime(2026, 4, 20, 12, 0, tzinfo=timezone.utc).timetuple(), + published_parsed=datetime( + 2026, 4, 20, 12, 0, tzinfo=timezone.utc + ).timetuple(), ) ] ) @@ -115,14 +126,15 @@ def test_run_ingestion_skips_duplicate_urls(source_plugin_context, mocker): process_content_delay_mock.assert_not_called() assert Content.objects.filter(url="https://example.com/post-1").count() == 1 + def test_run_ingestion_creates_content_from_reddit_posts(source_plugin_context, mocker): upsert_embedding_mock = mocker.patch("core.tasks.upsert_content_embedding") process_content_delay_mock = mocker.patch("core.tasks.process_content.delay") reddit_mock = mocker.patch("core.plugins.reddit.praw.Reddit") source_config = SourceConfig.objects.create( - project=source_plugin_context.project, - plugin_name=SourcePluginName.REDDIT, - config={"subreddit": "python", "listing": "new", "limit": 5}, + project=source_plugin_context.project, + plugin_name=SourcePluginName.REDDIT, + config={"subreddit": "python", "listing": "new", "limit": 5}, ) submission = SimpleNamespace( id="abc123", @@ -133,7 +145,9 @@ def test_run_ingestion_creates_content_from_reddit_posts(source_plugin_context, author="redditor", created_utc=datetime(2026, 4, 20, 12, 0, tzinfo=timezone.utc).timestamp(), ) - subreddit = SimpleNamespace(new=lambda limit: iter([submission]), hot=lambda limit: iter([])) + subreddit = SimpleNamespace( + new=lambda limit: iter([submission]), hot=lambda limit: iter([]) + ) reddit_mock.return_value.subreddit.return_value = subreddit result = run_ingestion(source_config.id) @@ -146,12 +160,15 @@ def test_run_ingestion_creates_content_from_reddit_posts(source_plugin_context, assert content.source_plugin == SourcePluginName.REDDIT assert content.entity is None -def test_run_all_ingestions_enqueues_active_source_configs(source_plugin_context, mocker): + +def test_run_all_ingestions_enqueues_active_source_configs( + source_plugin_context, mocker +): delay_mock = mocker.patch("core.tasks.run_ingestion.delay") active_one = SourceConfig.objects.create( - project=source_plugin_context.project, - plugin_name=SourcePluginName.RSS, - config={"feed_url": "https://example.com/feed.xml"}, + project=source_plugin_context.project, + plugin_name=SourcePluginName.RSS, + config={"feed_url": "https://example.com/feed.xml"}, ) active_two = SourceConfig.objects.create( project=source_plugin_context.project, @@ -173,7 +190,9 @@ def test_run_all_ingestions_enqueues_active_source_configs(source_plugin_context assert delay_mock.call_count == 2 -def test_run_all_ingestions_executes_inline_when_eager(source_plugin_context, settings, mocker): +def test_run_all_ingestions_executes_inline_when_eager( + source_plugin_context, settings, mocker +): settings.CELERY_TASK_ALWAYS_EAGER = True run_ingestion_mock = mocker.patch("core.tasks.run_ingestion") delay_mock = mocker.patch("core.tasks.run_ingestion.delay") @@ -196,19 +215,22 @@ def test_run_all_ingestions_executes_inline_when_eager(source_plugin_context, se assert run_ingestion_mock.call_count == 2 delay_mock.assert_not_called() + def test_run_ingestion_marks_failure_when_plugin_errors(source_plugin_context, mocker): parse_mock = mocker.patch("core.plugins.rss.feedparser.parse") source_config = SourceConfig.objects.create( project=source_plugin_context.project, - plugin_name=SourcePluginName.RSS, - config={"feed_url": "https://example.com/feed.xml"}, + plugin_name=SourcePluginName.RSS, + config={"feed_url": "https://example.com/feed.xml"}, ) parse_mock.side_effect = RuntimeError("feed unavailable") with pytest.raises(RuntimeError, match="feed unavailable"): run_ingestion(source_config.id) - ingestion_run = IngestionRun.objects.get(project=source_plugin_context.project, plugin_name=SourcePluginName.RSS) + ingestion_run = IngestionRun.objects.get( + project=source_plugin_context.project, plugin_name=SourcePluginName.RSS + ) assert ingestion_run.status == RunStatus.FAILED assert ingestion_run.error_message == "feed unavailable" @@ -232,7 +254,9 @@ def test_queue_content_skill_enqueues_relevance_task(source_plugin_context, mock delay_mock.assert_called_once_with(skill_result.id) -def test_queue_content_skill_executes_inline_when_eager(source_plugin_context, settings, mocker): +def test_queue_content_skill_executes_inline_when_eager( + source_plugin_context, settings, mocker +): settings.CELERY_TASK_ALWAYS_EAGER = True content = Content.objects.create( project=source_plugin_context.project, @@ -254,7 +278,9 @@ def test_queue_content_skill_executes_inline_when_eager(source_plugin_context, s delay_mock.assert_not_called() -def test_queue_content_skill_executes_summary_inline_when_eager(source_plugin_context, settings, mocker): +def test_queue_content_skill_executes_summary_inline_when_eager( + source_plugin_context, settings, mocker +): settings.CELERY_TASK_ALWAYS_EAGER = True content = Content.objects.create( project=source_plugin_context.project, @@ -277,7 +303,9 @@ def test_queue_content_skill_executes_summary_inline_when_eager(source_plugin_co delay_mock.assert_not_called() -def test_run_relevance_scoring_skill_updates_pending_result(source_plugin_context, mocker): +def test_run_relevance_scoring_skill_updates_pending_result( + source_plugin_context, mocker +): content = Content.objects.create( project=source_plugin_context.project, entity=source_plugin_context.entity, @@ -313,7 +341,9 @@ def test_run_relevance_scoring_skill_updates_pending_result(source_plugin_contex assert content.is_active is True -def test_run_summarization_skill_marks_result_failed_when_relevance_is_too_low(source_plugin_context, mocker): +def test_run_summarization_skill_marks_result_failed_when_relevance_is_too_low( + source_plugin_context, mocker +): content = Content.objects.create( project=source_plugin_context.project, entity=source_plugin_context.entity, @@ -338,7 +368,9 @@ def test_run_summarization_skill_marks_result_failed_when_relevance_is_too_low(s assert "Summarization requires relevance_score" in pending_result.error_message -def test_ingest_source_config_truncates_fields_and_processes_inline(source_plugin_context, settings, mocker): +def test_ingest_source_config_truncates_fields_and_processes_inline( + source_plugin_context, settings, mocker +): settings.CELERY_TASK_ALWAYS_EAGER = True plugin = mocker.Mock() plugin.fetch_new_content.return_value = [ diff --git a/core/tests/test_utils.py b/core/tests/test_utils.py index a074b670..bbc12168 100644 --- a/core/tests/test_utils.py +++ b/core/tests/test_utils.py @@ -17,11 +17,17 @@ def test_dashboard_callback_uses_zero_when_no_project_configs(): def test_dashboard_callback_rounds_average_authority_weight(django_user_model): - user = django_user_model.objects.create_user(username="utils-owner", password="testpass123") + user = django_user_model.objects.create_user( + username="utils-owner", password="testpass123" + ) group = Group.objects.create(name="utils-team") user.groups.add(group) - project_one = Project.objects.create(name="Utils Project 1", group=group, topic_description="Infra") - project_two = Project.objects.create(name="Utils Project 2", group=group, topic_description="Data") + project_one = Project.objects.create( + name="Utils Project 1", group=group, topic_description="Infra" + ) + project_two = Project.objects.create( + name="Utils Project 2", group=group, topic_description="Data" + ) ProjectConfig.objects.create(project=project_one, upvote_authority_weight=0.1234) ProjectConfig.objects.create(project=project_two, upvote_authority_weight=0.5678) diff --git a/core/urls.py b/core/urls.py index 64b858ae..5744a8fa 100644 --- a/core/urls.py +++ b/core/urls.py @@ -5,5 +5,9 @@ urlpatterns = [ path("healthz/", healthz_view, name="healthz"), path("readyz/", readyz_view, name="readyz"), - path("api/v1/inbound/confirm//", confirm_newsletter_sender_view, name="confirm-newsletter-sender"), + path( + "api/v1/inbound/confirm//", + confirm_newsletter_sender_view, + name="confirm-newsletter-sender", + ), ] diff --git a/core/utils.py b/core/utils.py index 817a1410..0a15ff70 100644 --- a/core/utils.py +++ b/core/utils.py @@ -5,10 +5,14 @@ def dashboard_callback(request, context): # Calculate the average authority weight across all projects. - avg_weight = ProjectConfig.objects.aggregate(Avg("upvote_authority_weight"))["upvote_authority_weight__avg"] + avg_weight = ProjectConfig.objects.aggregate(Avg("upvote_authority_weight"))[ + "upvote_authority_weight__avg" + ] # Add it to the template context - context.update({ - "avg_authority_weight": round(avg_weight, 2) if avg_weight else 0, - }) + context.update( + { + "avg_authority_weight": round(avg_weight, 2) if avg_weight else 0, + } + ) return context diff --git a/core/views.py b/core/views.py index 3d1191cb..33e68c10 100644 --- a/core/views.py +++ b/core/views.py @@ -1,3 +1,5 @@ +"""Operational and newsletter-intake views used outside the REST API.""" + from http import HTTPStatus from typing import cast @@ -17,10 +19,16 @@ def healthz_view(request): - return JsonResponse({"status": "ok", "service": "newsletter-maker"}, status=HTTPStatus.OK) + """Return a lightweight liveness response for load balancers and probes.""" + + return JsonResponse( + {"status": "ok", "service": "newsletter-maker"}, status=HTTPStatus.OK + ) def readyz_view(request): + """Return readiness status based on the database and Qdrant dependencies.""" + checks = { "database": _check_database(), "qdrant": _check_qdrant(), @@ -34,6 +42,8 @@ def readyz_view(request): def _check_database() -> bool: + """Verify the application can execute a trivial SQL query.""" + try: with connection.cursor() as cursor: cursor.execute("SELECT 1") @@ -44,8 +54,12 @@ def _check_database() -> bool: def _check_qdrant() -> bool: + """Verify the application can reach the configured Qdrant instance.""" + try: - client = QdrantClient(url=settings.QDRANT_URL, timeout=2, check_compatibility=False) + client = QdrantClient( + url=settings.QDRANT_URL, timeout=2, check_compatibility=False + ) client.get_collections() except Exception: return False @@ -54,6 +68,17 @@ def _check_qdrant() -> bool: @require_GET def confirm_newsletter_sender_view(request: HttpRequest, token: str): + """Confirm a sender and queue any pending newsletter intake rows. + + Args: + request: Incoming confirmation request. + token: Confirmation token stored on the allowlist entry. + + Returns: + A JSON response showing that the sender was confirmed and how many pending + intake rows were queued for processing. + """ + allowlist = get_object_or_404(IntakeAllowlist, confirmation_token=token) if allowlist.confirmed_at is None: allowlist.confirmed_at = timezone.now() diff --git a/frontend/eslint.config.mjs b/frontend/eslint.config.mjs index 2c3732b7..5981cdbd 100644 --- a/frontend/eslint.config.mjs +++ b/frontend/eslint.config.mjs @@ -8,6 +8,7 @@ export default tseslint.config( { ignores: [ ".next/**", + "coverage/**", "node_modules/**", "next-env.d.ts", "tsconfig.tsbuildinfo", diff --git a/frontend/src/app/__tests__/page.test.tsx b/frontend/src/app/__tests__/page.test.tsx new file mode 100644 index 00000000..e73f5a9f --- /dev/null +++ b/frontend/src/app/__tests__/page.test.tsx @@ -0,0 +1,404 @@ +import { render, screen } from "@testing-library/react" +import type { ReactNode } from "react" +import { beforeEach, describe, expect, it, vi } from "vitest" + +import type { + Content, + Entity, + Project, + ReviewQueueItem, + SourceConfig, + UserFeedback, +} from "@/lib/types" + +const { + buildDashboardViewMock, + getProjectContentsMock, + getProjectEntitiesMock, + getProjectFeedbackMock, + getProjectsMock, + getProjectReviewQueueMock, + getProjectSourceConfigsMock, + selectProjectMock, +} = vi.hoisted(() => ({ + buildDashboardViewMock: vi.fn(), + getProjectContentsMock: vi.fn(), + getProjectEntitiesMock: vi.fn(), + getProjectFeedbackMock: vi.fn(), + getProjectsMock: vi.fn(), + getProjectReviewQueueMock: vi.fn(), + getProjectSourceConfigsMock: vi.fn(), + selectProjectMock: vi.fn(), +})) + +vi.mock("@/components/app-shell", () => ({ + AppShell: ({ + children, + description, + title, + }: { + children: ReactNode + description: string + title: string + }) => ( +
+

{title}

+

{description}

+ {children} +
+ ), +})) + +vi.mock("@/components/status-badge", () => ({ + StatusBadge: ({ + children, + tone, + }: { + children: ReactNode + tone: string + }) => ( + + {children} + + ), +})) + +vi.mock("@/lib/api", () => ({ + getProjectContents: getProjectContentsMock, + getProjectEntities: getProjectEntitiesMock, + getProjectFeedback: getProjectFeedbackMock, + getProjects: getProjectsMock, + getProjectReviewQueue: getProjectReviewQueueMock, + getProjectSourceConfigs: getProjectSourceConfigsMock, +})) + +vi.mock("@/lib/dashboard-view", () => ({ + buildDashboardView: buildDashboardViewMock, +})) + +vi.mock("@/lib/view-helpers", async () => { + const actual = await vi.importActual( + "@/lib/view-helpers", + ) + + return { + ...actual, + selectProject: selectProjectMock, + } +}) + +function createProject(overrides: Partial = {}): Project { + return { + id: 1, + name: "AI Weekly", + group: 10, + topic_description: "AI news", + content_retention_days: 30, + created_at: "2026-04-01T00:00:00Z", + ...overrides, + } +} + +function createContent(overrides: Partial = {}): Content { + return { + id: 41, + project: 1, + url: "https://example.com/post", + title: "Useful AI briefing", + author: "Ada", + entity: null, + source_plugin: "rss", + content_type: "article", + published_date: "2026-04-28T09:00:00Z", + ingested_at: "2026-04-28T10:00:00Z", + content_text: "A long article body for the dashboard preview.", + relevance_score: 0.84, + embedding_id: "embed-1", + is_reference: false, + is_active: true, + ...overrides, + } +} + +function createReviewQueueItem( + overrides: Partial = {}, +): ReviewQueueItem { + return { + id: 7, + project: 1, + content: 41, + reason: "borderline_relevance", + confidence: 0.61, + created_at: "2026-04-28T12:00:00Z", + resolved: false, + resolution: "", + ...overrides, + } +} + +function createEntity(overrides: Partial = {}): Entity { + return { + id: 3, + project: 1, + name: "OpenAI", + type: "vendor", + description: "LLM vendor", + authority_score: 0.91, + website_url: "https://openai.com", + github_url: "", + linkedin_url: "", + bluesky_handle: "", + mastodon_handle: "", + twitter_handle: "openai", + created_at: "2026-04-28T09:30:00Z", + ...overrides, + } +} + +function createSourceConfig( + overrides: Partial = {}, +): SourceConfig { + return { + id: 2, + project: 1, + plugin_name: "rss", + config: { feed_url: "https://example.com/feed.xml" }, + is_active: true, + last_fetched_at: "2026-04-28T07:00:00Z", + ...overrides, + } +} + +function createFeedback(overrides: Partial = {}): UserFeedback { + return { + id: 9, + content: 41, + project: 1, + user: 2, + feedback_type: "upvote", + created_at: "2026-04-28T12:30:00Z", + ...overrides, + } +} + +function createDashboardView(overrides: Record = {}) { + return { + contentMap: new Map(), + contentTypeFilter: "", + contentTypes: [], + daysFilter: 30, + filteredContents: [], + negativeFeedback: 0, + pendingReviewItems: [], + positiveFeedback: 0, + sourceFilter: "", + sources: [], + view: "content", + ...overrides, + } +} + +async function loadHomePageModule() { + return import("../page") +} + +async function renderHomePage( + searchParams: Record = { + project: "1", + }, +) { + const { default: HomePage } = await loadHomePageModule() + + return render( + await HomePage({ + searchParams: Promise.resolve(searchParams), + }), + ) +} + +describe("HomePage", () => { + beforeEach(() => { + const defaultProject = createProject() + const contents = [createContent()] + const reviewQueue = [createReviewQueueItem()] + const entities = [createEntity()] + const sourceConfigs = [createSourceConfig()] + const feedback = [createFeedback()] + + getProjectsMock.mockReset() + getProjectContentsMock.mockReset() + getProjectReviewQueueMock.mockReset() + getProjectEntitiesMock.mockReset() + getProjectSourceConfigsMock.mockReset() + getProjectFeedbackMock.mockReset() + buildDashboardViewMock.mockReset() + selectProjectMock.mockReset() + + getProjectsMock.mockResolvedValue([defaultProject]) + getProjectContentsMock.mockResolvedValue(contents) + getProjectReviewQueueMock.mockResolvedValue(reviewQueue) + getProjectEntitiesMock.mockResolvedValue(entities) + getProjectSourceConfigsMock.mockResolvedValue(sourceConfigs) + getProjectFeedbackMock.mockResolvedValue(feedback) + selectProjectMock.mockImplementation((projects: Project[]) => { + return projects[0] ?? null + }) + buildDashboardViewMock.mockReturnValue( + createDashboardView({ + contentMap: new Map([[41, contents[0]]]), + contentTypes: ["article"], + filteredContents: contents, + pendingReviewItems: reviewQueue, + positiveFeedback: 1, + sources: ["rss"], + }), + ) + }) + + it("renders the no-project empty state and skips project-scoped requests", async () => { + getProjectsMock.mockResolvedValue([]) + selectProjectMock.mockReturnValue(null) + + await renderHomePage({}) + + expect(selectProjectMock).toHaveBeenCalledWith([], {}) + expect( + screen.getByText( + "Create a project in Django admin first, then come back here to review ingested content.", + ), + ).toBeInTheDocument() + expect( + screen.getByText("No projects are available for the configured API user."), + ).toBeInTheDocument() + expect(getProjectContentsMock).not.toHaveBeenCalled() + expect(buildDashboardViewMock).not.toHaveBeenCalled() + }) + + it("renders the content view with summaries, flash messages, and content cards", async () => { + const content = createContent({ + title: "Useful AI briefing", + is_reference: true, + is_active: false, + relevance_score: 0.84, + }) + const reviewItem = createReviewQueueItem({ content: content.id }) + const feedback = [ + createFeedback({ feedback_type: "upvote", content: content.id }), + createFeedback({ + id: 10, + feedback_type: "downvote", + content: content.id, + }), + ] + const sourceConfigs = [ + createSourceConfig({ is_active: true }), + createSourceConfig({ id: 3, plugin_name: "reddit", is_active: false }), + ] + + getProjectContentsMock.mockResolvedValue([content]) + getProjectReviewQueueMock.mockResolvedValue([reviewItem]) + getProjectSourceConfigsMock.mockResolvedValue(sourceConfigs) + getProjectFeedbackMock.mockResolvedValue(feedback) + buildDashboardViewMock.mockReturnValue( + createDashboardView({ + contentMap: new Map([[content.id, content]]), + contentTypeFilter: "article", + contentTypes: ["article"], + filteredContents: [content], + negativeFeedback: 1, + pendingReviewItems: [reviewItem], + positiveFeedback: 1, + sourceFilter: "rss", + sources: ["reddit", "rss"], + view: "content", + }), + ) + + await renderHomePage({ + contentType: "article", + error: "Filter failed", + message: "Filters applied", + project: "1", + source: "rss", + view: "content", + }) + + expect(buildDashboardViewMock).toHaveBeenCalledWith({ + contents: [content], + feedback, + reviewQueue: [reviewItem], + searchParams: { + contentType: "article", + error: "Filter failed", + message: "Filters applied", + project: "1", + source: "rss", + view: "content", + }, + }) + expect(screen.getByText("Filter failed")).toBeInTheDocument() + expect(screen.getByText("Filters applied")).toBeInTheDocument() + expect(screen.getByText("Useful AI briefing")).toBeInTheDocument() + expect(screen.getByText("1/1")).toBeInTheDocument() + expect( + screen.getAllByText("1", { selector: "p.mt-1.text-3xl.font-bold" }), + ).toHaveLength(5) + expect(screen.getByText("reference")).toBeInTheDocument() + expect(screen.getByText("archived")).toBeInTheDocument() + + const badges = screen.getAllByTestId("status-badge") + expect(badges).toHaveLength(1) + expect(badges[0]).toHaveAttribute("data-tone", "positive") + expect(badges[0]).toHaveTextContent("Relevance 0.84") + }) + + it("renders the empty content state when no content matches the current filters", async () => { + buildDashboardViewMock.mockReturnValue( + createDashboardView({ + filteredContents: [], + pendingReviewItems: [], + view: "content", + }), + ) + + await renderHomePage({ project: "1" }) + + expect( + screen.getByText("No content matched the current filters."), + ).toBeInTheDocument() + }) + + it("renders the review view empty state when there are no unresolved items", async () => { + buildDashboardViewMock.mockReturnValue( + createDashboardView({ + pendingReviewItems: [], + view: "review", + }), + ) + + await renderHomePage({ project: "1", view: "review" }) + + expect( + screen.getByText("No unresolved review items for this project right now."), + ).toBeInTheDocument() + }) + + it("renders the review table with fallback content labels when content metadata is missing", async () => { + const reviewItem = createReviewQueueItem({ id: 14, content: 99 }) + buildDashboardViewMock.mockReturnValue( + createDashboardView({ + contentMap: new Map(), + pendingReviewItems: [reviewItem], + view: "review", + }), + ) + + await renderHomePage({ project: "1", view: "review" }) + + expect(screen.getByText("Content #99", { selector: "strong" })).toBeInTheDocument() + expect(screen.getByText("unknown source")).toBeInTheDocument() + expect(screen.getByText("unclassified")).toBeInTheDocument() + expect(screen.getByRole("button", { name: "Approve" })).toBeInTheDocument() + expect(screen.getByRole("button", { name: "Reject" })).toBeInTheDocument() + }) +}) diff --git a/frontend/src/app/admin/health/__tests__/page.test.tsx b/frontend/src/app/admin/health/__tests__/page.test.tsx new file mode 100644 index 00000000..c4b3f9ad --- /dev/null +++ b/frontend/src/app/admin/health/__tests__/page.test.tsx @@ -0,0 +1,271 @@ +import { render, screen } from "@testing-library/react" +import type { ReactNode } from "react" +import { beforeEach, describe, expect, it, vi } from "vitest" + +import type { IngestionRun, Project, SourceConfig } from "@/lib/types" + +const { + getProjectIngestionRunsMock, + getProjectsMock, + getProjectSourceConfigsMock, + selectProjectMock, +} = vi.hoisted(() => ({ + getProjectIngestionRunsMock: vi.fn(), + getProjectsMock: vi.fn(), + getProjectSourceConfigsMock: vi.fn(), + selectProjectMock: vi.fn(), +})) + +vi.mock("@/components/app-shell", () => ({ + AppShell: ({ + children, + description, + title, + }: { + children: ReactNode + description: string + title: string + }) => ( +
+

{title}

+

{description}

+ {children} +
+ ), +})) + +vi.mock("@/components/status-badge", () => ({ + StatusBadge: ({ + children, + tone, + }: { + children: ReactNode + tone: string + }) => ( + + {children} + + ), +})) + +vi.mock("@/lib/api", () => ({ + getProjectIngestionRuns: getProjectIngestionRunsMock, + getProjects: getProjectsMock, + getProjectSourceConfigs: getProjectSourceConfigsMock, +})) + +vi.mock("@/lib/view-helpers", async () => { + const actual = await vi.importActual( + "@/lib/view-helpers", + ) + + return { + ...actual, + selectProject: selectProjectMock, + } +}) + +function createProject(overrides: Partial = {}): Project { + return { + id: 1, + name: "AI Weekly", + group: 10, + topic_description: "AI news", + content_retention_days: 30, + created_at: "2026-04-01T00:00:00Z", + ...overrides, + } +} + +function createSourceConfig( + overrides: Partial = {}, +): SourceConfig { + return { + id: 7, + project: 1, + plugin_name: "rss", + config: { feed_url: "https://example.com/feed.xml" }, + is_active: true, + last_fetched_at: "2026-04-28T08:00:00Z", + ...overrides, + } +} + +function createIngestionRun( + overrides: Partial = {}, +): IngestionRun { + return { + id: 22, + project: 1, + plugin_name: "rss", + started_at: "2026-04-28T09:00:00Z", + completed_at: "2026-04-28T09:03:00Z", + status: "success", + items_fetched: 12, + items_ingested: 9, + error_message: "", + ...overrides, + } +} + +async function loadHealthPageModule() { + return import("../page") +} + +async function renderHealthPage( + searchParams: Record = { + project: "1", + }, +) { + const { default: HealthPage } = await loadHealthPageModule() + + return render( + await HealthPage({ + searchParams: Promise.resolve(searchParams), + }), + ) +} + +describe("deriveSourceStatus", () => { + it('returns "idle" for inactive sources', async () => { + const { deriveSourceStatus } = await loadHealthPageModule() + + expect(deriveSourceStatus(false, "success", "2026-04-28T08:00:00Z")).toBe( + "idle", + ) + }) + + it('returns "failing" for failed ingestion runs', async () => { + const { deriveSourceStatus } = await loadHealthPageModule() + + expect(deriveSourceStatus(true, "failed", "2026-04-28T08:00:00Z")).toBe( + "failing", + ) + }) + + it('returns "degraded" for running ingestion runs', async () => { + const { deriveSourceStatus } = await loadHealthPageModule() + + expect(deriveSourceStatus(true, "running", "2026-04-28T08:00:00Z")).toBe( + "degraded", + ) + }) + + it('returns "degraded" when the source has never fetched', async () => { + const { deriveSourceStatus } = await loadHealthPageModule() + + expect(deriveSourceStatus(true, null, null)).toBe("degraded") + }) + + it('returns "healthy" when the source is active and has successful history', async () => { + const { deriveSourceStatus } = await loadHealthPageModule() + + expect(deriveSourceStatus(true, "success", "2026-04-28T08:00:00Z")).toBe( + "healthy", + ) + }) +}) + +describe("HealthPage", () => { + beforeEach(() => { + const defaultProject = createProject() + + getProjectsMock.mockReset() + getProjectSourceConfigsMock.mockReset() + getProjectIngestionRunsMock.mockReset() + selectProjectMock.mockReset() + + getProjectsMock.mockResolvedValue([defaultProject]) + getProjectSourceConfigsMock.mockResolvedValue([]) + getProjectIngestionRunsMock.mockResolvedValue([]) + selectProjectMock.mockImplementation((projects: Project[]) => { + return projects[0] ?? null + }) + }) + + it("renders the no-project empty state and skips project-scoped API calls", async () => { + getProjectsMock.mockResolvedValue([]) + selectProjectMock.mockReturnValue(null) + + await renderHealthPage({}) + + expect(selectProjectMock).toHaveBeenCalledWith([], {}) + expect( + screen.getByText("No project found for this API user."), + ).toBeInTheDocument() + expect( + screen.getByText("Create a project first in Django admin."), + ).toBeInTheDocument() + expect(getProjectSourceConfigsMock).not.toHaveBeenCalled() + expect(getProjectIngestionRunsMock).not.toHaveBeenCalled() + }) + + it("renders an empty source-configurations row when the project has no sources", async () => { + await renderHealthPage() + + expect( + screen.getByText("No source configurations exist for this project yet."), + ).toBeInTheDocument() + expect(getProjectSourceConfigsMock).toHaveBeenCalledWith(1) + expect(getProjectIngestionRunsMock).toHaveBeenCalledWith(1) + }) + + it("shows a no-runs message for sources without ingestion history", async () => { + getProjectSourceConfigsMock.mockResolvedValue([ + createSourceConfig({ plugin_name: "reddit" }), + ]) + + await renderHealthPage() + + expect(screen.getByText("reddit", { selector: "strong" })).toBeInTheDocument() + expect(screen.getByText("No runs yet")).toBeInTheDocument() + }) + + it("passes the resolved search params to selectProject and renders source names", async () => { + const projects = [createProject({ id: 2, name: "Data Signals" })] + getProjectsMock.mockResolvedValue(projects) + selectProjectMock.mockReturnValue(projects[0]) + getProjectSourceConfigsMock.mockResolvedValue([ + createSourceConfig({ project: 2, plugin_name: "rss" }), + ]) + + await renderHealthPage({ project: "2" }) + + expect(selectProjectMock).toHaveBeenCalledWith(projects, { project: "2" }) + expect(screen.getByText("rss", { selector: "strong" })).toBeInTheDocument() + }) + + it("maps derived health states to badge tones and labels", async () => { + const selectedProject = createProject({ id: 3 }) + getProjectsMock.mockResolvedValue([selectedProject]) + selectProjectMock.mockReturnValue(selectedProject) + getProjectSourceConfigsMock.mockResolvedValue([ + createSourceConfig({ + id: 1, + project: 3, + plugin_name: "rss", + last_fetched_at: "2026-04-28T08:00:00Z", + }), + createSourceConfig({ + id: 2, + project: 3, + plugin_name: "reddit", + last_fetched_at: "2026-04-28T08:00:00Z", + }), + ]) + getProjectIngestionRunsMock.mockResolvedValue([ + createIngestionRun({ project: 3, plugin_name: "rss", status: "success" }), + createIngestionRun({ project: 3, plugin_name: "reddit", status: "failed" }), + ]) + + await renderHealthPage({ project: "3" }) + + const badges = screen.getAllByTestId("status-badge") + + expect(badges).toHaveLength(2) + expect(badges[0]).toHaveAttribute("data-tone", "positive") + expect(badges[0]).toHaveTextContent("healthy") + expect(badges[1]).toHaveAttribute("data-tone", "negative") + expect(badges[1]).toHaveTextContent("failing") + }) +}) diff --git a/frontend/src/app/admin/health/page.tsx b/frontend/src/app/admin/health/page.tsx index 33c385b1..57e7d57b 100644 --- a/frontend/src/app/admin/health/page.tsx +++ b/frontend/src/app/admin/health/page.tsx @@ -12,7 +12,19 @@ type HealthPageProps = { searchParams: Promise> } -function deriveSourceStatus( +/** + * Classify a source configuration into the badge status shown on the admin health page. + * + * The page treats disabled sources as idle, active sources with failed or currently + * running ingestion as unhealthy, and sources with no fetch timestamp as degraded so + * operators can spot missing history before the source silently stalls. + * + * @param isActive - Whether the source configuration is enabled for ingestion. + * @param latestRunStatus - Status of the newest ingestion run for the same plugin, or `null` when no run exists. + * @param lastFetchedAt - ISO timestamp for the last successful fetch, or `null` when the source has not fetched yet. + * @returns The health badge state for the source row. + */ +export function deriveSourceStatus( isActive: boolean, latestRunStatus: string | null, lastFetchedAt: string | null, @@ -32,6 +44,18 @@ function deriveSourceStatus( return "healthy" } +/** + * Render the source-by-source ingestion health view for the selected project. + * + * The page resolves the active project from the URL search params, loads source + * configurations and their most recent ingestion runs, and then maps those records to + * a compact health table. When the API user has no available project, the page renders + * a safe empty state instead of attempting project-scoped API calls. + * + * @param props - Async server component props from the App Router. + * @param props.searchParams - Search params promise containing the optional `project` selector. + * @returns The rendered admin health page for the selected project or the empty project state. + */ export default async function HealthPage({ searchParams }: HealthPageProps) { const resolvedSearchParams = await searchParams const projects = await getProjects() @@ -45,7 +69,7 @@ export default async function HealthPage({ searchParams }: HealthPageProps) { projects={[]} selectedProjectId={null} > -
+
Create a project first in Django admin.
@@ -88,7 +112,7 @@ export default async function HealthPage({ searchParams }: HealthPageProps) { {sourceConfigs.length === 0 ? ( -
+
No source configurations exist for this project yet.
diff --git a/frontend/src/app/admin/sources/__tests__/page.test.tsx b/frontend/src/app/admin/sources/__tests__/page.test.tsx new file mode 100644 index 00000000..192791b9 --- /dev/null +++ b/frontend/src/app/admin/sources/__tests__/page.test.tsx @@ -0,0 +1,278 @@ +import { render, screen } from "@testing-library/react" +import type { ReactNode } from "react" +import { beforeEach, describe, expect, it, vi } from "vitest" + +import type { IngestionRun, Project, SourceConfig } from "@/lib/types" + +const { + getProjectIngestionRunsMock, + getProjectsMock, + getProjectSourceConfigsMock, + selectProjectMock, +} = vi.hoisted(() => ({ + getProjectIngestionRunsMock: vi.fn(), + getProjectsMock: vi.fn(), + getProjectSourceConfigsMock: vi.fn(), + selectProjectMock: vi.fn(), +})) + +vi.mock("@/components/app-shell", () => ({ + AppShell: ({ + children, + description, + title, + }: { + children: ReactNode + description: string + title: string + }) => ( +
+

{title}

+

{description}

+ {children} +
+ ), +})) + +vi.mock("@/components/status-badge", () => ({ + StatusBadge: ({ + children, + tone, + }: { + children: ReactNode + tone: string + }) => ( + + {children} + + ), +})) + +vi.mock("@/lib/api", () => ({ + getProjectIngestionRuns: getProjectIngestionRunsMock, + getProjects: getProjectsMock, + getProjectSourceConfigs: getProjectSourceConfigsMock, +})) + +vi.mock("@/lib/view-helpers", async () => { + const actual = await vi.importActual( + "@/lib/view-helpers", + ) + + return { + ...actual, + selectProject: selectProjectMock, + } +}) + +function createProject(overrides: Partial = {}): Project { + return { + id: 1, + name: "AI Weekly", + group: 10, + topic_description: "AI news", + content_retention_days: 30, + created_at: "2026-04-01T00:00:00Z", + ...overrides, + } +} + +function createSourceConfig( + overrides: Partial = {}, +): SourceConfig { + return { + id: 7, + project: 1, + plugin_name: "rss", + config: { feed_url: "https://example.com/feed.xml" }, + is_active: true, + last_fetched_at: "2026-04-28T08:00:00Z", + ...overrides, + } +} + +function createIngestionRun( + overrides: Partial = {}, +): IngestionRun { + return { + id: 22, + project: 1, + plugin_name: "rss", + started_at: "2026-04-28T09:00:00Z", + completed_at: "2026-04-28T09:03:00Z", + status: "success", + items_fetched: 12, + items_ingested: 9, + error_message: "", + ...overrides, + } +} + +async function loadSourcesPageModule() { + return import("../page") +} + +async function renderSourcesPage( + searchParams: Record = { + project: "1", + }, +) { + const { default: SourcesPage } = await loadSourcesPageModule() + + return render( + await SourcesPage({ + searchParams: Promise.resolve(searchParams), + }), + ) +} + +describe("buildLatestRunByPlugin", () => { + it("keeps the first run seen for each plugin", async () => { + const { buildLatestRunByPlugin } = await loadSourcesPageModule() + const newestRssRun = createIngestionRun({ id: 100, plugin_name: "rss" }) + const olderRssRun = createIngestionRun({ id: 90, plugin_name: "rss" }) + const redditRun = createIngestionRun({ id: 80, plugin_name: "reddit" }) + + const latestRunByPlugin = buildLatestRunByPlugin([ + newestRssRun, + olderRssRun, + redditRun, + ]) + + expect(latestRunByPlugin.get("rss")).toEqual(newestRssRun) + expect(latestRunByPlugin.get("reddit")).toEqual(redditRun) + }) +}) + +describe("SourcesPage", () => { + beforeEach(() => { + const defaultProject = createProject() + + getProjectsMock.mockReset() + getProjectSourceConfigsMock.mockReset() + getProjectIngestionRunsMock.mockReset() + selectProjectMock.mockReset() + + getProjectsMock.mockResolvedValue([defaultProject]) + getProjectSourceConfigsMock.mockResolvedValue([]) + getProjectIngestionRunsMock.mockResolvedValue([]) + selectProjectMock.mockImplementation((projects: Project[]) => { + return projects[0] ?? null + }) + }) + + it("renders the no-project empty state and skips project-scoped API calls", async () => { + getProjectsMock.mockResolvedValue([]) + selectProjectMock.mockReturnValue(null) + + await renderSourcesPage({}) + + expect(selectProjectMock).toHaveBeenCalledWith([], {}) + expect( + screen.getByText("No project found for this API user."), + ).toBeInTheDocument() + expect( + screen.getByText("Create a project first in Django admin."), + ).toBeInTheDocument() + expect(getProjectSourceConfigsMock).not.toHaveBeenCalled() + expect(getProjectIngestionRunsMock).not.toHaveBeenCalled() + }) + + it("renders flash messages from the search params", async () => { + await renderSourcesPage({ + error: "Could not save source", + message: "Source saved", + project: "1", + }) + + expect(selectProjectMock).toHaveBeenCalledWith( + [expect.objectContaining({ id: 1 })], + { + error: "Could not save source", + message: "Source saved", + project: "1", + }, + ) + expect(screen.getByText("Could not save source")).toBeInTheDocument() + expect(screen.getByText("Source saved")).toBeInTheDocument() + }) + + it("renders the empty source-config state when the project has no sources", async () => { + await renderSourcesPage() + + expect( + screen.getByText("No source configurations exist for this project yet."), + ).toBeInTheDocument() + expect(getProjectSourceConfigsMock).toHaveBeenCalledWith(1) + expect(getProjectIngestionRunsMock).toHaveBeenCalledWith(1) + }) + + it("renders source cards with badge tones and the latest run summary", async () => { + const selectedProject = createProject({ id: 3 }) + getProjectsMock.mockResolvedValue([selectedProject]) + selectProjectMock.mockReturnValue(selectedProject) + getProjectSourceConfigsMock.mockResolvedValue([ + createSourceConfig({ + id: 1, + project: 3, + plugin_name: "rss", + is_active: true, + }), + createSourceConfig({ + id: 2, + project: 3, + plugin_name: "reddit", + is_active: false, + }), + ]) + getProjectIngestionRunsMock.mockResolvedValue([ + createIngestionRun({ + id: 9, + project: 3, + plugin_name: "rss", + status: "success", + }), + createIngestionRun({ + id: 8, + project: 3, + plugin_name: "rss", + status: "failed", + }), + createIngestionRun({ + id: 7, + project: 3, + plugin_name: "reddit", + status: "failed", + error_message: "Rate limited", + }), + ]) + + await renderSourcesPage({ project: "3" }) + + expect(screen.getByRole("heading", { name: "rss" })).toBeInTheDocument() + expect( + screen.getByRole("heading", { name: "reddit" }), + ).toBeInTheDocument() + expect(screen.getByText("Latest run: success")).toBeInTheDocument() + expect(screen.getByText("Latest run: failed")).toBeInTheDocument() + expect(screen.getByText("Rate limited")).toBeInTheDocument() + + const badges = screen.getAllByTestId("status-badge") + expect(badges).toHaveLength(2) + expect(badges[0]).toHaveAttribute("data-tone", "positive") + expect(badges[0]).toHaveTextContent("active") + expect(badges[1]).toHaveAttribute("data-tone", "neutral") + expect(badges[1]).toHaveTextContent("disabled") + }) + + it("shows fallback latest-run text when a source has no ingestion history", async () => { + getProjectSourceConfigsMock.mockResolvedValue([ + createSourceConfig({ plugin_name: "reddit" }), + ]) + + await renderSourcesPage({ project: "1" }) + + expect(screen.getByText("Latest run: none")).toBeInTheDocument() + expect(screen.getByText("No recent error")).toBeInTheDocument() + }) +}) diff --git a/frontend/src/app/admin/sources/page.tsx b/frontend/src/app/admin/sources/page.tsx index b50b5ebe..69ad525e 100644 --- a/frontend/src/app/admin/sources/page.tsx +++ b/frontend/src/app/admin/sources/page.tsx @@ -16,6 +16,40 @@ type SourcesPageProps = { searchParams: Promise> } +/** + * Build a per-plugin lookup of the newest ingestion run records already returned by the API. + * + * The ingestion runs list is expected to arrive in newest-first order. This helper keeps the + * first run seen for each plugin so the page can show one concise status summary beside each + * source configuration without re-sorting or scanning the array repeatedly. + * + * @param ingestionRuns - Ingestion history for the selected project, ideally ordered newest first. + * @returns A map keyed by plugin name with the latest run for each source plugin. + */ +export function buildLatestRunByPlugin( + ingestionRuns: Awaited>, +) { + const latestRunByPlugin = new Map() + for (const ingestionRun of ingestionRuns) { + if (!latestRunByPlugin.has(ingestionRun.plugin_name)) { + latestRunByPlugin.set(ingestionRun.plugin_name, ingestionRun) + } + } + return latestRunByPlugin +} + +/** + * Render the source-configuration admin page for the selected project. + * + * The page resolves the active project from the URL, shows any success or error flash messages + * returned from the source-config routes, and renders both the create form and the editable list + * of existing source configurations. When no project is available, it renders a guarded empty + * state instead of issuing project-scoped API requests. + * + * @param props - Async server component props from the App Router. + * @param props.searchParams - Search params promise containing the optional `project`, `message`, and `error` values. + * @returns The rendered source configuration admin page or the no-project empty state. + */ export default async function SourcesPage({ searchParams }: SourcesPageProps) { const resolvedSearchParams = await searchParams const projects = await getProjects() @@ -29,7 +63,7 @@ export default async function SourcesPage({ searchParams }: SourcesPageProps) { projects={[]} selectedProjectId={null} > -
+
Create a project first in Django admin.
@@ -40,12 +74,7 @@ export default async function SourcesPage({ searchParams }: SourcesPageProps) { getProjectSourceConfigs(selectedProject.id), getProjectIngestionRuns(selectedProject.id), ]) - const latestRunByPlugin = new Map() - for (const ingestionRun of ingestionRuns) { - if (!latestRunByPlugin.has(ingestionRun.plugin_name)) { - latestRunByPlugin.set(ingestionRun.plugin_name, ingestionRun) - } - } + const latestRunByPlugin = buildLatestRunByPlugin(ingestionRuns) const errorMessage = getErrorMessage(resolvedSearchParams) const successMessage = getSuccessMessage(resolvedSearchParams) @@ -58,10 +87,10 @@ export default async function SourcesPage({ searchParams }: SourcesPageProps) { selectedProjectId={selectedProject.id} > {errorMessage ? ( -
{errorMessage}
+
{errorMessage}
) : null} {successMessage ? ( -
{successMessage}
+
{successMessage}
) : null}
@@ -92,7 +121,7 @@ export default async function SourcesPage({ searchParams }: SourcesPageProps) {