From 91f791527065d59a601373b43d0cc1680315cf3d Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Mon, 11 May 2026 16:32:50 -0700 Subject: [PATCH 01/12] [HOP-60] Added Models and registered in admin for 3 obj types --- hospexplorer/ask/admin.py | 54 ++++++++++- ...authorinstitution_documenttype_and_more.py | 94 +++++++++++++++++++ hospexplorer/ask/models.py | 64 +++++++++++++ 3 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index cd99b3e..3cf6c5c 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -6,7 +6,17 @@ from django.contrib.auth.models import User from django.db import transaction -from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource +from ask.models import ( + Conversation, + TermsAcceptance, + QARecord, + SimWorkflow, + WebsiteResource, + PDFResource, + DocumentType, + DocumentAuthorInstitution, + InstitutionType, +) from ask.kb_connector import delete_kb_document from ask.tasks import run_kb_resource_upload @@ -181,16 +191,46 @@ def delete_queryset(self, request, queryset): return +@admin.register(DocumentType) +class DocumentTypeAdmin(admin.ModelAdmin): + list_display = ("name",) + search_fields = ("name",) + + +@admin.register(DocumentAuthorInstitution) +class DocumentAuthorInstitutionAdmin(admin.ModelAdmin): + list_display = ("name",) + search_fields = ("name",) + + +@admin.register(InstitutionType) +class InstitutionTypeAdmin(admin.ModelAdmin): + list_display = ("name",) + search_fields = ("name",) + + @admin.register(WebsiteResource) class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): list_display = ("title", "url", "creator", "status", "modified_at") list_filter = ("status",) search_fields = ("title", "url") readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message") + fieldsets = ( + (None, {"fields": ("title", "description", "url")}), + ("Metadata", {"fields": ( + "date_published", "date_published_precision", + "document_type", "document_author_institution", "institution_type", + )}), + ("Status", {"fields": ( + "status", "status_message", "mcp_kb_document_id", + "created_at", "modified_at", "creator", "modifier", + )}), + ) help_texts = { "title": "A short name to identify this website resource.", "description": "Optional details about what this website covers.", "url": "The URL the LLM will use as context when answering questions.", + "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.", } def get_form(self, request, obj=None, **kwargs): @@ -230,10 +270,22 @@ class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): list_filter = ("status",) search_fields = ("title",) readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message") + fieldsets = ( + (None, {"fields": ("title", "description", "file")}), + ("Metadata", {"fields": ( + "date_published", "date_published_precision", + "document_type", "document_author_institution", "institution_type", + )}), + ("Status", {"fields": ( + "status", "status_message", "mcp_kb_document_id", + "created_at", "modified_at", "creator", "modifier", + )}), + ) help_texts = { "title": "A short name to identify this PDF resource.", "description": "Optional details about what this PDF covers.", "file": "The PDF file the LLM will use as context when answering questions.", + "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.", } def get_form(self, request, obj=None, **kwargs): diff --git a/hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py b/hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py new file mode 100644 index 0000000..dfe1411 --- /dev/null +++ b/hospexplorer/ask/migrations/0013_documentauthorinstitution_documenttype_and_more.py @@ -0,0 +1,94 @@ +# Generated by Django 6.0.2 on 2026-05-11 23:30 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('ask', '0012_pdfresource_status_pdfresource_status_message_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='DocumentAuthorInstitution', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + options={ + 'ordering': ['name'], + }, + ), + migrations.CreateModel( + name='DocumentType', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + options={ + 'ordering': ['name'], + }, + ), + migrations.CreateModel( + name='InstitutionType', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=255, unique=True)), + ], + options={ + 'ordering': ['name'], + }, + ), + migrations.AddField( + model_name='pdfresource', + name='date_published', + field=models.DateField(blank=True, null=True), + ), + migrations.AddField( + model_name='pdfresource', + name='date_published_precision', + field=models.CharField(blank=True, choices=[('year', 'Year'), ('month', 'Month'), ('day', 'Day')], default='', max_length=10), + ), + migrations.AddField( + model_name='websiteresource', + name='date_published', + field=models.DateField(blank=True, null=True), + ), + migrations.AddField( + model_name='websiteresource', + name='date_published_precision', + field=models.CharField(blank=True, choices=[('year', 'Year'), ('month', 'Month'), ('day', 'Day')], default='', max_length=10), + ), + migrations.AddField( + model_name='pdfresource', + name='document_author_institution', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documentauthorinstitution'), + ), + migrations.AddField( + model_name='websiteresource', + name='document_author_institution', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documentauthorinstitution'), + ), + migrations.AddField( + model_name='pdfresource', + name='document_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documenttype'), + ), + migrations.AddField( + model_name='websiteresource', + name='document_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.documenttype'), + ), + migrations.AddField( + model_name='pdfresource', + name='institution_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.institutiontype'), + ), + migrations.AddField( + model_name='websiteresource', + name='institution_type', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='%(class)s_resources', to='ask.institutiontype'), + ), + ] diff --git a/hospexplorer/ask/models.py b/hospexplorer/ask/models.py index 70fa94b..e51d420 100644 --- a/hospexplorer/ask/models.py +++ b/hospexplorer/ask/models.py @@ -3,6 +3,36 @@ from django.conf import settings from django.db import models +class DocumentType(models.Model): + name = models.CharField(max_length=255, unique=True) + + class Meta: + ordering = ["name"] + + def __str__(self): + return self.name + + +class DocumentAuthorInstitution(models.Model): + name = models.CharField(max_length=255, unique=True) + + class Meta: + ordering = ["name"] + + def __str__(self): + return self.name + + +class InstitutionType(models.Model): + name = models.CharField(max_length=255, unique=True) + + class Meta: + ordering = ["name"] + + def __str__(self): + return self.name + + # Abstract Model, fields are inherited by subclasses class Resource(models.Model): class Status(models.TextChoices): @@ -11,6 +41,11 @@ class Status(models.TextChoices): ERROR = "error", "Error" WARNING = "warning", "Warning" + class DatePrecision(models.TextChoices): + YEAR = "year", "Year" + MONTH = "month", "Month" + DAY = "day", "Day" + title = models.CharField(max_length=255) description = models.TextField(blank=True, default="") creator = models.ForeignKey( @@ -34,6 +69,35 @@ class Status(models.TextChoices): ) status_message = models.TextField(blank=True, default="") + date_published = models.DateField(null=True, blank=True) + date_published_precision = models.CharField( + max_length=10, + choices=DatePrecision.choices, + blank=True, + default="", + ) + document_type = models.ForeignKey( + "DocumentType", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="%(class)s_resources", + ) + document_author_institution = models.ForeignKey( + "DocumentAuthorInstitution", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="%(class)s_resources", + ) + institution_type = models.ForeignKey( + "InstitutionType", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="%(class)s_resources", + ) + class Meta: abstract = True From 1eaf9b597e921dde66f744edcc2fe021c94eb716 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Tue, 12 May 2026 14:20:05 -0700 Subject: [PATCH 02/12] [HOP-63] Added csv templates for admin, added csv import funcs with edge cases, incorported metadata field in resources abstract model and added it in website and pdf resources --- hospexplorer/ask/admin.py | 62 ++++++++++++++++++- hospexplorer/ask/admin_csv.py | 26 ++++++++ hospexplorer/ask/kb_connector.py | 17 +++-- hospexplorer/ask/tasks.py | 24 ++++++- .../admin/ask/lookup_change_list.html | 9 +++ .../admin/ask/lookup_csv_import.html | 27 ++++++++ 6 files changed, 154 insertions(+), 11 deletions(-) create mode 100644 hospexplorer/ask/admin_csv.py create mode 100644 hospexplorer/ask/templates/admin/ask/lookup_change_list.html create mode 100644 hospexplorer/ask/templates/admin/ask/lookup_csv_import.html diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 3cf6c5c..6ee1a97 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -5,6 +5,9 @@ from django.contrib.auth.admin import UserAdmin from django.contrib.auth.models import User from django.db import transaction +from django.http import HttpResponseRedirect +from django.shortcuts import render +from django.urls import path, reverse from ask.models import ( Conversation, @@ -17,6 +20,7 @@ DocumentAuthorInstitution, InstitutionType, ) +from ask.admin_csv import import_names_csv from ask.kb_connector import delete_kb_document from ask.tasks import run_kb_resource_upload @@ -191,20 +195,72 @@ def delete_queryset(self, request, queryset): return +class LookupCSVImportMixin: + """Adds an Import CSV button + upload view to a lookup ModelAdmin. + + CSV is single-column name. Duplicates are skipped, header row optional. + """ + + change_list_template = "admin/ask/lookup_change_list.html" + + def get_urls(self): + urls = super().get_urls() + info = (self.model._meta.app_label, self.model._meta.model_name) + return [ + path( + "import-csv/", + self.admin_site.admin_view(self.import_csv_view), + name=f"{info[0]}_{info[1]}_import_csv", + ), + ] + urls + + def import_csv_view(self, request): + info = (self.model._meta.app_label, self.model._meta.model_name) + changelist_url = reverse(f"admin:{info[0]}_{info[1]}_changelist") + + if request.method == "POST": + file_obj = request.FILES.get("csv_file") + if file_obj is None: + self.message_user(request, "No file provided.", level="error") + elif not file_obj.name.lower().endswith(".csv"): + self.message_user(request, "File must have a .csv extension.", level="error") + else: + try: + created, skipped = import_names_csv(self.model, file_obj) + except Exception as e: + logger.exception("CSV import failed for %s", self.model.__name__) + self.message_user(request, f"Import failed: {e}", level="error") + else: + self.message_user( + request, + f"Imported {created} new {self.model._meta.verbose_name_plural} " + f"(skipped {skipped} duplicate or empty rows).", + ) + return HttpResponseRedirect(changelist_url) + + context = { + **self.admin_site.each_context(request), + "title": f"Import {self.model._meta.verbose_name_plural} from CSV", + "opts": self.model._meta, + "changelist_url": changelist_url, + } + return render(request, "admin/ask/lookup_csv_import.html", context) + + @admin.register(DocumentType) -class DocumentTypeAdmin(admin.ModelAdmin): +class DocumentTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin): list_display = ("name",) search_fields = ("name",) @admin.register(DocumentAuthorInstitution) -class DocumentAuthorInstitutionAdmin(admin.ModelAdmin): +class DocumentAuthorInstitutionAdmin(LookupCSVImportMixin, admin.ModelAdmin): list_display = ("name",) search_fields = ("name",) @admin.register(InstitutionType) -class InstitutionTypeAdmin(admin.ModelAdmin): +class InstitutionTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin): list_display = ("name",) search_fields = ("name",) diff --git a/hospexplorer/ask/admin_csv.py b/hospexplorer/ask/admin_csv.py new file mode 100644 index 0000000..beebfd6 --- /dev/null +++ b/hospexplorer/ask/admin_csv.py @@ -0,0 +1,26 @@ +import csv +import io + + +def import_names_csv(model, file_obj): + """Import a one-column CSV into a model with a ``name`` field. + + Returns ``(created, skipped)``. Blank rows, a leading header row of ``name``, + and rows whose name already exists in the table are all counted as skipped. + """ + text = file_obj.read().decode("utf-8-sig", errors="replace") + reader = csv.reader(io.StringIO(text)) + + created = 0 + skipped = 0 + for row in reader: + name = row[0].strip() if row else "" + if not name or name.lower() == "name": + skipped += 1 + continue + _, was_created = model.objects.get_or_create(name=name) + if was_created: + created += 1 + else: + skipped += 1 + return created, skipped diff --git a/hospexplorer/ask/kb_connector.py b/hospexplorer/ask/kb_connector.py index 94bdf40..39b6ea0 100644 --- a/hospexplorer/ask/kb_connector.py +++ b/hospexplorer/ask/kb_connector.py @@ -1,3 +1,4 @@ +import json import logging import httpx @@ -30,15 +31,16 @@ def list_kb_documents(page=1, page_size=10): return response.json() -def add_website_to_kb(url): +def add_website_to_kb(url, metadata=None): """Send a website URL to the MCP KB server for ingestion. Calls POST /docs/website/add?url={url} on the MCP KB server. - The KB server fetches the page, chunks it, generates embeddings, - and stores it for semantic search. + ``metadata`` (if provided) is sent as a JSON body ``{"metadata": ...}`` so + the KB server can store it on the Document row. """ headers = { "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}", + "Content-Type": "application/json", } endpoint = f"{settings.KB_MCP_HOST}/docs/website/add" @@ -46,6 +48,7 @@ def add_website_to_kb(url): response = client.post( endpoint, params={"url": url}, + json={"metadata": metadata} if metadata is not None else {}, headers=headers, timeout=settings.KB_MCP_TIMEOUT, ) @@ -54,12 +57,12 @@ def add_website_to_kb(url): return response.json() -def add_pdf_to_kb(file_bytes, filename, title, url=None): +def add_pdf_to_kb(file_bytes, filename, title, url=None, metadata=None): """Upload a PDF to the MCP KB server for ingestion. Calls POST /docs/pdf/add on the MCP KB server with multipart form data. - The KB server extracts text, chunks it, generates embeddings, - and stores it for semantic search. + metadata (if provided) is JSON-encoded into a metadata form field so + it can travel alongside the file. """ headers = { "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}", @@ -70,6 +73,8 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None): data = {"title": title} if url: data["url"] = url + if metadata is not None: + data["metadata"] = json.dumps(metadata) with httpx.Client() as client: response = client.post( diff --git a/hospexplorer/ask/tasks.py b/hospexplorer/ask/tasks.py index e290310..89b53b3 100644 --- a/hospexplorer/ask/tasks.py +++ b/hospexplorer/ask/tasks.py @@ -144,6 +144,23 @@ def run_llm_task(task_id, record_id, conversation_id): close_old_connections() +def _build_resource_metadata(obj): + """Serialize a Resource's metadata fields into a JSON-safe dict. + + FK lookups are flattened to their ``name`` so the MCP payload is + self-describing and doesn't depend on hosp-explorer's local IDs. + """ + return { + "date_published": obj.date_published.isoformat() if obj.date_published else None, + "date_published_precision": obj.date_published_precision or None, + "document_type": obj.document_type.name if obj.document_type_id else None, + "document_author_institution": ( + obj.document_author_institution.name if obj.document_author_institution_id else None + ), + "institution_type": obj.institution_type.name if obj.institution_type_id else None, + } + + def run_kb_resource_upload(model_label, resource_id): """Background thread: push a resource to the MCP KB and record its doc_id. @@ -169,15 +186,18 @@ def run_kb_resource_upload(model_label, resource_id): return try: + metadata = _build_resource_metadata(obj) if model_label == "pdf": obj.file.open("rb") try: file_bytes = obj.file.read() finally: obj.file.close() - result = add_pdf_to_kb(file_bytes, obj.file.name.split("/")[-1], obj.title) + result = add_pdf_to_kb( + file_bytes, obj.file.name.split("/")[-1], obj.title, metadata=metadata, + ) else: - result = add_website_to_kb(obj.url) + result = add_website_to_kb(obj.url, metadata=metadata) obj.mcp_kb_document_id = result.get("doc_id") obj.status = Resource.Status.SUCCESS diff --git a/hospexplorer/ask/templates/admin/ask/lookup_change_list.html b/hospexplorer/ask/templates/admin/ask/lookup_change_list.html new file mode 100644 index 0000000..766c505 --- /dev/null +++ b/hospexplorer/ask/templates/admin/ask/lookup_change_list.html @@ -0,0 +1,9 @@ +{% extends "admin/change_list.html" %} +{% load i18n %} + +{% block object-tools-items %} +
  • + {% trans "Import CSV" %} +
  • + {{ block.super }} +{% endblock %} diff --git a/hospexplorer/ask/templates/admin/ask/lookup_csv_import.html b/hospexplorer/ask/templates/admin/ask/lookup_csv_import.html new file mode 100644 index 0000000..63ae644 --- /dev/null +++ b/hospexplorer/ask/templates/admin/ask/lookup_csv_import.html @@ -0,0 +1,27 @@ +{% extends "admin/base_site.html" %} +{% load i18n %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +
    + {% csrf_token %} +

    + Upload a one-column CSV. The first column is treated as the + name. A leading header row of name is allowed + and will be skipped. Duplicate names are skipped silently. +

    +

    + +
    +{% endblock %} From b548e0715b7a6e49b4883b7065c3d5595b65074e Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Tue, 19 May 2026 16:26:44 -0700 Subject: [PATCH 03/12] [HOP-65] Removed duplicate 'Upload zip of PDFs' title --- hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html | 1 - 1 file changed, 1 deletion(-) diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html index 3b11f2c..4be7d0e 100644 --- a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html @@ -10,7 +10,6 @@ {% endblock %} {% block content %} -

    {{ title }}

    Upload a .zip containing PDF files and a single CSV metadata file with columns {{ required_columns_label }}. Each row creates a PDF Resource From 229483706e9e3409fee3e93bbce81d8872df4ee1 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Wed, 20 May 2026 16:29:06 -0700 Subject: [PATCH 04/12] [HOP-65] Duplicates matched via filename and title --- hospexplorer/ask/admin.py | 20 +++++++++-- .../0013_pdfresource_original_filename.py | 35 +++++++++++++++++++ hospexplorer/ask/models.py | 3 ++ 3 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 hospexplorer/ask/migrations/0013_pdfresource_original_filename.py diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 52bfa4e..c5fe34d 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -271,6 +271,9 @@ def save_model(self, request, obj, form, change): obj.modifier = request.user obj.status = PDFResource.Status.PROCESSING obj.status_message = "Queued for Knowledge Base upload." + # capture the original name before storage save() mangles it on collision + if not change or "file" in form.changed_data: + obj.original_filename = os.path.basename(obj.file.name) super().save_model(request, obj, form, change) transaction.on_commit( @@ -347,6 +350,12 @@ def _is_real(name): for n in real_names: zip_members.setdefault(os.path.basename(n), n) + # a PDF "already exists" when both its original filename and + # title match a row already imported + existing_pdfs = set( + PDFResource.objects.values_list("original_filename", "title") + ) + total = 0 saved = 0 queued_ids = [] @@ -361,7 +370,12 @@ def _is_real(name): ) continue - member = zip_members.get(filename) or zip_members.get(os.path.basename(filename)) + basename = os.path.basename(filename) + if (basename, title) in existing_pdfs: + messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.") + continue + + member = zip_members.get(filename) or zip_members.get(basename) if not member: messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") continue @@ -374,13 +388,15 @@ def _is_real(name): obj = PDFResource( title=title, + original_filename=basename, creator=request.user, modifier=request.user, status=PDFResource.Status.PROCESSING, status_message="Queued for Knowledge Base upload.", ) - obj.file.save(os.path.basename(filename), ContentFile(pdf_bytes), save=True) + obj.file.save(basename, ContentFile(pdf_bytes), save=True) saved += 1 + existing_pdfs.add((basename, title)) queued_ids.append(obj.pk) # fire KB uploads after the request transaction commits so background diff --git a/hospexplorer/ask/migrations/0013_pdfresource_original_filename.py b/hospexplorer/ask/migrations/0013_pdfresource_original_filename.py new file mode 100644 index 0000000..5c7aa5c --- /dev/null +++ b/hospexplorer/ask/migrations/0013_pdfresource_original_filename.py @@ -0,0 +1,35 @@ +import os +import re + +from django.db import migrations, models + + +# Django's storage appends "_<7 random alphanumeric chars>" to a file name +# whenever the target name already exists. Strip that to recover the original. +_STORAGE_SUFFIX = re.compile(r"_[A-Za-z0-9]{7}$") + + +def backfill_original_filename(apps, schema_editor): + PDFResource = apps.get_model("ask", "PDFResource") + for resource in PDFResource.objects.all(): + if not resource.file or resource.original_filename: + continue + root, ext = os.path.splitext(os.path.basename(resource.file.name)) + resource.original_filename = f"{_STORAGE_SUFFIX.sub('', root)}{ext}" + resource.save(update_fields=["original_filename"]) + + +class Migration(migrations.Migration): + + dependencies = [ + ("ask", "0012_pdfresource_status_pdfresource_status_message_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="pdfresource", + name="original_filename", + field=models.CharField(blank=True, default="", max_length=255), + ), + migrations.RunPython(backfill_original_filename, migrations.RunPython.noop), + ] diff --git a/hospexplorer/ask/models.py b/hospexplorer/ask/models.py index 70fa94b..f80d37d 100644 --- a/hospexplorer/ask/models.py +++ b/hospexplorer/ask/models.py @@ -52,6 +52,9 @@ class Meta: class PDFResource(Resource): file = models.FileField(upload_to="kb_pdfs/") + # Original uploaded filename, kept verbatim — file.name carries a storage-added + # suffix on collision, so it can't be used to detect re-uploads of the same file. + original_filename = models.CharField(max_length=255, blank=True, default="") mcp_kb_document_id = models.IntegerField(null=True, blank=True, help_text="Document ID returned by the MCP Knowledge Base.") class Meta: From 00ee781cdbc420173dc317c44006df2343710d85 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Thu, 21 May 2026 11:15:10 -0700 Subject: [PATCH 05/12] [HOP-65] Tests work, better comments --- hospexplorer/ask/admin.py | 7 +++++-- hospexplorer/ask/models.py | 3 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index c5fe34d..04e423d 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -271,9 +271,12 @@ def save_model(self, request, obj, form, change): obj.modifier = request.user obj.status = PDFResource.Status.PROCESSING obj.status_message = "Queued for Knowledge Base upload." - # capture the original name before storage save() mangles it on collision + + # record the original name so the zip upload's duplicate check sees PDFs + # added through this form too; do it before save() mangles file.name on collision if not change or "file" in form.changed_data: obj.original_filename = os.path.basename(obj.file.name) + super().save_model(request, obj, form, change) transaction.on_commit( @@ -350,7 +353,7 @@ def _is_real(name): for n in real_names: zip_members.setdefault(os.path.basename(n), n) - # a PDF "already exists" when both its original filename and + # a PDF already exists when both its original filename and # title match a row already imported existing_pdfs = set( PDFResource.objects.values_list("original_filename", "title") diff --git a/hospexplorer/ask/models.py b/hospexplorer/ask/models.py index f80d37d..aefb517 100644 --- a/hospexplorer/ask/models.py +++ b/hospexplorer/ask/models.py @@ -52,8 +52,7 @@ class Meta: class PDFResource(Resource): file = models.FileField(upload_to="kb_pdfs/") - # Original uploaded filename, kept verbatim — file.name carries a storage-added - # suffix on collision, so it can't be used to detect re-uploads of the same file. + # original upload name, kept so re-uploads can be skipped — Django renames file.name on collision original_filename = models.CharField(max_length=255, blank=True, default="") mcp_kb_document_id = models.IntegerField(null=True, blank=True, help_text="Document ID returned by the MCP Knowledge Base.") From 0eccab59d0df5b38c533f98df00003c271d84300 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Thu, 21 May 2026 16:49:15 -0700 Subject: [PATCH 06/12] [HOP-63] Added metadata to csv --- hospexplorer/ask/admin.py | 48 +++++- hospexplorer/ask/admin_csv.py | 29 ++++ .../admin/ask/pdfresource/upload_zip.html | 15 +- hospexplorer/ask/tests.py | 138 +++++++++++++++++- hospexplorer/hospexplorer/settings.py | 6 +- 5 files changed, 229 insertions(+), 7 deletions(-) diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index bdbf4ed..3a1c4f9 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -27,7 +27,7 @@ DocumentAuthorInstitution, InstitutionType, ) -from ask.admin_csv import import_names_csv +from ask.admin_csv import import_names_csv, parse_partial_date from ask.kb_connector import delete_kb_document from ask.tasks import run_kb_resource_upload @@ -341,6 +341,50 @@ def save_model(self, request, obj, form, change): ) +# Optional metadata columns the zip-CSV importer reads onto each PDFResource. +# Controlled-list values create the matching lookup row the first time they +# appear, so the available options grow from what the imports actually use. +ZIP_CSV_LOOKUP_COLUMNS = { + "document_type": DocumentType, + "document_author_institution": DocumentAuthorInstitution, + "institution_type": InstitutionType, +} + + +def _apply_zip_csv_metadata(obj, row): + """Populate a resource's metadata fields from one zip-CSV row. + + Every metadata column is optional. Returns a list of human-readable + warnings for values that could not be applied — the row is still imported, + just with that field left blank. + """ + warnings = [] + + date_raw = (row.get("date_published") or "").strip() + if date_raw: + parsed_date, precision = parse_partial_date(date_raw) + if parsed_date: + obj.date_published = parsed_date + obj.date_published_precision = precision + else: + warnings.append( + f"invalid date_published '{date_raw}' " + "(use YYYY, YYYY-MM or YYYY-MM-DD); left blank" + ) + + for column, model in ZIP_CSV_LOOKUP_COLUMNS.items(): + value = (row.get(column) or "").strip() + if not value: + continue + if len(value) > 255: + warnings.append(f"{column} value exceeds 255 characters; left blank") + continue + lookup, _ = model.objects.get_or_create(name=value) + setattr(obj, column, lookup) + + return warnings + + @admin.register(PDFResource) class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): list_display = ("title", "file", "creator", "status", "modified_at") @@ -498,6 +542,8 @@ def _is_real(name): status=PDFResource.Status.PROCESSING, status_message="Queued for Knowledge Base upload.", ) + for warning in _apply_zip_csv_metadata(obj, row): + messages.warning(request, f"Row {total}: {warning}") obj.file.save(os.path.basename(filename), ContentFile(pdf_bytes), save=True) saved += 1 queued_ids.append(obj.pk) diff --git a/hospexplorer/ask/admin_csv.py b/hospexplorer/ask/admin_csv.py index beebfd6..87eda10 100644 --- a/hospexplorer/ask/admin_csv.py +++ b/hospexplorer/ask/admin_csv.py @@ -1,5 +1,34 @@ import csv +import datetime import io +import re + +# A partial ISO date: a 4-digit year, optionally a month, optionally a day. +# Day can only appear when month does, so "year-day" is impossible to express. +_PARTIAL_DATE_RE = re.compile(r"^(\d{4})(?:-(\d{1,2})(?:-(\d{1,2}))?)?$") + + +def parse_partial_date(value): + """Parse a partial ISO date string into a ``(date, precision)`` pair. + + Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD``. A missing month/day + defaults to 1 so the value still fits a ``DateField``; ``precision`` + ("year", "month" or "day") records how much was actually supplied so the + padding can be ignored later. Blank or unparseable input returns + ``(None, "")``. + """ + match = _PARTIAL_DATE_RE.match((value or "").strip()) + if not match: + return None, "" + year, month, day = match.groups() + try: + if day is not None: + return datetime.date(int(year), int(month), int(day)), "day" + if month is not None: + return datetime.date(int(year), int(month), 1), "month" + return datetime.date(int(year), 1, 1), "year" + except ValueError: + return None, "" def import_names_csv(model, file_obj): diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html index 3b11f2c..587564b 100644 --- a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html @@ -12,9 +12,18 @@ {% block content %}

    {{ title }}

    - Upload a .zip containing PDF files and a single CSV metadata file - with columns {{ required_columns_label }}. Each row creates a PDF Resource - and pushes the file to the Knowledge Base. + Upload a .zip containing PDF files and a single CSV metadata file. + Each row creates a PDF Resource and pushes the file to the Knowledge Base. +

    +

    + Required CSV columns: {{ required_columns_label }}. +

    +

    + Optional metadata columns (per row, leave blank if unknown): + date_published — a year, YYYY-MM, or + YYYY-MM-DD; document_type, + document_author_institution, institution_type + — controlled-list values, created automatically as rows are imported.

    {% csrf_token %} diff --git a/hospexplorer/ask/tests.py b/hospexplorer/ask/tests.py index 749b619..589b8c0 100644 --- a/hospexplorer/ask/tests.py +++ b/hospexplorer/ask/tests.py @@ -1,12 +1,23 @@ +import datetime +import io import shutil import tempfile +import zipfile from unittest.mock import patch from django.contrib.auth.models import User from django.core.files.base import ContentFile from django.test import TestCase, override_settings +from django.urls import reverse -from ask.models import PDFResource +from ask.admin import _apply_zip_csv_metadata +from ask.admin_csv import parse_partial_date +from ask.models import ( + DocumentAuthorInstitution, + DocumentType, + InstitutionType, + PDFResource, +) class PDFResourceDeletionTests(TestCase): @@ -48,3 +59,128 @@ def test_successful_file_removal_is_not_flagged(self): pdf.file.save("report.pdf", ContentFile(b"%PDF-1.4 test"), save=True) pdf.delete() self.assertFalse(pdf.file_deletion_failed) + + +class ParsePartialDateTests(TestCase): + def test_full_date(self): + self.assertEqual( + parse_partial_date("2024-03-15"), (datetime.date(2024, 3, 15), "day") + ) + + def test_year_month(self): + self.assertEqual( + parse_partial_date("2024-03"), (datetime.date(2024, 3, 1), "month") + ) + + def test_year_only(self): + self.assertEqual( + parse_partial_date("2024"), (datetime.date(2024, 1, 1), "year") + ) + + def test_blank_or_none_returns_empty(self): + self.assertEqual(parse_partial_date(""), (None, "")) + self.assertEqual(parse_partial_date(" "), (None, "")) + self.assertEqual(parse_partial_date(None), (None, "")) + + def test_impossible_calendar_dates_rejected(self): + self.assertEqual(parse_partial_date("2024-13"), (None, "")) + self.assertEqual(parse_partial_date("2024-02-30"), (None, "")) + + def test_non_iso_input_rejected(self): + self.assertEqual(parse_partial_date("March 2024"), (None, "")) + self.assertEqual(parse_partial_date("24-03-15"), (None, "")) + + +class ApplyZipCsvMetadataTests(TestCase): + def test_creates_lookups_and_sets_fields(self): + obj = PDFResource(title="Doc") + warnings = _apply_zip_csv_metadata(obj, { + "date_published": "2023-06", + "document_type": "Report", + "document_author_institution": "WHO", + "institution_type": "NGO", + }) + self.assertEqual(warnings, []) + self.assertEqual(obj.date_published, datetime.date(2023, 6, 1)) + self.assertEqual(obj.date_published_precision, "month") + self.assertEqual(obj.document_type.name, "Report") + self.assertEqual(obj.document_author_institution.name, "WHO") + self.assertEqual(obj.institution_type.name, "NGO") + self.assertTrue(DocumentType.objects.filter(name="Report").exists()) + + def test_reuses_existing_lookup_row(self): + existing = DocumentType.objects.create(name="Report") + obj = PDFResource(title="Doc") + _apply_zip_csv_metadata(obj, {"document_type": "Report"}) + self.assertEqual(obj.document_type.pk, existing.pk) + self.assertEqual(DocumentType.objects.filter(name="Report").count(), 1) + + def test_blank_and_missing_columns_are_skipped(self): + obj = PDFResource(title="Doc") + warnings = _apply_zip_csv_metadata(obj, {"document_type": " ", "date_published": ""}) + self.assertEqual(warnings, []) + self.assertIsNone(obj.date_published) + self.assertIsNone(obj.document_type_id) + self.assertEqual(_apply_zip_csv_metadata(PDFResource(title="Doc"), {}), []) + + def test_invalid_date_warns_and_leaves_field_blank(self): + obj = PDFResource(title="Doc") + warnings = _apply_zip_csv_metadata(obj, {"date_published": "not-a-date"}) + self.assertEqual(len(warnings), 1) + self.assertIn("date_published", warnings[0]) + self.assertIsNone(obj.date_published) + + +@override_settings(PDF_ZIP_CSV_COLUMNS=("filename", "title")) +class ZipUploadViewTests(TestCase): + def setUp(self): + media_root = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, media_root, ignore_errors=True) + override = override_settings(MEDIA_ROOT=media_root) + override.enable() + self.addCleanup(override.disable) + self.admin = User.objects.create_superuser("admin", "admin@example.com", "pw") + self.client.force_login(self.admin) + + def _build_zip(self, csv_text, pdfs): + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as archive: + archive.writestr("metadata.csv", csv_text) + for name, content in pdfs.items(): + archive.writestr(name, content) + buf.seek(0) + buf.name = "upload.zip" + return buf + + def test_zip_import_applies_csv_metadata(self): + csv_text = ( + "filename,title,date_published,document_type," + "document_author_institution,institution_type\r\n" + "report.pdf,Annual Report,2022,Report,WHO,NGO\r\n" + ) + zip_file = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 test"}) + + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip_file} + ) + self.assertEqual(response.status_code, 302) + + pdf = PDFResource.objects.get(title="Annual Report") + self.assertEqual(pdf.date_published, datetime.date(2022, 1, 1)) + self.assertEqual(pdf.date_published_precision, "year") + self.assertEqual(pdf.document_type.name, "Report") + self.assertEqual(pdf.document_author_institution.name, "WHO") + self.assertEqual(pdf.institution_type.name, "NGO") + + def test_zip_import_works_without_metadata_columns(self): + csv_text = "filename,title\r\nreport.pdf,Plain Report\r\n" + zip_file = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 test"}) + + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip_file} + ) + self.assertEqual(response.status_code, 302) + + pdf = PDFResource.objects.get(title="Plain Report") + self.assertIsNone(pdf.date_published) + self.assertIsNone(pdf.document_type_id) diff --git a/hospexplorer/hospexplorer/settings.py b/hospexplorer/hospexplorer/settings.py index 4764883..3b11ace 100644 --- a/hospexplorer/hospexplorer/settings.py +++ b/hospexplorer/hospexplorer/settings.py @@ -193,8 +193,10 @@ # the upload view requires both a filename column and a title column, so # PDFResourceAdmin will raise ImproperlyConfigured at request time. # -# Extra CSV columns beyond these two are ignored. Changing this does not change -# which PDFResource fields get populated, only title is read +# Only the filename and title columns are configurable here. The importer also +# reads optional, fixed-name metadata columns when present: date_published, +# document_type, document_author_institution, institution_type. Any other +# columns are ignored. PDF_ZIP_CSV_COLUMNS = tuple( column.strip() for column in os.getenv("PDF_ZIP_CSV_COLUMNS", "filename,title").split(",") if column.strip() From 020e6145c6b8d272bebe9fbcc61852d72ba981c6 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Fri, 22 May 2026 15:06:04 -0700 Subject: [PATCH 07/12] [HOP-63] Added metadata for zip --- hospexplorer/ask/admin.py | 7 ++++++- hospexplorer/ask/tests.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 3a1c4f9..3ec5a03 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -499,7 +499,12 @@ def _is_real(name): csv_text = archive.read(csv_names[0]).decode("utf-8-sig") reader = csv.DictReader(io.StringIO(csv_text)) - csv_columns = {(name or "").strip() for name in (reader.fieldnames or [])} + # strip header names so the column check and per-row lookups use + # the same keys; otherwise a header like "filename, title" leaves + # stray spaces and every row reads as missing its required fields + if reader.fieldnames: + reader.fieldnames = [(name or "").strip() for name in reader.fieldnames] + csv_columns = set(reader.fieldnames or []) if not required_columns.issubset(csv_columns): missing = ", ".join(sorted(required_columns - csv_columns)) messages.error(request, f"CSV is missing required columns: {missing}.") diff --git a/hospexplorer/ask/tests.py b/hospexplorer/ask/tests.py index 589b8c0..f14ebe3 100644 --- a/hospexplorer/ask/tests.py +++ b/hospexplorer/ask/tests.py @@ -184,3 +184,20 @@ def test_zip_import_works_without_metadata_columns(self): pdf = PDFResource.objects.get(title="Plain Report") self.assertIsNone(pdf.date_published) self.assertIsNone(pdf.document_type_id) + + def test_zip_import_tolerates_whitespace_in_csv_header(self): + # spaces after commas in the header row must not cause rows to be skipped + csv_text = ( + "filename, title, date_published, document_type\r\n" + "report.pdf,Spaced Report,2021,Report\r\n" + ) + zip_file = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 test"}) + + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip_file} + ) + self.assertEqual(response.status_code, 302) + + pdf = PDFResource.objects.get(title="Spaced Report") + self.assertEqual(pdf.date_published, datetime.date(2021, 1, 1)) + self.assertEqual(pdf.document_type.name, "Report") From 846c5aa4b3c298c310314e0169554bb4a4df6a8d Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Fri, 22 May 2026 15:34:15 -0700 Subject: [PATCH 08/12] [HOP-65] blanks on files fixed --- .../0013_pdfresource_original_filename.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/hospexplorer/ask/migrations/0013_pdfresource_original_filename.py b/hospexplorer/ask/migrations/0013_pdfresource_original_filename.py index 5c7aa5c..7161cfa 100644 --- a/hospexplorer/ask/migrations/0013_pdfresource_original_filename.py +++ b/hospexplorer/ask/migrations/0013_pdfresource_original_filename.py @@ -1,24 +1,6 @@ -import os -import re - from django.db import migrations, models -# Django's storage appends "_<7 random alphanumeric chars>" to a file name -# whenever the target name already exists. Strip that to recover the original. -_STORAGE_SUFFIX = re.compile(r"_[A-Za-z0-9]{7}$") - - -def backfill_original_filename(apps, schema_editor): - PDFResource = apps.get_model("ask", "PDFResource") - for resource in PDFResource.objects.all(): - if not resource.file or resource.original_filename: - continue - root, ext = os.path.splitext(os.path.basename(resource.file.name)) - resource.original_filename = f"{_STORAGE_SUFFIX.sub('', root)}{ext}" - resource.save(update_fields=["original_filename"]) - - class Migration(migrations.Migration): dependencies = [ @@ -31,5 +13,4 @@ class Migration(migrations.Migration): name="original_filename", field=models.CharField(blank=True, default="", max_length=255), ), - migrations.RunPython(backfill_original_filename, migrations.RunPython.noop), ] From ee168a4fc4316ef8b1f32f111253ea0252211795 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Fri, 22 May 2026 15:58:43 -0700 Subject: [PATCH 09/12] [HOP-66] Makemigrations --merge --- .../ask/migrations/0014_merge_20260522_2257.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 hospexplorer/ask/migrations/0014_merge_20260522_2257.py diff --git a/hospexplorer/ask/migrations/0014_merge_20260522_2257.py b/hospexplorer/ask/migrations/0014_merge_20260522_2257.py new file mode 100644 index 0000000..eaaa6c1 --- /dev/null +++ b/hospexplorer/ask/migrations/0014_merge_20260522_2257.py @@ -0,0 +1,14 @@ +# Generated by Django 6.0.2 on 2026-05-22 22:57 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('ask', '0013_documentauthorinstitution_documenttype_and_more'), + ('ask', '0013_pdfresource_original_filename'), + ] + + operations = [ + ] From 606704ca65b715d3b0feb62c0a0724ba4968c6ad Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Fri, 22 May 2026 16:29:53 -0700 Subject: [PATCH 10/12] [HOP-66] added two checkbox rows for update_file and update_metadata --- hospexplorer/ask/admin.py | 73 +++++++++++++++---- .../admin/ask/pdfresource/upload_zip.html | 12 +++ 2 files changed, 69 insertions(+), 16 deletions(-) diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 4a971ed..0df78f1 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -478,6 +478,9 @@ def zip_upload_view(self, request): messages.error(request, "Please select a zip file to upload.") return HttpResponseRedirect(request.path) + update_file = bool(request.POST.get("update_file")) + update_metadata = bool(request.POST.get("update_metadata")) + try: archive = zipfile.ZipFile(zip_file) except zipfile.BadZipFile: @@ -529,7 +532,9 @@ def _is_real(name): total = 0 saved = 0 - queued_ids = [] + updated = 0 + queued_new_ids = [] + queued_replace_ids = [] for row in reader: total += 1 filename = (row.get(filename_col) or "").strip() @@ -542,19 +547,45 @@ def _is_real(name): continue basename = os.path.basename(filename) - if (basename, title) in existing_pdfs: - messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.") - continue + is_update = (basename, title) in existing_pdfs - member = zip_members.get(filename) or zip_members.get(basename) - if not member: - messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") + if is_update and not (update_file or update_metadata): + messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.") continue - try: - pdf_bytes = archive.read(member) - except KeyError: - messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.") + # only read PDF bytes when we'll actually use them: new rows + # always need them; existing rows only when update_file is set + pdf_bytes = None + if (not is_update) or update_file: + member = zip_members.get(filename) or zip_members.get(basename) + if not member: + messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") + continue + try: + pdf_bytes = archive.read(member) + except KeyError: + messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.") + continue + + if is_update: + match = PDFResource.objects.filter( + original_filename=basename, title=title + ).first() + if match is None: + messages.warning(request, f"Row {total}: '{filename}' lookup failed; skipped.") + continue + if update_metadata: + for warning in _apply_zip_csv_metadata(match, row): + messages.warning(request, f"Row {total}: {warning}") + if update_file: + match.file.delete(save=False) + match.file.save(basename, ContentFile(pdf_bytes), save=False) + match.modifier = request.user + match.status = PDFResource.Status.PROCESSING + match.status_message = "Queued for Knowledge Base re-upload." + match.save() + updated += 1 + queued_replace_ids.append(match.pk) continue obj = PDFResource( @@ -570,12 +601,21 @@ def _is_real(name): obj.file.save(basename, ContentFile(pdf_bytes), save=True) saved += 1 existing_pdfs.add((basename, title)) - queued_ids.append(obj.pk) + queued_new_ids.append(obj.pk) # fire KB uploads after the request transaction commits so background # threads see the just-saved rows - def _start_uploads(ids=tuple(queued_ids)): - for pk in ids: + def _start_uploads( + new_ids=tuple(queued_new_ids), + replace_ids=tuple(queued_replace_ids), + ): + for pk in new_ids: + threading.Thread( + target=run_kb_resource_upload, + args=("pdf", pk), + daemon=True, + ).start() + for pk in replace_ids: threading.Thread( target=run_kb_resource_upload, args=("pdf", pk), @@ -585,8 +625,9 @@ def _start_uploads(ids=tuple(queued_ids)): messages.success( request, - f"Imported {saved} of {total} PDFs. Knowledge Base uploads are running in the " - "background — refresh the list to see each row's final status.", + f"Imported {saved} new and updated {updated} of {total} PDF rows. " + "Knowledge Base uploads are running in the background — " + "refresh the list to see each row's final status.", ) return HttpResponseRedirect(changelist_url) diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html index 0bfe104..e6bd9d5 100644 --- a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html @@ -30,6 +30,18 @@

    +

    + +

    +

    + +

    Cancel From ff0b674883881d1f8c825d362c39c689cf28f615 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Tue, 26 May 2026 14:30:46 -0700 Subject: [PATCH 11/12] [HOP-66] update_file and update_metadata checkboxes on zip upload, KB doc is replaced (delete + re-add) for u\pdated rows --- hospexplorer/ask/admin.py | 1 + hospexplorer/ask/tasks.py | 22 +++++- hospexplorer/ask/tests.py | 136 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 2 deletions(-) diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 0df78f1..af53ac6 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -619,6 +619,7 @@ def _start_uploads( threading.Thread( target=run_kb_resource_upload, args=("pdf", pk), + kwargs={"replace": True}, daemon=True, ).start() transaction.on_commit(_start_uploads) diff --git a/hospexplorer/ask/tasks.py b/hospexplorer/ask/tasks.py index 6eded79..ab9c366 100644 --- a/hospexplorer/ask/tasks.py +++ b/hospexplorer/ask/tasks.py @@ -161,15 +161,20 @@ def _build_resource_metadata(obj): } -def run_kb_resource_upload(model_label, resource_id): +def run_kb_resource_upload(model_label, resource_id, replace=False): """Background thread: push a resource to the MCP KB and record its doc_id. Runs outside the admin's atomic save transaction so a slow or timing-out MCP call can't roll back the local row. The object's status/status_message are updated at each phase so the admin can surface progress and errors. + + When ``replace`` is True and the row already has an ``mcp_kb_document_id``, + the existing KB doc is deleted before the new one is added — used by the + zip importer when an "update file" / "update metadata" re-upload would + otherwise leave the old chunks in the KB alongside the new ones. """ from ask.models import WebsiteResource, PDFResource, Resource - from ask.kb_connector import add_pdf_to_kb, add_website_to_kb + from ask.kb_connector import add_pdf_to_kb, add_website_to_kb, delete_kb_document if model_label == "pdf": Model = PDFResource @@ -185,6 +190,19 @@ def run_kb_resource_upload(model_label, resource_id): logger.error("run_kb_resource_upload: %s id=%s not found", model_label, resource_id) return + if replace and obj.mcp_kb_document_id: + # best-effort: if delete fails the re-add still happens, leaving a + # stale duplicate in the KB rather than losing the new upload + old_doc_id = obj.mcp_kb_document_id + try: + delete_kb_document(old_doc_id) + except Exception: + logger.warning( + "run_kb_resource_upload: failed to delete old KB doc_id=%s for %s id=%s; " + "re-adding anyway", old_doc_id, model_label, resource_id, + ) + obj.mcp_kb_document_id = None + try: metadata = _build_resource_metadata(obj) if model_label == "pdf": diff --git a/hospexplorer/ask/tests.py b/hospexplorer/ask/tests.py index f14ebe3..d08481a 100644 --- a/hospexplorer/ask/tests.py +++ b/hospexplorer/ask/tests.py @@ -201,3 +201,139 @@ def test_zip_import_tolerates_whitespace_in_csv_header(self): pdf = PDFResource.objects.get(title="Spaced Report") self.assertEqual(pdf.date_published, datetime.date(2021, 1, 1)) self.assertEqual(pdf.document_type.name, "Report") + + def test_zip_update_file_overwrites_existing_pdf(self): + csv_text = "filename,title\r\nreport.pdf,Annual Report\r\n" + zip1 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 original"}) + self.client.post(reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip1}) + + zip2 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 updated"}) + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), + {"zip_file": zip2, "update_file": "on"}, + ) + self.assertEqual(response.status_code, 302) + + self.assertEqual(PDFResource.objects.count(), 1) + pdf = PDFResource.objects.get(title="Annual Report") + with pdf.file.open("rb") as f: + self.assertEqual(f.read(), b"%PDF-1.4 updated") + + def test_zip_update_metadata_refreshes_fields(self): + csv_text_1 = "filename,title\r\nreport.pdf,Annual Report\r\n" + zip1 = self._build_zip(csv_text_1, {"report.pdf": b"%PDF-1.4 original"}) + self.client.post(reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip1}) + + pdf = PDFResource.objects.get(title="Annual Report") + self.assertIsNone(pdf.date_published) + self.assertIsNone(pdf.document_type_id) + + csv_text_2 = ( + "filename,title,date_published,document_type\r\n" + "report.pdf,Annual Report,2024-03,Report\r\n" + ) + # second zip's bytes must NOT replace the file when only update_metadata is on + zip2 = self._build_zip(csv_text_2, {"report.pdf": b"%PDF-1.4 ignored"}) + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), + {"zip_file": zip2, "update_metadata": "on"}, + ) + self.assertEqual(response.status_code, 302) + + self.assertEqual(PDFResource.objects.count(), 1) + pdf.refresh_from_db() + self.assertEqual(pdf.date_published, datetime.date(2024, 3, 1)) + self.assertEqual(pdf.date_published_precision, "month") + self.assertEqual(pdf.document_type.name, "Report") + with pdf.file.open("rb") as f: + self.assertEqual(f.read(), b"%PDF-1.4 original") + + def test_zip_no_update_flags_preserve_skip_behavior(self): + csv_text = "filename,title\r\nreport.pdf,Annual Report\r\n" + zip1 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 original"}) + self.client.post(reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip1}) + + zip2 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 updated"}) + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip2} + ) + self.assertEqual(response.status_code, 302) + + self.assertEqual(PDFResource.objects.count(), 1) + pdf = PDFResource.objects.get(title="Annual Report") + with pdf.file.open("rb") as f: + self.assertEqual(f.read(), b"%PDF-1.4 original") + + +class RunKbResourceUploadReplaceTests(TestCase): + def setUp(self): + media_root = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, media_root, ignore_errors=True) + override = override_settings(MEDIA_ROOT=media_root) + override.enable() + self.addCleanup(override.disable) + # the real run_kb_resource_upload closes DB connections in finally — + # that kills the TestCase's wrapping transaction connection and breaks + # later tests, so neutralize it for this class + close_patcher = patch("ask.tasks.close_old_connections") + close_patcher.start() + self.addCleanup(close_patcher.stop) + self.user = User.objects.create_user("curator", password="pw") + + def _make_pdf(self, mcp_id): + obj = PDFResource( + title="Annual Report", + original_filename="report.pdf", + creator=self.user, + modifier=self.user, + mcp_kb_document_id=mcp_id, + ) + obj.file.save("report.pdf", ContentFile(b"%PDF-1.4 test"), save=True) + return obj + + def test_replace_deletes_old_doc_then_re_adds(self): + from ask import kb_connector + from ask.tasks import run_kb_resource_upload + + obj = self._make_pdf(mcp_id=42) + with patch.object(kb_connector, "delete_kb_document") as mock_del, patch.object( + kb_connector, "add_pdf_to_kb", return_value={"doc_id": 99} + ) as mock_add: + run_kb_resource_upload("pdf", obj.pk, replace=True) + + mock_del.assert_called_once_with(42) + mock_add.assert_called_once() + obj.refresh_from_db() + self.assertEqual(obj.mcp_kb_document_id, 99) + self.assertEqual(obj.status, PDFResource.Status.SUCCESS) + + def test_replace_without_existing_doc_id_skips_delete(self): + from ask import kb_connector + from ask.tasks import run_kb_resource_upload + + obj = self._make_pdf(mcp_id=None) + with patch.object(kb_connector, "delete_kb_document") as mock_del, patch.object( + kb_connector, "add_pdf_to_kb", return_value={"doc_id": 99} + ) as mock_add: + run_kb_resource_upload("pdf", obj.pk, replace=True) + + mock_del.assert_not_called() + mock_add.assert_called_once() + obj.refresh_from_db() + self.assertEqual(obj.mcp_kb_document_id, 99) + + def test_replace_swallows_delete_failure_and_still_adds(self): + from ask import kb_connector + from ask.tasks import run_kb_resource_upload + + obj = self._make_pdf(mcp_id=42) + with patch.object( + kb_connector, "delete_kb_document", side_effect=Exception("boom") + ), patch.object( + kb_connector, "add_pdf_to_kb", return_value={"doc_id": 99} + ) as mock_add: + run_kb_resource_upload("pdf", obj.pk, replace=True) + + mock_add.assert_called_once() + obj.refresh_from_db() + self.assertEqual(obj.mcp_kb_document_id, 99) From e2cb39771b1f4268e4268ccdc7e48207b9072f99 Mon Sep 17 00:00:00 2001 From: Girik1105 Date: Tue, 26 May 2026 15:54:14 -0700 Subject: [PATCH 12/12] [HOP-63] Store date_published as partial ISO string --- hospexplorer/ask/admin.py | 18 +++-- hospexplorer/ask/admin_csv.py | 42 +++++------- .../ask/migrations/0015_iso_date_published.py | 65 +++++++++++++++++++ hospexplorer/ask/models.py | 15 +---- hospexplorer/ask/tasks.py | 3 +- hospexplorer/ask/tests.py | 49 +++++++------- 6 files changed, 117 insertions(+), 75 deletions(-) create mode 100644 hospexplorer/ask/migrations/0015_iso_date_published.py diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 4a971ed..e4952cc 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -27,7 +27,7 @@ DocumentAuthorInstitution, InstitutionType, ) -from ask.admin_csv import import_names_csv, parse_partial_date +from ask.admin_csv import import_names_csv, normalize_partial_date from ask.kb_connector import delete_kb_document from ask.tasks import run_kb_resource_upload @@ -295,7 +295,7 @@ class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): fieldsets = ( (None, {"fields": ("title", "description", "url")}), ("Metadata", {"fields": ( - "date_published", "date_published_precision", + "date_published", "document_type", "document_author_institution", "institution_type", )}), ("Status", {"fields": ( @@ -307,7 +307,7 @@ class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): "title": "A short name to identify this website resource.", "description": "Optional details about what this website covers.", "url": "The URL the LLM will use as context when answering questions.", - "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.", + "date_published": "Partial ISO date: YYYY, YYYY-MM, or YYYY-MM-DD. Leave blank if unknown.", } def get_form(self, request, obj=None, **kwargs): @@ -362,11 +362,9 @@ def _apply_zip_csv_metadata(obj, row): date_raw = (row.get("date_published") or "").strip() if date_raw: - parsed_date, precision = parse_partial_date(date_raw) - if parsed_date: - obj.date_published = parsed_date - obj.date_published_precision = precision - else: + try: + obj.date_published = normalize_partial_date(date_raw) + except ValueError: warnings.append( f"invalid date_published '{date_raw}' " "(use YYYY, YYYY-MM or YYYY-MM-DD); left blank" @@ -394,7 +392,7 @@ class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): fieldsets = ( (None, {"fields": ("title", "description", "file")}), ("Metadata", {"fields": ( - "date_published", "date_published_precision", + "date_published", "document_type", "document_author_institution", "institution_type", )}), ("Status", {"fields": ( @@ -406,7 +404,7 @@ class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): "title": "A short name to identify this PDF resource.", "description": "Optional details about what this PDF covers.", "file": "The PDF file the LLM will use as context when answering questions.", - "date_published_precision": "Granularity of the date above (year / month / day). Leave blank if unknown.", + "date_published": "Partial ISO date: YYYY, YYYY-MM, or YYYY-MM-DD. Leave blank if unknown.", } # Column names the bulk-import CSV must define (first = zip member, second = resource title) diff --git a/hospexplorer/ask/admin_csv.py b/hospexplorer/ask/admin_csv.py index 87eda10..9506a7e 100644 --- a/hospexplorer/ask/admin_csv.py +++ b/hospexplorer/ask/admin_csv.py @@ -1,34 +1,28 @@ import csv import datetime import io -import re -# A partial ISO date: a 4-digit year, optionally a month, optionally a day. -# Day can only appear when month does, so "year-day" is impossible to express. -_PARTIAL_DATE_RE = re.compile(r"^(\d{4})(?:-(\d{1,2})(?:-(\d{1,2}))?)?$") +def normalize_partial_date(value): + """Validate a partial ISO 8601 date string and return its trimmed form. -def parse_partial_date(value): - """Parse a partial ISO date string into a ``(date, precision)`` pair. - - Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD``. A missing month/day - defaults to 1 so the value still fits a ``DateField``; ``precision`` - ("year", "month" or "day") records how much was actually supplied so the - padding can be ignored later. Blank or unparseable input returns - ``(None, "")``. + Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD`` (zero-padded). Calendar + correctness is delegated to ``datetime.date.fromisoformat`` by padding + the missing components with ``-01``. Empty / whitespace input returns + ``""``; any other malformed value raises ``ValueError``. """ - match = _PARTIAL_DATE_RE.match((value or "").strip()) - if not match: - return None, "" - year, month, day = match.groups() - try: - if day is not None: - return datetime.date(int(year), int(month), int(day)), "day" - if month is not None: - return datetime.date(int(year), int(month), 1), "month" - return datetime.date(int(year), 1, 1), "year" - except ValueError: - return None, "" + s = (value or "").strip() + if not s: + return "" + if len(s) == 4: + datetime.date.fromisoformat(s + "-01-01") + elif len(s) == 7: + datetime.date.fromisoformat(s + "-01") + elif len(s) == 10: + datetime.date.fromisoformat(s) + else: + raise ValueError(f"not a partial ISO date: {value!r}") + return s def import_names_csv(model, file_obj): diff --git a/hospexplorer/ask/migrations/0015_iso_date_published.py b/hospexplorer/ask/migrations/0015_iso_date_published.py new file mode 100644 index 0000000..c18bd65 --- /dev/null +++ b/hospexplorer/ask/migrations/0015_iso_date_published.py @@ -0,0 +1,65 @@ +"""Switch date_published from (DateField + precision enum) to a single +CharField holding a partial ISO 8601 date string. + +The MissingMigration plus an AlterField wouldn't carry the precision +information across, so this migration: adds a temporary CharField, copies +each row's old (date, precision) pair into a partial ISO string, removes +the old fields, then renames the temp field to the canonical name. +""" +from django.db import migrations, models + + +def _to_iso_partial(date, precision): + if date is None: + return "" + if precision == "year": + return f"{date.year:04d}" + if precision == "month": + return f"{date.year:04d}-{date.month:02d}" + # "day" — or any other / empty precision, which we treat as a full date + return date.isoformat() + + +def forwards(apps, schema_editor): + for model_name in ("PDFResource", "WebsiteResource"): + Model = apps.get_model("ask", model_name) + for obj in Model.objects.all(): + obj.date_published_iso = _to_iso_partial( + obj.date_published, obj.date_published_precision + ) + obj.save(update_fields=["date_published_iso"]) + + +class Migration(migrations.Migration): + + dependencies = [ + ("ask", "0014_merge_20260526_2133"), + ] + + operations = [ + migrations.AddField( + model_name="pdfresource", + name="date_published_iso", + field=models.CharField(blank=True, default="", max_length=10), + ), + migrations.AddField( + model_name="websiteresource", + name="date_published_iso", + field=models.CharField(blank=True, default="", max_length=10), + ), + migrations.RunPython(forwards, migrations.RunPython.noop), + migrations.RemoveField(model_name="pdfresource", name="date_published"), + migrations.RemoveField(model_name="websiteresource", name="date_published"), + migrations.RemoveField(model_name="pdfresource", name="date_published_precision"), + migrations.RemoveField(model_name="websiteresource", name="date_published_precision"), + migrations.RenameField( + model_name="pdfresource", + old_name="date_published_iso", + new_name="date_published", + ), + migrations.RenameField( + model_name="websiteresource", + old_name="date_published_iso", + new_name="date_published", + ), + ] diff --git a/hospexplorer/ask/models.py b/hospexplorer/ask/models.py index f482720..be07fc2 100644 --- a/hospexplorer/ask/models.py +++ b/hospexplorer/ask/models.py @@ -43,11 +43,6 @@ class Status(models.TextChoices): ERROR = "error", "Error" WARNING = "warning", "Warning" - class DatePrecision(models.TextChoices): - YEAR = "year", "Year" - MONTH = "month", "Month" - DAY = "day", "Day" - title = models.CharField(max_length=255) description = models.TextField(blank=True, default="") creator = models.ForeignKey( @@ -71,13 +66,9 @@ class DatePrecision(models.TextChoices): ) status_message = models.TextField(blank=True, default="") - date_published = models.DateField(null=True, blank=True) - date_published_precision = models.CharField( - max_length=10, - choices=DatePrecision.choices, - blank=True, - default="", - ) + # ISO 8601 partial date: YYYY, YYYY-MM, or YYYY-MM-DD. Lexicographic + # ordering of the string also orders chronologically. + date_published = models.CharField(max_length=10, blank=True, default="") document_type = models.ForeignKey( "DocumentType", on_delete=models.SET_NULL, diff --git a/hospexplorer/ask/tasks.py b/hospexplorer/ask/tasks.py index 6eded79..5d05b4e 100644 --- a/hospexplorer/ask/tasks.py +++ b/hospexplorer/ask/tasks.py @@ -151,8 +151,7 @@ def _build_resource_metadata(obj): self-describing and doesn't depend on hosp-explorer's local IDs. """ return { - "date_published": obj.date_published.isoformat() if obj.date_published else None, - "date_published_precision": obj.date_published_precision or None, + "date_published": obj.date_published or None, "document_type": obj.document_type.name if obj.document_type_id else None, "document_author_institution": ( obj.document_author_institution.name if obj.document_author_institution_id else None diff --git a/hospexplorer/ask/tests.py b/hospexplorer/ask/tests.py index f14ebe3..16fbd0e 100644 --- a/hospexplorer/ask/tests.py +++ b/hospexplorer/ask/tests.py @@ -1,4 +1,3 @@ -import datetime import io import shutil import tempfile @@ -11,7 +10,7 @@ from django.urls import reverse from ask.admin import _apply_zip_csv_metadata -from ask.admin_csv import parse_partial_date +from ask.admin_csv import normalize_partial_date from ask.models import ( DocumentAuthorInstitution, DocumentType, @@ -61,34 +60,32 @@ def test_successful_file_removal_is_not_flagged(self): self.assertFalse(pdf.file_deletion_failed) -class ParsePartialDateTests(TestCase): +class NormalizePartialDateTests(TestCase): def test_full_date(self): - self.assertEqual( - parse_partial_date("2024-03-15"), (datetime.date(2024, 3, 15), "day") - ) + self.assertEqual(normalize_partial_date("2024-03-15"), "2024-03-15") def test_year_month(self): - self.assertEqual( - parse_partial_date("2024-03"), (datetime.date(2024, 3, 1), "month") - ) + self.assertEqual(normalize_partial_date("2024-03"), "2024-03") def test_year_only(self): - self.assertEqual( - parse_partial_date("2024"), (datetime.date(2024, 1, 1), "year") - ) + self.assertEqual(normalize_partial_date("2024"), "2024") def test_blank_or_none_returns_empty(self): - self.assertEqual(parse_partial_date(""), (None, "")) - self.assertEqual(parse_partial_date(" "), (None, "")) - self.assertEqual(parse_partial_date(None), (None, "")) + self.assertEqual(normalize_partial_date(""), "") + self.assertEqual(normalize_partial_date(" "), "") + self.assertEqual(normalize_partial_date(None), "") def test_impossible_calendar_dates_rejected(self): - self.assertEqual(parse_partial_date("2024-13"), (None, "")) - self.assertEqual(parse_partial_date("2024-02-30"), (None, "")) + with self.assertRaises(ValueError): + normalize_partial_date("2024-13") + with self.assertRaises(ValueError): + normalize_partial_date("2024-02-30") def test_non_iso_input_rejected(self): - self.assertEqual(parse_partial_date("March 2024"), (None, "")) - self.assertEqual(parse_partial_date("24-03-15"), (None, "")) + with self.assertRaises(ValueError): + normalize_partial_date("March 2024") + with self.assertRaises(ValueError): + normalize_partial_date("24-03-15") class ApplyZipCsvMetadataTests(TestCase): @@ -101,8 +98,7 @@ def test_creates_lookups_and_sets_fields(self): "institution_type": "NGO", }) self.assertEqual(warnings, []) - self.assertEqual(obj.date_published, datetime.date(2023, 6, 1)) - self.assertEqual(obj.date_published_precision, "month") + self.assertEqual(obj.date_published, "2023-06") self.assertEqual(obj.document_type.name, "Report") self.assertEqual(obj.document_author_institution.name, "WHO") self.assertEqual(obj.institution_type.name, "NGO") @@ -119,7 +115,7 @@ def test_blank_and_missing_columns_are_skipped(self): obj = PDFResource(title="Doc") warnings = _apply_zip_csv_metadata(obj, {"document_type": " ", "date_published": ""}) self.assertEqual(warnings, []) - self.assertIsNone(obj.date_published) + self.assertEqual(obj.date_published, "") self.assertIsNone(obj.document_type_id) self.assertEqual(_apply_zip_csv_metadata(PDFResource(title="Doc"), {}), []) @@ -128,7 +124,7 @@ def test_invalid_date_warns_and_leaves_field_blank(self): warnings = _apply_zip_csv_metadata(obj, {"date_published": "not-a-date"}) self.assertEqual(len(warnings), 1) self.assertIn("date_published", warnings[0]) - self.assertIsNone(obj.date_published) + self.assertEqual(obj.date_published, "") @override_settings(PDF_ZIP_CSV_COLUMNS=("filename", "title")) @@ -166,8 +162,7 @@ def test_zip_import_applies_csv_metadata(self): self.assertEqual(response.status_code, 302) pdf = PDFResource.objects.get(title="Annual Report") - self.assertEqual(pdf.date_published, datetime.date(2022, 1, 1)) - self.assertEqual(pdf.date_published_precision, "year") + self.assertEqual(pdf.date_published, "2022") self.assertEqual(pdf.document_type.name, "Report") self.assertEqual(pdf.document_author_institution.name, "WHO") self.assertEqual(pdf.institution_type.name, "NGO") @@ -182,7 +177,7 @@ def test_zip_import_works_without_metadata_columns(self): self.assertEqual(response.status_code, 302) pdf = PDFResource.objects.get(title="Plain Report") - self.assertIsNone(pdf.date_published) + self.assertEqual(pdf.date_published, "") self.assertIsNone(pdf.document_type_id) def test_zip_import_tolerates_whitespace_in_csv_header(self): @@ -199,5 +194,5 @@ def test_zip_import_tolerates_whitespace_in_csv_header(self): self.assertEqual(response.status_code, 302) pdf = PDFResource.objects.get(title="Spaced Report") - self.assertEqual(pdf.date_published, datetime.date(2021, 1, 1)) + self.assertEqual(pdf.date_published, "2021") self.assertEqual(pdf.document_type.name, "Report")