Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 214 additions & 18 deletions hospexplorer/ask/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@
from django.shortcuts import render
from django.urls import path, reverse

from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource
from ask.models import (
Conversation,
TermsAcceptance,
QARecord,
SimWorkflow,
WebsiteResource,
PDFResource,
DocumentType,
DocumentAuthorInstitution,
InstitutionType,
)
from ask.admin_csv import import_names_csv, normalize_partial_date
from ask.kb_connector import delete_kb_document
from ask.tasks import run_kb_resource_upload

Expand Down Expand Up @@ -205,16 +216,98 @@ def delete_queryset(self, request, queryset):
return


class LookupCSVImportMixin:
"""Adds an Import CSV button + upload view to a lookup ModelAdmin.

CSV is single-column name. Duplicates are skipped, header row optional.
"""

change_list_template = "admin/ask/lookup_change_list.html"

def get_urls(self):
urls = super().get_urls()
info = (self.model._meta.app_label, self.model._meta.model_name)
return [
path(
"import-csv/",
self.admin_site.admin_view(self.import_csv_view),
name=f"{info[0]}_{info[1]}_import_csv",
),
] + urls

def import_csv_view(self, request):
info = (self.model._meta.app_label, self.model._meta.model_name)
changelist_url = reverse(f"admin:{info[0]}_{info[1]}_changelist")

if request.method == "POST":
file_obj = request.FILES.get("csv_file")
if file_obj is None:
self.message_user(request, "No file provided.", level="error")
elif not file_obj.name.lower().endswith(".csv"):
self.message_user(request, "File must have a .csv extension.", level="error")
else:
try:
created, skipped = import_names_csv(self.model, file_obj)
except Exception as e:
logger.exception("CSV import failed for %s", self.model.__name__)
self.message_user(request, f"Import failed: {e}", level="error")
else:
self.message_user(
request,
f"Imported {created} new {self.model._meta.verbose_name_plural} "
f"(skipped {skipped} duplicate or empty rows).",
)
return HttpResponseRedirect(changelist_url)

context = {
**self.admin_site.each_context(request),
"title": f"Import {self.model._meta.verbose_name_plural} from CSV",
"opts": self.model._meta,
"changelist_url": changelist_url,
}
return render(request, "admin/ask/lookup_csv_import.html", context)


@admin.register(DocumentType)
class DocumentTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(DocumentAuthorInstitution)
class DocumentAuthorInstitutionAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(InstitutionType)
class InstitutionTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(WebsiteResource)
class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
list_display = ("title", "url", "creator", "status", "modified_at")
list_filter = ("status",)
search_fields = ("title", "url")
readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
fieldsets = (
(None, {"fields": ("title", "description", "url")}),
("Metadata", {"fields": (
"date_published",
"document_type", "document_author_institution", "institution_type",
)}),
("Status", {"fields": (
"status", "status_message", "mcp_kb_document_id",
"created_at", "modified_at", "creator", "modifier",
)}),
)
help_texts = {
"title": "A short name to identify this website resource.",
"description": "Optional details about what this website covers.",
"url": "The URL the LLM will use as context when answering questions.",
"date_published": "Partial ISO date: YYYY, YYYY-MM, or YYYY-MM-DD. Leave blank if unknown.",
}

def get_form(self, request, obj=None, **kwargs):
Expand Down Expand Up @@ -248,16 +341,70 @@ def save_model(self, request, obj, form, change):
)


# Optional metadata columns the zip-CSV importer reads onto each PDFResource.
# Controlled-list values create the matching lookup row the first time they
# appear, so the available options grow from what the imports actually use.
ZIP_CSV_LOOKUP_COLUMNS = {
"document_type": DocumentType,
"document_author_institution": DocumentAuthorInstitution,
"institution_type": InstitutionType,
}


def _apply_zip_csv_metadata(obj, row):
"""Populate a resource's metadata fields from one zip-CSV row.

Every metadata column is optional. Returns a list of human-readable
warnings for values that could not be applied — the row is still imported,
just with that field left blank.
"""
warnings = []

date_raw = (row.get("date_published") or "").strip()
if date_raw:
try:
obj.date_published = normalize_partial_date(date_raw)
except ValueError:
warnings.append(
f"invalid date_published '{date_raw}' "
"(use YYYY, YYYY-MM or YYYY-MM-DD); left blank"
)

for column, model in ZIP_CSV_LOOKUP_COLUMNS.items():
value = (row.get(column) or "").strip()
if not value:
continue
if len(value) > 255:
warnings.append(f"{column} value exceeds 255 characters; left blank")
continue
lookup, _ = model.objects.get_or_create(name=value)
setattr(obj, column, lookup)

return warnings


@admin.register(PDFResource)
class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
list_display = ("title", "file", "creator", "status", "modified_at")
list_filter = ("status",)
search_fields = ("title",)
readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
fieldsets = (
(None, {"fields": ("title", "description", "file")}),
("Metadata", {"fields": (
"date_published",
"document_type", "document_author_institution", "institution_type",
)}),
("Status", {"fields": (
"status", "status_message", "mcp_kb_document_id",
"created_at", "modified_at", "creator", "modifier",
)}),
)
help_texts = {
"title": "A short name to identify this PDF resource.",
"description": "Optional details about what this PDF covers.",
"file": "The PDF file the LLM will use as context when answering questions.",
"date_published": "Partial ISO date: YYYY, YYYY-MM, or YYYY-MM-DD. Leave blank if unknown.",
}

# Column names the bulk-import CSV must define (first = zip member, second = resource title)
Expand Down Expand Up @@ -329,6 +476,9 @@ def zip_upload_view(self, request):
messages.error(request, "Please select a zip file to upload.")
return HttpResponseRedirect(request.path)

update_file = bool(request.POST.get("update_file"))
update_metadata = bool(request.POST.get("update_metadata"))

try:
archive = zipfile.ZipFile(zip_file)
except zipfile.BadZipFile:
Expand Down Expand Up @@ -356,7 +506,12 @@ def _is_real(name):

csv_text = archive.read(csv_names[0]).decode("utf-8-sig")
reader = csv.DictReader(io.StringIO(csv_text))
csv_columns = {(name or "").strip() for name in (reader.fieldnames or [])}
# strip header names so the column check and per-row lookups use
# the same keys; otherwise a header like "filename, title" leaves
# stray spaces and every row reads as missing its required fields
if reader.fieldnames:
reader.fieldnames = [(name or "").strip() for name in reader.fieldnames]
csv_columns = set(reader.fieldnames or [])
if not required_columns.issubset(csv_columns):
missing = ", ".join(sorted(required_columns - csv_columns))
messages.error(request, f"CSV is missing required columns: {missing}.")
Expand All @@ -375,7 +530,9 @@ def _is_real(name):

total = 0
saved = 0
queued_ids = []
updated = 0
queued_new_ids = []
queued_replace_ids = []
for row in reader:
total += 1
filename = (row.get(filename_col) or "").strip()
Expand All @@ -388,19 +545,45 @@ def _is_real(name):
continue

basename = os.path.basename(filename)
if (basename, title) in existing_pdfs:
messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.")
continue
is_update = (basename, title) in existing_pdfs

member = zip_members.get(filename) or zip_members.get(basename)
if not member:
messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.")
if is_update and not (update_file or update_metadata):
messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.")
continue

try:
pdf_bytes = archive.read(member)
except KeyError:
messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.")
# only read PDF bytes when we'll actually use them: new rows
# always need them; existing rows only when update_file is set
pdf_bytes = None
if (not is_update) or update_file:
member = zip_members.get(filename) or zip_members.get(basename)
if not member:
messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.")
continue
try:
pdf_bytes = archive.read(member)
except KeyError:
messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.")
continue

if is_update:
match = PDFResource.objects.filter(
original_filename=basename, title=title
).first()
if match is None:
messages.warning(request, f"Row {total}: '{filename}' lookup failed; skipped.")
continue
if update_metadata:
for warning in _apply_zip_csv_metadata(match, row):
messages.warning(request, f"Row {total}: {warning}")
if update_file:
match.file.delete(save=False)
match.file.save(basename, ContentFile(pdf_bytes), save=False)
match.modifier = request.user
match.status = PDFResource.Status.PROCESSING
match.status_message = "Queued for Knowledge Base re-upload."
match.save()
updated += 1
queued_replace_ids.append(match.pk)
continue

obj = PDFResource(
Expand All @@ -411,26 +594,39 @@ def _is_real(name):
status=PDFResource.Status.PROCESSING,
status_message="Queued for Knowledge Base upload.",
)
for warning in _apply_zip_csv_metadata(obj, row):
messages.warning(request, f"Row {total}: {warning}")
obj.file.save(basename, ContentFile(pdf_bytes), save=True)
saved += 1
existing_pdfs.add((basename, title))
queued_ids.append(obj.pk)
queued_new_ids.append(obj.pk)

# fire KB uploads after the request transaction commits so background
# threads see the just-saved rows
def _start_uploads(ids=tuple(queued_ids)):
for pk in ids:
def _start_uploads(
new_ids=tuple(queued_new_ids),
replace_ids=tuple(queued_replace_ids),
):
for pk in new_ids:
threading.Thread(
target=run_kb_resource_upload,
args=("pdf", pk),
daemon=True,
).start()
for pk in replace_ids:
threading.Thread(
target=run_kb_resource_upload,
args=("pdf", pk),
kwargs={"replace": True},
daemon=True,
).start()
transaction.on_commit(_start_uploads)

messages.success(
request,
f"Imported {saved} of {total} PDFs. Knowledge Base uploads are running in the "
"background — refresh the list to see each row's final status.",
f"Imported {saved} new and updated {updated} of {total} PDF rows. "
"Knowledge Base uploads are running in the background — "
"refresh the list to see each row's final status.",
)
return HttpResponseRedirect(changelist_url)

Expand Down
49 changes: 49 additions & 0 deletions hospexplorer/ask/admin_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import csv
import datetime
import io


def normalize_partial_date(value):
"""Validate a partial ISO 8601 date string and return its trimmed form.

Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD`` (zero-padded). Calendar
correctness is delegated to ``datetime.date.fromisoformat`` by padding
the missing components with ``-01``. Empty / whitespace input returns
``""``; any other malformed value raises ``ValueError``.
"""
s = (value or "").strip()
if not s:
return ""
if len(s) == 4:
datetime.date.fromisoformat(s + "-01-01")
elif len(s) == 7:
datetime.date.fromisoformat(s + "-01")
elif len(s) == 10:
datetime.date.fromisoformat(s)
else:
raise ValueError(f"not a partial ISO date: {value!r}")
return s


def import_names_csv(model, file_obj):
"""Import a one-column CSV into a model with a ``name`` field.

Returns ``(created, skipped)``. Blank rows, a leading header row of ``name``,
and rows whose name already exists in the table are all counted as skipped.
"""
text = file_obj.read().decode("utf-8-sig", errors="replace")
reader = csv.reader(io.StringIO(text))

created = 0
skipped = 0
for row in reader:
name = row[0].strip() if row else ""
if not name or name.lower() == "name":
skipped += 1
continue
_, was_created = model.objects.get_or_create(name=name)
if was_created:
created += 1
else:
skipped += 1
return created, skipped
Loading