From d63246b9e5005f40260b877ea74945413b78dcb8 Mon Sep 17 00:00:00 2001 From: alhendrickson <159636032+alhendrickson@users.noreply.github.com.> Date: Wed, 18 Mar 2026 16:33:30 +0000 Subject: [PATCH] feat(helm): Create tutorial for medcat service usage --- .../examples/medcat-service-tutorial.ipynb | 341 ++++++++++++++++++ .../templates/examples-configmap.yaml | 6 + .../charts/jupyterhub/values.yaml | 16 +- .../cogstack-helm-ce/templates/NOTES.txt | 5 + .../cogstack-helm-ce/templates/_helpers.tpl | 4 + .../opensearch-provisioning-post-install.yaml | 2 +- 6 files changed, 370 insertions(+), 4 deletions(-) create mode 100644 helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb create mode 100644 helm-charts/cogstack-helm-ce/charts/jupyterhub/templates/examples-configmap.yaml diff --git a/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb b/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb new file mode 100644 index 0000000..d6bc6a5 --- /dev/null +++ b/helm-charts/cogstack-helm-ce/charts/jupyterhub/examples/medcat-service-tutorial.ipynb @@ -0,0 +1,341 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3dd024b0-46cb-4410-8e33-89dbf2557b3f", + "metadata": {}, + "source": [ + "# Cogstack Serivces Tutorial\n", + "\n", + "This is a step-by-step walkthrough that shows how to call two CogStack services over HTTP:\n", + "MedCAT (entity extraction) and AnonCAT (de-identification).\n", + "\n", + "## Who it is for:\n", + "This is for developers, data engineers, and analysts who want a quick, practical example of how\n", + "to integrate MedCAT/AnonCAT into a Python workflow (and later into a notebook-based analysis).\n", + "\n", + "## What it will do:\n", + "1) Define a sample clinical sentence and the service URLs.\n", + "2) Extract Entities, by calling the medcat-service API\n", + "3) Print the extracted entity annotations from the MedCAT response.\n", + "4) Deidentify text by calling the anoncat-service API\n", + "5) Print the de-identified text (and show the full JSON response for inspection).\n" + ] + }, + { + "cell_type": "markdown", + "id": "de9fd984-d6bd-4717-a5a0-4a34b233f594", + "metadata": {}, + "source": [ + "## Initialisation: Define the inputs and services\n", + "We pick a single sample sentence and the two HTTP endpoints we will call.\n", + "\n", + "The sample sentence contains concepts that the example demo packs used by medcat service have been trained for. \n", + "\n", + "For the service URLs, if using the cogstack community edition helm chart, these should all be setup for you automatically using kubernetes services and env vars. Otherwise change these accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a5f15845-5a0f-414e-9db7-f414d12bde48", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "import requests\n", + "\n", + "sample_text = \"John was diagnosed with Kidney Failure\"\n", + "\n", + "medcat_base_url = os.getenv(\n", + " \"MEDCAT_URL\", \"http://cogstack-helm-ce-medcat-service:5000\"\n", + ").rstrip(\"/\")\n", + "\n", + "anoncat_base_url = os.getenv(\n", + " \"ANONCAT_URL\", \"http://cogstack-helm-ce-anoncat-service:5000\"\n", + ").rstrip(\"/\")\n", + "\n", + "medcat_url = medcat_base_url + \"/api/process\"\n", + "anoncat_url = anoncat_base_url + \"/api/process\"" + ] + }, + { + "cell_type": "markdown", + "id": "b9b954ee-66e4-424b-b0f8-5602fe86ffa7", + "metadata": {}, + "source": [ + "## Perform Named Entity Resolution by calling MedCAT service\n", + "We can now use medcat service to extract entities from our note. \n", + "\n", + "We will send `sample_text` to MedCAT’s `/api/process` route where the payload is shaped as: `{\"content\": {\"text\": sample_text}}`\n", + "\n", + "We can then parse the JSON response and pull out: `medcat_result.get(\"result\").get(\"annotations\")`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "13a2405f-502d-4fff-9c0c-11fb7341ca3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== MedCAT: entities ===\n", + "[\n", + " {\n", + " \"0\": {\n", + " \"pretty_name\": \"Kidney Failure\",\n", + " \"cui\": \"1\",\n", + " \"type_ids\": [\n", + " \"T047\"\n", + " ],\n", + " \"source_value\": \"Kidney Failure\",\n", + " \"detected_name\": \"kidney~failure\",\n", + " \"acc\": 1,\n", + " \"context_similarity\": 1,\n", + " \"start\": 24,\n", + " \"end\": 38,\n", + " \"id\": 0,\n", + " \"meta_anns\": {},\n", + " \"context_left\": [],\n", + " \"context_center\": [],\n", + " \"context_right\": []\n", + " }\n", + " }\n", + "]\n" + ] + } + ], + "source": [ + "medcat_payload = {\"content\": {\"text\": sample_text}}\n", + "\n", + "medcat_result = requests.post(medcat_url, json=medcat_payload).json()\n", + "medcat_annotations = medcat_result.get(\"result\").get(\"annotations\")\n", + "\n", + "print(\"=== MedCAT: entities ===\")\n", + "print(json.dumps(medcat_annotations, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "95a2f603-f107-4266-9bd8-ae64fd6e7692", + "metadata": {}, + "source": [ + "From the above results, we can see that the service has detected \"Kidney Failure\" in the text with a cui of \"1\".\n", + "\n", + "We can see the raw JSON response from medcat by printing it" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "aad682db-df27-4520-b18d-58007594053f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== MedCAT Service: Raw results ===\n", + "{\n", + " \"medcat_info\": {\n", + " \"service_app_name\": \"MedCAT\",\n", + " \"service_language\": \"en\",\n", + " \"service_version\": \"2.4.0.dev0\",\n", + " \"service_model\": \"unknown\",\n", + " \"model_card_info\": {\n", + " \"ontologies\": \"None\",\n", + " \"meta_cat_model_names\": [],\n", + " \"rel_cat_model_names\": [],\n", + " \"model_last_modified_on\": \"2025-07-14T12:36:10.286051\"\n", + " }\n", + " },\n", + " \"result\": {\n", + " \"text\": \"John was diagnosed with Kidney Failure\",\n", + " \"annotations\": [\n", + " {\n", + " \"0\": {\n", + " \"pretty_name\": \"Kidney Failure\",\n", + " \"cui\": \"1\",\n", + " \"type_ids\": [\n", + " \"T047\"\n", + " ],\n", + " \"source_value\": \"Kidney Failure\",\n", + " \"detected_name\": \"kidney~failure\",\n", + " \"acc\": 1,\n", + " \"context_similarity\": 1,\n", + " \"start\": 24,\n", + " \"end\": 38,\n", + " \"id\": 0,\n", + " \"meta_anns\": {},\n", + " \"context_left\": [],\n", + " \"context_center\": [],\n", + " \"context_right\": []\n", + " }\n", + " }\n", + " ],\n", + " \"success\": true,\n", + " \"timestamp\": \"2026-03-18T16:08:44.595+00:00\",\n", + " \"elapsed_time\": 0.003085773,\n", + " \"footer\": null\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "print(\"=== MedCAT Service: Raw results ===\")\n", + "print(json.dumps(medcat_result, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "c4a7dfdb-730f-4390-94eb-31055ecae914", + "metadata": {}, + "source": [ + "## Perform deidentificaition by using AnonCAT sercice\n", + "\n", + "We can also use AnonCat service to deidentify our notes.\n", + "\n", + "The process for this is the same as medcat, the only difference is we are will call a different endpoint. We will send `sample_text` to MedCAT’s `/api/process` route where the payload is shaped as: `{\"content\": {\"text\": sample_text}}`.\n", + "\n", + "We can then parse the JSON response and pull out the text which should be anonymised" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e2e0cba5-3c08-4c1f-869f-f8012b0a9282", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== AnonCAT: Deidentification result ===\n", + "The input was 'John was diagnosed with Kidney Failure'. The output was '[PATIENT] diagnosed with Kidney Failure'\n" + ] + } + ], + "source": [ + "anoncat_payload = {\"content\": {\"text\": sample_text}}\n", + "anoncat_result = requests.post(anoncat_url, json=anoncat_payload).json()\n", + "\n", + "deidentified_text = anoncat_result.get(\"result\").get(\"text\")\n", + "\n", + "print(\"=== AnonCAT: Deidentification result ===\")\n", + "print(f\"The input was '{sample_text}'. The output was '{deidentified_text}'\")" + ] + }, + { + "cell_type": "markdown", + "id": "e4a9c7ce-4255-4ddc-a704-34663298a8a0", + "metadata": {}, + "source": [ + "From the above result, we can see that it has found that the note had the name \"John\", which it's replaced with the placeholder `[PATIENT]`. This has anonymised the note. Note we could alternatively change the service to redact the text, and return `[***]`, which we can do by configuring the service values and redeploying. \n", + "\n", + "We can see the raw JSON response from medcat by printing it. Note that it is the same format as medcat, just instead of finding medical concepts, it has found the patient name." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e792e179-fc8a-4e76-aa19-efb0a3e01241", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== AnonCAT: Deidentification result ===\n", + "{\n", + " \"medcat_info\": {\n", + " \"service_app_name\": \"MedCAT\",\n", + " \"service_language\": \"en\",\n", + " \"service_version\": \"2.4.0.dev0\",\n", + " \"service_model\": \"unknown\",\n", + " \"model_card_info\": {\n", + " \"ontologies\": [],\n", + " \"meta_cat_model_names\": [],\n", + " \"rel_cat_model_names\": [],\n", + " \"model_last_modified_on\": \"2025-08-15T15:14:34.047031\"\n", + " }\n", + " },\n", + " \"result\": {\n", + " \"text\": \"[PATIENT] diagnosed with Kidney Failure\",\n", + " \"annotations\": [\n", + " {\n", + " \"0\": {\n", + " \"pretty_name\": \"PATIENT\",\n", + " \"cui\": \"PATIENT\",\n", + " \"type_ids\": [],\n", + " \"source_value\": \"John was\",\n", + " \"detected_name\": \"\",\n", + " \"acc\": 0.9922866225242615,\n", + " \"context_similarity\": 0.9922866225242615,\n", + " \"start\": 0,\n", + " \"end\": 8,\n", + " \"id\": 0,\n", + " \"meta_anns\": {},\n", + " \"context_left\": [],\n", + " \"context_center\": [],\n", + " \"context_right\": []\n", + " }\n", + " }\n", + " ],\n", + " \"success\": true,\n", + " \"timestamp\": \"2026-03-18T16:09:41.266+00:00\",\n", + " \"elapsed_time\": 0.011446122,\n", + " \"footer\": null\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "print(\"=== AnonCAT: Deidentification result ===\")\n", + "print(json.dumps(anoncat_result, indent=2))\n" + ] + }, + { + "cell_type": "markdown", + "id": "aed034c8-7630-48f8-b382-f220f4eddfa8", + "metadata": {}, + "source": [ + "## Summary\n", + "This is the end of this tutorial.\n", + "\n", + "We can see by calling the model services, we are able to get entities and deidentify text just by calling two http APIs. \n", + "\n", + "## What next?\n", + "There's two options of where to go next:\n", + "1. Setup a data pipeline, that can call these services and write results into OpenSearch\n", + "2. Use MedCAT Trainer and setup a MLOps flow for training a model, and redeploying the services with the new model." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/helm-charts/cogstack-helm-ce/charts/jupyterhub/templates/examples-configmap.yaml b/helm-charts/cogstack-helm-ce/charts/jupyterhub/templates/examples-configmap.yaml new file mode 100644 index 0000000..590a7b5 --- /dev/null +++ b/helm-charts/cogstack-helm-ce/charts/jupyterhub/templates/examples-configmap.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: jupyter-examples +data: +{{ (.Files.Glob "examples/*").AsConfig | indent 2 }} diff --git a/helm-charts/cogstack-helm-ce/charts/jupyterhub/values.yaml b/helm-charts/cogstack-helm-ce/charts/jupyterhub/values.yaml index c0740e5..e2f03d1 100644 --- a/helm-charts/cogstack-helm-ce/charts/jupyterhub/values.yaml +++ b/helm-charts/cogstack-helm-ce/charts/jupyterhub/values.yaml @@ -67,7 +67,7 @@ jupyterhub: fi networkPolicy: enabled: false - + startTimeout: 600 # Give 10 minutes to start the container, which includes pulling the image. # User pod configuration uid: 0 fsGid: 0 @@ -81,8 +81,18 @@ jupyterhub: storage: capacity: 5Gi - - + # NOTE: Prefer dictionary-form here to avoid Helm merge issues + # when this subchart is configured by a parent chart. + extraVolumes: + jupyter-examples: + name: jupyter-examples + configMap: + name: jupyter-examples + extraVolumeMounts: + jupyter-examples: + name: jupyter-examples + mountPath: /home/jovyan/work/examples + readOnly: true cull: enabled: false scheduling: diff --git a/helm-charts/cogstack-helm-ce/templates/NOTES.txt b/helm-charts/cogstack-helm-ce/templates/NOTES.txt index ab3903d..ff04f5a 100644 --- a/helm-charts/cogstack-helm-ce/templates/NOTES.txt +++ b/helm-charts/cogstack-helm-ce/templates/NOTES.txt @@ -42,3 +42,8 @@ echo "Visit http://127.0.0.1:8080 to use MedCAT Trainer" {{ if .Values.opensearch.enabled }}echo "Visit https://127.0.0.1:9200 to use OpenSearch"{{- end }} {{ if index .Values "opensearch-dashboards" "enabled" }}echo "Visit http://127.0.0.1:5601 to use OpenSearch Dashboards"{{- end }} {{ if index .Values "cogstack-jupyterhub" "enabled" }}echo "Visit http://127.0.0.1:8000 to use jupyterhub"{{- end }} +{{ if index .Values "cogstack-jupyterhub" "enabled" }}echo "Visit http://127.0.0.1:8000/user/admin/notebooks/medcat-service-tutorial.ipynb to get started with a tutorial"{{- end }} + +# Setup Complete +# Run this command line to setup port-forwarding and access services +# `helm get notes {{ .Release.Name }} | bash` diff --git a/helm-charts/cogstack-helm-ce/templates/_helpers.tpl b/helm-charts/cogstack-helm-ce/templates/_helpers.tpl index 9ec6d37..ad7acb9 100644 --- a/helm-charts/cogstack-helm-ce/templates/_helpers.tpl +++ b/helm-charts/cogstack-helm-ce/templates/_helpers.tpl @@ -66,10 +66,14 @@ Dependency URLs */}} {{- define "opensearch.url" -}} +{{- if .Values.opensearch.enabled }} {{- $serviceName := include "opensearch.serviceName" (index .Subcharts "opensearch") -}} {{- $scheme := default "https" .Values.opensearch.protocol -}} {{- $port := default 9200 .Values.opensearch.httpPort -}} {{- printf "%s://%s:%v" $scheme $serviceName $port -}} +{{- else -}} +"opensearch-disabled" +{{- end}} {{- end }} {{- define "opensearch-dashboards.url" -}} diff --git a/helm-charts/cogstack-helm-ce/templates/opensearch-provisioning-post-install.yaml b/helm-charts/cogstack-helm-ce/templates/opensearch-provisioning-post-install.yaml index 3be6a7e..39317e7 100644 --- a/helm-charts/cogstack-helm-ce/templates/opensearch-provisioning-post-install.yaml +++ b/helm-charts/cogstack-helm-ce/templates/opensearch-provisioning-post-install.yaml @@ -1,4 +1,4 @@ -{{- if .Values.provisioning.enabled }} +{{- if and (.Values.provisioning.enabled) (or (.Values.opensearch.enabled) (index .Values "opensearch-dashboards" "enabled")) }} apiVersion: v1 kind: Pod metadata: