From 0d219f8e5c5e43c69c07d02c6891c226839f31f4 Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Sun, 8 Mar 2026 21:58:33 +0100
Subject: [PATCH 01/12] initial work

---
 .gitignore                               |  17 +++
 .vscode/launch.json                      |  15 ++
 Makefile                                 |  25 ++++
 docker-compose.swarm.yml                 | 172 +++++++++++++++++++++++
 docker/backend/worker-beat-entrypoint.sh |   3 -
 docker/backend/worker-entrypoint.sh      |   3 -
 6 files changed, 229 insertions(+), 6 deletions(-)
 create mode 100644 .vscode/launch.json
 create mode 100644 docker-compose.swarm.yml

diff --git a/.gitignore b/.gitignore
index 452309f2..da00de86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,9 @@
 **/__pycache__/
 **/.pytest_cache/
 **/node_modules
+**/build
 **/.dev
+**/.debug
 **/letsencrypt
 .DS_Store
 .env.*
@@ -12,6 +14,21 @@ tmp*
 django_static
 local
 
+### Local/personal files ###
+CLAUDE.md
+.analyses/
+plans/
+.python-version
+backend/Makefile.local
+backend/Makefile_v2
+backend/core/settings_backup_*.py
+docker/backend/*_backup.sh
+redis-proxy.py
+pyproject.toml
+uv.lock
+backend/pyproject.toml
+backend/package-lock.json
+
 ### VisualStudioCode ###
 .vscode/*
 !.vscode/settings.json
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..15416932
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug WR31 rule_executor",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/backend/.debug/debug_wr31.py",
+            "python": "${workspaceFolder}/backend/.dev/venv/bin/python",
+            "cwd": "${workspaceFolder}/backend",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        }
+    ]
+}
diff --git a/Makefile b/Makefile
index 0163120f..a15e04b2 100644
--- a/Makefile
+++ b/Makefile
@@ -26,6 +26,31 @@ start-infra-only:
 stop:
 	docker compose down
 
+# --- Docker Swarm ---
+
+REGISTRY ?= localhost:5000
+WORKERS  ?= 2
+
+start-swarm:
+	docker stack deploy -c docker-compose.swarm.yml --with-registry-auth validate
+
+stop-swarm:
+	docker stack rm validate
+
+scale-workers:
+	docker service scale validate_worker=$(WORKERS)
+
+swarm-push: build
+	docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend
+	docker tag buildingsmart/validationsvc-frontend $(REGISTRY)/validationsvc-frontend
+	docker push $(REGISTRY)/validationsvc-backend
+	docker push $(REGISTRY)/validationsvc-frontend
+
+swarm-status:
+	@docker service ls
+	@echo "---"
+	@docker service ps validate_worker
+
 build:
 	docker compose build \
 	--build-arg GIT_COMMIT_HASH="$$(git rev-parse --short HEAD)" \
diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml
new file mode 100644
index 00000000..8975a4f8
--- /dev/null
+++ b/docker-compose.swarm.yml
@@ -0,0 +1,172 @@
+# Docker Swarm deployment configuration
+#
+# Usage:
+#   1. Build and push images:  make swarm-push
+#   2. Deploy:                 make start-swarm
+#   3. Scale workers:          make scale-workers WORKERS=5
+#   4. Status:                 make swarm-status
+#   5. Tear down:              make stop-swarm
+#
+# Prerequisites:
+#   - docker swarm init
+#   - Local registry: docker service create --name registry --publish 5000:5000 registry:2
+#   - NFS server configured (see PR description)
+#
+# NFS: Set NFS_SERVER_IP in .env (default: 10.0.0.1).
+# For local testing without NFS, override volumes with plain named volumes.
+
+services:
+
+    frontend:
+        image: ${REGISTRY:-localhost:5000}/validationsvc-frontend
+        ports:
+            - 80:80
+            - 443:443
+        environment:
+            CERTBOT_DOMAIN: ${CERTBOT_DOMAIN}
+            CERTBOT_EMAIL: ${CERTBOT_EMAIL}
+        volumes:
+            - letsencrypt_data:/etc/letsencrypt
+            - static_data:/app/backend/django_static
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    backend:
+        image: ${REGISTRY:-localhost:5000}/validationsvc-backend
+        entrypoint: /app/backend/server-entrypoint.sh
+        env_file: .env
+        volumes:
+            - static_data:/app/backend/django_static
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        expose:
+            - 8000
+        networks:
+            - validate
+        deploy:
+            replicas: 2
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+        healthcheck:
+            test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/alive')\""]
+            interval: 30s
+            timeout: 10s
+            retries: 3
+            start_period: 60s
+
+    worker:
+        image: ${REGISTRY:-localhost:5000}/validationsvc-backend
+        entrypoint: /app/backend/worker-entrypoint.sh
+        env_file: .env
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 2
+            # No placement constraint - workers run on any node
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+            resources:
+                limits:
+                    cpus: "2.0"
+                    memory: 4G
+                reservations:
+                    cpus: "1.0"
+                    memory: 2G
+
+    scheduler:
+        image: ${REGISTRY:-localhost:5000}/validationsvc-backend
+        entrypoint: /app/backend/worker-beat-entrypoint.sh
+        env_file: .env
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    redis:
+        image: redis:8.4-alpine
+        command: redis-server --protected-mode no --bind 0.0.0.0
+        expose:
+            - 6379
+        volumes:
+            - redis_data:/data
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    db:
+        image: postgres:16.10-alpine
+        environment:
+            POSTGRES_DB: ${POSTGRES_NAME}
+            POSTGRES_USER: ${POSTGRES_USER}
+            POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
+        expose:
+            - 5432
+        volumes:
+            - postgres_data:/var/lib/postgresql/data/
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+networks:
+    validate:
+        driver: overlay
+
+volumes:
+    static_data:
+    letsencrypt_data:
+    postgres_data:
+    redis_data:
+    files_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/files_data"
+    gherkin_rules_log_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/gherkin_logs"
diff --git a/docker/backend/worker-beat-entrypoint.sh b/docker/backend/worker-beat-entrypoint.sh
index 23c38f6e..1eeb5afa 100644
--- a/docker/backend/worker-beat-entrypoint.sh
+++ b/docker/backend/worker-beat-entrypoint.sh
@@ -23,9 +23,6 @@ freshclam
 service clamav-freshclam start
 service clamav-daemon start
 
-python manage.py makemigrations
-python manage.py migrate
-
 CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes
 echo "Celery concurrency: $CELERY_CONCURRENCY"
 
diff --git a/docker/backend/worker-entrypoint.sh b/docker/backend/worker-entrypoint.sh
index c08929b8..cbe08319 100644
--- a/docker/backend/worker-entrypoint.sh
+++ b/docker/backend/worker-entrypoint.sh
@@ -23,9 +23,6 @@ freshclam
 service clamav-freshclam start
 service clamav-daemon start
 
-python manage.py makemigrations
-python manage.py migrate
-
 CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes
 echo "Celery concurrency: $CELERY_CONCURRENCY"
 

From 012776c5a61e765cb3d9ce7e3f5143f1e65f094e Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Tue, 10 Mar 2026 17:14:33 +0000
Subject: [PATCH 02/12] docker swarm deployment support - file upload seek
 position

---
 .vscode/launch.json                         | 15 -----
 Makefile                                    |  3 +
 backend/apps/ifc_validation/api/v1/views.py |  1 +
 docker-compose.swarm.local.yml              | 67 +++++++++++++++++++++
 docker-compose.swarm.yml                    | 10 +--
 5 files changed, 76 insertions(+), 20 deletions(-)
 delete mode 100644 .vscode/launch.json
 create mode 100644 docker-compose.swarm.local.yml

diff --git a/.vscode/launch.json b/.vscode/launch.json
deleted file mode 100644
index 15416932..00000000
--- a/.vscode/launch.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "Debug WR31 rule_executor",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "${workspaceFolder}/backend/.debug/debug_wr31.py",
-            "python": "${workspaceFolder}/backend/.dev/venv/bin/python",
-            "cwd": "${workspaceFolder}/backend",
-            "console": "integratedTerminal",
-            "justMyCode": false
-        }
-    ]
-}
diff --git a/Makefile b/Makefile
index a15e04b2..608e1f25 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,9 @@ WORKERS  ?= 2
 start-swarm:
 	docker stack deploy -c docker-compose.swarm.yml --with-registry-auth validate
 
+start-swarm-local:
+	docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml --with-registry-auth validate
+
 stop-swarm:
 	docker stack rm validate
 
diff --git a/backend/apps/ifc_validation/api/v1/views.py b/backend/apps/ifc_validation/api/v1/views.py
index ae7bb5e1..838f09e0 100644
--- a/backend/apps/ifc_validation/api/v1/views.py
+++ b/backend/apps/ifc_validation/api/v1/views.py
@@ -195,6 +195,7 @@ def post(self, request, *args, **kwargs):
                     #file = os.path.join(MEDIA_ROOT, uploaded_file['file_name'])                   
                     #uploaded_file['size'] = os.path.getsize(file)
                     uploaded_file['size'] = file_length
+                    f.seek(0)
                     instance = serializer.save()
 
                     # submit task for background execution
diff --git a/docker-compose.swarm.local.yml b/docker-compose.swarm.local.yml
new file mode 100644
index 00000000..c05b5c9c
--- /dev/null
+++ b/docker-compose.swarm.local.yml
@@ -0,0 +1,67 @@
+# Override: single-node local testing (no NFS, no ClamAV, reduced replicas)
+#
+# Usage:
+#   docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml validate
+#
+# For production/NFS testing, use docker-compose.swarm.yml directly.
+
+services:
+
+  frontend:
+    environment:
+      CERTBOT_DOMAIN: _
+      CERTBOT_EMAIL: x
+
+  backend:
+    deploy:
+      replicas: 1
+
+  worker:
+    entrypoint: /bin/sh
+    command:
+      - -c
+      - |
+        set -e
+        until cd /files_storage; do echo "Waiting for files_storage..."; done
+        until cd /app/backend; do echo "Waiting for server volume..."; done
+        while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do
+          echo "Waiting for DB..."
+          sleep 5
+        done
+        echo "DB is ready. Starting worker (no ClamAV)."
+        rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true
+        CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4}
+        echo "Celery concurrency: $$CELERY_CONCURRENCY"
+        celery --app=core worker --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker@%n
+    deploy:
+      replicas: 1
+      resources:
+        limits:
+          cpus: "2.0"
+          memory: 2G
+        reservations:
+          cpus: "0.5"
+          memory: 512M
+
+  scheduler:
+    entrypoint: /bin/sh
+    command:
+      - -c
+      - |
+        set -e
+        until cd /files_storage; do echo "Waiting for files_storage..."; done
+        until cd /app/backend; do echo "Waiting for server volume..."; done
+        while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do
+          echo "Waiting for DB..."
+          sleep 5
+        done
+        echo "DB is ready. Starting scheduler (no ClamAV)."
+        rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true
+        CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4}
+        celery --app=core worker --beat --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker-beat@%n
+
+volumes:
+  files_data:
+    driver: local
+  gherkin_rules_log_data:
+    driver: local
diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml
index 8975a4f8..d21ae5fc 100644
--- a/docker-compose.swarm.yml
+++ b/docker-compose.swarm.yml
@@ -62,7 +62,7 @@ services:
                 delay: 30s
                 failure_action: rollback
         healthcheck:
-            test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/alive')\""]
+            test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""]
             interval: 30s
             timeout: 10s
             retries: 3
@@ -89,11 +89,11 @@ services:
                 failure_action: rollback
             resources:
                 limits:
-                    cpus: "2.0"
-                    memory: 4G
+                    cpus: "${WORKER_CPU_LIMIT:-2.0}"
+                    memory: ${WORKER_MEMORY_LIMIT:-2G}
                 reservations:
-                    cpus: "1.0"
-                    memory: 2G
+                    cpus: "${WORKER_CPU_RESERVATION:-1.0}"
+                    memory: ${WORKER_MEMORY_RESERVATION:-1G}
 
     scheduler:
         image: ${REGISTRY:-localhost:5000}/validationsvc-backend

From 61a93444cb08a1a9fa2b0fd4f465a5add0f1acbc Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Thu, 12 Mar 2026 11:56:30 +0000
Subject: [PATCH 03/12] whitelisting admin, submodule sync

---
 backend/apps/ifc_validation/admin.py | 4 ++--
 backend/apps/ifc_validation_models   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/apps/ifc_validation/admin.py b/backend/apps/ifc_validation/admin.py
index d52ff317..5c226756 100644
--- a/backend/apps/ifc_validation/admin.py
+++ b/backend/apps/ifc_validation/admin.py
@@ -439,8 +439,8 @@ class ValidationOutcomeAdmin(BaseAdmin, NonAdminAddable):
     readonly_fields = ["id", "public_id", "created", "updated"]
     date_hierarchy = "created"
 
-    list_filter = ['validation_task__type', 'severity', 'validation_task__request__model', 'outcome_code', 'feature', ('created', AdvancedDateFilter)]
-    search_fields = ('validation_task__request__file_name', 'feature', 'feature_version', 'outcome_code', 'severity', 'expected', 'observed')
+    list_filter = ['validation_task__type', 'severity_in_db', 'validation_task__request__model', 'outcome_code', 'feature', ('created', AdvancedDateFilter)]
+    search_fields = ('validation_task__request__file_name', 'feature', 'feature_version', 'outcome_code', 'severity_in_db', 'expected', 'observed')
 
     paginator = utils.LargeTablePaginator
     show_full_result_count = False # do not use COUNT(*) twice
diff --git a/backend/apps/ifc_validation_models b/backend/apps/ifc_validation_models
index 16089c2e..774c7bb8 160000
--- a/backend/apps/ifc_validation_models
+++ b/backend/apps/ifc_validation_models
@@ -1 +1 @@
-Subproject commit 16089c2ec9c95454604d20ebc024239f3c71cd80
+Subproject commit 774c7bb8dff8be799bd41d648d28c5a4a2789deb

From 4eb8f243c83a8f9364ea17dbf1755fb5e33405f9 Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Sat, 14 Mar 2026 00:06:09 +0000
Subject: [PATCH 04/12] submodule

---
 backend/apps/ifc_validation_models | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/apps/ifc_validation_models b/backend/apps/ifc_validation_models
index 774c7bb8..469d9a48 160000
--- a/backend/apps/ifc_validation_models
+++ b/backend/apps/ifc_validation_models
@@ -1 +1 @@
-Subproject commit 774c7bb8dff8be799bd41d648d28c5a4a2789deb
+Subproject commit 469d9a488856e5cd141a47ffdc37d3fcbc6637e0

From 54632ab11874de8f721543cb6177e949a07f0b13 Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Sun, 15 Mar 2026 11:52:20 +0000
Subject: [PATCH 05/12] swarm env environments

---
 Makefile                 | 5 +++--
 docker-compose.swarm.yml | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 608e1f25..8bfc8d07 100644
--- a/Makefile
+++ b/Makefile
@@ -30,12 +30,13 @@ stop:
 
 REGISTRY ?= localhost:5000
 WORKERS  ?= 2
+ENV_FILE ?= .env
 
 start-swarm:
-	docker stack deploy -c docker-compose.swarm.yml --with-registry-auth validate
+	docker compose -f docker-compose.swarm.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate
 
 start-swarm-local:
-	docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml --with-registry-auth validate
+	docker compose -f docker-compose.swarm.yml -f docker-compose.swarm.local.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate
 
 stop-swarm:
 	docker stack rm validate
diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml
index d21ae5fc..d3afa804 100644
--- a/docker-compose.swarm.yml
+++ b/docker-compose.swarm.yml
@@ -146,7 +146,7 @@ services:
             placement:
                 constraints: [node.role == manager]
             restart_policy:
-                condition: on-failure
+                condition: any
                 delay: 5s
 
 networks:
@@ -162,11 +162,11 @@ volumes:
         driver: local
         driver_opts:
             type: nfs
-            o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
             device: ":/srv/nfs/files_data"
     gherkin_rules_log_data:
         driver: local
         driver_opts:
             type: nfs
-            o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
             device: ":/srv/nfs/gherkin_logs"

From ab223b2ba441bdfbb82714bf73de0296fc73f3a3 Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Sun, 15 Mar 2026 12:19:49 +0000
Subject: [PATCH 06/12] swarm general env

---
 .env | 110 ++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 59 insertions(+), 51 deletions(-)

diff --git a/.env b/.env
index fb79ac74..982cd621 100644
--- a/.env
+++ b/.env
@@ -1,51 +1,59 @@
-# variables in Docker Compose
-DEBUG = True
-ENV = Development
-PUBLIC_URL = http://localhost
-
-# Certbot
-CERTBOT_DOMAIN = _
-CERTBOT_EMAIL = <CERTBOT_EMAIL>
-
-# Django
-MEDIA_ROOT = /files_storage
-DJANGO_DB = postgresql
-DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk
-DJANGO_ALLOWED_HOSTS = localhost
-DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000
-DJANGO_LOG_LEVEL = INFO
-GHERKIN_LOG_FOLDER = /gherkin_logs
-DJANGO_GUNICORN_WORKERS = 3
-DJANGO_GUNICORN_THREADS_PER_WORKER = 4
-
-# DB
-POSTGRES_HOST = db
-POSTGRES_NAME = postgres
-POSTGRES_USER = postgres
-POSTGRES_PASSWORD = postgres
-POSTGRES_PORT = 5432
-
-# Worker
-REDIS_PORT = 6379
-CELERY_BROKER_URL = redis://redis:6379/0
-CELERY_TASK_SOFT_TIME_LIMIT = 3600
-CELERY_TASK_TIME_LIMIT = 4000
-TASK_TIMEOUT_LIMIT = 3600
-DJANGO_DB_USER_CONTEXT = SYSTEM
-DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000
-CELERY_CONCURRENCY = 4
-
-# Email
-MAILGUN_API_URL = <MG_API_URL>
-MAILGUN_API_KEY = <MG_API_KEY>
-MAILGUN_FROM_NAME = Validation Service
-MAILGUN_FROM_EMAIL = noreply@localhost
-ADMIN_EMAIL = noreply@localhost
-CONTACT_EMAIL = noreply@localhost
-
-# IAM
-B2C_CLIENT_ID = <B2C_CLIENT_ID>
-B2C_CLIENT_SECRET = <B2C_CLIENT_SECRET>
-B2C_AUTHORITY = <B2C_AUTHORITY>
-B2C_USER_FLOW = <B2C_USER_FLOW>
-USE_WHITELIST = False
+# variables in Docker Compose
+DEBUG = True
+ENV = Development
+PUBLIC_URL = http://localhost
+
+# Certbot
+CERTBOT_DOMAIN = _
+CERTBOT_EMAIL = <CERTBOT_EMAIL>
+
+# Django
+MEDIA_ROOT = /files_storage
+DJANGO_DB = postgresql
+DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk
+DJANGO_ALLOWED_HOSTS = localhost
+DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000
+DJANGO_LOG_LEVEL = INFO
+GHERKIN_LOG_FOLDER = /gherkin_logs
+DJANGO_GUNICORN_WORKERS = 3
+DJANGO_GUNICORN_THREADS_PER_WORKER = 4
+
+# DB
+POSTGRES_HOST = db
+POSTGRES_NAME = postgres
+POSTGRES_USER = postgres
+POSTGRES_PASSWORD = postgres
+POSTGRES_PORT = 5432
+
+# Worker
+REDIS_PORT = 6379
+CELERY_BROKER_URL = redis://redis:6379/0
+CELERY_TASK_SOFT_TIME_LIMIT = 3600
+CELERY_TASK_TIME_LIMIT = 4000
+TASK_TIMEOUT_LIMIT = 3600
+DJANGO_DB_USER_CONTEXT = SYSTEM
+DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000
+CELERY_CONCURRENCY = 4
+
+# Email
+MAILGUN_API_URL = <MG_API_URL>
+MAILGUN_API_KEY = <MG_API_KEY>
+MAILGUN_FROM_NAME = Validation Service
+MAILGUN_FROM_EMAIL = noreply@localhost
+ADMIN_EMAIL = noreply@localhost
+CONTACT_EMAIL = noreply@localhost
+
+# IAM
+B2C_CLIENT_ID = <B2C_CLIENT_ID>
+B2C_CLIENT_SECRET = <B2C_CLIENT_SECRET>
+B2C_AUTHORITY = <B2C_AUTHORITY>
+B2C_USER_FLOW = <B2C_USER_FLOW>
+USE_WHITELIST = False
+
+# Swarm (ignored by docker compose)
+# REGISTRY=localhost:5000
+# NFS_SERVER_IP=10.0.0.1
+# WORKER_CPU_LIMIT=2.0
+# WORKER_CPU_RESERVATION=1.0
+# WORKER_MEMORY_LIMIT=2G
+# WORKER_MEMORY_RESERVATION=1G

From 126ca948eed8d90d6facabd36e460ba93179cb6b Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Sun, 15 Mar 2026 14:22:54 +0000
Subject: [PATCH 07/12] replace docker compose config with envsubst, fix
 overlay MTU

---
 Makefile                 | 18 +++++++++++++++---
 docker-compose.swarm.yml | 31 ++++++++++++++-----------------
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/Makefile b/Makefile
index 8bfc8d07..84ddac7d 100644
--- a/Makefile
+++ b/Makefile
@@ -28,15 +28,20 @@ stop:
 
 # --- Docker Swarm ---
 
-REGISTRY ?= localhost:5000
 WORKERS  ?= 2
 ENV_FILE ?= .env
 
+# Reads compose-level vars from ENV_FILE, substitutes into YAML via envsubst.
+# Container env vars are loaded by docker stack deploy via the env_file: directive.
+SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP POSTGRES_NAME POSTGRES_USER POSTGRES_PASSWORD
+SWARM_ENV = ENV_FILE=$(ENV_FILE) \
+	$(foreach v,$(SWARM_VARS),$(v)=$$(grep '^$(v)=' $(ENV_FILE) | cut -d= -f2-))
+
 start-swarm:
-	docker compose -f docker-compose.swarm.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate
+	$(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate
 
 start-swarm-local:
-	docker compose -f docker-compose.swarm.yml -f docker-compose.swarm.local.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate
+	$(SWARM_ENV) envsubst < docker-compose.swarm.local.yml | docker stack deploy -c - --with-registry-auth validate
 
 stop-swarm:
 	docker stack rm validate
@@ -44,6 +49,13 @@ stop-swarm:
 scale-workers:
 	docker service scale validate_worker=$(WORKERS)
 
+CPU ?= 2
+MEM ?= 2G
+set-worker-limits:
+	docker service update --limit-cpu $(CPU) --limit-memory $(MEM) validate_worker
+
+REGISTRY ?= $$(grep '^REGISTRY=' $(ENV_FILE) | cut -d= -f2- || echo localhost:5000)
+
 swarm-push: build
 	docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend
 	docker tag buildingsmart/validationsvc-frontend $(REGISTRY)/validationsvc-frontend
diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml
index d3afa804..ddaac915 100644
--- a/docker-compose.swarm.yml
+++ b/docker-compose.swarm.yml
@@ -18,7 +18,7 @@
 services:
 
     frontend:
-        image: ${REGISTRY:-localhost:5000}/validationsvc-frontend
+        image: ${REGISTRY}/validationsvc-frontend
         ports:
             - 80:80
             - 443:443
@@ -39,9 +39,9 @@ services:
                 delay: 5s
 
     backend:
-        image: ${REGISTRY:-localhost:5000}/validationsvc-backend
+        image: ${REGISTRY}/validationsvc-backend
         entrypoint: /app/backend/server-entrypoint.sh
-        env_file: .env
+        env_file: ${ENV_FILE}
         volumes:
             - static_data:/app/backend/django_static
             - files_data:/files_storage
@@ -51,7 +51,7 @@ services:
         networks:
             - validate
         deploy:
-            replicas: 2
+            replicas: 1
             placement:
                 constraints: [node.role == manager]
             restart_policy:
@@ -69,9 +69,9 @@ services:
             start_period: 60s
 
     worker:
-        image: ${REGISTRY:-localhost:5000}/validationsvc-backend
+        image: ${REGISTRY}/validationsvc-backend
         entrypoint: /app/backend/worker-entrypoint.sh
-        env_file: .env
+        env_file: ${ENV_FILE}
         volumes:
             - files_data:/files_storage
             - gherkin_rules_log_data:/gherkin_logs
@@ -87,18 +87,13 @@ services:
                 parallelism: 1
                 delay: 30s
                 failure_action: rollback
-            resources:
-                limits:
-                    cpus: "${WORKER_CPU_LIMIT:-2.0}"
-                    memory: ${WORKER_MEMORY_LIMIT:-2G}
-                reservations:
-                    cpus: "${WORKER_CPU_RESERVATION:-1.0}"
-                    memory: ${WORKER_MEMORY_RESERVATION:-1G}
+            # Resource limits applied post-deploy via:
+            #   docker service update --limit-cpu 2 --limit-memory 2G validate_worker
 
     scheduler:
-        image: ${REGISTRY:-localhost:5000}/validationsvc-backend
+        image: ${REGISTRY}/validationsvc-backend
         entrypoint: /app/backend/worker-beat-entrypoint.sh
-        env_file: .env
+        env_file: ${ENV_FILE}
         volumes:
             - files_data:/files_storage
             - gherkin_rules_log_data:/gherkin_logs
@@ -152,6 +147,8 @@ services:
 networks:
     validate:
         driver: overlay
+        driver_opts:
+            com.docker.network.driver.mtu: "1400"
 
 volumes:
     static_data:
@@ -162,11 +159,11 @@ volumes:
         driver: local
         driver_opts:
             type: nfs
-            o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
             device: ":/srv/nfs/files_data"
     gherkin_rules_log_data:
         driver: local
         driver_opts:
             type: nfs
-            o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
             device: ":/srv/nfs/gherkin_logs"

From 8f9dbeadcc0983735f149a80767fd4fd4a1c0be4 Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Sun, 15 Mar 2026 18:49:19 +0000
Subject: [PATCH 08/12] fix local override merging, cleanup gitignore and
 duplicate assignment

---
 Makefile                                                      | 4 +++-
 .../ifc_validation/tasks/processing/instance_completion.py    | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 84ddac7d..6baa69ca 100644
--- a/Makefile
+++ b/Makefile
@@ -41,7 +41,9 @@ start-swarm:
 	$(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate
 
 start-swarm-local:
-	$(SWARM_ENV) envsubst < docker-compose.swarm.local.yml | docker stack deploy -c - --with-registry-auth validate
+	$(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml
+	$(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm_local.yml
+	docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm_local.yml --with-registry-auth validate
 
 stop-swarm:
 	docker stack rm validate
diff --git a/backend/apps/ifc_validation/tasks/processing/instance_completion.py b/backend/apps/ifc_validation/tasks/processing/instance_completion.py
index 2fb5b3ba..c2940e56 100644
--- a/backend/apps/ifc_validation/tasks/processing/instance_completion.py
+++ b/backend/apps/ifc_validation/tasks/processing/instance_completion.py
@@ -18,7 +18,7 @@
     import itertools
     import functools
 
-    file_path, step_ids = file_path, step_ids = json.load(sys.stdin)
+    file_path, step_ids = json.load(sys.stdin)
     ifc_file = ifcopenshell.open(file_path)
     def filter_serializable(v):
         def inner(k, v):

From edcdf2bffa2ba8d60f95eb73200640168cdab355 Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Mon, 16 Mar 2026 17:13:11 +0000
Subject: [PATCH 09/12] add swarm documentation

---
 docs/swarm-considerations.md | 487 ++++++++++++++++++++++++++++++++++
 docs/swarm-runbook.md        | 491 +++++++++++++++++++++++++++++++++++
 2 files changed, 978 insertions(+)
 create mode 100644 docs/swarm-considerations.md
 create mode 100644 docs/swarm-runbook.md

diff --git a/docs/swarm-considerations.md b/docs/swarm-considerations.md
new file mode 100644
index 00000000..40e77bb0
--- /dev/null
+++ b/docs/swarm-considerations.md
@@ -0,0 +1,487 @@
+# Docker Swarm — Considerations & Known Issues
+
+Compiled during IVS-719 development. Grouped by category.
+
+## Status
+
+- **Single-node Swarm**: tested and working (Hetzner, 2026-03-10)
+- **Multi-node Swarm**: tested and working with 2 nodes + NFS (Hetzner, 2026-03-15)
+- **Single-node Swarm on Azure DEV**: tested and working with external DB + NFS (2026-03-15)
+- **CI/CD**: not yet adapted for Swarm — see section 5
+- **SSL/Certbot**: not tested with a real domain yet (using `CERTBOT_DOMAIN=_` to skip)
+- **Documentation**: user-facing docs (README, deployment guide) not yet updated for Swarm workflow
+
+---
+
+# Architecture & Design
+
+## 1. Architecture overview
+
+Every worker needs access to `/files_storage` (uploaded IFC files) and `/gherkin_logs`. In Docker Compose, these are local volumes on one machine. In Swarm, workers run on **different machines** — so files must be shared via NFS.
+
+```
+                   ┌─────────┐
+                   │ Frontend │  (Nginx + React)
+                   │  :80/443 │
+                   └────┬─────┘
+                        │
+                   ┌────▼─────┐
+                   │ Backend  │  (Django API — manager node)
+                   │  :8000   │
+                   └────┬─────┘
+                        │ enqueues tasks
+                   ┌────▼─────┐
+                   │  Redis   │  (Celery broker — manager node)
+                   │  :6379   │
+                   └────┬─────┘
+                        │ workers consume via overlay network
+              ┌─────────┼──────────┐
+              │         │          │
+         ┌────▼───┐ ┌───▼────┐ ┌──▼─────┐
+         │Worker 1│ │Worker 2│ │Worker N│  (any node in swarm)
+         └────┬───┘ └───┬────┘ └──┬─────┘
+              │         │          │
+              │     NFS mount      │
+              └─────────┼──────────┘
+                   ┌────▼─────┐
+                   │/srv/nfs/ │  (NFS server on manager node)
+                   │files_data│
+                   └──────────┘
+                        │ same machine
+                   ┌────▼─────┐
+                   │ Postgres │  (manager node)
+                   └──────────┘
+
+         ┌───────────┐
+         │ Scheduler │  (1 replica, manager only)
+         │  --beat   │  file retention: archive@90d, remove@180d
+         └───────────┘
+```
+
+**How it works:**
+- The **manager node** runs: frontend, backend, DB, Redis, scheduler, and the NFS server
+- **Worker nodes** only run Celery workers — they mount NFS volumes automatically via the Docker volume driver
+- The **overlay network** (Docker Swarm native) connects workers to Redis and Postgres across machines
+- NFS gives workers read/write access to uploaded files as if they were local
+
+**If NFS goes down, all workers stall** — `hard,timeo=600` mount options mean workers will hang (not error) until NFS recovers. This is intentional: better to wait than to silently fail.
+
+For Azure: restrict NFS exports to VNet CIDR (e.g. `10.0.0.0/16(rw,sync,...)`), not `*`.
+
+---
+
+## 2. Build and deploy are now separate steps
+
+Docker Compose: `docker compose build && docker compose up` — build and run in one flow.
+
+Docker Swarm: worker nodes **cannot build images**. They pull from a registry.
+
+```
+Developer machine          Registry              Swarm nodes
+     build ──push──>  localhost:5000  <──pull──  worker-1, worker-2
+```
+
+Workflow:
+```bash
+make build                          # build images locally
+make swarm-push ENV_FILE=.env.xxx   # tag + push to registry
+make start-swarm ENV_FILE=.env.xxx  # docker stack deploy (nodes pull from registry)
+```
+
+For Azure PROD, replace `localhost:5000` with Azure Container Registry (ACR).
+
+---
+
+## 3. Worker scaling and capacity
+
+There is **no hard cap** on worker replicas. Scaling is manual:
+
+```bash
+make scale-workers WORKERS=4
+```
+
+**Capacity math per worker:**
+- ~1GB RAM for ClamAV virus signature database
+- ~2-3GB RAM for Celery tasks (depends on `CELERY_CONCURRENCY`)
+- Total: **~3-4GB RAM per worker**
+- Each worker runs `CELERY_CONCURRENCY` parallel tasks (default: 4 in .env.hetzner, 6 in .env)
+
+| Environment | Workers | Concurrency | Parallel tasks | RAM needed (workers only) |
+|---|---|---|---|---|
+| Hetzner (8GB) | 2 | 4 | 8 | ~6-8GB |
+| DEV | 2 | 4 | 8 | ~6-8GB |
+| PROD | 4+ | 6 | 24+ | ~12-16GB |
+
+To prevent overloading a single node, use `max_replicas_per_node` in the compose file:
+```yaml
+deploy:
+    replicas: 4
+    placement:
+        max_replicas_per_node: 2
+```
+This forces Swarm to spread workers across at least 2 nodes. Not currently set — all replicas can land on one node if Swarm decides to.
+
+**Resource limits** are optional but recommended in production. Apply post-deploy:
+```bash
+make set-worker-limits CPU=2 MEM=2G                        # limits only
+make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G   # limits + reservations
+```
+
+Per-environment suggestions:
+| Environment | CPU limit | Memory limit | Notes |
+|---|---|---|---|
+| Hetzner (8GB) | 2 | 2G | Small server, max ~2 workers |
+| DEV | 1 | 1G | |
+| PROD | 4 | 4G | Includes ClamAV ~1GB |
+
+---
+
+## 4. `.env` strategy
+
+`.env` is committed with safe defaults (localhost, no secrets). Environment-specific files are gitignored via `.env.*`:
+
+| File | Purpose | Committed? |
+|---|---|---|
+| `.env` | Shared defaults for local dev / forking | Yes |
+| `.env.hetzner` | Hetzner dev server (IPs, NFS, registry) | No |
+| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No |
+| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No |
+| `.env.PROD` | Production (real secrets, domains) | No |
+
+Deploy with:
+```bash
+make start-swarm ENV_FILE=.env.hetzner          # Hetzner (with DB container)
+make start-swarm-nodb ENV_FILE=.env.DEV_SWARM   # DEV (external Azure DB)
+```
+
+The Makefile uses `envsubst` to substitute **only compose-level vars** (REGISTRY, NFS_SERVER_IP, CERTBOT_DOMAIN, etc.) from the env file into the YAML, then pipes the result to `docker stack deploy`. Container env vars are loaded by `docker stack deploy` via the `env_file:` directive directly.
+
+**Why only compose-level vars?** Earlier approaches that sourced the entire env file broke on values with special characters (`#`, `(`, spaces). The current approach extracts only the vars that `envsubst` needs (REGISTRY, CERTBOT_DOMAIN, CERTBOT_EMAIL, NFS_SERVER_IP, etc.) using `grep` + `cut` in the Makefile.
+
+**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):**
+- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-`
+- No quotes around values — Docker passes them literally
+- No angle bracket placeholders like `<VALUE>` — they get passed as literal strings
+
+This avoids three problems with earlier approaches:
+1. **Type conversion bugs** — `docker compose config` converted ports to strings and cpus to integers, which `docker stack deploy` rejected
+2. **`.env` auto-load conflict** — `docker compose config` always loads `.env` from the project directory, silently overriding values from `--env-file`
+3. **Special character breakage** — sourcing the whole env file with `set -a && . ./file` breaks on values containing `#` (comment), `(` (subshell), or unquoted spaces
+
+---
+
+## 5. Local dev and server deploy are now different configs
+
+You maintain two separate compose files:
+- `docker-compose.yml` — local development (single machine, local volumes, `container_name`)
+- `docker-compose.swarm.yml` — Swarm deployment (overlay network, NFS volumes, `deploy:` section)
+- `docker-compose.swarm.nodb.yml` — Swarm with external DB (no containerized Postgres)
+
+Risk: they drift apart over time (different env vars, image versions, volume configs). Mitigation: keep changes in sync during PRs.
+
+---
+
+## 6. No `container_name` / `depends_on` in Swarm
+
+Swarm manages container naming internally (e.g. `validate_worker.1.abc123`). `depends_on` is ignored — services start simultaneously.
+
+Current impact: minimal — entrypoints use DNS service discovery (`redis`, `db`, `backend`) and `pg_isready` wait loops. No code changes needed.
+
+---
+
+## 7. DNS transition strategy for PROD cutover
+
+To avoid downtime when switching from Docker Compose to Swarm in production, use a temporary subdomain:
+
+1. Deploy Swarm stack on a new server (or same server on different ports)
+2. Point a temp subdomain to it (e.g. `swarm.validate.buildingsmart.org`)
+3. Run both setups in parallel — existing Compose on the main domain, Swarm on the temp domain
+4. Test via API (bulk uploads, concurrent validations) against the temp domain
+5. Once confident, swap DNS: point the main domain to the Swarm deployment
+6. Decommission the old Compose setup
+
+Rollback: if Swarm has issues, DNS points back to the old setup in minutes.
+
+For DEV: same approach, or direct cutover (lower risk since it's not user-facing).
+
+---
+
+# Known Issues & Gotchas
+
+## 8. Overlay network MTU must be set to 1400
+
+MTU (Maximum Transmission Unit) is the largest packet size a network link can carry — the default is 1500 bytes. Hetzner's private network uses MTU 1450. Docker's VXLAN overlay adds ~50 bytes of encapsulation headers to every packet, so if the underlying MTU is already ≤1500, the oversized packets get silently dropped or fragmented. Without setting the overlay MTU to 1400 (leaving headroom for the VXLAN overhead), worker nodes on different machines **cannot reach services on the manager** (DB, Redis).
+
+Symptom: workers stuck on `db:5432 - no response` despite DNS resolving correctly.
+
+Fix is in `docker-compose.swarm.yml`:
+```yaml
+networks:
+    validate:
+        driver: overlay
+        driver_opts:
+            com.docker.network.driver.mtu: "1400"
+```
+
+This applies to any cloud provider with sub-1500 MTU on internal networks.
+
+---
+
+## 9. ClamAV runs inside every worker (~1GB RAM overhead each)
+
+Each worker container starts its own ClamAV daemon + freshclam (virus signature updater). This is the **same as before** — not a Swarm change. But when scaling to N workers, you get N independent ClamAV instances.
+
+Impact:
+- ~1GB RAM per worker for virus signature database (observed during Hetzner testing — 5 instances caused OOM on 8GB server)
+- Each worker independently downloads signature updates on boot
+- The 4GB memory limit per worker (PROD) accounts for this: ~1GB ClamAV + ~2-3GB for Celery tasks
+- The local override (`docker-compose.swarm.local.yml`) skips ClamAV entirely for testing on small servers
+
+4 workers with ClamAV = ~4GB just for virus DBs.
+
+---
+
+## 10. Insecure registry required on ALL nodes
+
+When using `REGISTRY=10.0.0.3:5000` (private IP) instead of `localhost:5000`, **every node** — including the manager — needs the insecure registry configured:
+
+```bash
+echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json
+sudo systemctl restart docker
+```
+
+Without this, services get `No such image` errors and stay at 0/N replicas.
+
+---
+
+## 11. DB `postmaster.pid` disappears in Swarm (containerized DB only)
+
+PostgreSQL starts, recovers, becomes ready — then shuts itself down because its PID file vanished:
+
+```
+could not open file "postmaster.pid": No such file or directory
+performing immediate shutdown because data directory lock file is invalid
+```
+
+This is a Docker Swarm volume mount timing issue. Fix: set `restart_policy.condition: any` (not `on-failure`) on the db service so Swarm keeps restarting it until it sticks. Already applied in `docker-compose.swarm.yml`.
+
+---
+
+## 12. Docker caches NFS volume options
+
+When `docker stack deploy` creates an NFS volume, the driver options (including `addr=`) are cached. If the first deploy has the wrong NFS IP (e.g. the default `10.0.0.1`), **all subsequent deploys reuse that wrong IP** — even after fixing the env file.
+
+Symptoms: containers stuck in "Created" state, never starting. No logs. NFS mount hangs because the IP doesn't exist.
+
+Fix:
+```bash
+docker stack rm validate
+sleep 15
+docker container prune -f
+docker volume rm validate_files_data validate_gherkin_rules_log_data
+# If containers are stuck on hanging NFS mount:
+systemctl restart docker
+# Then redeploy
+make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
+```
+
+Verify volume has correct IP after deploy: `docker volume inspect validate_files_data`
+
+---
+
+## 13. File upload: `f.seek(0)` after measuring size
+
+In `views.py`, the upload handler seeks to the end of the file to measure its size (`f.seek(0, 2)` + `f.tell()`), then must rewind (`f.seek(0)`) before `serializer.save()`. Without the rewind, Django saves a 0-byte file because the file pointer is at the end.
+
+This may only manifest with NFS-backed storage where buffering behaviour differs from local volumes. Commit: `012776c`
+
+---
+
+## 14. `determine_aggregate_status()` masks silent failures
+
+When a validation task produces zero outcomes (e.g. subprocess crashed, worker OOM, NFS hang), the status defaults to VALID (`models.py:1297` — `# assume valid if no outcomes - TODO: is this correct?`). This pre-dates Swarm but becomes more visible when workers crash/restart across nodes.
+
+**Why we can't just return INVALID:** Marking a file as invalid has real consequences — vendors have to investigate and fix it. Returning INVALID for a crashed task would create false negatives. The actual problem is **silent failure** — a task fails completely and nobody notices because it looks like it passed.
+
+**What should happen instead:** When zero outcomes are produced, the system should alert developers (e.g. log an error, send a notification, or set a distinct status like `ERROR` or `INCONCLUSIVE`) rather than silently defaulting to VALID. The file should be flagged for re-validation, not marked as valid or invalid.
+
+Not blocking for Swarm, but worth a follow-up fix.
+
+---
+
+## 15. DB connection pooling: stale connections on overlay network
+
+Django's `"pool": True` (psycopg3 connection pool) keeps DB connections open for reuse. The Swarm overlay network drops idle TCP connections after ~13 minutes. When the pool hands out a dead connection, Django raises:
+
+```
+OperationalError: consuming input failed: server closed the connection unexpectedly
+```
+
+**Fix** (in `backend/core/settings.py`):
+- `"pool": False` — disable psycopg3's built-in connection pool. `CONN_HEALTH_CHECKS` alone is not sufficient because the pool can hand out a stale connection after the health check passes but before it reaches the query.
+- `CONN_HEALTH_CHECKS = True` — Django pings the connection before using it; if dead, it reconnects transparently
+- `CONN_MAX_AGE = 600` (10 min) — keeps connections open for reuse without the pool layer
+
+`CONN_MAX_AGE` is configurable via `POSTGRES_CONN_MAX_AGE` env var. The default of 600s works for Swarm; set to `0` to close connections after each request (safest but slower).
+
+DB logs showing the symptom (every ~13 min):
+```
+LOG:  could not receive data from client: Connection reset by peer
+```
+
+---
+
+## 16. SSL certs: bind mount vs named volume
+
+Docker Compose used a bind mount for Let's Encrypt certs: `./docker/frontend/letsencrypt:/etc/letsencrypt`. Swarm uses a named volume (`validate_letsencrypt_data`).
+
+When migrating, certs must be manually copied into the Swarm volume:
+```bash
+cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/
+docker service update --force validate_frontend
+```
+
+Without this, HTTPS won't work and the site is only accessible via HTTP. Certbot renewal should continue to work inside the container since `CERTBOT_DOMAIN` is set.
+
+---
+
+## 17. Overlay network race condition after stack rm
+
+After `docker stack rm`, the overlay network cleanup is asynchronous. Redeploying too quickly causes `network validate_validate not found` errors.
+
+Fix: wait ~15 seconds between `docker stack rm` and `docker stack deploy`. If a ghost network persists (`docker network ls` shows it but `docker network rm` says "not found"), restart Docker: `systemctl restart docker`.
+
+---
+
+## 18. No rolling updates for `latest` tags
+
+Swarm checks if the image tag has changed before pulling. Since all images use `:latest`, Swarm sees "same tag" and skips the pull — even if the image content has changed.
+
+**Impact:** `docker service update --force` restarts containers but uses the **cached** image. To deploy new code, you must tear down and redeploy:
+
+```bash
+make stop-swarm
+make swarm-push ENV_FILE=.env.xxx
+make start-swarm ENV_FILE=.env.xxx
+```
+
+Or force a pull for a single service:
+```bash
+docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend
+```
+
+---
+
+## 19. `docker service update --force` does NOT re-read env vars
+
+`docker service update --force` restarts containers with the **same config** they were deployed with. It does NOT re-read the env file. If you changed `.env.DEV_SWARM` and want the changes to take effect, you must do a full redeploy:
+
+```bash
+make stop-swarm
+# wait ~15 seconds
+make start-swarm ENV_FILE=.env.DEV_SWARM
+```
+
+---
+
+## 20. VS Code port forwarding conflicts with Swarm ingress
+
+VS Code's SSH tunnel sometimes conflicts with Swarm's ingress routing (IPv6 issues). Accessing `localhost:80` via VS Code's forwarded port may not work.
+
+**Workaround:** Use the server's public IP directly instead of localhost.
+
+---
+
+# Maintenance
+
+## 21. CI/CD not yet adapted for Swarm
+
+The current GitHub Actions workflow (`.github/workflows/ci_cd.yml`) uses `docker compose up` for DEV and PROD deployments. It does **not** support Swarm.
+
+What needs to change for Swarm CI/CD:
+- `docker compose up` → `make start-swarm ENV_FILE=.env.XXX` (build, push to registry, stack deploy)
+- The runner/deploy target needs access to the Swarm manager (SSH or self-hosted runner on the manager node)
+- Worker nodes pull images from the registry automatically — no action needed per node
+- `ENV_FILE` is already a GitHub Actions variable (`${{ vars.ENV_FILE }}`) — just needs to point to the right file
+
+Options:
+1. **Self-hosted runner on the manager node** — simplest, runner has direct access to Docker and the registry
+2. **SSH deploy step** — GitHub-hosted runner SSHes into the manager to run make commands
+3. **Separate workflow** — new workflow file for Swarm deployments, triggered manually or on specific branches
+
+Not blocking for merge to development — Swarm can be deployed manually until CI/CD is adapted.
+
+---
+
+## 22. Periodic cleanup on DEV server
+
+> **DEV-specific** — the DEV server has a small root disk (29GB). Hetzner/PROD with larger disks are less affected but should still clean up periodically.
+
+Docker images, build cache, orphaned volumes, and uploaded IFC files accumulate fast. Without periodic cleanup, the disk fills up and deployments fail.
+
+**What accumulates:**
+- Docker build cache (~2GB per full build cycle)
+- Old/unused images (previous deployments)
+- Orphaned volumes from CI/CD runs (e.g. `repo-clone_*` volumes from GitHub Actions)
+- Uploaded IFC files in `files_data` volume (4GB+ and growing)
+
+**Cleanup commands:**
+```bash
+# Check disk usage
+df -h /
+
+# Docker overview
+docker system df
+
+# Remove unused images and build cache
+docker builder prune -af
+docker image prune -af
+
+# Remove orphaned volumes (CAREFUL: only removes volumes not attached to any container)
+docker volume prune -f
+
+# List volume sizes to find large orphans
+docker system df -v | grep -A 50 "Local Volumes"
+```
+
+**Recommendation:** Run `docker system prune -af` and `docker volume prune -f` after each major deployment cycle. Consider adding this to the CI/CD pipeline or a cron job. The `/mnt` disk (74GB ephemeral Azure temp disk) can be used for temporary storage but **data is lost on VM deallocation/resize**.
+
+---
+
+## 23. `makemigrations` runs on every backend startup
+
+The `server-entrypoint.sh` runs `python manage.py makemigrations` and `python manage.py migrate` on every container start. This works because:
+- Backend is constrained to **1 replica** on the manager node — no migration race conditions
+- The generated migration files live inside the container (ephemeral) — they're not persisted
+
+**Risk:** If model changes exist that haven't been committed as migration files, `makemigrations` will generate them at runtime inside the container. These migrations disappear when the container restarts, potentially causing inconsistency. In production, migrations should be baked into the image at build time.
+
+**Decision:** Kept as-is for now. Backend is always 1 replica, and in practice all migrations are committed to git before deployment. But worth revisiting for PROD hardening.
+
+---
+
+## 24. Historical Swarm instability
+
+> "unexplained crashes/corrupt state (5+ years ago) — hopefully they are gone now"
+
+Modern Docker Engine (24+) should be stable. Mitigations already in place:
+- `CELERY_TASK_ACKS_LATE = True` — tasks stay in queue until completed
+- `CELERY_TASK_REJECT_ON_WORKER_LOST = True` — crashed tasks are re-queued
+- `restart_policy: condition: any` on DB (see section 11), `on-failure` on other services
+- `update_config: failure_action: rollback` — bad deploys roll back
+
+---
+
+# Local Dev Only
+
+## 25. Lima-specific: virtiofs + Celery prefork = errno 35
+
+Celery's `prefork` pool + Lima's virtiofs read-only mounts cause `EDEADLK` deadlocks. Workaround: `--pool=solo`.
+
+**Not a production issue** — only affects local development on macOS with Lima. Docker containers on Linux use proper ext4/overlay2 filesystems.
+
+---
+
+## 26. macOS NFS gotcha: `/tmp` vs `/private/tmp`
+
+On macOS, `/tmp` is a symlink to `/private/tmp`. NFS exports must use the real path (`/private/tmp/...`). Not relevant for Linux servers (Hetzner/Azure), but relevant for local development on macOS.
diff --git a/docs/swarm-runbook.md b/docs/swarm-runbook.md
new file mode 100644
index 00000000..94843db8
--- /dev/null
+++ b/docs/swarm-runbook.md
@@ -0,0 +1,491 @@
+# Swarm Operations Runbook
+
+Copy-paste-ready commands for every Swarm operation. Refer to [swarm-considerations.md](swarm-considerations.md) for architecture, known issues, and design decisions.
+
+Last updated: 2026-03-16
+
+---
+
+## Table of Contents
+
+1. [First-Time Setup (Manager Node)](#1-first-time-setup-manager-node)
+2. [Build, Push and Deploy](#2-build-push-and-deploy)
+3. [Set Up NFS (Multi-Node)](#3-set-up-nfs-multi-node)
+4. [Add a Worker Node to the Swarm](#4-add-a-worker-node-to-the-swarm)
+5. [Scale Workers](#5-scale-workers)
+6. [Redeploy After Code Changes](#6-redeploy-after-code-changes)
+7. [Monitoring and Logs](#7-monitoring-and-logs)
+8. [Shut Down the Swarm](#8-shut-down-the-swarm)
+9. [Remove a Worker Node](#9-remove-a-worker-node)
+10. [Full Reset (Nuclear Option)](#10-full-reset-nuclear-option)
+11. [Environment File Strategy](#11-environment-file-strategy)
+12. [Quick Reference Card](#12-quick-reference-card)
+
+---
+
+## 1. First-Time Setup (Manager Node)
+
+Run once per machine that will act as a Swarm manager. This covers everything: Swarm init, NFS, registry, env, build, deploy.
+
+```bash
+# 1a. Initialize Swarm
+docker swarm init --advertise-addr <PRIVATE_IP>
+
+# 1b. Create .VERSION (gitignored, required by make build)
+echo "1.0.0" > .VERSION
+
+# 1c. Prepare the .env file
+# Copy .env (committed defaults) and customize for this server:
+cp .env .env.myserver   # name it after the environment: .env.hetzner, .env.DEV_SWARM, .env.PROD
+# Edit manually — no spaces around '='. Variables you MUST change:
+#   PUBLIC_URL              — server URL (e.g. http://10.0.0.3 or https://validate.example.org)
+#   DJANGO_ALLOWED_HOSTS    — space-separated hostnames/IPs that Django accepts
+#   DJANGO_TRUSTED_ORIGINS  — space-separated origins for CSRF
+#   DJANGO_SECRET_KEY       — generate a random key for non-dev environments
+#   POSTGRES_PASSWORD       — use a strong password for non-dev environments
+# Variables to ADD (not in the base .env, Swarm-only):
+#   NFS_SERVER_IP           — private IP of the NFS server (e.g. 10.0.0.3)
+#   REGISTRY                — Docker registry address (e.g. localhost:5000)
+# Optional (uncomment to set):
+#   CERTBOT_DOMAIN          — real domain for SSL (leave as _ to skip)
+#   CERTBOT_EMAIL           — email for Let's Encrypt
+#   WORKER_CPU_LIMIT, WORKER_MEMORY_LIMIT, etc. — resource limits
+
+# 1d. Start local registry (as plain container, NOT Swarm service)
+docker run -d --name registry -p 5000:5000 --restart always registry:2
+# Verify:
+curl -s http://localhost:5000/v2/   # should return {}
+
+# 1e. Set up NFS on the host
+apt install -y nfs-kernel-server
+mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs
+chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs
+chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs
+
+cat >> /etc/exports << 'EOF'
+/srv/nfs/files_data  10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+EOF
+
+exportfs -ra
+systemctl restart nfs-kernel-server
+showmount -e localhost
+
+# 1f. (If migrating from docker-compose) Stop the old stack first
+docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down
+# Volume names differ: compose uses "validation-service_files_data", swarm uses "validate_files_data"
+# Check which volumes have data:
+docker system df -v | grep -A 50 "Local Volumes"
+
+# 1g. Copy existing data from Docker volumes to NFS
+# Use the COMPOSE volume name (validation-service_*), not the swarm name (validate_*):
+docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/"
+docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/"
+# Verify:
+du -sh /srv/nfs/files_data /srv/nfs/gherkin_logs
+
+# 1h. (If migrating) Copy SSL certs to Swarm volume
+# Old compose used a bind mount (docker/frontend/letsencrypt/), Swarm uses a named volume.
+# Deploy first (step 1j), then copy certs into the volume and restart frontend:
+# cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/
+# docker service update --force validate_frontend
+
+# 1i. Fetch submodules
+make fetch-modules
+
+# 1j. Build, push, deploy
+make swarm-push
+# For external DB (Azure):  make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
+# For containerized DB:     make start-swarm ENV_FILE=.env.hetzner
+make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
+
+# 1k. Verify
+watch docker service ls
+```
+
+Adjust NFS exports CIDR to match the network (Azure VNet: `10.0.0.0/16`, Hetzner: `10.0.0.0/24` or `*`).
+
+See [swarm-considerations.md](swarm-considerations.md) for known issues and gotchas that can trip you up during setup (NFS volume caching, network race conditions, env file format, registry config, SSL cert migration, etc.).
+
+---
+
+## 2. Build, Push and Deploy
+
+```bash
+# Build, tag and push to registry (swarm-push includes build)
+make swarm-push ENV_FILE=<your-env-file>
+
+# Deploy — pick the right target:
+# Full stack with DB container + NFS:
+make start-swarm ENV_FILE=<your-env-file>
+
+# External DB (e.g. Azure PostgreSQL) + NFS:
+make start-swarm-nodb ENV_FILE=<your-env-file>
+
+# Single-node / local testing (no NFS, no ClamAV, 1 replica each):
+make start-swarm-local ENV_FILE=<your-env-file>
+
+# 2b. Watch services come up (all should reach 1/1 within ~60s)
+watch docker service ls
+
+# Verify endpoints
+curl -s -o /dev/null -w "%{http_code}" http://localhost/         # 200
+curl -s -o /dev/null -w "%{http_code}" http://localhost/api/      # 302
+curl -s -o /dev/null -w "%{http_code}" http://localhost/admin/    # 302
+
+# 2c. (Optional) Set resource limits on workers
+make set-worker-limits CPU=2 MEM=2G                        # limits only
+make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G   # limits + reservations
+```
+
+**What `start-swarm` vs `start-swarm-nodb` vs `start-swarm-local` does:**
+
+| | `start-swarm` | `start-swarm-nodb` | `start-swarm-local` |
+|---|---|---|---|
+| Compose file | `swarm.yml` | `swarm.nodb.yml` | `swarm.yml` + `swarm.local.yml` |
+| Database | Containerized PostgreSQL | External (e.g. Azure) | Containerized PostgreSQL |
+| Volumes | NFS | NFS | Plain local volumes |
+| ClamAV | Runs | Runs | Skipped |
+| Replicas | backend: 2, worker: 2 | backend: 2, worker: 2 | All 1 |
+| Use case | Hetzner, self-hosted | DEV/PROD (Azure DB) | Quick local testing |
+
+---
+
+## 3. Set Up NFS (Multi-Node)
+
+Required before adding worker nodes. Workers need shared access to uploaded IFC files and gherkin logs.
+
+### 3a. On the NFS server (typically the manager node)
+
+```bash
+# Install NFS
+apt install -y nfs-kernel-server
+
+# Create export directories
+mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs
+chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs
+chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs
+
+# Configure exports
+cat >> /etc/exports << 'EOF'
+/srv/nfs/files_data  10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+EOF
+
+exportfs -ra
+systemctl restart nfs-kernel-server
+
+# Verify
+showmount -e localhost
+```
+
+### 3b. Copy existing data to NFS (if migrating from local volumes)
+
+```bash
+# Copy files_data
+docker run --rm \
+  -v validate_files_data:/src \
+  -v /srv/nfs/files_data:/dst \
+  alpine sh -c "cp -a /src/. /dst/"
+
+# Copy gherkin_logs
+docker run --rm \
+  -v validate_gherkin_rules_log_data:/src \
+  -v /srv/nfs/gherkin_logs:/dst \
+  alpine sh -c "cp -a /src/. /dst/"
+
+# Verify
+ls -la /srv/nfs/files_data/
+ls -la /srv/nfs/gherkin_logs/
+```
+
+**Note:** If migrating from Docker Compose, the volume names may be prefixed differently (e.g. `validation-service_files_data` instead of `validate_files_data`). Check with `docker volume ls`.
+
+### 3c. Set NFS_SERVER_IP in the env file
+
+```bash
+# In .env.hetzner (or .env.DEV_SWARM / .env.PROD):
+NFS_SERVER_IP=10.0.0.3   # private IP of the NFS server
+```
+
+The `docker-compose.swarm.yml` uses this in the NFS volume driver options.
+
+### 3d. Redeploy with NFS volumes
+
+```bash
+# Tear down existing stack (uses local volumes)
+make stop-swarm
+
+# Wait ~15 seconds for cleanup, then redeploy with NFS
+make start-swarm ENV_FILE=<your-env-file>
+
+# Verify NFS volumes are mounted
+docker volume inspect validate_files_data
+# Should show Type: nfs in Options
+```
+
+---
+
+## 4. Add a Worker Node to the Swarm
+
+### 4a. On the manager — get join token
+
+```bash
+docker swarm join-token worker
+# Outputs: docker swarm join --token SWMTKN-... <manager-ip>:2377
+```
+
+### 4b. On the new worker node — prerequisites
+
+```bash
+# Install Docker
+curl -fsSL https://get.docker.com | sh
+
+# Install NFS client (needed for NFS volumes)
+apt install -y nfs-common
+
+# Verify NFS is reachable
+mount -t nfs4 <NFS_SERVER_IP>:/srv/nfs/files_data /mnt && ls /mnt && umount /mnt
+
+# Configure insecure registry (if using private registry over HTTP)
+echo '{ "insecure-registries": ["<MANAGER_PRIVATE_IP>:5000"] }' | sudo tee /etc/docker/daemon.json
+sudo systemctl restart docker
+```
+
+### 4c. Join the swarm
+
+```bash
+# Paste the join command from step 4a:
+docker swarm join --token SWMTKN-... <manager-ip>:2377
+```
+
+### 4d. Verify on manager
+
+```bash
+docker node ls
+# Should show both nodes as Ready/Active
+```
+
+### 4e. Also configure insecure registry on manager (if using private IP for registry)
+
+```bash
+# Only needed if REGISTRY=10.0.0.3:5000 instead of localhost:5000
+echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json
+sudo systemctl restart docker
+```
+
+**Important:** When using a private IP registry (`REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors.
+
+---
+
+## 5. Scale Workers
+
+```bash
+# Scale to N workers (Swarm distributes across available nodes)
+make scale-workers WORKERS=4
+
+# Check placement — see which node each worker is on
+docker service ps validate_worker
+
+# Set resource limits (applied per-container, not total)
+make set-worker-limits CPU=2 MEM=2G                        # limits only
+make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G   # limits + reservations
+```
+
+**Per-environment resource limits:**
+
+| Environment | CPU limit | Memory limit | Notes |
+|---|---|---|---|
+| Hetzner (8GB, no ClamAV) | 2 | 2G | Max ~2 workers |
+| DEV | 1 | 1G | |
+| PROD | 4 | 4G | Includes ClamAV ~1GB |
+
+**ClamAV RAM warning:** Each worker with ClamAV loads ~1GB of virus signatures. 4 workers + 1 scheduler = ~5GB just for ClamAV. Use the local override (skips ClamAV) on small servers, or use 16GB+ RAM.
+
+---
+
+## 6. Redeploy After Code Changes
+
+There is no rolling update for `latest` tags — must tear down and redeploy.
+
+```bash
+# 1. Stop
+make stop-swarm
+
+# 2. Rebuild and push
+make swarm-push ENV_FILE=<your-env-file>
+
+# 3. Redeploy
+make start-swarm ENV_FILE=<your-env-file>
+
+# 4. Verify
+watch docker service ls
+```
+
+**Faster alternative for single-service changes:**
+
+```bash
+# Force-restart one service (uses existing image, same config — does NOT re-read .env)
+docker service update --force validate_backend
+
+# Or rebuild and push just the backend image, then update (still same env):
+make swarm-push ENV_FILE=<your-env-file>
+docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend
+
+# To pick up .env changes, you must redeploy (stop + start-swarm)
+```
+
+---
+
+## 7. Monitoring and Logs
+
+```bash
+# Service overview
+docker service ls
+
+# Detailed worker status (shows which node, current state)
+make swarm-status
+
+# Follow logs for a service
+docker service logs -f validate_frontend
+docker service logs -f validate_backend
+docker service logs -f validate_worker
+docker service logs -f validate_scheduler
+docker service logs -f validate_db
+
+# Resource usage (CPU/memory per container)
+docker stats --no-stream
+
+# Check for OOM kills
+journalctl -k | grep "out of memory"
+
+# Check node status
+docker node ls
+
+# Inspect a specific service
+docker service inspect validate_worker --pretty
+```
+
+---
+
+## 8. Shut Down the Swarm
+
+### Stop the stack (keeps volumes and swarm membership)
+
+```bash
+make stop-swarm
+# Equivalent to: docker stack rm validate
+# Volumes are preserved — data survives restarts
+```
+
+### Restart after shutdown
+
+```bash
+# Just redeploy — volumes are still there
+make start-swarm ENV_FILE=<your-env-file>
+```
+
+---
+
+## 9. Remove a Worker Node
+
+```bash
+# On manager: drain the node first (moves tasks to other nodes)
+docker node update --availability drain <NODE_ID>
+
+# Wait for tasks to migrate, then on the worker node:
+docker swarm leave
+
+# On manager: remove the node from the list
+docker node rm <NODE_ID>
+```
+
+---
+
+## 10. Full Reset (Nuclear Option)
+
+Removes everything — stack, volumes, images, swarm.
+
+```bash
+# 1. Remove the stack
+make stop-swarm
+
+# 2. Remove registry
+docker rm -f registry
+
+# 3. Remove all volumes (WARNING: deletes DB data and uploaded files!)
+docker volume prune -f
+
+# 4. Remove all images
+docker system prune -af
+
+# 5. Leave the swarm
+docker swarm leave --force
+
+# Then start fresh from section 1
+```
+
+---
+
+## 11. Environment File Strategy
+
+The `.env` in the repo root is committed with safe defaults (localhost, no secrets). Each environment gets its own gitignored override.
+
+| File | Purpose | Committed? |
+|---|---|---|
+| `.env` | Shared defaults for docker compose (local dev, forking) | Yes |
+| `.env.hetzner` | Hetzner test server (IPs, NFS, registry) | No |
+| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No |
+| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No |
+| `.env.PROD` | Production (real secrets, domains, SSL) | No |
+
+**Deploy with:**
+```bash
+make start-swarm ENV_FILE=.env.hetzner          # Hetzner (with DB container)
+make start-swarm-nodb ENV_FILE=.env.DEV_SWARM   # DEV (external Azure DB)
+make start-swarm ENV_FILE=.env.PROD             # PROD
+```
+
+**What changes per environment:**
+
+| Variable | Hetzner (test) | DEV | PROD |
+|---|---|---|---|
+| `DEBUG` | `True` | `True` | `False` |
+| `ENV` | `Development` | `Development` | `Production` |
+| `PUBLIC_URL` | `http://<server-ip>` | `https://dev.validate...` | `https://validate.buildingsmart.org` |
+| `DJANGO_ALLOWED_HOSTS` | `localhost <server-ip>` | `dev.validate...` | `validate.buildingsmart.org` |
+| `CERTBOT_DOMAIN` | `_` (skip SSL) | domain | domain |
+| `NFS_SERVER_IP` | `10.0.0.3` | `10.0.0.5` | per-setup |
+| `REGISTRY` | `localhost:5000` | `localhost:5000` | per-setup |
+| `POSTGRES_PASSWORD` | `postgres` | strong | strong |
+| `DJANGO_SECRET_KEY` | insecure default | random | random |
+| B2C / Mailgun | empty | real creds | real creds |
+
+**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):**
+- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-`
+- No quotes around values — Docker passes them literally
+- No angle bracket placeholders like `<VALUE>` — they get passed as literal strings
+
+---
+
+## 12. Quick Reference Card
+
+| Task | Command |
+|---|---|
+| Deploy (local/test) | `make start-swarm-local ENV_FILE=<your-env-file>` |
+| Deploy (with DB + NFS) | `make start-swarm ENV_FILE=<your-env-file>` |
+| Deploy (external DB + NFS) | `make start-swarm-nodb ENV_FILE=<your-env-file>` |
+| Copy SSL certs to Swarm | `cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/` |
+| Restart frontend (after cert copy) | `docker service update --force validate_frontend` |
+| Stop stack | `make stop-swarm` |
+| Scale workers | `make scale-workers WORKERS=4` |
+| Set worker limits | `make set-worker-limits CPU=2 MEM=2G` |
+| Build + push images | `make swarm-push ENV_FILE=<your-env-file>` |
+| Service status | `make swarm-status` |
+| Follow logs | `docker service logs -f validate_<service>` |
+| Force-restart service | `docker service update --force validate_backend` |
+| Add worker node | `docker swarm join --token SWMTKN-... <ip>:2377` |
+| Drain node | `docker node update --availability drain <id>` |
+| Remove node | `docker swarm leave` (on worker) + `docker node rm <id>` (on manager) |
+| Check MTU | `ping -M do -s 1372 <other-node-ip>` |
+

From e650928250b4c799329610581904b371bedd8e5d Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Mon, 16 Mar 2026 17:15:32 +0000
Subject: [PATCH 10/12] runbook -> deploy guide

---
 docs/{swarm-runbook.md => swarm-deploy-guide.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/{swarm-runbook.md => swarm-deploy-guide.md} (100%)

diff --git a/docs/swarm-runbook.md b/docs/swarm-deploy-guide.md
similarity index 100%
rename from docs/swarm-runbook.md
rename to docs/swarm-deploy-guide.md

From 4b8d99139ed946003da2efc8b6274f6778f9827b Mon Sep 17 00:00:00 2001
From: Ghesselink <geert.hess@gmail.com>
Date: Mon, 16 Mar 2026 20:13:49 +0000
Subject: [PATCH 11/12] improve swarm makefile targets, fix db connection
 pooling

---
 Makefile                      |  32 ++++----
 backend/core/settings.py      |   4 +-
 docker-compose.swarm.nodb.yml | 137 ++++++++++++++++++++++++++++++++++
 docs/swarm-deploy-guide.md    |  11 +--
 4 files changed, 163 insertions(+), 21 deletions(-)
 create mode 100644 docker-compose.swarm.nodb.yml

diff --git a/Makefile b/Makefile
index 6baa69ca..86259f97 100644
--- a/Makefile
+++ b/Makefile
@@ -28,22 +28,23 @@ stop:
 
 # --- Docker Swarm ---
 
+REGISTRY ?= localhost:5000
 WORKERS  ?= 2
 ENV_FILE ?= .env
-
-# Reads compose-level vars from ENV_FILE, substitutes into YAML via envsubst.
-# Container env vars are loaded by docker stack deploy via the env_file: directive.
-SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP POSTGRES_NAME POSTGRES_USER POSTGRES_PASSWORD
-SWARM_ENV = ENV_FILE=$(ENV_FILE) \
-	$(foreach v,$(SWARM_VARS),$(v)=$$(grep '^$(v)=' $(ENV_FILE) | cut -d= -f2-))
+SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP WORKER_CPU_LIMIT WORKER_MEMORY_LIMIT WORKER_CPU_RESERVATION WORKER_MEMORY_RESERVATION
+SWARM_ENV = ENV_FILE="$(ENV_FILE)" $(foreach v,$(SWARM_VARS),$(v)="$(shell grep '^$(v)=' $(ENV_FILE) | head -1 | cut -d= -f2-)")
 
 start-swarm:
-	$(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate
+
+start-swarm-nodb:
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.nodb.yml | docker stack deploy -c - --with-registry-auth validate
 
 start-swarm-local:
-	$(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml
-	$(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm_local.yml
-	docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm_local.yml --with-registry-auth validate
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml && \
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm.local.yml && \
+	docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm.local.yml --with-registry-auth validate && \
+	rm -f /tmp/_swarm.yml /tmp/_swarm.local.yml
 
 stop-swarm:
 	docker stack rm validate
@@ -51,12 +52,13 @@ stop-swarm:
 scale-workers:
 	docker service scale validate_worker=$(WORKERS)
 
-CPU ?= 2
-MEM ?= 2G
 set-worker-limits:
-	docker service update --limit-cpu $(CPU) --limit-memory $(MEM) validate_worker
-
-REGISTRY ?= $$(grep '^REGISTRY=' $(ENV_FILE) | cut -d= -f2- || echo localhost:5000)
+	docker service update \
+		$(if $(CPU),--limit-cpu $(CPU)) \
+		$(if $(MEM),--limit-memory $(MEM)) \
+		$(if $(CPU_RES),--reserve-cpu $(CPU_RES)) \
+		$(if $(MEM_RES),--reserve-memory $(MEM_RES)) \
+		validate_worker
 
 swarm-push: build
 	docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend
diff --git a/backend/core/settings.py b/backend/core/settings.py
index ac1bd090..24d261da 100644
--- a/backend/core/settings.py
+++ b/backend/core/settings.py
@@ -246,8 +246,10 @@
         "USER": os.environ.get("POSTGRES_USER", "postgres"),
         "PASSWORD": os.environ.get("POSTGRES_PASSWORD", "postgres"),
         "PORT": int(os.environ.get("POSTGRES_PORT", "5432")),
+        "CONN_MAX_AGE": int(os.environ.get("POSTGRES_CONN_MAX_AGE", 600)),
+        "CONN_HEALTH_CHECKS": True,
         "OPTIONS": {
-            "pool": True,
+            "pool": False,
         },
     },
 }
diff --git a/docker-compose.swarm.nodb.yml b/docker-compose.swarm.nodb.yml
new file mode 100644
index 00000000..bfe1fb11
--- /dev/null
+++ b/docker-compose.swarm.nodb.yml
@@ -0,0 +1,137 @@
+# Docker Swarm deployment configuration — external database (no containerized PostgreSQL)
+#
+# Usage:
+#   make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
+#
+# Same as docker-compose.swarm.yml but without the db service.
+# Set POSTGRES_HOST, POSTGRES_PORT, etc. in your env file to point to the external DB.
+
+services:
+
+    frontend:
+        image: ${REGISTRY}/validationsvc-frontend
+        ports:
+            - 80:80
+            - 443:443
+        environment:
+            CERTBOT_DOMAIN: ${CERTBOT_DOMAIN}
+            CERTBOT_EMAIL: ${CERTBOT_EMAIL}
+        volumes:
+            - letsencrypt_data:/etc/letsencrypt
+            - static_data:/app/backend/django_static
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    backend:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/server-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - static_data:/app/backend/django_static
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        expose:
+            - 8000
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+        healthcheck:
+            test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""]
+            interval: 30s
+            timeout: 10s
+            retries: 3
+            start_period: 60s
+
+    worker:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/worker-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 2
+            # No placement constraint - workers run on any node
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+
+    scheduler:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/worker-beat-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    redis:
+        image: redis:8.4-alpine
+        command: redis-server --protected-mode no --bind 0.0.0.0
+        expose:
+            - 6379
+        volumes:
+            - redis_data:/data
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+networks:
+    validate:
+        driver: overlay
+        driver_opts:
+            com.docker.network.driver.mtu: "1400"
+
+volumes:
+    static_data:
+    letsencrypt_data:
+    redis_data:
+    files_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/files_data"
+    gherkin_rules_log_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/gherkin_logs"
diff --git a/docs/swarm-deploy-guide.md b/docs/swarm-deploy-guide.md
index 94843db8..36bce9ba 100644
--- a/docs/swarm-deploy-guide.md
+++ b/docs/swarm-deploy-guide.md
@@ -204,8 +204,9 @@ ls -la /srv/nfs/gherkin_logs/
 ### 3c. Set NFS_SERVER_IP in the env file
 
 ```bash
-# In .env.hetzner (or .env.DEV_SWARM / .env.PROD):
-NFS_SERVER_IP=10.0.0.3   # private IP of the NFS server
+# In your env file (.env.hetzner, .env.DEV_SWARM, .env.PROD, etc.):
+NFS_SERVER_IP=<private IP of the NFS server>
+# e.g. on Hetzner test server this was 10.0.0.3 — check your actual network with: hostname -I
 ```
 
 The `docker-compose.swarm.yml` uses this in the NFS volume driver options.
@@ -269,12 +270,12 @@ docker node ls
 ### 4e. Also configure insecure registry on manager (if using private IP for registry)
 
 ```bash
-# Only needed if REGISTRY=10.0.0.3:5000 instead of localhost:5000
-echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json
+# Only needed if REGISTRY=<MANAGER_PRIVATE_IP>:5000 instead of localhost:5000
+echo '{ "insecure-registries": ["<MANAGER_PRIVATE_IP>:5000"] }' | sudo tee /etc/docker/daemon.json
 sudo systemctl restart docker
 ```
 
-**Important:** When using a private IP registry (`REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors.
+**Important:** When using a private IP registry (e.g. `REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors.
 
 ---
 

From aabe062e81813ed851fffaa6aa9d3ee20465d4f4 Mon Sep 17 00:00:00 2001
From: bSI Validation Service CI/CD <validate@buildingsmart.org>
Date: Mon, 16 Mar 2026 23:20:28 +0000
Subject: [PATCH 12/12] add automated add-worker/remove-worker, fix registry to
 use private IP

---
 Makefile                     |  39 ++-
 docs/swarm-considerations.md |  13 +-
 docs/swarm-deploy-guide.md   | 515 +++++++----------------------------
 3 files changed, 150 insertions(+), 417 deletions(-)

diff --git a/Makefile b/Makefile
index 86259f97..bceeafb0 100644
--- a/Makefile
+++ b/Makefile
@@ -71,6 +71,43 @@ swarm-status:
 	@echo "---"
 	@docker service ps validate_worker
 
+# Add a worker node to the Swarm cluster
+# Usage: make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
+# Reads SWARM_WORKER_N entries and SWARM_SSH_USER from ENV_FILE
+add-worker:
+	@test -n "$(NAME)" || (echo "Usage: make add-worker NAME=<worker-name> ENV_FILE=.env.DEV_SWARM" && exit 1)
+	$(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-))
+	$(eval MANAGER_IP := $(shell grep '^NFS_SERVER_IP=' $(ENV_FILE) | head -1 | cut -d= -f2-))
+	$(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2))
+	@test -n "$(WORKER_IP)" || (echo "ERROR: Worker '$(NAME)' not found in $(ENV_FILE). Add it as: SWARM_WORKER_N=$(NAME):<ip>" && exit 1)
+	@test -n "$(MANAGER_IP)" || (echo "ERROR: NFS_SERVER_IP not set in $(ENV_FILE)" && exit 1)
+	@test -n "$(SSH_USER)" || (echo "ERROR: SWARM_SSH_USER not set in $(ENV_FILE)" && exit 1)
+	@echo "==> Installing Docker on $(NAME) ($(WORKER_IP))..."
+	sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "curl -fsSL https://get.docker.com | sh"
+	@echo "==> Configuring insecure registry ($(MANAGER_IP):5000)..."
+	sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) 'echo '"'"'{ "insecure-registries": ["$(MANAGER_IP):5000"] }'"'"' | sudo tee /etc/docker/daemon.json && sudo systemctl restart docker'
+	@echo "==> Joining Swarm..."
+	sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm join --token $$(sudo docker swarm join-token worker -q) $(MANAGER_IP):2377"
+	@echo "==> Done! Node list:"
+	sudo docker node ls
+
+# Remove a worker node from the Swarm cluster
+# Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
+remove-worker:
+	@test -n "$(NAME)" || (echo "Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM" && exit 1)
+	$(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-))
+	$(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2))
+	@echo "==> Draining $(NAME)..."
+	sudo docker node update --availability drain $(NAME)
+	@echo "==> Leaving swarm..."
+	-sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm leave"
+	@echo "==> Waiting for node to go down..."
+	@for i in 1 2 3 4 5 6; do sleep 5; sudo docker node ls --format '{{.Hostname}} {{.Status}}' | grep -q '$(NAME) Down' && break; echo "    waiting..."; done
+	@echo "==> Removing node..."
+	sudo docker node rm $(NAME)
+	@echo "==> Done! Don't forget to remove the SWARM_WORKER entry from $(ENV_FILE)"
+	sudo docker node ls
+
 build:
 	docker compose build \
 	--build-arg GIT_COMMIT_HASH="$$(git rev-parse --short HEAD)" \
@@ -128,7 +165,7 @@ e2e-test: start-infra
 	cd e2e && npm install && npm run install-playwright && npm run test
 
 e2e-test-report: start-infra
-	cd e2e && npm install && npm run install-playwright && npm run test:html && npm run test:report
+	cd e2e && npm install && npm run inst1all-playwright && npm run test:html && npm run test:report
 
 BRANCH   ?= main                 
 SUBTREES := \
diff --git a/docs/swarm-considerations.md b/docs/swarm-considerations.md
index 40e77bb0..d25ee790 100644
--- a/docs/swarm-considerations.md
+++ b/docs/swarm-considerations.md
@@ -7,6 +7,7 @@ Compiled during IVS-719 development. Grouped by category.
 - **Single-node Swarm**: tested and working (Hetzner, 2026-03-10)
 - **Multi-node Swarm**: tested and working with 2 nodes + NFS (Hetzner, 2026-03-15)
 - **Single-node Swarm on Azure DEV**: tested and working with external DB + NFS (2026-03-15)
+- **Multi-node Swarm on Azure DEV**: tested and working — manager + worker node, tasks distributed across both (2026-03-16)
 - **CI/CD**: not yet adapted for Swarm — see section 5
 - **SSL/Certbot**: not tested with a real domain yet (using `CERTBOT_DOMAIN=_` to skip)
 - **Documentation**: user-facing docs (README, deployment guide) not yet updated for Swarm workflow
@@ -241,16 +242,20 @@ Impact:
 
 ---
 
-## 10. Insecure registry required on ALL nodes
+## 10. Registry must use private IP, not localhost
 
-When using `REGISTRY=10.0.0.3:5000` (private IP) instead of `localhost:5000`, **every node** — including the manager — needs the insecure registry configured:
+**Always set `REGISTRY=<manager-private-ip>:5000`** (e.g. `10.0.0.5:5000`) in the env file, never `localhost:5000`.
+
+Why: `localhost` resolves to the local machine. On the manager, that works. On worker nodes, `localhost:5000` points to nothing — workers can't pull images and stay at 0/N replicas with `No such image` errors.
+
+**Every node** (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`:
 
 ```bash
-echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json
+echo '{ "insecure-registries": ["10.0.0.5:5000"] }' | sudo tee /etc/docker/daemon.json
 sudo systemctl restart docker
 ```
 
-Without this, services get `No such image` errors and stay at 0/N replicas.
+The `make add-worker` target handles this automatically for workers. For the **manager**, add it manually once during initial setup (merge with any existing `daemon.json` settings like log-driver).
 
 ---
 
diff --git a/docs/swarm-deploy-guide.md b/docs/swarm-deploy-guide.md
index 36bce9ba..f484bcf6 100644
--- a/docs/swarm-deploy-guide.md
+++ b/docs/swarm-deploy-guide.md
@@ -1,492 +1,183 @@
-# Swarm Operations Runbook
+# Swarm Deploy Guide
 
-Copy-paste-ready commands for every Swarm operation. Refer to [swarm-considerations.md](swarm-considerations.md) for architecture, known issues, and design decisions.
+Copy-paste commands for deploying and operating the Validation Service on Docker Swarm.
 
-Last updated: 2026-03-16
+For architecture decisions, known issues, env file strategy, and gotchas, see [swarm-considerations.md](swarm-considerations.md).
 
 ---
 
-## Table of Contents
-
-1. [First-Time Setup (Manager Node)](#1-first-time-setup-manager-node)
-2. [Build, Push and Deploy](#2-build-push-and-deploy)
-3. [Set Up NFS (Multi-Node)](#3-set-up-nfs-multi-node)
-4. [Add a Worker Node to the Swarm](#4-add-a-worker-node-to-the-swarm)
-5. [Scale Workers](#5-scale-workers)
-6. [Redeploy After Code Changes](#6-redeploy-after-code-changes)
-7. [Monitoring and Logs](#7-monitoring-and-logs)
-8. [Shut Down the Swarm](#8-shut-down-the-swarm)
-9. [Remove a Worker Node](#9-remove-a-worker-node)
-10. [Full Reset (Nuclear Option)](#10-full-reset-nuclear-option)
-11. [Environment File Strategy](#11-environment-file-strategy)
-12. [Quick Reference Card](#12-quick-reference-card)
-
----
-
-## 1. First-Time Setup (Manager Node)
-
-Run once per machine that will act as a Swarm manager. This covers everything: Swarm init, NFS, registry, env, build, deploy.
+## Deploy
 
 ```bash
-# 1a. Initialize Swarm
-docker swarm init --advertise-addr <PRIVATE_IP>
-
-# 1b. Create .VERSION (gitignored, required by make build)
-echo "1.0.0" > .VERSION
-
-# 1c. Prepare the .env file
-# Copy .env (committed defaults) and customize for this server:
-cp .env .env.myserver   # name it after the environment: .env.hetzner, .env.DEV_SWARM, .env.PROD
-# Edit manually — no spaces around '='. Variables you MUST change:
-#   PUBLIC_URL              — server URL (e.g. http://10.0.0.3 or https://validate.example.org)
-#   DJANGO_ALLOWED_HOSTS    — space-separated hostnames/IPs that Django accepts
-#   DJANGO_TRUSTED_ORIGINS  — space-separated origins for CSRF
-#   DJANGO_SECRET_KEY       — generate a random key for non-dev environments
-#   POSTGRES_PASSWORD       — use a strong password for non-dev environments
-# Variables to ADD (not in the base .env, Swarm-only):
-#   NFS_SERVER_IP           — private IP of the NFS server (e.g. 10.0.0.3)
-#   REGISTRY                — Docker registry address (e.g. localhost:5000)
-# Optional (uncomment to set):
-#   CERTBOT_DOMAIN          — real domain for SSL (leave as _ to skip)
-#   CERTBOT_EMAIL           — email for Let's Encrypt
-#   WORKER_CPU_LIMIT, WORKER_MEMORY_LIMIT, etc. — resource limits
-
-# 1d. Start local registry (as plain container, NOT Swarm service)
-docker run -d --name registry -p 5000:5000 --restart always registry:2
-# Verify:
-curl -s http://localhost:5000/v2/   # should return {}
-
-# 1e. Set up NFS on the host
-apt install -y nfs-kernel-server
-mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs
-chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs
-chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs
-
-cat >> /etc/exports << 'EOF'
-/srv/nfs/files_data  10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
-/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
-EOF
-
-exportfs -ra
-systemctl restart nfs-kernel-server
-showmount -e localhost
+# Build, push images to registry, and deploy
+make swarm-push ENV_FILE=<env_file>
+make start-swarm-nodb ENV_FILE=<env_file>    # external DB (Azure DEV/PROD)
+# or: make start-swarm ENV_FILE=<env_file>   # containerized DB (Hetzner)
+# or: make start-swarm-local ENV_FILE=<env_file>  # local testing (no NFS, no ClamAV)
 
-# 1f. (If migrating from docker-compose) Stop the old stack first
-docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down
-# Volume names differ: compose uses "validation-service_files_data", swarm uses "validate_files_data"
-# Check which volumes have data:
-docker system df -v | grep -A 50 "Local Volumes"
-
-# 1g. Copy existing data from Docker volumes to NFS
-# Use the COMPOSE volume name (validation-service_*), not the swarm name (validate_*):
-docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/"
-docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/"
-# Verify:
-du -sh /srv/nfs/files_data /srv/nfs/gherkin_logs
-
-# 1h. (If migrating) Copy SSL certs to Swarm volume
-# Old compose used a bind mount (docker/frontend/letsencrypt/), Swarm uses a named volume.
-# Deploy first (step 1j), then copy certs into the volume and restart frontend:
-# cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/
-# docker service update --force validate_frontend
-
-# 1i. Fetch submodules
-make fetch-modules
-
-# 1j. Build, push, deploy
-make swarm-push
-# For external DB (Azure):  make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
-# For containerized DB:     make start-swarm ENV_FILE=.env.hetzner
-make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
-
-# 1k. Verify
+# Verify — all services should reach 1/1 within ~60s
 watch docker service ls
 ```
 
-Adjust NFS exports CIDR to match the network (Azure VNet: `10.0.0.0/16`, Hetzner: `10.0.0.0/24` or `*`).
-
-See [swarm-considerations.md](swarm-considerations.md) for known issues and gotchas that can trip you up during setup (NFS volume caching, network race conditions, env file format, registry config, SSL cert migration, etc.).
+## Redeploy (after code changes)
 
----
-
-## 2. Build, Push and Deploy
+No rolling updates with `latest` tags — must tear down and redeploy.
 
 ```bash
-# Build, tag and push to registry (swarm-push includes build)
-make swarm-push ENV_FILE=<your-env-file>
-
-# Deploy — pick the right target:
-# Full stack with DB container + NFS:
-make start-swarm ENV_FILE=<your-env-file>
-
-# External DB (e.g. Azure PostgreSQL) + NFS:
-make start-swarm-nodb ENV_FILE=<your-env-file>
-
-# Single-node / local testing (no NFS, no ClamAV, 1 replica each):
-make start-swarm-local ENV_FILE=<your-env-file>
-
-# 2b. Watch services come up (all should reach 1/1 within ~60s)
+make stop-swarm
+# Wait ~15s for network cleanup
+make swarm-push ENV_FILE=<env_file>
+make start-swarm-nodb ENV_FILE=<env_file>
 watch docker service ls
-
-# Verify endpoints
-curl -s -o /dev/null -w "%{http_code}" http://localhost/         # 200
-curl -s -o /dev/null -w "%{http_code}" http://localhost/api/      # 302
-curl -s -o /dev/null -w "%{http_code}" http://localhost/admin/    # 302
-
-# 2c. (Optional) Set resource limits on workers
-make set-worker-limits CPU=2 MEM=2G                        # limits only
-make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G   # limits + reservations
-```
-
-**What `start-swarm` vs `start-swarm-nodb` vs `start-swarm-local` does:**
-
-| | `start-swarm` | `start-swarm-nodb` | `start-swarm-local` |
-|---|---|---|---|
-| Compose file | `swarm.yml` | `swarm.nodb.yml` | `swarm.yml` + `swarm.local.yml` |
-| Database | Containerized PostgreSQL | External (e.g. Azure) | Containerized PostgreSQL |
-| Volumes | NFS | NFS | Plain local volumes |
-| ClamAV | Runs | Runs | Skipped |
-| Replicas | backend: 2, worker: 2 | backend: 2, worker: 2 | All 1 |
-| Use case | Hetzner, self-hosted | DEV/PROD (Azure DB) | Quick local testing |
-
----
-
-## 3. Set Up NFS (Multi-Node)
-
-Required before adding worker nodes. Workers need shared access to uploaded IFC files and gherkin logs.
-
-### 3a. On the NFS server (typically the manager node)
-
-```bash
-# Install NFS
-apt install -y nfs-kernel-server
-
-# Create export directories
-mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs
-chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs
-chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs
-
-# Configure exports
-cat >> /etc/exports << 'EOF'
-/srv/nfs/files_data  10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
-/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
-EOF
-
-exportfs -ra
-systemctl restart nfs-kernel-server
-
-# Verify
-showmount -e localhost
-```
-
-### 3b. Copy existing data to NFS (if migrating from local volumes)
-
-```bash
-# Copy files_data
-docker run --rm \
-  -v validate_files_data:/src \
-  -v /srv/nfs/files_data:/dst \
-  alpine sh -c "cp -a /src/. /dst/"
-
-# Copy gherkin_logs
-docker run --rm \
-  -v validate_gherkin_rules_log_data:/src \
-  -v /srv/nfs/gherkin_logs:/dst \
-  alpine sh -c "cp -a /src/. /dst/"
-
-# Verify
-ls -la /srv/nfs/files_data/
-ls -la /srv/nfs/gherkin_logs/
-```
-
-**Note:** If migrating from Docker Compose, the volume names may be prefixed differently (e.g. `validation-service_files_data` instead of `validate_files_data`). Check with `docker volume ls`.
-
-### 3c. Set NFS_SERVER_IP in the env file
-
-```bash
-# In your env file (.env.hetzner, .env.DEV_SWARM, .env.PROD, etc.):
-NFS_SERVER_IP=<private IP of the NFS server>
-# e.g. on Hetzner test server this was 10.0.0.3 — check your actual network with: hostname -I
 ```
 
-The `docker-compose.swarm.yml` uses this in the NFS volume driver options.
-
-### 3d. Redeploy with NFS volumes
-
+To force-restart a single service (same image, same env):
 ```bash
-# Tear down existing stack (uses local volumes)
-make stop-swarm
-
-# Wait ~15 seconds for cleanup, then redeploy with NFS
-make start-swarm ENV_FILE=<your-env-file>
-
-# Verify NFS volumes are mounted
-docker volume inspect validate_files_data
-# Should show Type: nfs in Options
-```
-
----
-
-## 4. Add a Worker Node to the Swarm
-
-### 4a. On the manager — get join token
-
-```bash
-docker swarm join-token worker
-# Outputs: docker swarm join --token SWMTKN-... <manager-ip>:2377
+docker service update --force validate_backend
 ```
 
-### 4b. On the new worker node — prerequisites
-
-```bash
-# Install Docker
-curl -fsSL https://get.docker.com | sh
-
-# Install NFS client (needed for NFS volumes)
-apt install -y nfs-common
+## Add / Remove Worker Nodes
 
-# Verify NFS is reachable
-mount -t nfs4 <NFS_SERVER_IP>:/srv/nfs/files_data /mnt && ls /mnt && umount /mnt
+### Prerequisites
 
-# Configure insecure registry (if using private registry over HTTP)
-echo '{ "insecure-registries": ["<MANAGER_PRIVATE_IP>:5000"] }' | sudo tee /etc/docker/daemon.json
-sudo systemctl restart docker
-```
+1. Worker VM must be in the same VNet/subnet as the manager
+2. Manager's SSH key must be on the worker (`~/.ssh/authorized_keys`). On Azure, use Portal > "Reset password > Add SSH public key"
+3. Register the worker in the env file:
+   ```
+   SWARM_WORKER_1=dev-vm-worker-1:10.0.0.4
+   ```
 
-### 4c. Join the swarm
+### Add
 
 ```bash
-# Paste the join command from step 4a:
-docker swarm join --token SWMTKN-... <manager-ip>:2377
+# Installs Docker, configures registry, joins Swarm — all in one command
+make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
 ```
 
-### 4d. Verify on manager
+### Remove
 
 ```bash
-docker node ls
-# Should show both nodes as Ready/Active
-```
+# Drains tasks, leaves Swarm, removes node
+make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
 
-### 4e. Also configure insecure registry on manager (if using private IP for registry)
-
-```bash
-# Only needed if REGISTRY=<MANAGER_PRIVATE_IP>:5000 instead of localhost:5000
-echo '{ "insecure-registries": ["<MANAGER_PRIVATE_IP>:5000"] }' | sudo tee /etc/docker/daemon.json
-sudo systemctl restart docker
+# Then: remove SWARM_WORKER_N line from env file, delete VM if temporary
 ```
 
-**Important:** When using a private IP registry (e.g. `REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors.
-
----
-
-## 5. Scale Workers
+## Scale Workers
 
 ```bash
-# Scale to N workers (Swarm distributes across available nodes)
+# Scale to N worker containers (distributed across nodes)
 make scale-workers WORKERS=4
 
-# Check placement — see which node each worker is on
+# Check which node each worker runs on
 docker service ps validate_worker
 
-# Set resource limits (applied per-container, not total)
-make set-worker-limits CPU=2 MEM=2G                        # limits only
-make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G   # limits + reservations
+# Set resource limits per container
+make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G
 ```
 
-**Per-environment resource limits:**
-
-| Environment | CPU limit | Memory limit | Notes |
-|---|---|---|---|
-| Hetzner (8GB, no ClamAV) | 2 | 2G | Max ~2 workers |
-| DEV | 1 | 1G | |
-| PROD | 4 | 4G | Includes ClamAV ~1GB |
-
-**ClamAV RAM warning:** Each worker with ClamAV loads ~1GB of virus signatures. 4 workers + 1 scheduler = ~5GB just for ClamAV. Use the local override (skips ClamAV) on small servers, or use 16GB+ RAM.
-
----
-
-## 6. Redeploy After Code Changes
-
-There is no rolling update for `latest` tags — must tear down and redeploy.
-
-```bash
-# 1. Stop
-make stop-swarm
-
-# 2. Rebuild and push
-make swarm-push ENV_FILE=<your-env-file>
-
-# 3. Redeploy
-make start-swarm ENV_FILE=<your-env-file>
-
-# 4. Verify
-watch docker service ls
-```
+**Terminology:** A worker _node_ is a VM. Each node runs worker _replicas_ (containers). Each replica runs multiple Celery _processes_ (set by `CELERY_CONCURRENCY`, default 4).
 
-**Faster alternative for single-service changes:**
+## Monitoring
 
 ```bash
-# Force-restart one service (uses existing image, same config — does NOT re-read .env)
-docker service update --force validate_backend
-
-# Or rebuild and push just the backend image, then update (still same env):
-make swarm-push ENV_FILE=<your-env-file>
-docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend
-
-# To pick up .env changes, you must redeploy (stop + start-swarm)
+make swarm-status                            # service overview + worker placement
+docker service logs -f validate_worker       # follow logs (also: backend, frontend, scheduler)
+docker stats --no-stream                     # CPU/memory per container
+docker node ls                               # node health
+journalctl -k | grep "out of memory"         # check for OOM kills
 ```
 
----
-
-## 7. Monitoring and Logs
+## Stop / Start
 
 ```bash
-# Service overview
-docker service ls
-
-# Detailed worker status (shows which node, current state)
-make swarm-status
-
-# Follow logs for a service
-docker service logs -f validate_frontend
-docker service logs -f validate_backend
-docker service logs -f validate_worker
-docker service logs -f validate_scheduler
-docker service logs -f validate_db
-
-# Resource usage (CPU/memory per container)
-docker stats --no-stream
-
-# Check for OOM kills
-journalctl -k | grep "out of memory"
-
-# Check node status
-docker node ls
-
-# Inspect a specific service
-docker service inspect validate_worker --pretty
+make stop-swarm          # removes stack, keeps volumes and Swarm membership
+make start-swarm-nodb ENV_FILE=<env_file>    # redeploy — volumes are still there
 ```
 
----
-
-## 8. Shut Down the Swarm
+## Full Reset
 
-### Stop the stack (keeps volumes and swarm membership)
+Removes everything — stack, volumes, images, Swarm. Start fresh from first-time setup.
 
 ```bash
 make stop-swarm
-# Equivalent to: docker stack rm validate
-# Volumes are preserved — data survives restarts
-```
-
-### Restart after shutdown
-
-```bash
-# Just redeploy — volumes are still there
-make start-swarm ENV_FILE=<your-env-file>
+docker rm -f registry
+docker volume prune -f          # WARNING: deletes DB data and uploaded files
+docker system prune -af
+docker swarm leave --force
 ```
 
 ---
 
-## 9. Remove a Worker Node
-
-```bash
-# On manager: drain the node first (moves tasks to other nodes)
-docker node update --availability drain <NODE_ID>
-
-# Wait for tasks to migrate, then on the worker node:
-docker swarm leave
+## First-Time Setup (Manager Node)
 
-# On manager: remove the node from the list
-docker node rm <NODE_ID>
-```
+One-time setup for a new manager. Once done, use the commands above for daily operations.
 
----
+```bash
+# 1. Init Swarm
+docker swarm init --advertise-addr <PRIVATE_IP>
 
-## 10. Full Reset (Nuclear Option)
+# 2. Start local registry
+docker run -d --name registry -p 5000:5000 --restart always registry:2
 
-Removes everything — stack, volumes, images, swarm.
+# 3. Configure insecure registry (required for multi-node)
+#    Add "insecure-registries": ["<PRIVATE_IP>:5000"] to /etc/docker/daemon.json
+#    Then: sudo systemctl restart docker
 
-```bash
-# 1. Remove the stack
-make stop-swarm
+# 4. Set up NFS
+apt install -y nfs-kernel-server
+mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs
+chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs
+chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs
 
-# 2. Remove registry
-docker rm -f registry
+cat >> /etc/exports << 'EOF'
+/srv/nfs/files_data  10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+EOF
 
-# 3. Remove all volumes (WARNING: deletes DB data and uploaded files!)
-docker volume prune -f
+exportfs -ra && systemctl restart nfs-kernel-server
 
-# 4. Remove all images
-docker system prune -af
+# 5. Create .VERSION
+echo "1.0.0" > .VERSION
 
-# 5. Leave the swarm
-docker swarm leave --force
+# 6. Prepare env file — see swarm-considerations.md for env file strategy
+cp .env .env.myserver   # customize: PUBLIC_URL, DJANGO_ALLOWED_HOSTS, NFS_SERVER_IP, REGISTRY, etc.
 
-# Then start fresh from section 1
+# 7. Fetch submodules, build, deploy
+make fetch-modules
+make swarm-push ENV_FILE=<env_file>
+make start-swarm-nodb ENV_FILE=<env_file>
 ```
 
----
-
-## 11. Environment File Strategy
+### Migrating from Docker Compose
 
-The `.env` in the repo root is committed with safe defaults (localhost, no secrets). Each environment gets its own gitignored override.
+```bash
+# Stop old stack
+docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down
 
-| File | Purpose | Committed? |
-|---|---|---|
-| `.env` | Shared defaults for docker compose (local dev, forking) | Yes |
-| `.env.hetzner` | Hetzner test server (IPs, NFS, registry) | No |
-| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No |
-| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No |
-| `.env.PROD` | Production (real secrets, domains, SSL) | No |
+# Copy data from compose volumes to NFS (volume names differ: validation-service_* vs validate_*)
+docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/"
+docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/"
 
-**Deploy with:**
-```bash
-make start-swarm ENV_FILE=.env.hetzner          # Hetzner (with DB container)
-make start-swarm-nodb ENV_FILE=.env.DEV_SWARM   # DEV (external Azure DB)
-make start-swarm ENV_FILE=.env.PROD             # PROD
+# Copy SSL certs (after first deploy)
+cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/
+docker service update --force validate_frontend
 ```
 
-**What changes per environment:**
-
-| Variable | Hetzner (test) | DEV | PROD |
-|---|---|---|---|
-| `DEBUG` | `True` | `True` | `False` |
-| `ENV` | `Development` | `Development` | `Production` |
-| `PUBLIC_URL` | `http://<server-ip>` | `https://dev.validate...` | `https://validate.buildingsmart.org` |
-| `DJANGO_ALLOWED_HOSTS` | `localhost <server-ip>` | `dev.validate...` | `validate.buildingsmart.org` |
-| `CERTBOT_DOMAIN` | `_` (skip SSL) | domain | domain |
-| `NFS_SERVER_IP` | `10.0.0.3` | `10.0.0.5` | per-setup |
-| `REGISTRY` | `localhost:5000` | `localhost:5000` | per-setup |
-| `POSTGRES_PASSWORD` | `postgres` | strong | strong |
-| `DJANGO_SECRET_KEY` | insecure default | random | random |
-| B2C / Mailgun | empty | real creds | real creds |
-
-**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):**
-- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-`
-- No quotes around values — Docker passes them literally
-- No angle bracket placeholders like `<VALUE>` — they get passed as literal strings
-
 ---
 
-## 12. Quick Reference Card
+## Quick Reference
 
 | Task | Command |
 |---|---|
-| Deploy (local/test) | `make start-swarm-local ENV_FILE=<your-env-file>` |
-| Deploy (with DB + NFS) | `make start-swarm ENV_FILE=<your-env-file>` |
-| Deploy (external DB + NFS) | `make start-swarm-nodb ENV_FILE=<your-env-file>` |
-| Copy SSL certs to Swarm | `cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/` |
-| Restart frontend (after cert copy) | `docker service update --force validate_frontend` |
-| Stop stack | `make stop-swarm` |
+| Deploy (external DB) | `make start-swarm-nodb ENV_FILE=<env_file>` |
+| Deploy (with DB) | `make start-swarm ENV_FILE=<env_file>` |
+| Stop | `make stop-swarm` |
+| Build + push | `make swarm-push ENV_FILE=<env_file>` |
 | Scale workers | `make scale-workers WORKERS=4` |
-| Set worker limits | `make set-worker-limits CPU=2 MEM=2G` |
-| Build + push images | `make swarm-push ENV_FILE=<your-env-file>` |
-| Service status | `make swarm-status` |
-| Follow logs | `docker service logs -f validate_<service>` |
-| Force-restart service | `docker service update --force validate_backend` |
-| Add worker node | `docker swarm join --token SWMTKN-... <ip>:2377` |
-| Drain node | `docker node update --availability drain <id>` |
-| Remove node | `docker swarm leave` (on worker) + `docker node rm <id>` (on manager) |
-| Check MTU | `ping -M do -s 1372 <other-node-ip>` |
-
+| Set limits | `make set-worker-limits CPU=2 MEM=2G` |
+| Add worker | `make add-worker NAME=<name> ENV_FILE=<env_file>` |
+| Remove worker | `make remove-worker NAME=<name> ENV_FILE=<env_file>` |
+| Status | `make swarm-status` |
+| Logs | `docker service logs -f validate_<service>` |
+| Force-restart | `docker service update --force validate_<service>` |