diff --git a/.env b/.env
index fb79ac74..982cd621 100644
--- a/.env
+++ b/.env
@@ -1,51 +1,59 @@
-# variables in Docker Compose
-DEBUG = True
-ENV = Development
-PUBLIC_URL = http://localhost
-
-# Certbot
-CERTBOT_DOMAIN = _
-CERTBOT_EMAIL = <CERTBOT_EMAIL>
-
-# Django
-MEDIA_ROOT = /files_storage
-DJANGO_DB = postgresql
-DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk
-DJANGO_ALLOWED_HOSTS = localhost
-DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000
-DJANGO_LOG_LEVEL = INFO
-GHERKIN_LOG_FOLDER = /gherkin_logs
-DJANGO_GUNICORN_WORKERS = 3
-DJANGO_GUNICORN_THREADS_PER_WORKER = 4
-
-# DB
-POSTGRES_HOST = db
-POSTGRES_NAME = postgres
-POSTGRES_USER = postgres
-POSTGRES_PASSWORD = postgres
-POSTGRES_PORT = 5432
-
-# Worker
-REDIS_PORT = 6379
-CELERY_BROKER_URL = redis://redis:6379/0
-CELERY_TASK_SOFT_TIME_LIMIT = 3600
-CELERY_TASK_TIME_LIMIT = 4000
-TASK_TIMEOUT_LIMIT = 3600
-DJANGO_DB_USER_CONTEXT = SYSTEM
-DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000
-CELERY_CONCURRENCY = 4
-
-# Email
-MAILGUN_API_URL = <MG_API_URL>
-MAILGUN_API_KEY = <MG_API_KEY>
-MAILGUN_FROM_NAME = Validation Service
-MAILGUN_FROM_EMAIL = noreply@localhost
-ADMIN_EMAIL = noreply@localhost
-CONTACT_EMAIL = noreply@localhost
-
-# IAM
-B2C_CLIENT_ID = <B2C_CLIENT_ID>
-B2C_CLIENT_SECRET = <B2C_CLIENT_SECRET>
-B2C_AUTHORITY = <B2C_AUTHORITY>
-B2C_USER_FLOW = <B2C_USER_FLOW>
-USE_WHITELIST = False
+# variables in Docker Compose
+DEBUG = True
+ENV = Development
+PUBLIC_URL = http://localhost
+
+# Certbot
+CERTBOT_DOMAIN = _
+CERTBOT_EMAIL = <CERTBOT_EMAIL>
+
+# Django
+MEDIA_ROOT = /files_storage
+DJANGO_DB = postgresql
+DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk
+DJANGO_ALLOWED_HOSTS = localhost
+DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000
+DJANGO_LOG_LEVEL = INFO
+GHERKIN_LOG_FOLDER = /gherkin_logs
+DJANGO_GUNICORN_WORKERS = 3
+DJANGO_GUNICORN_THREADS_PER_WORKER = 4
+
+# DB
+POSTGRES_HOST = db
+POSTGRES_NAME = postgres
+POSTGRES_USER = postgres
+POSTGRES_PASSWORD = postgres
+POSTGRES_PORT = 5432
+
+# Worker
+REDIS_PORT = 6379
+CELERY_BROKER_URL = redis://redis:6379/0
+CELERY_TASK_SOFT_TIME_LIMIT = 3600
+CELERY_TASK_TIME_LIMIT = 4000
+TASK_TIMEOUT_LIMIT = 3600
+DJANGO_DB_USER_CONTEXT = SYSTEM
+DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000
+CELERY_CONCURRENCY = 4
+
+# Email
+MAILGUN_API_URL = <MG_API_URL>
+MAILGUN_API_KEY = <MG_API_KEY>
+MAILGUN_FROM_NAME = Validation Service
+MAILGUN_FROM_EMAIL = noreply@localhost
+ADMIN_EMAIL = noreply@localhost
+CONTACT_EMAIL = noreply@localhost
+
+# IAM
+B2C_CLIENT_ID = <B2C_CLIENT_ID>
+B2C_CLIENT_SECRET = <B2C_CLIENT_SECRET>
+B2C_AUTHORITY = <B2C_AUTHORITY>
+B2C_USER_FLOW = <B2C_USER_FLOW>
+USE_WHITELIST = False
+
+# Swarm (ignored by docker compose)
+# REGISTRY=localhost:5000
+# NFS_SERVER_IP=10.0.0.1
+# WORKER_CPU_LIMIT=2.0
+# WORKER_CPU_RESERVATION=1.0
+# WORKER_MEMORY_LIMIT=2G
+# WORKER_MEMORY_RESERVATION=1G
diff --git a/Makefile b/Makefile
index 0163120f..bceeafb0 100644
--- a/Makefile
+++ b/Makefile
@@ -26,6 +26,88 @@ start-infra-only:
 stop:
 	docker compose down
 
+# --- Docker Swarm ---
+
+REGISTRY ?= localhost:5000
+WORKERS  ?= 2
+ENV_FILE ?= .env
+SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP WORKER_CPU_LIMIT WORKER_MEMORY_LIMIT WORKER_CPU_RESERVATION WORKER_MEMORY_RESERVATION
+SWARM_ENV = ENV_FILE="$(ENV_FILE)" $(foreach v,$(SWARM_VARS),$(v)="$(shell grep '^$(v)=' $(ENV_FILE) | head -1 | cut -d= -f2-)")
+
+start-swarm:
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate
+
+start-swarm-nodb:
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.nodb.yml | docker stack deploy -c - --with-registry-auth validate
+
+start-swarm-local:
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml && \
+	env $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm.local.yml && \
+	docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm.local.yml --with-registry-auth validate && \
+	rm -f /tmp/_swarm.yml /tmp/_swarm.local.yml
+
+stop-swarm:
+	docker stack rm validate
+
+scale-workers:
+	docker service scale validate_worker=$(WORKERS)
+
+set-worker-limits:
+	docker service update \
+		$(if $(CPU),--limit-cpu $(CPU)) \
+		$(if $(MEM),--limit-memory $(MEM)) \
+		$(if $(CPU_RES),--reserve-cpu $(CPU_RES)) \
+		$(if $(MEM_RES),--reserve-memory $(MEM_RES)) \
+		validate_worker
+
+swarm-push: build
+	docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend
+	docker tag buildingsmart/validationsvc-frontend $(REGISTRY)/validationsvc-frontend
+	docker push $(REGISTRY)/validationsvc-backend
+	docker push $(REGISTRY)/validationsvc-frontend
+
+swarm-status:
+	@docker service ls
+	@echo "---"
+	@docker service ps validate_worker
+
+# Add a worker node to the Swarm cluster
+# Usage: make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
+# Reads SWARM_WORKER_N entries and SWARM_SSH_USER from ENV_FILE
+add-worker:
+	@test -n "$(NAME)" || (echo "Usage: make add-worker NAME=<worker-name> ENV_FILE=.env.DEV_SWARM" && exit 1)
+	$(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-))
+	$(eval MANAGER_IP := $(shell grep '^NFS_SERVER_IP=' $(ENV_FILE) | head -1 | cut -d= -f2-))
+	$(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2))
+	@test -n "$(WORKER_IP)" || (echo "ERROR: Worker '$(NAME)' not found in $(ENV_FILE). Add it as: SWARM_WORKER_N=$(NAME):<ip>" && exit 1)
+	@test -n "$(MANAGER_IP)" || (echo "ERROR: NFS_SERVER_IP not set in $(ENV_FILE)" && exit 1)
+	@test -n "$(SSH_USER)" || (echo "ERROR: SWARM_SSH_USER not set in $(ENV_FILE)" && exit 1)
+	@echo "==> Installing Docker on $(NAME) ($(WORKER_IP))..."
+	sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "curl -fsSL https://get.docker.com | sh"
+	@echo "==> Configuring insecure registry ($(MANAGER_IP):5000)..."
+	sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) 'echo '"'"'{ "insecure-registries": ["$(MANAGER_IP):5000"] }'"'"' | sudo tee /etc/docker/daemon.json && sudo systemctl restart docker'
+	@echo "==> Joining Swarm..."
+	sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm join --token $$(sudo docker swarm join-token worker -q) $(MANAGER_IP):2377"
+	@echo "==> Done! Node list:"
+	sudo docker node ls
+
+# Remove a worker node from the Swarm cluster
+# Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
+remove-worker:
+	@test -n "$(NAME)" || (echo "Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM" && exit 1)
+	$(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-))
+	$(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2))
+	@echo "==> Draining $(NAME)..."
+	sudo docker node update --availability drain $(NAME)
+	@echo "==> Leaving swarm..."
+	-sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm leave"
+	@echo "==> Waiting for node to go down..."
+	@for i in 1 2 3 4 5 6; do sleep 5; sudo docker node ls --format '{{.Hostname}} {{.Status}}' | grep -q '$(NAME) Down' && break; echo "    waiting..."; done
+	@echo "==> Removing node..."
+	sudo docker node rm $(NAME)
+	@echo "==> Done! Don't forget to remove the SWARM_WORKER entry from $(ENV_FILE)"
+	sudo docker node ls
+
 build:
 	docker compose build \
 	--build-arg GIT_COMMIT_HASH="$$(git rev-parse --short HEAD)" \
@@ -83,7 +165,7 @@ e2e-test: start-infra
 	cd e2e && npm install && npm run install-playwright && npm run test
 
 e2e-test-report: start-infra
-	cd e2e && npm install && npm run install-playwright && npm run test:html && npm run test:report
+	cd e2e && npm install && npm run inst1all-playwright && npm run test:html && npm run test:report
 
 BRANCH   ?= main                 
 SUBTREES := \
diff --git a/backend/apps/ifc_validation/api/v1/views.py b/backend/apps/ifc_validation/api/v1/views.py
index ae7bb5e1..838f09e0 100644
--- a/backend/apps/ifc_validation/api/v1/views.py
+++ b/backend/apps/ifc_validation/api/v1/views.py
@@ -195,6 +195,7 @@ def post(self, request, *args, **kwargs):
                     #file = os.path.join(MEDIA_ROOT, uploaded_file['file_name'])                   
                     #uploaded_file['size'] = os.path.getsize(file)
                     uploaded_file['size'] = file_length
+                    f.seek(0)
                     instance = serializer.save()
 
                     # submit task for background execution
diff --git a/backend/apps/ifc_validation/tasks/processing/instance_completion.py b/backend/apps/ifc_validation/tasks/processing/instance_completion.py
index 2fb5b3ba..c2940e56 100644
--- a/backend/apps/ifc_validation/tasks/processing/instance_completion.py
+++ b/backend/apps/ifc_validation/tasks/processing/instance_completion.py
@@ -18,7 +18,7 @@
     import itertools
     import functools
 
-    file_path, step_ids = file_path, step_ids = json.load(sys.stdin)
+    file_path, step_ids = json.load(sys.stdin)
     ifc_file = ifcopenshell.open(file_path)
     def filter_serializable(v):
         def inner(k, v):
diff --git a/backend/core/settings.py b/backend/core/settings.py
index 78f1e63d..90061152 100644
--- a/backend/core/settings.py
+++ b/backend/core/settings.py
@@ -246,8 +246,10 @@
         "USER": os.environ.get("POSTGRES_USER", "postgres"),
         "PASSWORD": os.environ.get("POSTGRES_PASSWORD", "postgres"),
         "PORT": int(os.environ.get("POSTGRES_PORT", "5432")),
+        "CONN_MAX_AGE": int(os.environ.get("POSTGRES_CONN_MAX_AGE", 600)),
+        "CONN_HEALTH_CHECKS": True,
         "OPTIONS": {
-            "pool": True,
+            "pool": False,
         },
     },
 }
diff --git a/docker-compose.swarm.local.yml b/docker-compose.swarm.local.yml
new file mode 100644
index 00000000..c05b5c9c
--- /dev/null
+++ b/docker-compose.swarm.local.yml
@@ -0,0 +1,67 @@
+# Override: single-node local testing (no NFS, no ClamAV, reduced replicas)
+#
+# Usage:
+#   docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml validate
+#
+# For production/NFS testing, use docker-compose.swarm.yml directly.
+
+services:
+
+  frontend:
+    environment:
+      CERTBOT_DOMAIN: _
+      CERTBOT_EMAIL: x
+
+  backend:
+    deploy:
+      replicas: 1
+
+  worker:
+    entrypoint: /bin/sh
+    command:
+      - -c
+      - |
+        set -e
+        until cd /files_storage; do echo "Waiting for files_storage..."; done
+        until cd /app/backend; do echo "Waiting for server volume..."; done
+        while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do
+          echo "Waiting for DB..."
+          sleep 5
+        done
+        echo "DB is ready. Starting worker (no ClamAV)."
+        rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true
+        CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4}
+        echo "Celery concurrency: $$CELERY_CONCURRENCY"
+        celery --app=core worker --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker@%n
+    deploy:
+      replicas: 1
+      resources:
+        limits:
+          cpus: "2.0"
+          memory: 2G
+        reservations:
+          cpus: "0.5"
+          memory: 512M
+
+  scheduler:
+    entrypoint: /bin/sh
+    command:
+      - -c
+      - |
+        set -e
+        until cd /files_storage; do echo "Waiting for files_storage..."; done
+        until cd /app/backend; do echo "Waiting for server volume..."; done
+        while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do
+          echo "Waiting for DB..."
+          sleep 5
+        done
+        echo "DB is ready. Starting scheduler (no ClamAV)."
+        rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true
+        CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4}
+        celery --app=core worker --beat --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker-beat@%n
+
+volumes:
+  files_data:
+    driver: local
+  gherkin_rules_log_data:
+    driver: local
diff --git a/docker-compose.swarm.nodb.yml b/docker-compose.swarm.nodb.yml
new file mode 100644
index 00000000..bfe1fb11
--- /dev/null
+++ b/docker-compose.swarm.nodb.yml
@@ -0,0 +1,137 @@
+# Docker Swarm deployment configuration — external database (no containerized PostgreSQL)
+#
+# Usage:
+#   make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
+#
+# Same as docker-compose.swarm.yml but without the db service.
+# Set POSTGRES_HOST, POSTGRES_PORT, etc. in your env file to point to the external DB.
+
+services:
+
+    frontend:
+        image: ${REGISTRY}/validationsvc-frontend
+        ports:
+            - 80:80
+            - 443:443
+        environment:
+            CERTBOT_DOMAIN: ${CERTBOT_DOMAIN}
+            CERTBOT_EMAIL: ${CERTBOT_EMAIL}
+        volumes:
+            - letsencrypt_data:/etc/letsencrypt
+            - static_data:/app/backend/django_static
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    backend:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/server-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - static_data:/app/backend/django_static
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        expose:
+            - 8000
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+        healthcheck:
+            test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""]
+            interval: 30s
+            timeout: 10s
+            retries: 3
+            start_period: 60s
+
+    worker:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/worker-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 2
+            # No placement constraint - workers run on any node
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+
+    scheduler:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/worker-beat-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    redis:
+        image: redis:8.4-alpine
+        command: redis-server --protected-mode no --bind 0.0.0.0
+        expose:
+            - 6379
+        volumes:
+            - redis_data:/data
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+networks:
+    validate:
+        driver: overlay
+        driver_opts:
+            com.docker.network.driver.mtu: "1400"
+
+volumes:
+    static_data:
+    letsencrypt_data:
+    redis_data:
+    files_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/files_data"
+    gherkin_rules_log_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/gherkin_logs"
diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml
new file mode 100644
index 00000000..ddaac915
--- /dev/null
+++ b/docker-compose.swarm.yml
@@ -0,0 +1,169 @@
+# Docker Swarm deployment configuration
+#
+# Usage:
+#   1. Build and push images:  make swarm-push
+#   2. Deploy:                 make start-swarm
+#   3. Scale workers:          make scale-workers WORKERS=5
+#   4. Status:                 make swarm-status
+#   5. Tear down:              make stop-swarm
+#
+# Prerequisites:
+#   - docker swarm init
+#   - Local registry: docker service create --name registry --publish 5000:5000 registry:2
+#   - NFS server configured (see PR description)
+#
+# NFS: Set NFS_SERVER_IP in .env (default: 10.0.0.1).
+# For local testing without NFS, override volumes with plain named volumes.
+
+services:
+
+    frontend:
+        image: ${REGISTRY}/validationsvc-frontend
+        ports:
+            - 80:80
+            - 443:443
+        environment:
+            CERTBOT_DOMAIN: ${CERTBOT_DOMAIN}
+            CERTBOT_EMAIL: ${CERTBOT_EMAIL}
+        volumes:
+            - letsencrypt_data:/etc/letsencrypt
+            - static_data:/app/backend/django_static
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    backend:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/server-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - static_data:/app/backend/django_static
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        expose:
+            - 8000
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+        healthcheck:
+            test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""]
+            interval: 30s
+            timeout: 10s
+            retries: 3
+            start_period: 60s
+
+    worker:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/worker-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 2
+            # No placement constraint - workers run on any node
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+            update_config:
+                parallelism: 1
+                delay: 30s
+                failure_action: rollback
+            # Resource limits applied post-deploy via:
+            #   docker service update --limit-cpu 2 --limit-memory 2G validate_worker
+
+    scheduler:
+        image: ${REGISTRY}/validationsvc-backend
+        entrypoint: /app/backend/worker-beat-entrypoint.sh
+        env_file: ${ENV_FILE}
+        volumes:
+            - files_data:/files_storage
+            - gherkin_rules_log_data:/gherkin_logs
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    redis:
+        image: redis:8.4-alpine
+        command: redis-server --protected-mode no --bind 0.0.0.0
+        expose:
+            - 6379
+        volumes:
+            - redis_data:/data
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: on-failure
+                delay: 5s
+
+    db:
+        image: postgres:16.10-alpine
+        environment:
+            POSTGRES_DB: ${POSTGRES_NAME}
+            POSTGRES_USER: ${POSTGRES_USER}
+            POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
+        expose:
+            - 5432
+        volumes:
+            - postgres_data:/var/lib/postgresql/data/
+        networks:
+            - validate
+        deploy:
+            replicas: 1
+            placement:
+                constraints: [node.role == manager]
+            restart_policy:
+                condition: any
+                delay: 5s
+
+networks:
+    validate:
+        driver: overlay
+        driver_opts:
+            com.docker.network.driver.mtu: "1400"
+
+volumes:
+    static_data:
+    letsencrypt_data:
+    postgres_data:
+    redis_data:
+    files_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/files_data"
+    gherkin_rules_log_data:
+        driver: local
+        driver_opts:
+            type: nfs
+            o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2"
+            device: ":/srv/nfs/gherkin_logs"
diff --git a/docker/backend/worker-beat-entrypoint.sh b/docker/backend/worker-beat-entrypoint.sh
index 23c38f6e..1eeb5afa 100644
--- a/docker/backend/worker-beat-entrypoint.sh
+++ b/docker/backend/worker-beat-entrypoint.sh
@@ -23,9 +23,6 @@ freshclam
 service clamav-freshclam start
 service clamav-daemon start
 
-python manage.py makemigrations
-python manage.py migrate
-
 CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes
 echo "Celery concurrency: $CELERY_CONCURRENCY"
 
diff --git a/docker/backend/worker-entrypoint.sh b/docker/backend/worker-entrypoint.sh
index c08929b8..cbe08319 100644
--- a/docker/backend/worker-entrypoint.sh
+++ b/docker/backend/worker-entrypoint.sh
@@ -23,9 +23,6 @@ freshclam
 service clamav-freshclam start
 service clamav-daemon start
 
-python manage.py makemigrations
-python manage.py migrate
-
 CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes
 echo "Celery concurrency: $CELERY_CONCURRENCY"
 
diff --git a/docs/swarm-considerations.md b/docs/swarm-considerations.md
new file mode 100644
index 00000000..d25ee790
--- /dev/null
+++ b/docs/swarm-considerations.md
@@ -0,0 +1,492 @@
+# Docker Swarm — Considerations & Known Issues
+
+Compiled during IVS-719 development. Grouped by category.
+
+## Status
+
+- **Single-node Swarm**: tested and working (Hetzner, 2026-03-10)
+- **Multi-node Swarm**: tested and working with 2 nodes + NFS (Hetzner, 2026-03-15)
+- **Single-node Swarm on Azure DEV**: tested and working with external DB + NFS (2026-03-15)
+- **Multi-node Swarm on Azure DEV**: tested and working — manager + worker node, tasks distributed across both (2026-03-16)
+- **CI/CD**: not yet adapted for Swarm — see section 5
+- **SSL/Certbot**: not tested with a real domain yet (using `CERTBOT_DOMAIN=_` to skip)
+- **Documentation**: user-facing docs (README, deployment guide) not yet updated for Swarm workflow
+
+---
+
+# Architecture & Design
+
+## 1. Architecture overview
+
+Every worker needs access to `/files_storage` (uploaded IFC files) and `/gherkin_logs`. In Docker Compose, these are local volumes on one machine. In Swarm, workers run on **different machines** — so files must be shared via NFS.
+
+```
+                   ┌─────────┐
+                   │ Frontend │  (Nginx + React)
+                   │  :80/443 │
+                   └────┬─────┘
+                        │
+                   ┌────▼─────┐
+                   │ Backend  │  (Django API — manager node)
+                   │  :8000   │
+                   └────┬─────┘
+                        │ enqueues tasks
+                   ┌────▼─────┐
+                   │  Redis   │  (Celery broker — manager node)
+                   │  :6379   │
+                   └────┬─────┘
+                        │ workers consume via overlay network
+              ┌─────────┼──────────┐
+              │         │          │
+         ┌────▼───┐ ┌───▼────┐ ┌──▼─────┐
+         │Worker 1│ │Worker 2│ │Worker N│  (any node in swarm)
+         └────┬───┘ └───┬────┘ └──┬─────┘
+              │         │          │
+              │     NFS mount      │
+              └─────────┼──────────┘
+                   ┌────▼─────┐
+                   │/srv/nfs/ │  (NFS server on manager node)
+                   │files_data│
+                   └──────────┘
+                        │ same machine
+                   ┌────▼─────┐
+                   │ Postgres │  (manager node)
+                   └──────────┘
+
+         ┌───────────┐
+         │ Scheduler │  (1 replica, manager only)
+         │  --beat   │  file retention: archive@90d, remove@180d
+         └───────────┘
+```
+
+**How it works:**
+- The **manager node** runs: frontend, backend, DB, Redis, scheduler, and the NFS server
+- **Worker nodes** only run Celery workers — they mount NFS volumes automatically via the Docker volume driver
+- The **overlay network** (Docker Swarm native) connects workers to Redis and Postgres across machines
+- NFS gives workers read/write access to uploaded files as if they were local
+
+**If NFS goes down, all workers stall** — `hard,timeo=600` mount options mean workers will hang (not error) until NFS recovers. This is intentional: better to wait than to silently fail.
+
+For Azure: restrict NFS exports to VNet CIDR (e.g. `10.0.0.0/16(rw,sync,...)`), not `*`.
+
+---
+
+## 2. Build and deploy are now separate steps
+
+Docker Compose: `docker compose build && docker compose up` — build and run in one flow.
+
+Docker Swarm: worker nodes **cannot build images**. They pull from a registry.
+
+```
+Developer machine          Registry              Swarm nodes
+     build ──push──>  localhost:5000  <──pull──  worker-1, worker-2
+```
+
+Workflow:
+```bash
+make build                          # build images locally
+make swarm-push ENV_FILE=.env.xxx   # tag + push to registry
+make start-swarm ENV_FILE=.env.xxx  # docker stack deploy (nodes pull from registry)
+```
+
+For Azure PROD, replace `localhost:5000` with Azure Container Registry (ACR).
+
+---
+
+## 3. Worker scaling and capacity
+
+There is **no hard cap** on worker replicas. Scaling is manual:
+
+```bash
+make scale-workers WORKERS=4
+```
+
+**Capacity math per worker:**
+- ~1GB RAM for ClamAV virus signature database
+- ~2-3GB RAM for Celery tasks (depends on `CELERY_CONCURRENCY`)
+- Total: **~3-4GB RAM per worker**
+- Each worker runs `CELERY_CONCURRENCY` parallel tasks (default: 4 in .env.hetzner, 6 in .env)
+
+| Environment | Workers | Concurrency | Parallel tasks | RAM needed (workers only) |
+|---|---|---|---|---|
+| Hetzner (8GB) | 2 | 4 | 8 | ~6-8GB |
+| DEV | 2 | 4 | 8 | ~6-8GB |
+| PROD | 4+ | 6 | 24+ | ~12-16GB |
+
+To prevent overloading a single node, use `max_replicas_per_node` in the compose file:
+```yaml
+deploy:
+    replicas: 4
+    placement:
+        max_replicas_per_node: 2
+```
+This forces Swarm to spread workers across at least 2 nodes. Not currently set — all replicas can land on one node if Swarm decides to.
+
+**Resource limits** are optional but recommended in production. Apply post-deploy:
+```bash
+make set-worker-limits CPU=2 MEM=2G                        # limits only
+make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G   # limits + reservations
+```
+
+Per-environment suggestions:
+| Environment | CPU limit | Memory limit | Notes |
+|---|---|---|---|
+| Hetzner (8GB) | 2 | 2G | Small server, max ~2 workers |
+| DEV | 1 | 1G | |
+| PROD | 4 | 4G | Includes ClamAV ~1GB |
+
+---
+
+## 4. `.env` strategy
+
+`.env` is committed with safe defaults (localhost, no secrets). Environment-specific files are gitignored via `.env.*`:
+
+| File | Purpose | Committed? |
+|---|---|---|
+| `.env` | Shared defaults for local dev / forking | Yes |
+| `.env.hetzner` | Hetzner dev server (IPs, NFS, registry) | No |
+| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No |
+| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No |
+| `.env.PROD` | Production (real secrets, domains) | No |
+
+Deploy with:
+```bash
+make start-swarm ENV_FILE=.env.hetzner          # Hetzner (with DB container)
+make start-swarm-nodb ENV_FILE=.env.DEV_SWARM   # DEV (external Azure DB)
+```
+
+The Makefile uses `envsubst` to substitute **only compose-level vars** (REGISTRY, NFS_SERVER_IP, CERTBOT_DOMAIN, etc.) from the env file into the YAML, then pipes the result to `docker stack deploy`. Container env vars are loaded by `docker stack deploy` via the `env_file:` directive directly.
+
+**Why only compose-level vars?** Earlier approaches that sourced the entire env file broke on values with special characters (`#`, `(`, spaces). The current approach extracts only the vars that `envsubst` needs (REGISTRY, CERTBOT_DOMAIN, CERTBOT_EMAIL, NFS_SERVER_IP, etc.) using `grep` + `cut` in the Makefile.
+
+**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):**
+- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-`
+- No quotes around values — Docker passes them literally
+- No angle bracket placeholders like `<VALUE>` — they get passed as literal strings
+
+This avoids three problems with earlier approaches:
+1. **Type conversion bugs** — `docker compose config` converted ports to strings and cpus to integers, which `docker stack deploy` rejected
+2. **`.env` auto-load conflict** — `docker compose config` always loads `.env` from the project directory, silently overriding values from `--env-file`
+3. **Special character breakage** — sourcing the whole env file with `set -a && . ./file` breaks on values containing `#` (comment), `(` (subshell), or unquoted spaces
+
+---
+
+## 5. Local dev and server deploy are now different configs
+
+You maintain two separate compose files:
+- `docker-compose.yml` — local development (single machine, local volumes, `container_name`)
+- `docker-compose.swarm.yml` — Swarm deployment (overlay network, NFS volumes, `deploy:` section)
+- `docker-compose.swarm.nodb.yml` — Swarm with external DB (no containerized Postgres)
+
+Risk: they drift apart over time (different env vars, image versions, volume configs). Mitigation: keep changes in sync during PRs.
+
+---
+
+## 6. No `container_name` / `depends_on` in Swarm
+
+Swarm manages container naming internally (e.g. `validate_worker.1.abc123`). `depends_on` is ignored — services start simultaneously.
+
+Current impact: minimal — entrypoints use DNS service discovery (`redis`, `db`, `backend`) and `pg_isready` wait loops. No code changes needed.
+
+---
+
+## 7. DNS transition strategy for PROD cutover
+
+To avoid downtime when switching from Docker Compose to Swarm in production, use a temporary subdomain:
+
+1. Deploy Swarm stack on a new server (or same server on different ports)
+2. Point a temp subdomain to it (e.g. `swarm.validate.buildingsmart.org`)
+3. Run both setups in parallel — existing Compose on the main domain, Swarm on the temp domain
+4. Test via API (bulk uploads, concurrent validations) against the temp domain
+5. Once confident, swap DNS: point the main domain to the Swarm deployment
+6. Decommission the old Compose setup
+
+Rollback: if Swarm has issues, DNS points back to the old setup in minutes.
+
+For DEV: same approach, or direct cutover (lower risk since it's not user-facing).
+
+---
+
+# Known Issues & Gotchas
+
+## 8. Overlay network MTU must be set to 1400
+
+MTU (Maximum Transmission Unit) is the largest packet size a network link can carry — the default is 1500 bytes. Hetzner's private network uses MTU 1450. Docker's VXLAN overlay adds ~50 bytes of encapsulation headers to every packet, so if the underlying MTU is already ≤1500, the oversized packets get silently dropped or fragmented. Without setting the overlay MTU to 1400 (leaving headroom for the VXLAN overhead), worker nodes on different machines **cannot reach services on the manager** (DB, Redis).
+
+Symptom: workers stuck on `db:5432 - no response` despite DNS resolving correctly.
+
+Fix is in `docker-compose.swarm.yml`:
+```yaml
+networks:
+    validate:
+        driver: overlay
+        driver_opts:
+            com.docker.network.driver.mtu: "1400"
+```
+
+This applies to any cloud provider with sub-1500 MTU on internal networks.
+
+---
+
+## 9. ClamAV runs inside every worker (~1GB RAM overhead each)
+
+Each worker container starts its own ClamAV daemon + freshclam (virus signature updater). This is the **same as before** — not a Swarm change. But when scaling to N workers, you get N independent ClamAV instances.
+
+Impact:
+- ~1GB RAM per worker for virus signature database (observed during Hetzner testing — 5 instances caused OOM on 8GB server)
+- Each worker independently downloads signature updates on boot
+- The 4GB memory limit per worker (PROD) accounts for this: ~1GB ClamAV + ~2-3GB for Celery tasks
+- The local override (`docker-compose.swarm.local.yml`) skips ClamAV entirely for testing on small servers
+
+4 workers with ClamAV = ~4GB just for virus DBs.
+
+---
+
+## 10. Registry must use private IP, not localhost
+
+**Always set `REGISTRY=<manager-private-ip>:5000`** (e.g. `10.0.0.5:5000`) in the env file, never `localhost:5000`.
+
+Why: `localhost` resolves to the local machine. On the manager, that works. On worker nodes, `localhost:5000` points to nothing — workers can't pull images and stay at 0/N replicas with `No such image` errors.
+
+**Every node** (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`:
+
+```bash
+echo '{ "insecure-registries": ["10.0.0.5:5000"] }' | sudo tee /etc/docker/daemon.json
+sudo systemctl restart docker
+```
+
+The `make add-worker` target handles this automatically for workers. For the **manager**, add it manually once during initial setup (merge with any existing `daemon.json` settings like log-driver).
+
+---
+
+## 11. DB `postmaster.pid` disappears in Swarm (containerized DB only)
+
+PostgreSQL starts, recovers, becomes ready — then shuts itself down because its PID file vanished:
+
+```
+could not open file "postmaster.pid": No such file or directory
+performing immediate shutdown because data directory lock file is invalid
+```
+
+This is a Docker Swarm volume mount timing issue. Fix: set `restart_policy.condition: any` (not `on-failure`) on the db service so Swarm keeps restarting it until it sticks. Already applied in `docker-compose.swarm.yml`.
+
+---
+
+## 12. Docker caches NFS volume options
+
+When `docker stack deploy` creates an NFS volume, the driver options (including `addr=`) are cached. If the first deploy has the wrong NFS IP (e.g. the default `10.0.0.1`), **all subsequent deploys reuse that wrong IP** — even after fixing the env file.
+
+Symptoms: containers stuck in "Created" state, never starting. No logs. NFS mount hangs because the IP doesn't exist.
+
+Fix:
+```bash
+docker stack rm validate
+sleep 15
+docker container prune -f
+docker volume rm validate_files_data validate_gherkin_rules_log_data
+# If containers are stuck on hanging NFS mount:
+systemctl restart docker
+# Then redeploy
+make start-swarm-nodb ENV_FILE=.env.DEV_SWARM
+```
+
+Verify volume has correct IP after deploy: `docker volume inspect validate_files_data`
+
+---
+
+## 13. File upload: `f.seek(0)` after measuring size
+
+In `views.py`, the upload handler seeks to the end of the file to measure its size (`f.seek(0, 2)` + `f.tell()`), then must rewind (`f.seek(0)`) before `serializer.save()`. Without the rewind, Django saves a 0-byte file because the file pointer is at the end.
+
+This may only manifest with NFS-backed storage where buffering behaviour differs from local volumes. Commit: `012776c`
+
+---
+
+## 14. `determine_aggregate_status()` masks silent failures
+
+When a validation task produces zero outcomes (e.g. subprocess crashed, worker OOM, NFS hang), the status defaults to VALID (`models.py:1297` — `# assume valid if no outcomes - TODO: is this correct?`). This pre-dates Swarm but becomes more visible when workers crash/restart across nodes.
+
+**Why we can't just return INVALID:** Marking a file as invalid has real consequences — vendors have to investigate and fix it. Returning INVALID for a crashed task would create false negatives. The actual problem is **silent failure** — a task fails completely and nobody notices because it looks like it passed.
+
+**What should happen instead:** When zero outcomes are produced, the system should alert developers (e.g. log an error, send a notification, or set a distinct status like `ERROR` or `INCONCLUSIVE`) rather than silently defaulting to VALID. The file should be flagged for re-validation, not marked as valid or invalid.
+
+Not blocking for Swarm, but worth a follow-up fix.
+
+---
+
+## 15. DB connection pooling: stale connections on overlay network
+
+Django's `"pool": True` (psycopg3 connection pool) keeps DB connections open for reuse. The Swarm overlay network drops idle TCP connections after ~13 minutes. When the pool hands out a dead connection, Django raises:
+
+```
+OperationalError: consuming input failed: server closed the connection unexpectedly
+```
+
+**Fix** (in `backend/core/settings.py`):
+- `"pool": False` — disable psycopg3's built-in connection pool. `CONN_HEALTH_CHECKS` alone is not sufficient because the pool can hand out a stale connection after the health check passes but before it reaches the query.
+- `CONN_HEALTH_CHECKS = True` — Django pings the connection before using it; if dead, it reconnects transparently
+- `CONN_MAX_AGE = 600` (10 min) — keeps connections open for reuse without the pool layer
+
+`CONN_MAX_AGE` is configurable via `POSTGRES_CONN_MAX_AGE` env var. The default of 600s works for Swarm; set to `0` to close connections after each request (safest but slower).
+
+DB logs showing the symptom (every ~13 min):
+```
+LOG:  could not receive data from client: Connection reset by peer
+```
+
+---
+
+## 16. SSL certs: bind mount vs named volume
+
+Docker Compose used a bind mount for Let's Encrypt certs: `./docker/frontend/letsencrypt:/etc/letsencrypt`. Swarm uses a named volume (`validate_letsencrypt_data`).
+
+When migrating, certs must be manually copied into the Swarm volume:
+```bash
+cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/
+docker service update --force validate_frontend
+```
+
+Without this, HTTPS won't work and the site is only accessible via HTTP. Certbot renewal should continue to work inside the container since `CERTBOT_DOMAIN` is set.
+
+---
+
+## 17. Overlay network race condition after stack rm
+
+After `docker stack rm`, the overlay network cleanup is asynchronous. Redeploying too quickly causes `network validate_validate not found` errors.
+
+Fix: wait ~15 seconds between `docker stack rm` and `docker stack deploy`. If a ghost network persists (`docker network ls` shows it but `docker network rm` says "not found"), restart Docker: `systemctl restart docker`.
+
+---
+
+## 18. No rolling updates for `latest` tags
+
+Swarm checks if the image tag has changed before pulling. Since all images use `:latest`, Swarm sees "same tag" and skips the pull — even if the image content has changed.
+
+**Impact:** `docker service update --force` restarts containers but uses the **cached** image. To deploy new code, you must tear down and redeploy:
+
+```bash
+make stop-swarm
+make swarm-push ENV_FILE=.env.xxx
+make start-swarm ENV_FILE=.env.xxx
+```
+
+Or force a pull for a single service:
+```bash
+docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend
+```
+
+---
+
+## 19. `docker service update --force` does NOT re-read env vars
+
+`docker service update --force` restarts containers with the **same config** they were deployed with. It does NOT re-read the env file. If you changed `.env.DEV_SWARM` and want the changes to take effect, you must do a full redeploy:
+
+```bash
+make stop-swarm
+# wait ~15 seconds
+make start-swarm ENV_FILE=.env.DEV_SWARM
+```
+
+---
+
+## 20. VS Code port forwarding conflicts with Swarm ingress
+
+VS Code's SSH tunnel sometimes conflicts with Swarm's ingress routing (IPv6 issues). Accessing `localhost:80` via VS Code's forwarded port may not work.
+
+**Workaround:** Use the server's public IP directly instead of localhost.
+
+---
+
+# Maintenance
+
+## 21. CI/CD not yet adapted for Swarm
+
+The current GitHub Actions workflow (`.github/workflows/ci_cd.yml`) uses `docker compose up` for DEV and PROD deployments. It does **not** support Swarm.
+
+What needs to change for Swarm CI/CD:
+- `docker compose up` → `make start-swarm ENV_FILE=.env.XXX` (build, push to registry, stack deploy)
+- The runner/deploy target needs access to the Swarm manager (SSH or self-hosted runner on the manager node)
+- Worker nodes pull images from the registry automatically — no action needed per node
+- `ENV_FILE` is already a GitHub Actions variable (`${{ vars.ENV_FILE }}`) — just needs to point to the right file
+
+Options:
+1. **Self-hosted runner on the manager node** — simplest, runner has direct access to Docker and the registry
+2. **SSH deploy step** — GitHub-hosted runner SSHes into the manager to run make commands
+3. **Separate workflow** — new workflow file for Swarm deployments, triggered manually or on specific branches
+
+Not blocking for merge to development — Swarm can be deployed manually until CI/CD is adapted.
+
+---
+
+## 22. Periodic cleanup on DEV server
+
+> **DEV-specific** — the DEV server has a small root disk (29GB). Hetzner/PROD with larger disks are less affected but should still clean up periodically.
+
+Docker images, build cache, orphaned volumes, and uploaded IFC files accumulate fast. Without periodic cleanup, the disk fills up and deployments fail.
+
+**What accumulates:**
+- Docker build cache (~2GB per full build cycle)
+- Old/unused images (previous deployments)
+- Orphaned volumes from CI/CD runs (e.g. `repo-clone_*` volumes from GitHub Actions)
+- Uploaded IFC files in `files_data` volume (4GB+ and growing)
+
+**Cleanup commands:**
+```bash
+# Check disk usage
+df -h /
+
+# Docker overview
+docker system df
+
+# Remove unused images and build cache
+docker builder prune -af
+docker image prune -af
+
+# Remove orphaned volumes (CAREFUL: only removes volumes not attached to any container)
+docker volume prune -f
+
+# List volume sizes to find large orphans
+docker system df -v | grep -A 50 "Local Volumes"
+```
+
+**Recommendation:** Run `docker system prune -af` and `docker volume prune -f` after each major deployment cycle. Consider adding this to the CI/CD pipeline or a cron job. The `/mnt` disk (74GB ephemeral Azure temp disk) can be used for temporary storage but **data is lost on VM deallocation/resize**.
+
+---
+
+## 23. `makemigrations` runs on every backend startup
+
+The `server-entrypoint.sh` runs `python manage.py makemigrations` and `python manage.py migrate` on every container start. This works because:
+- Backend is constrained to **1 replica** on the manager node — no migration race conditions
+- The generated migration files live inside the container (ephemeral) — they're not persisted
+
+**Risk:** If model changes exist that haven't been committed as migration files, `makemigrations` will generate them at runtime inside the container. These migrations disappear when the container restarts, potentially causing inconsistency. In production, migrations should be baked into the image at build time.
+
+**Decision:** Kept as-is for now. Backend is always 1 replica, and in practice all migrations are committed to git before deployment. But worth revisiting for PROD hardening.
+
+---
+
+## 24. Historical Swarm instability
+
+> "unexplained crashes/corrupt state (5+ years ago) — hopefully they are gone now"
+
+Modern Docker Engine (24+) should be stable. Mitigations already in place:
+- `CELERY_TASK_ACKS_LATE = True` — tasks stay in queue until completed
+- `CELERY_TASK_REJECT_ON_WORKER_LOST = True` — crashed tasks are re-queued
+- `restart_policy: condition: any` on DB (see section 11), `on-failure` on other services
+- `update_config: failure_action: rollback` — bad deploys roll back
+
+---
+
+# Local Dev Only
+
+## 25. Lima-specific: virtiofs + Celery prefork = errno 35
+
+Celery's `prefork` pool + Lima's virtiofs read-only mounts cause `EDEADLK` deadlocks. Workaround: `--pool=solo`.
+
+**Not a production issue** — only affects local development on macOS with Lima. Docker containers on Linux use proper ext4/overlay2 filesystems.
+
+---
+
+## 26. macOS NFS gotcha: `/tmp` vs `/private/tmp`
+
+On macOS, `/tmp` is a symlink to `/private/tmp`. NFS exports must use the real path (`/private/tmp/...`). Not relevant for Linux servers (Hetzner/Azure), but relevant for local development on macOS.
diff --git a/docs/swarm-deploy-guide.md b/docs/swarm-deploy-guide.md
new file mode 100644
index 00000000..f484bcf6
--- /dev/null
+++ b/docs/swarm-deploy-guide.md
@@ -0,0 +1,183 @@
+# Swarm Deploy Guide
+
+Copy-paste commands for deploying and operating the Validation Service on Docker Swarm.
+
+For architecture decisions, known issues, env file strategy, and gotchas, see [swarm-considerations.md](swarm-considerations.md).
+
+---
+
+## Deploy
+
+```bash
+# Build, push images to registry, and deploy
+make swarm-push ENV_FILE=<env_file>
+make start-swarm-nodb ENV_FILE=<env_file>    # external DB (Azure DEV/PROD)
+# or: make start-swarm ENV_FILE=<env_file>   # containerized DB (Hetzner)
+# or: make start-swarm-local ENV_FILE=<env_file>  # local testing (no NFS, no ClamAV)
+
+# Verify — all services should reach 1/1 within ~60s
+watch docker service ls
+```
+
+## Redeploy (after code changes)
+
+No rolling updates with `latest` tags — must tear down and redeploy.
+
+```bash
+make stop-swarm
+# Wait ~15s for network cleanup
+make swarm-push ENV_FILE=<env_file>
+make start-swarm-nodb ENV_FILE=<env_file>
+watch docker service ls
+```
+
+To force-restart a single service (same image, same env):
+```bash
+docker service update --force validate_backend
+```
+
+## Add / Remove Worker Nodes
+
+### Prerequisites
+
+1. Worker VM must be in the same VNet/subnet as the manager
+2. Manager's SSH key must be on the worker (`~/.ssh/authorized_keys`). On Azure, use Portal > "Reset password > Add SSH public key"
+3. Register the worker in the env file:
+   ```
+   SWARM_WORKER_1=dev-vm-worker-1:10.0.0.4
+   ```
+
+### Add
+
+```bash
+# Installs Docker, configures registry, joins Swarm — all in one command
+make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
+```
+
+### Remove
+
+```bash
+# Drains tasks, leaves Swarm, removes node
+make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM
+
+# Then: remove SWARM_WORKER_N line from env file, delete VM if temporary
+```
+
+## Scale Workers
+
+```bash
+# Scale to N worker containers (distributed across nodes)
+make scale-workers WORKERS=4
+
+# Check which node each worker runs on
+docker service ps validate_worker
+
+# Set resource limits per container
+make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G
+```
+
+**Terminology:** A worker _node_ is a VM. Each node runs worker _replicas_ (containers). Each replica runs multiple Celery _processes_ (set by `CELERY_CONCURRENCY`, default 4).
+
+## Monitoring
+
+```bash
+make swarm-status                            # service overview + worker placement
+docker service logs -f validate_worker       # follow logs (also: backend, frontend, scheduler)
+docker stats --no-stream                     # CPU/memory per container
+docker node ls                               # node health
+journalctl -k | grep "out of memory"         # check for OOM kills
+```
+
+## Stop / Start
+
+```bash
+make stop-swarm          # removes stack, keeps volumes and Swarm membership
+make start-swarm-nodb ENV_FILE=<env_file>    # redeploy — volumes are still there
+```
+
+## Full Reset
+
+Removes everything — stack, volumes, images, Swarm. Start fresh from first-time setup.
+
+```bash
+make stop-swarm
+docker rm -f registry
+docker volume prune -f          # WARNING: deletes DB data and uploaded files
+docker system prune -af
+docker swarm leave --force
+```
+
+---
+
+## First-Time Setup (Manager Node)
+
+One-time setup for a new manager. Once done, use the commands above for daily operations.
+
+```bash
+# 1. Init Swarm
+docker swarm init --advertise-addr <PRIVATE_IP>
+
+# 2. Start local registry
+docker run -d --name registry -p 5000:5000 --restart always registry:2
+
+# 3. Configure insecure registry (required for multi-node)
+#    Add "insecure-registries": ["<PRIVATE_IP>:5000"] to /etc/docker/daemon.json
+#    Then: sudo systemctl restart docker
+
+# 4. Set up NFS
+apt install -y nfs-kernel-server
+mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs
+chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs
+chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs
+
+cat >> /etc/exports << 'EOF'
+/srv/nfs/files_data  10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash)
+EOF
+
+exportfs -ra && systemctl restart nfs-kernel-server
+
+# 5. Create .VERSION
+echo "1.0.0" > .VERSION
+
+# 6. Prepare env file — see swarm-considerations.md for env file strategy
+cp .env .env.myserver   # customize: PUBLIC_URL, DJANGO_ALLOWED_HOSTS, NFS_SERVER_IP, REGISTRY, etc.
+
+# 7. Fetch submodules, build, deploy
+make fetch-modules
+make swarm-push ENV_FILE=<env_file>
+make start-swarm-nodb ENV_FILE=<env_file>
+```
+
+### Migrating from Docker Compose
+
+```bash
+# Stop old stack
+docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down
+
+# Copy data from compose volumes to NFS (volume names differ: validation-service_* vs validate_*)
+docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/"
+docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/"
+
+# Copy SSL certs (after first deploy)
+cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/
+docker service update --force validate_frontend
+```
+
+---
+
+## Quick Reference
+
+| Task | Command |
+|---|---|
+| Deploy (external DB) | `make start-swarm-nodb ENV_FILE=<env_file>` |
+| Deploy (with DB) | `make start-swarm ENV_FILE=<env_file>` |
+| Stop | `make stop-swarm` |
+| Build + push | `make swarm-push ENV_FILE=<env_file>` |
+| Scale workers | `make scale-workers WORKERS=4` |
+| Set limits | `make set-worker-limits CPU=2 MEM=2G` |
+| Add worker | `make add-worker NAME=<name> ENV_FILE=<env_file>` |
+| Remove worker | `make remove-worker NAME=<name> ENV_FILE=<env_file>` |
+| Status | `make swarm-status` |
+| Logs | `docker service logs -f validate_<service>` |
+| Force-restart | `docker service update --force validate_<service>` |