From 0d219f8e5c5e43c69c07d02c6891c226839f31f4 Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Sun, 8 Mar 2026 21:58:33 +0100 Subject: [PATCH 01/12] initial work --- .gitignore | 17 +++ .vscode/launch.json | 15 ++ Makefile | 25 ++++ docker-compose.swarm.yml | 172 +++++++++++++++++++++++ docker/backend/worker-beat-entrypoint.sh | 3 - docker/backend/worker-entrypoint.sh | 3 - 6 files changed, 229 insertions(+), 6 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 docker-compose.swarm.yml diff --git a/.gitignore b/.gitignore index 452309f2..da00de86 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,9 @@ **/__pycache__/ **/.pytest_cache/ **/node_modules +**/build **/.dev +**/.debug **/letsencrypt .DS_Store .env.* @@ -12,6 +14,21 @@ tmp* django_static local +### Local/personal files ### +CLAUDE.md +.analyses/ +plans/ +.python-version +backend/Makefile.local +backend/Makefile_v2 +backend/core/settings_backup_*.py +docker/backend/*_backup.sh +redis-proxy.py +pyproject.toml +uv.lock +backend/pyproject.toml +backend/package-lock.json + ### VisualStudioCode ### .vscode/* !.vscode/settings.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..15416932 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Debug WR31 rule_executor", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/backend/.debug/debug_wr31.py", + "python": "${workspaceFolder}/backend/.dev/venv/bin/python", + "cwd": "${workspaceFolder}/backend", + "console": "integratedTerminal", + "justMyCode": false + } + ] +} diff --git a/Makefile b/Makefile index 0163120f..a15e04b2 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,31 @@ start-infra-only: stop: docker compose down +# --- Docker Swarm --- + +REGISTRY ?= localhost:5000 +WORKERS ?= 2 + +start-swarm: + docker stack deploy -c docker-compose.swarm.yml --with-registry-auth validate + +stop-swarm: + docker stack rm validate + +scale-workers: + docker service scale validate_worker=$(WORKERS) + +swarm-push: build + docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend + docker tag buildingsmart/validationsvc-frontend $(REGISTRY)/validationsvc-frontend + docker push $(REGISTRY)/validationsvc-backend + docker push $(REGISTRY)/validationsvc-frontend + +swarm-status: + @docker service ls + @echo "---" + @docker service ps validate_worker + build: docker compose build \ --build-arg GIT_COMMIT_HASH="$$(git rev-parse --short HEAD)" \ diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml new file mode 100644 index 00000000..8975a4f8 --- /dev/null +++ b/docker-compose.swarm.yml @@ -0,0 +1,172 @@ +# Docker Swarm deployment configuration +# +# Usage: +# 1. Build and push images: make swarm-push +# 2. Deploy: make start-swarm +# 3. Scale workers: make scale-workers WORKERS=5 +# 4. Status: make swarm-status +# 5. Tear down: make stop-swarm +# +# Prerequisites: +# - docker swarm init +# - Local registry: docker service create --name registry --publish 5000:5000 registry:2 +# - NFS server configured (see PR description) +# +# NFS: Set NFS_SERVER_IP in .env (default: 10.0.0.1). +# For local testing without NFS, override volumes with plain named volumes. + +services: + + frontend: + image: ${REGISTRY:-localhost:5000}/validationsvc-frontend + ports: + - 80:80 + - 443:443 + environment: + CERTBOT_DOMAIN: ${CERTBOT_DOMAIN} + CERTBOT_EMAIL: ${CERTBOT_EMAIL} + volumes: + - letsencrypt_data:/etc/letsencrypt + - static_data:/app/backend/django_static + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + backend: + image: ${REGISTRY:-localhost:5000}/validationsvc-backend + entrypoint: /app/backend/server-entrypoint.sh + env_file: .env + volumes: + - static_data:/app/backend/django_static + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + expose: + - 8000 + networks: + - validate + deploy: + replicas: 2 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/alive')\""] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + + worker: + image: ${REGISTRY:-localhost:5000}/validationsvc-backend + entrypoint: /app/backend/worker-entrypoint.sh + env_file: .env + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 2 + # No placement constraint - workers run on any node + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + resources: + limits: + cpus: "2.0" + memory: 4G + reservations: + cpus: "1.0" + memory: 2G + + scheduler: + image: ${REGISTRY:-localhost:5000}/validationsvc-backend + entrypoint: /app/backend/worker-beat-entrypoint.sh + env_file: .env + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + redis: + image: redis:8.4-alpine + command: redis-server --protected-mode no --bind 0.0.0.0 + expose: + - 6379 + volumes: + - redis_data:/data + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + db: + image: postgres:16.10-alpine + environment: + POSTGRES_DB: ${POSTGRES_NAME} + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + expose: + - 5432 + volumes: + - postgres_data:/var/lib/postgresql/data/ + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + +networks: + validate: + driver: overlay + +volumes: + static_data: + letsencrypt_data: + postgres_data: + redis_data: + files_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/files_data" + gherkin_rules_log_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/gherkin_logs" diff --git a/docker/backend/worker-beat-entrypoint.sh b/docker/backend/worker-beat-entrypoint.sh index 23c38f6e..1eeb5afa 100644 --- a/docker/backend/worker-beat-entrypoint.sh +++ b/docker/backend/worker-beat-entrypoint.sh @@ -23,9 +23,6 @@ freshclam service clamav-freshclam start service clamav-daemon start -python manage.py makemigrations -python manage.py migrate - CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes echo "Celery concurrency: $CELERY_CONCURRENCY" diff --git a/docker/backend/worker-entrypoint.sh b/docker/backend/worker-entrypoint.sh index c08929b8..cbe08319 100644 --- a/docker/backend/worker-entrypoint.sh +++ b/docker/backend/worker-entrypoint.sh @@ -23,9 +23,6 @@ freshclam service clamav-freshclam start service clamav-daemon start -python manage.py makemigrations -python manage.py migrate - CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes echo "Celery concurrency: $CELERY_CONCURRENCY" From 012776c5a61e765cb3d9ce7e3f5143f1e65f094e Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Tue, 10 Mar 2026 17:14:33 +0000 Subject: [PATCH 02/12] docker swarm deployment support - file upload seek position --- .vscode/launch.json | 15 ----- Makefile | 3 + backend/apps/ifc_validation/api/v1/views.py | 1 + docker-compose.swarm.local.yml | 67 +++++++++++++++++++++ docker-compose.swarm.yml | 10 +-- 5 files changed, 76 insertions(+), 20 deletions(-) delete mode 100644 .vscode/launch.json create mode 100644 docker-compose.swarm.local.yml diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 15416932..00000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "version": "0.2.0", - "configurations": [ - { - "name": "Debug WR31 rule_executor", - "type": "debugpy", - "request": "launch", - "program": "${workspaceFolder}/backend/.debug/debug_wr31.py", - "python": "${workspaceFolder}/backend/.dev/venv/bin/python", - "cwd": "${workspaceFolder}/backend", - "console": "integratedTerminal", - "justMyCode": false - } - ] -} diff --git a/Makefile b/Makefile index a15e04b2..608e1f25 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,9 @@ WORKERS ?= 2 start-swarm: docker stack deploy -c docker-compose.swarm.yml --with-registry-auth validate +start-swarm-local: + docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml --with-registry-auth validate + stop-swarm: docker stack rm validate diff --git a/backend/apps/ifc_validation/api/v1/views.py b/backend/apps/ifc_validation/api/v1/views.py index ae7bb5e1..838f09e0 100644 --- a/backend/apps/ifc_validation/api/v1/views.py +++ b/backend/apps/ifc_validation/api/v1/views.py @@ -195,6 +195,7 @@ def post(self, request, *args, **kwargs): #file = os.path.join(MEDIA_ROOT, uploaded_file['file_name']) #uploaded_file['size'] = os.path.getsize(file) uploaded_file['size'] = file_length + f.seek(0) instance = serializer.save() # submit task for background execution diff --git a/docker-compose.swarm.local.yml b/docker-compose.swarm.local.yml new file mode 100644 index 00000000..c05b5c9c --- /dev/null +++ b/docker-compose.swarm.local.yml @@ -0,0 +1,67 @@ +# Override: single-node local testing (no NFS, no ClamAV, reduced replicas) +# +# Usage: +# docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml validate +# +# For production/NFS testing, use docker-compose.swarm.yml directly. + +services: + + frontend: + environment: + CERTBOT_DOMAIN: _ + CERTBOT_EMAIL: x + + backend: + deploy: + replicas: 1 + + worker: + entrypoint: /bin/sh + command: + - -c + - | + set -e + until cd /files_storage; do echo "Waiting for files_storage..."; done + until cd /app/backend; do echo "Waiting for server volume..."; done + while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do + echo "Waiting for DB..." + sleep 5 + done + echo "DB is ready. Starting worker (no ClamAV)." + rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true + CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4} + echo "Celery concurrency: $$CELERY_CONCURRENCY" + celery --app=core worker --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker@%n + deploy: + replicas: 1 + resources: + limits: + cpus: "2.0" + memory: 2G + reservations: + cpus: "0.5" + memory: 512M + + scheduler: + entrypoint: /bin/sh + command: + - -c + - | + set -e + until cd /files_storage; do echo "Waiting for files_storage..."; done + until cd /app/backend; do echo "Waiting for server volume..."; done + while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do + echo "Waiting for DB..." + sleep 5 + done + echo "DB is ready. Starting scheduler (no ClamAV)." + rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true + CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4} + celery --app=core worker --beat --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker-beat@%n + +volumes: + files_data: + driver: local + gherkin_rules_log_data: + driver: local diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml index 8975a4f8..d21ae5fc 100644 --- a/docker-compose.swarm.yml +++ b/docker-compose.swarm.yml @@ -62,7 +62,7 @@ services: delay: 30s failure_action: rollback healthcheck: - test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/alive')\""] + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""] interval: 30s timeout: 10s retries: 3 @@ -89,11 +89,11 @@ services: failure_action: rollback resources: limits: - cpus: "2.0" - memory: 4G + cpus: "${WORKER_CPU_LIMIT:-2.0}" + memory: ${WORKER_MEMORY_LIMIT:-2G} reservations: - cpus: "1.0" - memory: 2G + cpus: "${WORKER_CPU_RESERVATION:-1.0}" + memory: ${WORKER_MEMORY_RESERVATION:-1G} scheduler: image: ${REGISTRY:-localhost:5000}/validationsvc-backend From 61a93444cb08a1a9fa2b0fd4f465a5add0f1acbc Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Thu, 12 Mar 2026 11:56:30 +0000 Subject: [PATCH 03/12] whitelisting admin, submodule sync --- backend/apps/ifc_validation/admin.py | 4 ++-- backend/apps/ifc_validation_models | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/apps/ifc_validation/admin.py b/backend/apps/ifc_validation/admin.py index d52ff317..5c226756 100644 --- a/backend/apps/ifc_validation/admin.py +++ b/backend/apps/ifc_validation/admin.py @@ -439,8 +439,8 @@ class ValidationOutcomeAdmin(BaseAdmin, NonAdminAddable): readonly_fields = ["id", "public_id", "created", "updated"] date_hierarchy = "created" - list_filter = ['validation_task__type', 'severity', 'validation_task__request__model', 'outcome_code', 'feature', ('created', AdvancedDateFilter)] - search_fields = ('validation_task__request__file_name', 'feature', 'feature_version', 'outcome_code', 'severity', 'expected', 'observed') + list_filter = ['validation_task__type', 'severity_in_db', 'validation_task__request__model', 'outcome_code', 'feature', ('created', AdvancedDateFilter)] + search_fields = ('validation_task__request__file_name', 'feature', 'feature_version', 'outcome_code', 'severity_in_db', 'expected', 'observed') paginator = utils.LargeTablePaginator show_full_result_count = False # do not use COUNT(*) twice diff --git a/backend/apps/ifc_validation_models b/backend/apps/ifc_validation_models index 16089c2e..774c7bb8 160000 --- a/backend/apps/ifc_validation_models +++ b/backend/apps/ifc_validation_models @@ -1 +1 @@ -Subproject commit 16089c2ec9c95454604d20ebc024239f3c71cd80 +Subproject commit 774c7bb8dff8be799bd41d648d28c5a4a2789deb From 4eb8f243c83a8f9364ea17dbf1755fb5e33405f9 Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Sat, 14 Mar 2026 00:06:09 +0000 Subject: [PATCH 04/12] submodule --- backend/apps/ifc_validation_models | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/apps/ifc_validation_models b/backend/apps/ifc_validation_models index 774c7bb8..469d9a48 160000 --- a/backend/apps/ifc_validation_models +++ b/backend/apps/ifc_validation_models @@ -1 +1 @@ -Subproject commit 774c7bb8dff8be799bd41d648d28c5a4a2789deb +Subproject commit 469d9a488856e5cd141a47ffdc37d3fcbc6637e0 From 54632ab11874de8f721543cb6177e949a07f0b13 Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Sun, 15 Mar 2026 11:52:20 +0000 Subject: [PATCH 05/12] swarm env environments --- Makefile | 5 +++-- docker-compose.swarm.yml | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 608e1f25..8bfc8d07 100644 --- a/Makefile +++ b/Makefile @@ -30,12 +30,13 @@ stop: REGISTRY ?= localhost:5000 WORKERS ?= 2 +ENV_FILE ?= .env start-swarm: - docker stack deploy -c docker-compose.swarm.yml --with-registry-auth validate + docker compose -f docker-compose.swarm.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate start-swarm-local: - docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml --with-registry-auth validate + docker compose -f docker-compose.swarm.yml -f docker-compose.swarm.local.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate stop-swarm: docker stack rm validate diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml index d21ae5fc..d3afa804 100644 --- a/docker-compose.swarm.yml +++ b/docker-compose.swarm.yml @@ -146,7 +146,7 @@ services: placement: constraints: [node.role == manager] restart_policy: - condition: on-failure + condition: any delay: 5s networks: @@ -162,11 +162,11 @@ volumes: driver: local driver_opts: type: nfs - o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2" device: ":/srv/nfs/files_data" gherkin_rules_log_data: driver: local driver_opts: type: nfs - o: "addr=${NFS_SERVER_IP:-10.0.0.1},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2" device: ":/srv/nfs/gherkin_logs" From ab223b2ba441bdfbb82714bf73de0296fc73f3a3 Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Sun, 15 Mar 2026 12:19:49 +0000 Subject: [PATCH 06/12] swarm general env --- .env | 110 ++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 59 insertions(+), 51 deletions(-) diff --git a/.env b/.env index fb79ac74..982cd621 100644 --- a/.env +++ b/.env @@ -1,51 +1,59 @@ -# variables in Docker Compose -DEBUG = True -ENV = Development -PUBLIC_URL = http://localhost - -# Certbot -CERTBOT_DOMAIN = _ -CERTBOT_EMAIL = - -# Django -MEDIA_ROOT = /files_storage -DJANGO_DB = postgresql -DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk -DJANGO_ALLOWED_HOSTS = localhost -DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000 -DJANGO_LOG_LEVEL = INFO -GHERKIN_LOG_FOLDER = /gherkin_logs -DJANGO_GUNICORN_WORKERS = 3 -DJANGO_GUNICORN_THREADS_PER_WORKER = 4 - -# DB -POSTGRES_HOST = db -POSTGRES_NAME = postgres -POSTGRES_USER = postgres -POSTGRES_PASSWORD = postgres -POSTGRES_PORT = 5432 - -# Worker -REDIS_PORT = 6379 -CELERY_BROKER_URL = redis://redis:6379/0 -CELERY_TASK_SOFT_TIME_LIMIT = 3600 -CELERY_TASK_TIME_LIMIT = 4000 -TASK_TIMEOUT_LIMIT = 3600 -DJANGO_DB_USER_CONTEXT = SYSTEM -DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000 -CELERY_CONCURRENCY = 4 - -# Email -MAILGUN_API_URL = -MAILGUN_API_KEY = -MAILGUN_FROM_NAME = Validation Service -MAILGUN_FROM_EMAIL = noreply@localhost -ADMIN_EMAIL = noreply@localhost -CONTACT_EMAIL = noreply@localhost - -# IAM -B2C_CLIENT_ID = -B2C_CLIENT_SECRET = -B2C_AUTHORITY = -B2C_USER_FLOW = -USE_WHITELIST = False +# variables in Docker Compose +DEBUG = True +ENV = Development +PUBLIC_URL = http://localhost + +# Certbot +CERTBOT_DOMAIN = _ +CERTBOT_EMAIL = + +# Django +MEDIA_ROOT = /files_storage +DJANGO_DB = postgresql +DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk +DJANGO_ALLOWED_HOSTS = localhost +DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000 +DJANGO_LOG_LEVEL = INFO +GHERKIN_LOG_FOLDER = /gherkin_logs +DJANGO_GUNICORN_WORKERS = 3 +DJANGO_GUNICORN_THREADS_PER_WORKER = 4 + +# DB +POSTGRES_HOST = db +POSTGRES_NAME = postgres +POSTGRES_USER = postgres +POSTGRES_PASSWORD = postgres +POSTGRES_PORT = 5432 + +# Worker +REDIS_PORT = 6379 +CELERY_BROKER_URL = redis://redis:6379/0 +CELERY_TASK_SOFT_TIME_LIMIT = 3600 +CELERY_TASK_TIME_LIMIT = 4000 +TASK_TIMEOUT_LIMIT = 3600 +DJANGO_DB_USER_CONTEXT = SYSTEM +DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000 +CELERY_CONCURRENCY = 4 + +# Email +MAILGUN_API_URL = +MAILGUN_API_KEY = +MAILGUN_FROM_NAME = Validation Service +MAILGUN_FROM_EMAIL = noreply@localhost +ADMIN_EMAIL = noreply@localhost +CONTACT_EMAIL = noreply@localhost + +# IAM +B2C_CLIENT_ID = +B2C_CLIENT_SECRET = +B2C_AUTHORITY = +B2C_USER_FLOW = +USE_WHITELIST = False + +# Swarm (ignored by docker compose) +# REGISTRY=localhost:5000 +# NFS_SERVER_IP=10.0.0.1 +# WORKER_CPU_LIMIT=2.0 +# WORKER_CPU_RESERVATION=1.0 +# WORKER_MEMORY_LIMIT=2G +# WORKER_MEMORY_RESERVATION=1G From 126ca948eed8d90d6facabd36e460ba93179cb6b Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Sun, 15 Mar 2026 14:22:54 +0000 Subject: [PATCH 07/12] replace docker compose config with envsubst, fix overlay MTU --- Makefile | 18 +++++++++++++++--- docker-compose.swarm.yml | 31 ++++++++++++++----------------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 8bfc8d07..84ddac7d 100644 --- a/Makefile +++ b/Makefile @@ -28,15 +28,20 @@ stop: # --- Docker Swarm --- -REGISTRY ?= localhost:5000 WORKERS ?= 2 ENV_FILE ?= .env +# Reads compose-level vars from ENV_FILE, substitutes into YAML via envsubst. +# Container env vars are loaded by docker stack deploy via the env_file: directive. +SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP POSTGRES_NAME POSTGRES_USER POSTGRES_PASSWORD +SWARM_ENV = ENV_FILE=$(ENV_FILE) \ + $(foreach v,$(SWARM_VARS),$(v)=$$(grep '^$(v)=' $(ENV_FILE) | cut -d= -f2-)) + start-swarm: - docker compose -f docker-compose.swarm.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate + $(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate start-swarm-local: - docker compose -f docker-compose.swarm.yml -f docker-compose.swarm.local.yml --env-file $(ENV_FILE) config | docker stack deploy -c - --with-registry-auth validate + $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml | docker stack deploy -c - --with-registry-auth validate stop-swarm: docker stack rm validate @@ -44,6 +49,13 @@ stop-swarm: scale-workers: docker service scale validate_worker=$(WORKERS) +CPU ?= 2 +MEM ?= 2G +set-worker-limits: + docker service update --limit-cpu $(CPU) --limit-memory $(MEM) validate_worker + +REGISTRY ?= $$(grep '^REGISTRY=' $(ENV_FILE) | cut -d= -f2- || echo localhost:5000) + swarm-push: build docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend docker tag buildingsmart/validationsvc-frontend $(REGISTRY)/validationsvc-frontend diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml index d3afa804..ddaac915 100644 --- a/docker-compose.swarm.yml +++ b/docker-compose.swarm.yml @@ -18,7 +18,7 @@ services: frontend: - image: ${REGISTRY:-localhost:5000}/validationsvc-frontend + image: ${REGISTRY}/validationsvc-frontend ports: - 80:80 - 443:443 @@ -39,9 +39,9 @@ services: delay: 5s backend: - image: ${REGISTRY:-localhost:5000}/validationsvc-backend + image: ${REGISTRY}/validationsvc-backend entrypoint: /app/backend/server-entrypoint.sh - env_file: .env + env_file: ${ENV_FILE} volumes: - static_data:/app/backend/django_static - files_data:/files_storage @@ -51,7 +51,7 @@ services: networks: - validate deploy: - replicas: 2 + replicas: 1 placement: constraints: [node.role == manager] restart_policy: @@ -69,9 +69,9 @@ services: start_period: 60s worker: - image: ${REGISTRY:-localhost:5000}/validationsvc-backend + image: ${REGISTRY}/validationsvc-backend entrypoint: /app/backend/worker-entrypoint.sh - env_file: .env + env_file: ${ENV_FILE} volumes: - files_data:/files_storage - gherkin_rules_log_data:/gherkin_logs @@ -87,18 +87,13 @@ services: parallelism: 1 delay: 30s failure_action: rollback - resources: - limits: - cpus: "${WORKER_CPU_LIMIT:-2.0}" - memory: ${WORKER_MEMORY_LIMIT:-2G} - reservations: - cpus: "${WORKER_CPU_RESERVATION:-1.0}" - memory: ${WORKER_MEMORY_RESERVATION:-1G} + # Resource limits applied post-deploy via: + # docker service update --limit-cpu 2 --limit-memory 2G validate_worker scheduler: - image: ${REGISTRY:-localhost:5000}/validationsvc-backend + image: ${REGISTRY}/validationsvc-backend entrypoint: /app/backend/worker-beat-entrypoint.sh - env_file: .env + env_file: ${ENV_FILE} volumes: - files_data:/files_storage - gherkin_rules_log_data:/gherkin_logs @@ -152,6 +147,8 @@ services: networks: validate: driver: overlay + driver_opts: + com.docker.network.driver.mtu: "1400" volumes: static_data: @@ -162,11 +159,11 @@ volumes: driver: local driver_opts: type: nfs - o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" device: ":/srv/nfs/files_data" gherkin_rules_log_data: driver: local driver_opts: type: nfs - o: "addr=${NFS_SERVER_IP:-10.0.0.3},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" device: ":/srv/nfs/gherkin_logs" From 8f9dbeadcc0983735f149a80767fd4fd4a1c0be4 Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Sun, 15 Mar 2026 18:49:19 +0000 Subject: [PATCH 08/12] fix local override merging, cleanup gitignore and duplicate assignment --- Makefile | 4 +++- .../ifc_validation/tasks/processing/instance_completion.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 84ddac7d..6baa69ca 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,9 @@ start-swarm: $(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate start-swarm-local: - $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml | docker stack deploy -c - --with-registry-auth validate + $(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml + $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm_local.yml + docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm_local.yml --with-registry-auth validate stop-swarm: docker stack rm validate diff --git a/backend/apps/ifc_validation/tasks/processing/instance_completion.py b/backend/apps/ifc_validation/tasks/processing/instance_completion.py index 2fb5b3ba..c2940e56 100644 --- a/backend/apps/ifc_validation/tasks/processing/instance_completion.py +++ b/backend/apps/ifc_validation/tasks/processing/instance_completion.py @@ -18,7 +18,7 @@ import itertools import functools - file_path, step_ids = file_path, step_ids = json.load(sys.stdin) + file_path, step_ids = json.load(sys.stdin) ifc_file = ifcopenshell.open(file_path) def filter_serializable(v): def inner(k, v): From edcdf2bffa2ba8d60f95eb73200640168cdab355 Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Mon, 16 Mar 2026 17:13:11 +0000 Subject: [PATCH 09/12] add swarm documentation --- docs/swarm-considerations.md | 487 ++++++++++++++++++++++++++++++++++ docs/swarm-runbook.md | 491 +++++++++++++++++++++++++++++++++++ 2 files changed, 978 insertions(+) create mode 100644 docs/swarm-considerations.md create mode 100644 docs/swarm-runbook.md diff --git a/docs/swarm-considerations.md b/docs/swarm-considerations.md new file mode 100644 index 00000000..40e77bb0 --- /dev/null +++ b/docs/swarm-considerations.md @@ -0,0 +1,487 @@ +# Docker Swarm — Considerations & Known Issues + +Compiled during IVS-719 development. Grouped by category. + +## Status + +- **Single-node Swarm**: tested and working (Hetzner, 2026-03-10) +- **Multi-node Swarm**: tested and working with 2 nodes + NFS (Hetzner, 2026-03-15) +- **Single-node Swarm on Azure DEV**: tested and working with external DB + NFS (2026-03-15) +- **CI/CD**: not yet adapted for Swarm — see section 5 +- **SSL/Certbot**: not tested with a real domain yet (using `CERTBOT_DOMAIN=_` to skip) +- **Documentation**: user-facing docs (README, deployment guide) not yet updated for Swarm workflow + +--- + +# Architecture & Design + +## 1. Architecture overview + +Every worker needs access to `/files_storage` (uploaded IFC files) and `/gherkin_logs`. In Docker Compose, these are local volumes on one machine. In Swarm, workers run on **different machines** — so files must be shared via NFS. + +``` + ┌─────────┐ + │ Frontend │ (Nginx + React) + │ :80/443 │ + └────┬─────┘ + │ + ┌────▼─────┐ + │ Backend │ (Django API — manager node) + │ :8000 │ + └────┬─────┘ + │ enqueues tasks + ┌────▼─────┐ + │ Redis │ (Celery broker — manager node) + │ :6379 │ + └────┬─────┘ + │ workers consume via overlay network + ┌─────────┼──────────┐ + │ │ │ + ┌────▼───┐ ┌───▼────┐ ┌──▼─────┐ + │Worker 1│ │Worker 2│ │Worker N│ (any node in swarm) + └────┬───┘ └───┬────┘ └──┬─────┘ + │ │ │ + │ NFS mount │ + └─────────┼──────────┘ + ┌────▼─────┐ + │/srv/nfs/ │ (NFS server on manager node) + │files_data│ + └──────────┘ + │ same machine + ┌────▼─────┐ + │ Postgres │ (manager node) + └──────────┘ + + ┌───────────┐ + │ Scheduler │ (1 replica, manager only) + │ --beat │ file retention: archive@90d, remove@180d + └───────────┘ +``` + +**How it works:** +- The **manager node** runs: frontend, backend, DB, Redis, scheduler, and the NFS server +- **Worker nodes** only run Celery workers — they mount NFS volumes automatically via the Docker volume driver +- The **overlay network** (Docker Swarm native) connects workers to Redis and Postgres across machines +- NFS gives workers read/write access to uploaded files as if they were local + +**If NFS goes down, all workers stall** — `hard,timeo=600` mount options mean workers will hang (not error) until NFS recovers. This is intentional: better to wait than to silently fail. + +For Azure: restrict NFS exports to VNet CIDR (e.g. `10.0.0.0/16(rw,sync,...)`), not `*`. + +--- + +## 2. Build and deploy are now separate steps + +Docker Compose: `docker compose build && docker compose up` — build and run in one flow. + +Docker Swarm: worker nodes **cannot build images**. They pull from a registry. + +``` +Developer machine Registry Swarm nodes + build ──push──> localhost:5000 <──pull── worker-1, worker-2 +``` + +Workflow: +```bash +make build # build images locally +make swarm-push ENV_FILE=.env.xxx # tag + push to registry +make start-swarm ENV_FILE=.env.xxx # docker stack deploy (nodes pull from registry) +``` + +For Azure PROD, replace `localhost:5000` with Azure Container Registry (ACR). + +--- + +## 3. Worker scaling and capacity + +There is **no hard cap** on worker replicas. Scaling is manual: + +```bash +make scale-workers WORKERS=4 +``` + +**Capacity math per worker:** +- ~1GB RAM for ClamAV virus signature database +- ~2-3GB RAM for Celery tasks (depends on `CELERY_CONCURRENCY`) +- Total: **~3-4GB RAM per worker** +- Each worker runs `CELERY_CONCURRENCY` parallel tasks (default: 4 in .env.hetzner, 6 in .env) + +| Environment | Workers | Concurrency | Parallel tasks | RAM needed (workers only) | +|---|---|---|---|---| +| Hetzner (8GB) | 2 | 4 | 8 | ~6-8GB | +| DEV | 2 | 4 | 8 | ~6-8GB | +| PROD | 4+ | 6 | 24+ | ~12-16GB | + +To prevent overloading a single node, use `max_replicas_per_node` in the compose file: +```yaml +deploy: + replicas: 4 + placement: + max_replicas_per_node: 2 +``` +This forces Swarm to spread workers across at least 2 nodes. Not currently set — all replicas can land on one node if Swarm decides to. + +**Resource limits** are optional but recommended in production. Apply post-deploy: +```bash +make set-worker-limits CPU=2 MEM=2G # limits only +make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G # limits + reservations +``` + +Per-environment suggestions: +| Environment | CPU limit | Memory limit | Notes | +|---|---|---|---| +| Hetzner (8GB) | 2 | 2G | Small server, max ~2 workers | +| DEV | 1 | 1G | | +| PROD | 4 | 4G | Includes ClamAV ~1GB | + +--- + +## 4. `.env` strategy + +`.env` is committed with safe defaults (localhost, no secrets). Environment-specific files are gitignored via `.env.*`: + +| File | Purpose | Committed? | +|---|---|---| +| `.env` | Shared defaults for local dev / forking | Yes | +| `.env.hetzner` | Hetzner dev server (IPs, NFS, registry) | No | +| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No | +| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No | +| `.env.PROD` | Production (real secrets, domains) | No | + +Deploy with: +```bash +make start-swarm ENV_FILE=.env.hetzner # Hetzner (with DB container) +make start-swarm-nodb ENV_FILE=.env.DEV_SWARM # DEV (external Azure DB) +``` + +The Makefile uses `envsubst` to substitute **only compose-level vars** (REGISTRY, NFS_SERVER_IP, CERTBOT_DOMAIN, etc.) from the env file into the YAML, then pipes the result to `docker stack deploy`. Container env vars are loaded by `docker stack deploy` via the `env_file:` directive directly. + +**Why only compose-level vars?** Earlier approaches that sourced the entire env file broke on values with special characters (`#`, `(`, spaces). The current approach extracts only the vars that `envsubst` needs (REGISTRY, CERTBOT_DOMAIN, CERTBOT_EMAIL, NFS_SERVER_IP, etc.) using `grep` + `cut` in the Makefile. + +**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):** +- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-` +- No quotes around values — Docker passes them literally +- No angle bracket placeholders like `` — they get passed as literal strings + +This avoids three problems with earlier approaches: +1. **Type conversion bugs** — `docker compose config` converted ports to strings and cpus to integers, which `docker stack deploy` rejected +2. **`.env` auto-load conflict** — `docker compose config` always loads `.env` from the project directory, silently overriding values from `--env-file` +3. **Special character breakage** — sourcing the whole env file with `set -a && . ./file` breaks on values containing `#` (comment), `(` (subshell), or unquoted spaces + +--- + +## 5. Local dev and server deploy are now different configs + +You maintain two separate compose files: +- `docker-compose.yml` — local development (single machine, local volumes, `container_name`) +- `docker-compose.swarm.yml` — Swarm deployment (overlay network, NFS volumes, `deploy:` section) +- `docker-compose.swarm.nodb.yml` — Swarm with external DB (no containerized Postgres) + +Risk: they drift apart over time (different env vars, image versions, volume configs). Mitigation: keep changes in sync during PRs. + +--- + +## 6. No `container_name` / `depends_on` in Swarm + +Swarm manages container naming internally (e.g. `validate_worker.1.abc123`). `depends_on` is ignored — services start simultaneously. + +Current impact: minimal — entrypoints use DNS service discovery (`redis`, `db`, `backend`) and `pg_isready` wait loops. No code changes needed. + +--- + +## 7. DNS transition strategy for PROD cutover + +To avoid downtime when switching from Docker Compose to Swarm in production, use a temporary subdomain: + +1. Deploy Swarm stack on a new server (or same server on different ports) +2. Point a temp subdomain to it (e.g. `swarm.validate.buildingsmart.org`) +3. Run both setups in parallel — existing Compose on the main domain, Swarm on the temp domain +4. Test via API (bulk uploads, concurrent validations) against the temp domain +5. Once confident, swap DNS: point the main domain to the Swarm deployment +6. Decommission the old Compose setup + +Rollback: if Swarm has issues, DNS points back to the old setup in minutes. + +For DEV: same approach, or direct cutover (lower risk since it's not user-facing). + +--- + +# Known Issues & Gotchas + +## 8. Overlay network MTU must be set to 1400 + +MTU (Maximum Transmission Unit) is the largest packet size a network link can carry — the default is 1500 bytes. Hetzner's private network uses MTU 1450. Docker's VXLAN overlay adds ~50 bytes of encapsulation headers to every packet, so if the underlying MTU is already ≤1500, the oversized packets get silently dropped or fragmented. Without setting the overlay MTU to 1400 (leaving headroom for the VXLAN overhead), worker nodes on different machines **cannot reach services on the manager** (DB, Redis). + +Symptom: workers stuck on `db:5432 - no response` despite DNS resolving correctly. + +Fix is in `docker-compose.swarm.yml`: +```yaml +networks: + validate: + driver: overlay + driver_opts: + com.docker.network.driver.mtu: "1400" +``` + +This applies to any cloud provider with sub-1500 MTU on internal networks. + +--- + +## 9. ClamAV runs inside every worker (~1GB RAM overhead each) + +Each worker container starts its own ClamAV daemon + freshclam (virus signature updater). This is the **same as before** — not a Swarm change. But when scaling to N workers, you get N independent ClamAV instances. + +Impact: +- ~1GB RAM per worker for virus signature database (observed during Hetzner testing — 5 instances caused OOM on 8GB server) +- Each worker independently downloads signature updates on boot +- The 4GB memory limit per worker (PROD) accounts for this: ~1GB ClamAV + ~2-3GB for Celery tasks +- The local override (`docker-compose.swarm.local.yml`) skips ClamAV entirely for testing on small servers + +4 workers with ClamAV = ~4GB just for virus DBs. + +--- + +## 10. Insecure registry required on ALL nodes + +When using `REGISTRY=10.0.0.3:5000` (private IP) instead of `localhost:5000`, **every node** — including the manager — needs the insecure registry configured: + +```bash +echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json +sudo systemctl restart docker +``` + +Without this, services get `No such image` errors and stay at 0/N replicas. + +--- + +## 11. DB `postmaster.pid` disappears in Swarm (containerized DB only) + +PostgreSQL starts, recovers, becomes ready — then shuts itself down because its PID file vanished: + +``` +could not open file "postmaster.pid": No such file or directory +performing immediate shutdown because data directory lock file is invalid +``` + +This is a Docker Swarm volume mount timing issue. Fix: set `restart_policy.condition: any` (not `on-failure`) on the db service so Swarm keeps restarting it until it sticks. Already applied in `docker-compose.swarm.yml`. + +--- + +## 12. Docker caches NFS volume options + +When `docker stack deploy` creates an NFS volume, the driver options (including `addr=`) are cached. If the first deploy has the wrong NFS IP (e.g. the default `10.0.0.1`), **all subsequent deploys reuse that wrong IP** — even after fixing the env file. + +Symptoms: containers stuck in "Created" state, never starting. No logs. NFS mount hangs because the IP doesn't exist. + +Fix: +```bash +docker stack rm validate +sleep 15 +docker container prune -f +docker volume rm validate_files_data validate_gherkin_rules_log_data +# If containers are stuck on hanging NFS mount: +systemctl restart docker +# Then redeploy +make start-swarm-nodb ENV_FILE=.env.DEV_SWARM +``` + +Verify volume has correct IP after deploy: `docker volume inspect validate_files_data` + +--- + +## 13. File upload: `f.seek(0)` after measuring size + +In `views.py`, the upload handler seeks to the end of the file to measure its size (`f.seek(0, 2)` + `f.tell()`), then must rewind (`f.seek(0)`) before `serializer.save()`. Without the rewind, Django saves a 0-byte file because the file pointer is at the end. + +This may only manifest with NFS-backed storage where buffering behaviour differs from local volumes. Commit: `012776c` + +--- + +## 14. `determine_aggregate_status()` masks silent failures + +When a validation task produces zero outcomes (e.g. subprocess crashed, worker OOM, NFS hang), the status defaults to VALID (`models.py:1297` — `# assume valid if no outcomes - TODO: is this correct?`). This pre-dates Swarm but becomes more visible when workers crash/restart across nodes. + +**Why we can't just return INVALID:** Marking a file as invalid has real consequences — vendors have to investigate and fix it. Returning INVALID for a crashed task would create false negatives. The actual problem is **silent failure** — a task fails completely and nobody notices because it looks like it passed. + +**What should happen instead:** When zero outcomes are produced, the system should alert developers (e.g. log an error, send a notification, or set a distinct status like `ERROR` or `INCONCLUSIVE`) rather than silently defaulting to VALID. The file should be flagged for re-validation, not marked as valid or invalid. + +Not blocking for Swarm, but worth a follow-up fix. + +--- + +## 15. DB connection pooling: stale connections on overlay network + +Django's `"pool": True` (psycopg3 connection pool) keeps DB connections open for reuse. The Swarm overlay network drops idle TCP connections after ~13 minutes. When the pool hands out a dead connection, Django raises: + +``` +OperationalError: consuming input failed: server closed the connection unexpectedly +``` + +**Fix** (in `backend/core/settings.py`): +- `"pool": False` — disable psycopg3's built-in connection pool. `CONN_HEALTH_CHECKS` alone is not sufficient because the pool can hand out a stale connection after the health check passes but before it reaches the query. +- `CONN_HEALTH_CHECKS = True` — Django pings the connection before using it; if dead, it reconnects transparently +- `CONN_MAX_AGE = 600` (10 min) — keeps connections open for reuse without the pool layer + +`CONN_MAX_AGE` is configurable via `POSTGRES_CONN_MAX_AGE` env var. The default of 600s works for Swarm; set to `0` to close connections after each request (safest but slower). + +DB logs showing the symptom (every ~13 min): +``` +LOG: could not receive data from client: Connection reset by peer +``` + +--- + +## 16. SSL certs: bind mount vs named volume + +Docker Compose used a bind mount for Let's Encrypt certs: `./docker/frontend/letsencrypt:/etc/letsencrypt`. Swarm uses a named volume (`validate_letsencrypt_data`). + +When migrating, certs must be manually copied into the Swarm volume: +```bash +cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/ +docker service update --force validate_frontend +``` + +Without this, HTTPS won't work and the site is only accessible via HTTP. Certbot renewal should continue to work inside the container since `CERTBOT_DOMAIN` is set. + +--- + +## 17. Overlay network race condition after stack rm + +After `docker stack rm`, the overlay network cleanup is asynchronous. Redeploying too quickly causes `network validate_validate not found` errors. + +Fix: wait ~15 seconds between `docker stack rm` and `docker stack deploy`. If a ghost network persists (`docker network ls` shows it but `docker network rm` says "not found"), restart Docker: `systemctl restart docker`. + +--- + +## 18. No rolling updates for `latest` tags + +Swarm checks if the image tag has changed before pulling. Since all images use `:latest`, Swarm sees "same tag" and skips the pull — even if the image content has changed. + +**Impact:** `docker service update --force` restarts containers but uses the **cached** image. To deploy new code, you must tear down and redeploy: + +```bash +make stop-swarm +make swarm-push ENV_FILE=.env.xxx +make start-swarm ENV_FILE=.env.xxx +``` + +Or force a pull for a single service: +```bash +docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend +``` + +--- + +## 19. `docker service update --force` does NOT re-read env vars + +`docker service update --force` restarts containers with the **same config** they were deployed with. It does NOT re-read the env file. If you changed `.env.DEV_SWARM` and want the changes to take effect, you must do a full redeploy: + +```bash +make stop-swarm +# wait ~15 seconds +make start-swarm ENV_FILE=.env.DEV_SWARM +``` + +--- + +## 20. VS Code port forwarding conflicts with Swarm ingress + +VS Code's SSH tunnel sometimes conflicts with Swarm's ingress routing (IPv6 issues). Accessing `localhost:80` via VS Code's forwarded port may not work. + +**Workaround:** Use the server's public IP directly instead of localhost. + +--- + +# Maintenance + +## 21. CI/CD not yet adapted for Swarm + +The current GitHub Actions workflow (`.github/workflows/ci_cd.yml`) uses `docker compose up` for DEV and PROD deployments. It does **not** support Swarm. + +What needs to change for Swarm CI/CD: +- `docker compose up` → `make start-swarm ENV_FILE=.env.XXX` (build, push to registry, stack deploy) +- The runner/deploy target needs access to the Swarm manager (SSH or self-hosted runner on the manager node) +- Worker nodes pull images from the registry automatically — no action needed per node +- `ENV_FILE` is already a GitHub Actions variable (`${{ vars.ENV_FILE }}`) — just needs to point to the right file + +Options: +1. **Self-hosted runner on the manager node** — simplest, runner has direct access to Docker and the registry +2. **SSH deploy step** — GitHub-hosted runner SSHes into the manager to run make commands +3. **Separate workflow** — new workflow file for Swarm deployments, triggered manually or on specific branches + +Not blocking for merge to development — Swarm can be deployed manually until CI/CD is adapted. + +--- + +## 22. Periodic cleanup on DEV server + +> **DEV-specific** — the DEV server has a small root disk (29GB). Hetzner/PROD with larger disks are less affected but should still clean up periodically. + +Docker images, build cache, orphaned volumes, and uploaded IFC files accumulate fast. Without periodic cleanup, the disk fills up and deployments fail. + +**What accumulates:** +- Docker build cache (~2GB per full build cycle) +- Old/unused images (previous deployments) +- Orphaned volumes from CI/CD runs (e.g. `repo-clone_*` volumes from GitHub Actions) +- Uploaded IFC files in `files_data` volume (4GB+ and growing) + +**Cleanup commands:** +```bash +# Check disk usage +df -h / + +# Docker overview +docker system df + +# Remove unused images and build cache +docker builder prune -af +docker image prune -af + +# Remove orphaned volumes (CAREFUL: only removes volumes not attached to any container) +docker volume prune -f + +# List volume sizes to find large orphans +docker system df -v | grep -A 50 "Local Volumes" +``` + +**Recommendation:** Run `docker system prune -af` and `docker volume prune -f` after each major deployment cycle. Consider adding this to the CI/CD pipeline or a cron job. The `/mnt` disk (74GB ephemeral Azure temp disk) can be used for temporary storage but **data is lost on VM deallocation/resize**. + +--- + +## 23. `makemigrations` runs on every backend startup + +The `server-entrypoint.sh` runs `python manage.py makemigrations` and `python manage.py migrate` on every container start. This works because: +- Backend is constrained to **1 replica** on the manager node — no migration race conditions +- The generated migration files live inside the container (ephemeral) — they're not persisted + +**Risk:** If model changes exist that haven't been committed as migration files, `makemigrations` will generate them at runtime inside the container. These migrations disappear when the container restarts, potentially causing inconsistency. In production, migrations should be baked into the image at build time. + +**Decision:** Kept as-is for now. Backend is always 1 replica, and in practice all migrations are committed to git before deployment. But worth revisiting for PROD hardening. + +--- + +## 24. Historical Swarm instability + +> "unexplained crashes/corrupt state (5+ years ago) — hopefully they are gone now" + +Modern Docker Engine (24+) should be stable. Mitigations already in place: +- `CELERY_TASK_ACKS_LATE = True` — tasks stay in queue until completed +- `CELERY_TASK_REJECT_ON_WORKER_LOST = True` — crashed tasks are re-queued +- `restart_policy: condition: any` on DB (see section 11), `on-failure` on other services +- `update_config: failure_action: rollback` — bad deploys roll back + +--- + +# Local Dev Only + +## 25. Lima-specific: virtiofs + Celery prefork = errno 35 + +Celery's `prefork` pool + Lima's virtiofs read-only mounts cause `EDEADLK` deadlocks. Workaround: `--pool=solo`. + +**Not a production issue** — only affects local development on macOS with Lima. Docker containers on Linux use proper ext4/overlay2 filesystems. + +--- + +## 26. macOS NFS gotcha: `/tmp` vs `/private/tmp` + +On macOS, `/tmp` is a symlink to `/private/tmp`. NFS exports must use the real path (`/private/tmp/...`). Not relevant for Linux servers (Hetzner/Azure), but relevant for local development on macOS. diff --git a/docs/swarm-runbook.md b/docs/swarm-runbook.md new file mode 100644 index 00000000..94843db8 --- /dev/null +++ b/docs/swarm-runbook.md @@ -0,0 +1,491 @@ +# Swarm Operations Runbook + +Copy-paste-ready commands for every Swarm operation. Refer to [swarm-considerations.md](swarm-considerations.md) for architecture, known issues, and design decisions. + +Last updated: 2026-03-16 + +--- + +## Table of Contents + +1. [First-Time Setup (Manager Node)](#1-first-time-setup-manager-node) +2. [Build, Push and Deploy](#2-build-push-and-deploy) +3. [Set Up NFS (Multi-Node)](#3-set-up-nfs-multi-node) +4. [Add a Worker Node to the Swarm](#4-add-a-worker-node-to-the-swarm) +5. [Scale Workers](#5-scale-workers) +6. [Redeploy After Code Changes](#6-redeploy-after-code-changes) +7. [Monitoring and Logs](#7-monitoring-and-logs) +8. [Shut Down the Swarm](#8-shut-down-the-swarm) +9. [Remove a Worker Node](#9-remove-a-worker-node) +10. [Full Reset (Nuclear Option)](#10-full-reset-nuclear-option) +11. [Environment File Strategy](#11-environment-file-strategy) +12. [Quick Reference Card](#12-quick-reference-card) + +--- + +## 1. First-Time Setup (Manager Node) + +Run once per machine that will act as a Swarm manager. This covers everything: Swarm init, NFS, registry, env, build, deploy. + +```bash +# 1a. Initialize Swarm +docker swarm init --advertise-addr + +# 1b. Create .VERSION (gitignored, required by make build) +echo "1.0.0" > .VERSION + +# 1c. Prepare the .env file +# Copy .env (committed defaults) and customize for this server: +cp .env .env.myserver # name it after the environment: .env.hetzner, .env.DEV_SWARM, .env.PROD +# Edit manually — no spaces around '='. Variables you MUST change: +# PUBLIC_URL — server URL (e.g. http://10.0.0.3 or https://validate.example.org) +# DJANGO_ALLOWED_HOSTS — space-separated hostnames/IPs that Django accepts +# DJANGO_TRUSTED_ORIGINS — space-separated origins for CSRF +# DJANGO_SECRET_KEY — generate a random key for non-dev environments +# POSTGRES_PASSWORD — use a strong password for non-dev environments +# Variables to ADD (not in the base .env, Swarm-only): +# NFS_SERVER_IP — private IP of the NFS server (e.g. 10.0.0.3) +# REGISTRY — Docker registry address (e.g. localhost:5000) +# Optional (uncomment to set): +# CERTBOT_DOMAIN — real domain for SSL (leave as _ to skip) +# CERTBOT_EMAIL — email for Let's Encrypt +# WORKER_CPU_LIMIT, WORKER_MEMORY_LIMIT, etc. — resource limits + +# 1d. Start local registry (as plain container, NOT Swarm service) +docker run -d --name registry -p 5000:5000 --restart always registry:2 +# Verify: +curl -s http://localhost:5000/v2/ # should return {} + +# 1e. Set up NFS on the host +apt install -y nfs-kernel-server +mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs +chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs +chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs + +cat >> /etc/exports << 'EOF' +/srv/nfs/files_data 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +EOF + +exportfs -ra +systemctl restart nfs-kernel-server +showmount -e localhost + +# 1f. (If migrating from docker-compose) Stop the old stack first +docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down +# Volume names differ: compose uses "validation-service_files_data", swarm uses "validate_files_data" +# Check which volumes have data: +docker system df -v | grep -A 50 "Local Volumes" + +# 1g. Copy existing data from Docker volumes to NFS +# Use the COMPOSE volume name (validation-service_*), not the swarm name (validate_*): +docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/" +docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/" +# Verify: +du -sh /srv/nfs/files_data /srv/nfs/gherkin_logs + +# 1h. (If migrating) Copy SSL certs to Swarm volume +# Old compose used a bind mount (docker/frontend/letsencrypt/), Swarm uses a named volume. +# Deploy first (step 1j), then copy certs into the volume and restart frontend: +# cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/ +# docker service update --force validate_frontend + +# 1i. Fetch submodules +make fetch-modules + +# 1j. Build, push, deploy +make swarm-push +# For external DB (Azure): make start-swarm-nodb ENV_FILE=.env.DEV_SWARM +# For containerized DB: make start-swarm ENV_FILE=.env.hetzner +make start-swarm-nodb ENV_FILE=.env.DEV_SWARM + +# 1k. Verify +watch docker service ls +``` + +Adjust NFS exports CIDR to match the network (Azure VNet: `10.0.0.0/16`, Hetzner: `10.0.0.0/24` or `*`). + +See [swarm-considerations.md](swarm-considerations.md) for known issues and gotchas that can trip you up during setup (NFS volume caching, network race conditions, env file format, registry config, SSL cert migration, etc.). + +--- + +## 2. Build, Push and Deploy + +```bash +# Build, tag and push to registry (swarm-push includes build) +make swarm-push ENV_FILE= + +# Deploy — pick the right target: +# Full stack with DB container + NFS: +make start-swarm ENV_FILE= + +# External DB (e.g. Azure PostgreSQL) + NFS: +make start-swarm-nodb ENV_FILE= + +# Single-node / local testing (no NFS, no ClamAV, 1 replica each): +make start-swarm-local ENV_FILE= + +# 2b. Watch services come up (all should reach 1/1 within ~60s) +watch docker service ls + +# Verify endpoints +curl -s -o /dev/null -w "%{http_code}" http://localhost/ # 200 +curl -s -o /dev/null -w "%{http_code}" http://localhost/api/ # 302 +curl -s -o /dev/null -w "%{http_code}" http://localhost/admin/ # 302 + +# 2c. (Optional) Set resource limits on workers +make set-worker-limits CPU=2 MEM=2G # limits only +make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G # limits + reservations +``` + +**What `start-swarm` vs `start-swarm-nodb` vs `start-swarm-local` does:** + +| | `start-swarm` | `start-swarm-nodb` | `start-swarm-local` | +|---|---|---|---| +| Compose file | `swarm.yml` | `swarm.nodb.yml` | `swarm.yml` + `swarm.local.yml` | +| Database | Containerized PostgreSQL | External (e.g. Azure) | Containerized PostgreSQL | +| Volumes | NFS | NFS | Plain local volumes | +| ClamAV | Runs | Runs | Skipped | +| Replicas | backend: 2, worker: 2 | backend: 2, worker: 2 | All 1 | +| Use case | Hetzner, self-hosted | DEV/PROD (Azure DB) | Quick local testing | + +--- + +## 3. Set Up NFS (Multi-Node) + +Required before adding worker nodes. Workers need shared access to uploaded IFC files and gherkin logs. + +### 3a. On the NFS server (typically the manager node) + +```bash +# Install NFS +apt install -y nfs-kernel-server + +# Create export directories +mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs +chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs +chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs + +# Configure exports +cat >> /etc/exports << 'EOF' +/srv/nfs/files_data 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +EOF + +exportfs -ra +systemctl restart nfs-kernel-server + +# Verify +showmount -e localhost +``` + +### 3b. Copy existing data to NFS (if migrating from local volumes) + +```bash +# Copy files_data +docker run --rm \ + -v validate_files_data:/src \ + -v /srv/nfs/files_data:/dst \ + alpine sh -c "cp -a /src/. /dst/" + +# Copy gherkin_logs +docker run --rm \ + -v validate_gherkin_rules_log_data:/src \ + -v /srv/nfs/gherkin_logs:/dst \ + alpine sh -c "cp -a /src/. /dst/" + +# Verify +ls -la /srv/nfs/files_data/ +ls -la /srv/nfs/gherkin_logs/ +``` + +**Note:** If migrating from Docker Compose, the volume names may be prefixed differently (e.g. `validation-service_files_data` instead of `validate_files_data`). Check with `docker volume ls`. + +### 3c. Set NFS_SERVER_IP in the env file + +```bash +# In .env.hetzner (or .env.DEV_SWARM / .env.PROD): +NFS_SERVER_IP=10.0.0.3 # private IP of the NFS server +``` + +The `docker-compose.swarm.yml` uses this in the NFS volume driver options. + +### 3d. Redeploy with NFS volumes + +```bash +# Tear down existing stack (uses local volumes) +make stop-swarm + +# Wait ~15 seconds for cleanup, then redeploy with NFS +make start-swarm ENV_FILE= + +# Verify NFS volumes are mounted +docker volume inspect validate_files_data +# Should show Type: nfs in Options +``` + +--- + +## 4. Add a Worker Node to the Swarm + +### 4a. On the manager — get join token + +```bash +docker swarm join-token worker +# Outputs: docker swarm join --token SWMTKN-... :2377 +``` + +### 4b. On the new worker node — prerequisites + +```bash +# Install Docker +curl -fsSL https://get.docker.com | sh + +# Install NFS client (needed for NFS volumes) +apt install -y nfs-common + +# Verify NFS is reachable +mount -t nfs4 :/srv/nfs/files_data /mnt && ls /mnt && umount /mnt + +# Configure insecure registry (if using private registry over HTTP) +echo '{ "insecure-registries": [":5000"] }' | sudo tee /etc/docker/daemon.json +sudo systemctl restart docker +``` + +### 4c. Join the swarm + +```bash +# Paste the join command from step 4a: +docker swarm join --token SWMTKN-... :2377 +``` + +### 4d. Verify on manager + +```bash +docker node ls +# Should show both nodes as Ready/Active +``` + +### 4e. Also configure insecure registry on manager (if using private IP for registry) + +```bash +# Only needed if REGISTRY=10.0.0.3:5000 instead of localhost:5000 +echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json +sudo systemctl restart docker +``` + +**Important:** When using a private IP registry (`REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors. + +--- + +## 5. Scale Workers + +```bash +# Scale to N workers (Swarm distributes across available nodes) +make scale-workers WORKERS=4 + +# Check placement — see which node each worker is on +docker service ps validate_worker + +# Set resource limits (applied per-container, not total) +make set-worker-limits CPU=2 MEM=2G # limits only +make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G # limits + reservations +``` + +**Per-environment resource limits:** + +| Environment | CPU limit | Memory limit | Notes | +|---|---|---|---| +| Hetzner (8GB, no ClamAV) | 2 | 2G | Max ~2 workers | +| DEV | 1 | 1G | | +| PROD | 4 | 4G | Includes ClamAV ~1GB | + +**ClamAV RAM warning:** Each worker with ClamAV loads ~1GB of virus signatures. 4 workers + 1 scheduler = ~5GB just for ClamAV. Use the local override (skips ClamAV) on small servers, or use 16GB+ RAM. + +--- + +## 6. Redeploy After Code Changes + +There is no rolling update for `latest` tags — must tear down and redeploy. + +```bash +# 1. Stop +make stop-swarm + +# 2. Rebuild and push +make swarm-push ENV_FILE= + +# 3. Redeploy +make start-swarm ENV_FILE= + +# 4. Verify +watch docker service ls +``` + +**Faster alternative for single-service changes:** + +```bash +# Force-restart one service (uses existing image, same config — does NOT re-read .env) +docker service update --force validate_backend + +# Or rebuild and push just the backend image, then update (still same env): +make swarm-push ENV_FILE= +docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend + +# To pick up .env changes, you must redeploy (stop + start-swarm) +``` + +--- + +## 7. Monitoring and Logs + +```bash +# Service overview +docker service ls + +# Detailed worker status (shows which node, current state) +make swarm-status + +# Follow logs for a service +docker service logs -f validate_frontend +docker service logs -f validate_backend +docker service logs -f validate_worker +docker service logs -f validate_scheduler +docker service logs -f validate_db + +# Resource usage (CPU/memory per container) +docker stats --no-stream + +# Check for OOM kills +journalctl -k | grep "out of memory" + +# Check node status +docker node ls + +# Inspect a specific service +docker service inspect validate_worker --pretty +``` + +--- + +## 8. Shut Down the Swarm + +### Stop the stack (keeps volumes and swarm membership) + +```bash +make stop-swarm +# Equivalent to: docker stack rm validate +# Volumes are preserved — data survives restarts +``` + +### Restart after shutdown + +```bash +# Just redeploy — volumes are still there +make start-swarm ENV_FILE= +``` + +--- + +## 9. Remove a Worker Node + +```bash +# On manager: drain the node first (moves tasks to other nodes) +docker node update --availability drain + +# Wait for tasks to migrate, then on the worker node: +docker swarm leave + +# On manager: remove the node from the list +docker node rm +``` + +--- + +## 10. Full Reset (Nuclear Option) + +Removes everything — stack, volumes, images, swarm. + +```bash +# 1. Remove the stack +make stop-swarm + +# 2. Remove registry +docker rm -f registry + +# 3. Remove all volumes (WARNING: deletes DB data and uploaded files!) +docker volume prune -f + +# 4. Remove all images +docker system prune -af + +# 5. Leave the swarm +docker swarm leave --force + +# Then start fresh from section 1 +``` + +--- + +## 11. Environment File Strategy + +The `.env` in the repo root is committed with safe defaults (localhost, no secrets). Each environment gets its own gitignored override. + +| File | Purpose | Committed? | +|---|---|---| +| `.env` | Shared defaults for docker compose (local dev, forking) | Yes | +| `.env.hetzner` | Hetzner test server (IPs, NFS, registry) | No | +| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No | +| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No | +| `.env.PROD` | Production (real secrets, domains, SSL) | No | + +**Deploy with:** +```bash +make start-swarm ENV_FILE=.env.hetzner # Hetzner (with DB container) +make start-swarm-nodb ENV_FILE=.env.DEV_SWARM # DEV (external Azure DB) +make start-swarm ENV_FILE=.env.PROD # PROD +``` + +**What changes per environment:** + +| Variable | Hetzner (test) | DEV | PROD | +|---|---|---|---| +| `DEBUG` | `True` | `True` | `False` | +| `ENV` | `Development` | `Development` | `Production` | +| `PUBLIC_URL` | `http://` | `https://dev.validate...` | `https://validate.buildingsmart.org` | +| `DJANGO_ALLOWED_HOSTS` | `localhost ` | `dev.validate...` | `validate.buildingsmart.org` | +| `CERTBOT_DOMAIN` | `_` (skip SSL) | domain | domain | +| `NFS_SERVER_IP` | `10.0.0.3` | `10.0.0.5` | per-setup | +| `REGISTRY` | `localhost:5000` | `localhost:5000` | per-setup | +| `POSTGRES_PASSWORD` | `postgres` | strong | strong | +| `DJANGO_SECRET_KEY` | insecure default | random | random | +| B2C / Mailgun | empty | real creds | real creds | + +**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):** +- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-` +- No quotes around values — Docker passes them literally +- No angle bracket placeholders like `` — they get passed as literal strings + +--- + +## 12. Quick Reference Card + +| Task | Command | +|---|---| +| Deploy (local/test) | `make start-swarm-local ENV_FILE=` | +| Deploy (with DB + NFS) | `make start-swarm ENV_FILE=` | +| Deploy (external DB + NFS) | `make start-swarm-nodb ENV_FILE=` | +| Copy SSL certs to Swarm | `cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/` | +| Restart frontend (after cert copy) | `docker service update --force validate_frontend` | +| Stop stack | `make stop-swarm` | +| Scale workers | `make scale-workers WORKERS=4` | +| Set worker limits | `make set-worker-limits CPU=2 MEM=2G` | +| Build + push images | `make swarm-push ENV_FILE=` | +| Service status | `make swarm-status` | +| Follow logs | `docker service logs -f validate_` | +| Force-restart service | `docker service update --force validate_backend` | +| Add worker node | `docker swarm join --token SWMTKN-... :2377` | +| Drain node | `docker node update --availability drain ` | +| Remove node | `docker swarm leave` (on worker) + `docker node rm ` (on manager) | +| Check MTU | `ping -M do -s 1372 ` | + From e650928250b4c799329610581904b371bedd8e5d Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Mon, 16 Mar 2026 17:15:32 +0000 Subject: [PATCH 10/12] runbook -> deploy guide --- docs/{swarm-runbook.md => swarm-deploy-guide.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/{swarm-runbook.md => swarm-deploy-guide.md} (100%) diff --git a/docs/swarm-runbook.md b/docs/swarm-deploy-guide.md similarity index 100% rename from docs/swarm-runbook.md rename to docs/swarm-deploy-guide.md From 4b8d99139ed946003da2efc8b6274f6778f9827b Mon Sep 17 00:00:00 2001 From: Ghesselink Date: Mon, 16 Mar 2026 20:13:49 +0000 Subject: [PATCH 11/12] improve swarm makefile targets, fix db connection pooling --- Makefile | 32 ++++---- backend/core/settings.py | 4 +- docker-compose.swarm.nodb.yml | 137 ++++++++++++++++++++++++++++++++++ docs/swarm-deploy-guide.md | 11 +-- 4 files changed, 163 insertions(+), 21 deletions(-) create mode 100644 docker-compose.swarm.nodb.yml diff --git a/Makefile b/Makefile index 6baa69ca..86259f97 100644 --- a/Makefile +++ b/Makefile @@ -28,22 +28,23 @@ stop: # --- Docker Swarm --- +REGISTRY ?= localhost:5000 WORKERS ?= 2 ENV_FILE ?= .env - -# Reads compose-level vars from ENV_FILE, substitutes into YAML via envsubst. -# Container env vars are loaded by docker stack deploy via the env_file: directive. -SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP POSTGRES_NAME POSTGRES_USER POSTGRES_PASSWORD -SWARM_ENV = ENV_FILE=$(ENV_FILE) \ - $(foreach v,$(SWARM_VARS),$(v)=$$(grep '^$(v)=' $(ENV_FILE) | cut -d= -f2-)) +SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP WORKER_CPU_LIMIT WORKER_MEMORY_LIMIT WORKER_CPU_RESERVATION WORKER_MEMORY_RESERVATION +SWARM_ENV = ENV_FILE="$(ENV_FILE)" $(foreach v,$(SWARM_VARS),$(v)="$(shell grep '^$(v)=' $(ENV_FILE) | head -1 | cut -d= -f2-)") start-swarm: - $(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate + env $(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate + +start-swarm-nodb: + env $(SWARM_ENV) envsubst < docker-compose.swarm.nodb.yml | docker stack deploy -c - --with-registry-auth validate start-swarm-local: - $(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml - $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm_local.yml - docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm_local.yml --with-registry-auth validate + env $(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml && \ + env $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm.local.yml && \ + docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm.local.yml --with-registry-auth validate && \ + rm -f /tmp/_swarm.yml /tmp/_swarm.local.yml stop-swarm: docker stack rm validate @@ -51,12 +52,13 @@ stop-swarm: scale-workers: docker service scale validate_worker=$(WORKERS) -CPU ?= 2 -MEM ?= 2G set-worker-limits: - docker service update --limit-cpu $(CPU) --limit-memory $(MEM) validate_worker - -REGISTRY ?= $$(grep '^REGISTRY=' $(ENV_FILE) | cut -d= -f2- || echo localhost:5000) + docker service update \ + $(if $(CPU),--limit-cpu $(CPU)) \ + $(if $(MEM),--limit-memory $(MEM)) \ + $(if $(CPU_RES),--reserve-cpu $(CPU_RES)) \ + $(if $(MEM_RES),--reserve-memory $(MEM_RES)) \ + validate_worker swarm-push: build docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend diff --git a/backend/core/settings.py b/backend/core/settings.py index ac1bd090..24d261da 100644 --- a/backend/core/settings.py +++ b/backend/core/settings.py @@ -246,8 +246,10 @@ "USER": os.environ.get("POSTGRES_USER", "postgres"), "PASSWORD": os.environ.get("POSTGRES_PASSWORD", "postgres"), "PORT": int(os.environ.get("POSTGRES_PORT", "5432")), + "CONN_MAX_AGE": int(os.environ.get("POSTGRES_CONN_MAX_AGE", 600)), + "CONN_HEALTH_CHECKS": True, "OPTIONS": { - "pool": True, + "pool": False, }, }, } diff --git a/docker-compose.swarm.nodb.yml b/docker-compose.swarm.nodb.yml new file mode 100644 index 00000000..bfe1fb11 --- /dev/null +++ b/docker-compose.swarm.nodb.yml @@ -0,0 +1,137 @@ +# Docker Swarm deployment configuration — external database (no containerized PostgreSQL) +# +# Usage: +# make start-swarm-nodb ENV_FILE=.env.DEV_SWARM +# +# Same as docker-compose.swarm.yml but without the db service. +# Set POSTGRES_HOST, POSTGRES_PORT, etc. in your env file to point to the external DB. + +services: + + frontend: + image: ${REGISTRY}/validationsvc-frontend + ports: + - 80:80 + - 443:443 + environment: + CERTBOT_DOMAIN: ${CERTBOT_DOMAIN} + CERTBOT_EMAIL: ${CERTBOT_EMAIL} + volumes: + - letsencrypt_data:/etc/letsencrypt + - static_data:/app/backend/django_static + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + backend: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/server-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - static_data:/app/backend/django_static + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + expose: + - 8000 + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + + worker: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/worker-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 2 + # No placement constraint - workers run on any node + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + + scheduler: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/worker-beat-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + redis: + image: redis:8.4-alpine + command: redis-server --protected-mode no --bind 0.0.0.0 + expose: + - 6379 + volumes: + - redis_data:/data + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + +networks: + validate: + driver: overlay + driver_opts: + com.docker.network.driver.mtu: "1400" + +volumes: + static_data: + letsencrypt_data: + redis_data: + files_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/files_data" + gherkin_rules_log_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/gherkin_logs" diff --git a/docs/swarm-deploy-guide.md b/docs/swarm-deploy-guide.md index 94843db8..36bce9ba 100644 --- a/docs/swarm-deploy-guide.md +++ b/docs/swarm-deploy-guide.md @@ -204,8 +204,9 @@ ls -la /srv/nfs/gherkin_logs/ ### 3c. Set NFS_SERVER_IP in the env file ```bash -# In .env.hetzner (or .env.DEV_SWARM / .env.PROD): -NFS_SERVER_IP=10.0.0.3 # private IP of the NFS server +# In your env file (.env.hetzner, .env.DEV_SWARM, .env.PROD, etc.): +NFS_SERVER_IP= +# e.g. on Hetzner test server this was 10.0.0.3 — check your actual network with: hostname -I ``` The `docker-compose.swarm.yml` uses this in the NFS volume driver options. @@ -269,12 +270,12 @@ docker node ls ### 4e. Also configure insecure registry on manager (if using private IP for registry) ```bash -# Only needed if REGISTRY=10.0.0.3:5000 instead of localhost:5000 -echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json +# Only needed if REGISTRY=:5000 instead of localhost:5000 +echo '{ "insecure-registries": [":5000"] }' | sudo tee /etc/docker/daemon.json sudo systemctl restart docker ``` -**Important:** When using a private IP registry (`REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors. +**Important:** When using a private IP registry (e.g. `REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors. --- From aabe062e81813ed851fffaa6aa9d3ee20465d4f4 Mon Sep 17 00:00:00 2001 From: bSI Validation Service CI/CD Date: Mon, 16 Mar 2026 23:20:28 +0000 Subject: [PATCH 12/12] add automated add-worker/remove-worker, fix registry to use private IP --- Makefile | 39 ++- docs/swarm-considerations.md | 13 +- docs/swarm-deploy-guide.md | 515 +++++++---------------------------- 3 files changed, 150 insertions(+), 417 deletions(-) diff --git a/Makefile b/Makefile index 86259f97..bceeafb0 100644 --- a/Makefile +++ b/Makefile @@ -71,6 +71,43 @@ swarm-status: @echo "---" @docker service ps validate_worker +# Add a worker node to the Swarm cluster +# Usage: make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM +# Reads SWARM_WORKER_N entries and SWARM_SSH_USER from ENV_FILE +add-worker: + @test -n "$(NAME)" || (echo "Usage: make add-worker NAME= ENV_FILE=.env.DEV_SWARM" && exit 1) + $(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-)) + $(eval MANAGER_IP := $(shell grep '^NFS_SERVER_IP=' $(ENV_FILE) | head -1 | cut -d= -f2-)) + $(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2)) + @test -n "$(WORKER_IP)" || (echo "ERROR: Worker '$(NAME)' not found in $(ENV_FILE). Add it as: SWARM_WORKER_N=$(NAME):" && exit 1) + @test -n "$(MANAGER_IP)" || (echo "ERROR: NFS_SERVER_IP not set in $(ENV_FILE)" && exit 1) + @test -n "$(SSH_USER)" || (echo "ERROR: SWARM_SSH_USER not set in $(ENV_FILE)" && exit 1) + @echo "==> Installing Docker on $(NAME) ($(WORKER_IP))..." + sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "curl -fsSL https://get.docker.com | sh" + @echo "==> Configuring insecure registry ($(MANAGER_IP):5000)..." + sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) 'echo '"'"'{ "insecure-registries": ["$(MANAGER_IP):5000"] }'"'"' | sudo tee /etc/docker/daemon.json && sudo systemctl restart docker' + @echo "==> Joining Swarm..." + sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm join --token $$(sudo docker swarm join-token worker -q) $(MANAGER_IP):2377" + @echo "==> Done! Node list:" + sudo docker node ls + +# Remove a worker node from the Swarm cluster +# Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM +remove-worker: + @test -n "$(NAME)" || (echo "Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM" && exit 1) + $(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-)) + $(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2)) + @echo "==> Draining $(NAME)..." + sudo docker node update --availability drain $(NAME) + @echo "==> Leaving swarm..." + -sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm leave" + @echo "==> Waiting for node to go down..." + @for i in 1 2 3 4 5 6; do sleep 5; sudo docker node ls --format '{{.Hostname}} {{.Status}}' | grep -q '$(NAME) Down' && break; echo " waiting..."; done + @echo "==> Removing node..." + sudo docker node rm $(NAME) + @echo "==> Done! Don't forget to remove the SWARM_WORKER entry from $(ENV_FILE)" + sudo docker node ls + build: docker compose build \ --build-arg GIT_COMMIT_HASH="$$(git rev-parse --short HEAD)" \ @@ -128,7 +165,7 @@ e2e-test: start-infra cd e2e && npm install && npm run install-playwright && npm run test e2e-test-report: start-infra - cd e2e && npm install && npm run install-playwright && npm run test:html && npm run test:report + cd e2e && npm install && npm run inst1all-playwright && npm run test:html && npm run test:report BRANCH ?= main SUBTREES := \ diff --git a/docs/swarm-considerations.md b/docs/swarm-considerations.md index 40e77bb0..d25ee790 100644 --- a/docs/swarm-considerations.md +++ b/docs/swarm-considerations.md @@ -7,6 +7,7 @@ Compiled during IVS-719 development. Grouped by category. - **Single-node Swarm**: tested and working (Hetzner, 2026-03-10) - **Multi-node Swarm**: tested and working with 2 nodes + NFS (Hetzner, 2026-03-15) - **Single-node Swarm on Azure DEV**: tested and working with external DB + NFS (2026-03-15) +- **Multi-node Swarm on Azure DEV**: tested and working — manager + worker node, tasks distributed across both (2026-03-16) - **CI/CD**: not yet adapted for Swarm — see section 5 - **SSL/Certbot**: not tested with a real domain yet (using `CERTBOT_DOMAIN=_` to skip) - **Documentation**: user-facing docs (README, deployment guide) not yet updated for Swarm workflow @@ -241,16 +242,20 @@ Impact: --- -## 10. Insecure registry required on ALL nodes +## 10. Registry must use private IP, not localhost -When using `REGISTRY=10.0.0.3:5000` (private IP) instead of `localhost:5000`, **every node** — including the manager — needs the insecure registry configured: +**Always set `REGISTRY=:5000`** (e.g. `10.0.0.5:5000`) in the env file, never `localhost:5000`. + +Why: `localhost` resolves to the local machine. On the manager, that works. On worker nodes, `localhost:5000` points to nothing — workers can't pull images and stay at 0/N replicas with `No such image` errors. + +**Every node** (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`: ```bash -echo '{ "insecure-registries": ["10.0.0.3:5000"] }' | sudo tee /etc/docker/daemon.json +echo '{ "insecure-registries": ["10.0.0.5:5000"] }' | sudo tee /etc/docker/daemon.json sudo systemctl restart docker ``` -Without this, services get `No such image` errors and stay at 0/N replicas. +The `make add-worker` target handles this automatically for workers. For the **manager**, add it manually once during initial setup (merge with any existing `daemon.json` settings like log-driver). --- diff --git a/docs/swarm-deploy-guide.md b/docs/swarm-deploy-guide.md index 36bce9ba..f484bcf6 100644 --- a/docs/swarm-deploy-guide.md +++ b/docs/swarm-deploy-guide.md @@ -1,492 +1,183 @@ -# Swarm Operations Runbook +# Swarm Deploy Guide -Copy-paste-ready commands for every Swarm operation. Refer to [swarm-considerations.md](swarm-considerations.md) for architecture, known issues, and design decisions. +Copy-paste commands for deploying and operating the Validation Service on Docker Swarm. -Last updated: 2026-03-16 +For architecture decisions, known issues, env file strategy, and gotchas, see [swarm-considerations.md](swarm-considerations.md). --- -## Table of Contents - -1. [First-Time Setup (Manager Node)](#1-first-time-setup-manager-node) -2. [Build, Push and Deploy](#2-build-push-and-deploy) -3. [Set Up NFS (Multi-Node)](#3-set-up-nfs-multi-node) -4. [Add a Worker Node to the Swarm](#4-add-a-worker-node-to-the-swarm) -5. [Scale Workers](#5-scale-workers) -6. [Redeploy After Code Changes](#6-redeploy-after-code-changes) -7. [Monitoring and Logs](#7-monitoring-and-logs) -8. [Shut Down the Swarm](#8-shut-down-the-swarm) -9. [Remove a Worker Node](#9-remove-a-worker-node) -10. [Full Reset (Nuclear Option)](#10-full-reset-nuclear-option) -11. [Environment File Strategy](#11-environment-file-strategy) -12. [Quick Reference Card](#12-quick-reference-card) - ---- - -## 1. First-Time Setup (Manager Node) - -Run once per machine that will act as a Swarm manager. This covers everything: Swarm init, NFS, registry, env, build, deploy. +## Deploy ```bash -# 1a. Initialize Swarm -docker swarm init --advertise-addr - -# 1b. Create .VERSION (gitignored, required by make build) -echo "1.0.0" > .VERSION - -# 1c. Prepare the .env file -# Copy .env (committed defaults) and customize for this server: -cp .env .env.myserver # name it after the environment: .env.hetzner, .env.DEV_SWARM, .env.PROD -# Edit manually — no spaces around '='. Variables you MUST change: -# PUBLIC_URL — server URL (e.g. http://10.0.0.3 or https://validate.example.org) -# DJANGO_ALLOWED_HOSTS — space-separated hostnames/IPs that Django accepts -# DJANGO_TRUSTED_ORIGINS — space-separated origins for CSRF -# DJANGO_SECRET_KEY — generate a random key for non-dev environments -# POSTGRES_PASSWORD — use a strong password for non-dev environments -# Variables to ADD (not in the base .env, Swarm-only): -# NFS_SERVER_IP — private IP of the NFS server (e.g. 10.0.0.3) -# REGISTRY — Docker registry address (e.g. localhost:5000) -# Optional (uncomment to set): -# CERTBOT_DOMAIN — real domain for SSL (leave as _ to skip) -# CERTBOT_EMAIL — email for Let's Encrypt -# WORKER_CPU_LIMIT, WORKER_MEMORY_LIMIT, etc. — resource limits - -# 1d. Start local registry (as plain container, NOT Swarm service) -docker run -d --name registry -p 5000:5000 --restart always registry:2 -# Verify: -curl -s http://localhost:5000/v2/ # should return {} - -# 1e. Set up NFS on the host -apt install -y nfs-kernel-server -mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs -chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs -chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs - -cat >> /etc/exports << 'EOF' -/srv/nfs/files_data 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) -/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) -EOF - -exportfs -ra -systemctl restart nfs-kernel-server -showmount -e localhost +# Build, push images to registry, and deploy +make swarm-push ENV_FILE= +make start-swarm-nodb ENV_FILE= # external DB (Azure DEV/PROD) +# or: make start-swarm ENV_FILE= # containerized DB (Hetzner) +# or: make start-swarm-local ENV_FILE= # local testing (no NFS, no ClamAV) -# 1f. (If migrating from docker-compose) Stop the old stack first -docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down -# Volume names differ: compose uses "validation-service_files_data", swarm uses "validate_files_data" -# Check which volumes have data: -docker system df -v | grep -A 50 "Local Volumes" - -# 1g. Copy existing data from Docker volumes to NFS -# Use the COMPOSE volume name (validation-service_*), not the swarm name (validate_*): -docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/" -docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/" -# Verify: -du -sh /srv/nfs/files_data /srv/nfs/gherkin_logs - -# 1h. (If migrating) Copy SSL certs to Swarm volume -# Old compose used a bind mount (docker/frontend/letsencrypt/), Swarm uses a named volume. -# Deploy first (step 1j), then copy certs into the volume and restart frontend: -# cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/ -# docker service update --force validate_frontend - -# 1i. Fetch submodules -make fetch-modules - -# 1j. Build, push, deploy -make swarm-push -# For external DB (Azure): make start-swarm-nodb ENV_FILE=.env.DEV_SWARM -# For containerized DB: make start-swarm ENV_FILE=.env.hetzner -make start-swarm-nodb ENV_FILE=.env.DEV_SWARM - -# 1k. Verify +# Verify — all services should reach 1/1 within ~60s watch docker service ls ``` -Adjust NFS exports CIDR to match the network (Azure VNet: `10.0.0.0/16`, Hetzner: `10.0.0.0/24` or `*`). - -See [swarm-considerations.md](swarm-considerations.md) for known issues and gotchas that can trip you up during setup (NFS volume caching, network race conditions, env file format, registry config, SSL cert migration, etc.). +## Redeploy (after code changes) ---- - -## 2. Build, Push and Deploy +No rolling updates with `latest` tags — must tear down and redeploy. ```bash -# Build, tag and push to registry (swarm-push includes build) -make swarm-push ENV_FILE= - -# Deploy — pick the right target: -# Full stack with DB container + NFS: -make start-swarm ENV_FILE= - -# External DB (e.g. Azure PostgreSQL) + NFS: -make start-swarm-nodb ENV_FILE= - -# Single-node / local testing (no NFS, no ClamAV, 1 replica each): -make start-swarm-local ENV_FILE= - -# 2b. Watch services come up (all should reach 1/1 within ~60s) +make stop-swarm +# Wait ~15s for network cleanup +make swarm-push ENV_FILE= +make start-swarm-nodb ENV_FILE= watch docker service ls - -# Verify endpoints -curl -s -o /dev/null -w "%{http_code}" http://localhost/ # 200 -curl -s -o /dev/null -w "%{http_code}" http://localhost/api/ # 302 -curl -s -o /dev/null -w "%{http_code}" http://localhost/admin/ # 302 - -# 2c. (Optional) Set resource limits on workers -make set-worker-limits CPU=2 MEM=2G # limits only -make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G # limits + reservations -``` - -**What `start-swarm` vs `start-swarm-nodb` vs `start-swarm-local` does:** - -| | `start-swarm` | `start-swarm-nodb` | `start-swarm-local` | -|---|---|---|---| -| Compose file | `swarm.yml` | `swarm.nodb.yml` | `swarm.yml` + `swarm.local.yml` | -| Database | Containerized PostgreSQL | External (e.g. Azure) | Containerized PostgreSQL | -| Volumes | NFS | NFS | Plain local volumes | -| ClamAV | Runs | Runs | Skipped | -| Replicas | backend: 2, worker: 2 | backend: 2, worker: 2 | All 1 | -| Use case | Hetzner, self-hosted | DEV/PROD (Azure DB) | Quick local testing | - ---- - -## 3. Set Up NFS (Multi-Node) - -Required before adding worker nodes. Workers need shared access to uploaded IFC files and gherkin logs. - -### 3a. On the NFS server (typically the manager node) - -```bash -# Install NFS -apt install -y nfs-kernel-server - -# Create export directories -mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs -chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs -chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs - -# Configure exports -cat >> /etc/exports << 'EOF' -/srv/nfs/files_data 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) -/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) -EOF - -exportfs -ra -systemctl restart nfs-kernel-server - -# Verify -showmount -e localhost -``` - -### 3b. Copy existing data to NFS (if migrating from local volumes) - -```bash -# Copy files_data -docker run --rm \ - -v validate_files_data:/src \ - -v /srv/nfs/files_data:/dst \ - alpine sh -c "cp -a /src/. /dst/" - -# Copy gherkin_logs -docker run --rm \ - -v validate_gherkin_rules_log_data:/src \ - -v /srv/nfs/gherkin_logs:/dst \ - alpine sh -c "cp -a /src/. /dst/" - -# Verify -ls -la /srv/nfs/files_data/ -ls -la /srv/nfs/gherkin_logs/ -``` - -**Note:** If migrating from Docker Compose, the volume names may be prefixed differently (e.g. `validation-service_files_data` instead of `validate_files_data`). Check with `docker volume ls`. - -### 3c. Set NFS_SERVER_IP in the env file - -```bash -# In your env file (.env.hetzner, .env.DEV_SWARM, .env.PROD, etc.): -NFS_SERVER_IP= -# e.g. on Hetzner test server this was 10.0.0.3 — check your actual network with: hostname -I ``` -The `docker-compose.swarm.yml` uses this in the NFS volume driver options. - -### 3d. Redeploy with NFS volumes - +To force-restart a single service (same image, same env): ```bash -# Tear down existing stack (uses local volumes) -make stop-swarm - -# Wait ~15 seconds for cleanup, then redeploy with NFS -make start-swarm ENV_FILE= - -# Verify NFS volumes are mounted -docker volume inspect validate_files_data -# Should show Type: nfs in Options -``` - ---- - -## 4. Add a Worker Node to the Swarm - -### 4a. On the manager — get join token - -```bash -docker swarm join-token worker -# Outputs: docker swarm join --token SWMTKN-... :2377 +docker service update --force validate_backend ``` -### 4b. On the new worker node — prerequisites - -```bash -# Install Docker -curl -fsSL https://get.docker.com | sh - -# Install NFS client (needed for NFS volumes) -apt install -y nfs-common +## Add / Remove Worker Nodes -# Verify NFS is reachable -mount -t nfs4 :/srv/nfs/files_data /mnt && ls /mnt && umount /mnt +### Prerequisites -# Configure insecure registry (if using private registry over HTTP) -echo '{ "insecure-registries": [":5000"] }' | sudo tee /etc/docker/daemon.json -sudo systemctl restart docker -``` +1. Worker VM must be in the same VNet/subnet as the manager +2. Manager's SSH key must be on the worker (`~/.ssh/authorized_keys`). On Azure, use Portal > "Reset password > Add SSH public key" +3. Register the worker in the env file: + ``` + SWARM_WORKER_1=dev-vm-worker-1:10.0.0.4 + ``` -### 4c. Join the swarm +### Add ```bash -# Paste the join command from step 4a: -docker swarm join --token SWMTKN-... :2377 +# Installs Docker, configures registry, joins Swarm — all in one command +make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM ``` -### 4d. Verify on manager +### Remove ```bash -docker node ls -# Should show both nodes as Ready/Active -``` +# Drains tasks, leaves Swarm, removes node +make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM -### 4e. Also configure insecure registry on manager (if using private IP for registry) - -```bash -# Only needed if REGISTRY=:5000 instead of localhost:5000 -echo '{ "insecure-registries": [":5000"] }' | sudo tee /etc/docker/daemon.json -sudo systemctl restart docker +# Then: remove SWARM_WORKER_N line from env file, delete VM if temporary ``` -**Important:** When using a private IP registry (e.g. `REGISTRY=10.0.0.3:5000`), EVERY node (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`. Otherwise workers get `No such image` errors. - ---- - -## 5. Scale Workers +## Scale Workers ```bash -# Scale to N workers (Swarm distributes across available nodes) +# Scale to N worker containers (distributed across nodes) make scale-workers WORKERS=4 -# Check placement — see which node each worker is on +# Check which node each worker runs on docker service ps validate_worker -# Set resource limits (applied per-container, not total) -make set-worker-limits CPU=2 MEM=2G # limits only -make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G # limits + reservations +# Set resource limits per container +make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G ``` -**Per-environment resource limits:** - -| Environment | CPU limit | Memory limit | Notes | -|---|---|---|---| -| Hetzner (8GB, no ClamAV) | 2 | 2G | Max ~2 workers | -| DEV | 1 | 1G | | -| PROD | 4 | 4G | Includes ClamAV ~1GB | - -**ClamAV RAM warning:** Each worker with ClamAV loads ~1GB of virus signatures. 4 workers + 1 scheduler = ~5GB just for ClamAV. Use the local override (skips ClamAV) on small servers, or use 16GB+ RAM. - ---- - -## 6. Redeploy After Code Changes - -There is no rolling update for `latest` tags — must tear down and redeploy. - -```bash -# 1. Stop -make stop-swarm - -# 2. Rebuild and push -make swarm-push ENV_FILE= - -# 3. Redeploy -make start-swarm ENV_FILE= - -# 4. Verify -watch docker service ls -``` +**Terminology:** A worker _node_ is a VM. Each node runs worker _replicas_ (containers). Each replica runs multiple Celery _processes_ (set by `CELERY_CONCURRENCY`, default 4). -**Faster alternative for single-service changes:** +## Monitoring ```bash -# Force-restart one service (uses existing image, same config — does NOT re-read .env) -docker service update --force validate_backend - -# Or rebuild and push just the backend image, then update (still same env): -make swarm-push ENV_FILE= -docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend - -# To pick up .env changes, you must redeploy (stop + start-swarm) +make swarm-status # service overview + worker placement +docker service logs -f validate_worker # follow logs (also: backend, frontend, scheduler) +docker stats --no-stream # CPU/memory per container +docker node ls # node health +journalctl -k | grep "out of memory" # check for OOM kills ``` ---- - -## 7. Monitoring and Logs +## Stop / Start ```bash -# Service overview -docker service ls - -# Detailed worker status (shows which node, current state) -make swarm-status - -# Follow logs for a service -docker service logs -f validate_frontend -docker service logs -f validate_backend -docker service logs -f validate_worker -docker service logs -f validate_scheduler -docker service logs -f validate_db - -# Resource usage (CPU/memory per container) -docker stats --no-stream - -# Check for OOM kills -journalctl -k | grep "out of memory" - -# Check node status -docker node ls - -# Inspect a specific service -docker service inspect validate_worker --pretty +make stop-swarm # removes stack, keeps volumes and Swarm membership +make start-swarm-nodb ENV_FILE= # redeploy — volumes are still there ``` ---- - -## 8. Shut Down the Swarm +## Full Reset -### Stop the stack (keeps volumes and swarm membership) +Removes everything — stack, volumes, images, Swarm. Start fresh from first-time setup. ```bash make stop-swarm -# Equivalent to: docker stack rm validate -# Volumes are preserved — data survives restarts -``` - -### Restart after shutdown - -```bash -# Just redeploy — volumes are still there -make start-swarm ENV_FILE= +docker rm -f registry +docker volume prune -f # WARNING: deletes DB data and uploaded files +docker system prune -af +docker swarm leave --force ``` --- -## 9. Remove a Worker Node - -```bash -# On manager: drain the node first (moves tasks to other nodes) -docker node update --availability drain - -# Wait for tasks to migrate, then on the worker node: -docker swarm leave +## First-Time Setup (Manager Node) -# On manager: remove the node from the list -docker node rm -``` +One-time setup for a new manager. Once done, use the commands above for daily operations. ---- +```bash +# 1. Init Swarm +docker swarm init --advertise-addr -## 10. Full Reset (Nuclear Option) +# 2. Start local registry +docker run -d --name registry -p 5000:5000 --restart always registry:2 -Removes everything — stack, volumes, images, swarm. +# 3. Configure insecure registry (required for multi-node) +# Add "insecure-registries": [":5000"] to /etc/docker/daemon.json +# Then: sudo systemctl restart docker -```bash -# 1. Remove the stack -make stop-swarm +# 4. Set up NFS +apt install -y nfs-kernel-server +mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs +chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs +chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs -# 2. Remove registry -docker rm -f registry +cat >> /etc/exports << 'EOF' +/srv/nfs/files_data 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +EOF -# 3. Remove all volumes (WARNING: deletes DB data and uploaded files!) -docker volume prune -f +exportfs -ra && systemctl restart nfs-kernel-server -# 4. Remove all images -docker system prune -af +# 5. Create .VERSION +echo "1.0.0" > .VERSION -# 5. Leave the swarm -docker swarm leave --force +# 6. Prepare env file — see swarm-considerations.md for env file strategy +cp .env .env.myserver # customize: PUBLIC_URL, DJANGO_ALLOWED_HOSTS, NFS_SERVER_IP, REGISTRY, etc. -# Then start fresh from section 1 +# 7. Fetch submodules, build, deploy +make fetch-modules +make swarm-push ENV_FILE= +make start-swarm-nodb ENV_FILE= ``` ---- - -## 11. Environment File Strategy +### Migrating from Docker Compose -The `.env` in the repo root is committed with safe defaults (localhost, no secrets). Each environment gets its own gitignored override. +```bash +# Stop old stack +docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down -| File | Purpose | Committed? | -|---|---|---| -| `.env` | Shared defaults for docker compose (local dev, forking) | Yes | -| `.env.hetzner` | Hetzner test server (IPs, NFS, registry) | No | -| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No | -| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No | -| `.env.PROD` | Production (real secrets, domains, SSL) | No | +# Copy data from compose volumes to NFS (volume names differ: validation-service_* vs validate_*) +docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/" +docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/" -**Deploy with:** -```bash -make start-swarm ENV_FILE=.env.hetzner # Hetzner (with DB container) -make start-swarm-nodb ENV_FILE=.env.DEV_SWARM # DEV (external Azure DB) -make start-swarm ENV_FILE=.env.PROD # PROD +# Copy SSL certs (after first deploy) +cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/ +docker service update --force validate_frontend ``` -**What changes per environment:** - -| Variable | Hetzner (test) | DEV | PROD | -|---|---|---|---| -| `DEBUG` | `True` | `True` | `False` | -| `ENV` | `Development` | `Development` | `Production` | -| `PUBLIC_URL` | `http://` | `https://dev.validate...` | `https://validate.buildingsmart.org` | -| `DJANGO_ALLOWED_HOSTS` | `localhost ` | `dev.validate...` | `validate.buildingsmart.org` | -| `CERTBOT_DOMAIN` | `_` (skip SSL) | domain | domain | -| `NFS_SERVER_IP` | `10.0.0.3` | `10.0.0.5` | per-setup | -| `REGISTRY` | `localhost:5000` | `localhost:5000` | per-setup | -| `POSTGRES_PASSWORD` | `postgres` | strong | strong | -| `DJANGO_SECRET_KEY` | insecure default | random | random | -| B2C / Mailgun | empty | real creds | real creds | - -**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):** -- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-` -- No quotes around values — Docker passes them literally -- No angle bracket placeholders like `` — they get passed as literal strings - --- -## 12. Quick Reference Card +## Quick Reference | Task | Command | |---|---| -| Deploy (local/test) | `make start-swarm-local ENV_FILE=` | -| Deploy (with DB + NFS) | `make start-swarm ENV_FILE=` | -| Deploy (external DB + NFS) | `make start-swarm-nodb ENV_FILE=` | -| Copy SSL certs to Swarm | `cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/` | -| Restart frontend (after cert copy) | `docker service update --force validate_frontend` | -| Stop stack | `make stop-swarm` | +| Deploy (external DB) | `make start-swarm-nodb ENV_FILE=` | +| Deploy (with DB) | `make start-swarm ENV_FILE=` | +| Stop | `make stop-swarm` | +| Build + push | `make swarm-push ENV_FILE=` | | Scale workers | `make scale-workers WORKERS=4` | -| Set worker limits | `make set-worker-limits CPU=2 MEM=2G` | -| Build + push images | `make swarm-push ENV_FILE=` | -| Service status | `make swarm-status` | -| Follow logs | `docker service logs -f validate_` | -| Force-restart service | `docker service update --force validate_backend` | -| Add worker node | `docker swarm join --token SWMTKN-... :2377` | -| Drain node | `docker node update --availability drain ` | -| Remove node | `docker swarm leave` (on worker) + `docker node rm ` (on manager) | -| Check MTU | `ping -M do -s 1372 ` | - +| Set limits | `make set-worker-limits CPU=2 MEM=2G` | +| Add worker | `make add-worker NAME= ENV_FILE=` | +| Remove worker | `make remove-worker NAME= ENV_FILE=` | +| Status | `make swarm-status` | +| Logs | `docker service logs -f validate_` | +| Force-restart | `docker service update --force validate_` |