diff --git a/.env b/.env index fb79ac74..982cd621 100644 --- a/.env +++ b/.env @@ -1,51 +1,59 @@ -# variables in Docker Compose -DEBUG = True -ENV = Development -PUBLIC_URL = http://localhost - -# Certbot -CERTBOT_DOMAIN = _ -CERTBOT_EMAIL = - -# Django -MEDIA_ROOT = /files_storage -DJANGO_DB = postgresql -DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk -DJANGO_ALLOWED_HOSTS = localhost -DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000 -DJANGO_LOG_LEVEL = INFO -GHERKIN_LOG_FOLDER = /gherkin_logs -DJANGO_GUNICORN_WORKERS = 3 -DJANGO_GUNICORN_THREADS_PER_WORKER = 4 - -# DB -POSTGRES_HOST = db -POSTGRES_NAME = postgres -POSTGRES_USER = postgres -POSTGRES_PASSWORD = postgres -POSTGRES_PORT = 5432 - -# Worker -REDIS_PORT = 6379 -CELERY_BROKER_URL = redis://redis:6379/0 -CELERY_TASK_SOFT_TIME_LIMIT = 3600 -CELERY_TASK_TIME_LIMIT = 4000 -TASK_TIMEOUT_LIMIT = 3600 -DJANGO_DB_USER_CONTEXT = SYSTEM -DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000 -CELERY_CONCURRENCY = 4 - -# Email -MAILGUN_API_URL = -MAILGUN_API_KEY = -MAILGUN_FROM_NAME = Validation Service -MAILGUN_FROM_EMAIL = noreply@localhost -ADMIN_EMAIL = noreply@localhost -CONTACT_EMAIL = noreply@localhost - -# IAM -B2C_CLIENT_ID = -B2C_CLIENT_SECRET = -B2C_AUTHORITY = -B2C_USER_FLOW = -USE_WHITELIST = False +# variables in Docker Compose +DEBUG = True +ENV = Development +PUBLIC_URL = http://localhost + +# Certbot +CERTBOT_DOMAIN = _ +CERTBOT_EMAIL = + +# Django +MEDIA_ROOT = /files_storage +DJANGO_DB = postgresql +DJANGO_SECRET_KEY = django-insecure-um7-^+&jbk_=80*xcc9uf4nh$4koida7)ja&6!vb*$8@n288jk +DJANGO_ALLOWED_HOSTS = localhost +DJANGO_TRUSTED_ORIGINS = http://localhost:3000 http://localhost http://localhost:8000 +DJANGO_LOG_LEVEL = INFO +GHERKIN_LOG_FOLDER = /gherkin_logs +DJANGO_GUNICORN_WORKERS = 3 +DJANGO_GUNICORN_THREADS_PER_WORKER = 4 + +# DB +POSTGRES_HOST = db +POSTGRES_NAME = postgres +POSTGRES_USER = postgres +POSTGRES_PASSWORD = postgres +POSTGRES_PORT = 5432 + +# Worker +REDIS_PORT = 6379 +CELERY_BROKER_URL = redis://redis:6379/0 +CELERY_TASK_SOFT_TIME_LIMIT = 3600 +CELERY_TASK_TIME_LIMIT = 4000 +TASK_TIMEOUT_LIMIT = 3600 +DJANGO_DB_USER_CONTEXT = SYSTEM +DJANGO_DB_BULK_CREATE_BATCH_SIZE = 1000 +CELERY_CONCURRENCY = 4 + +# Email +MAILGUN_API_URL = +MAILGUN_API_KEY = +MAILGUN_FROM_NAME = Validation Service +MAILGUN_FROM_EMAIL = noreply@localhost +ADMIN_EMAIL = noreply@localhost +CONTACT_EMAIL = noreply@localhost + +# IAM +B2C_CLIENT_ID = +B2C_CLIENT_SECRET = +B2C_AUTHORITY = +B2C_USER_FLOW = +USE_WHITELIST = False + +# Swarm (ignored by docker compose) +# REGISTRY=localhost:5000 +# NFS_SERVER_IP=10.0.0.1 +# WORKER_CPU_LIMIT=2.0 +# WORKER_CPU_RESERVATION=1.0 +# WORKER_MEMORY_LIMIT=2G +# WORKER_MEMORY_RESERVATION=1G diff --git a/Makefile b/Makefile index 0163120f..bceeafb0 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,88 @@ start-infra-only: stop: docker compose down +# --- Docker Swarm --- + +REGISTRY ?= localhost:5000 +WORKERS ?= 2 +ENV_FILE ?= .env +SWARM_VARS = REGISTRY CERTBOT_DOMAIN CERTBOT_EMAIL NFS_SERVER_IP WORKER_CPU_LIMIT WORKER_MEMORY_LIMIT WORKER_CPU_RESERVATION WORKER_MEMORY_RESERVATION +SWARM_ENV = ENV_FILE="$(ENV_FILE)" $(foreach v,$(SWARM_VARS),$(v)="$(shell grep '^$(v)=' $(ENV_FILE) | head -1 | cut -d= -f2-)") + +start-swarm: + env $(SWARM_ENV) envsubst < docker-compose.swarm.yml | docker stack deploy -c - --with-registry-auth validate + +start-swarm-nodb: + env $(SWARM_ENV) envsubst < docker-compose.swarm.nodb.yml | docker stack deploy -c - --with-registry-auth validate + +start-swarm-local: + env $(SWARM_ENV) envsubst < docker-compose.swarm.yml > /tmp/_swarm.yml && \ + env $(SWARM_ENV) envsubst < docker-compose.swarm.local.yml > /tmp/_swarm.local.yml && \ + docker stack deploy -c /tmp/_swarm.yml -c /tmp/_swarm.local.yml --with-registry-auth validate && \ + rm -f /tmp/_swarm.yml /tmp/_swarm.local.yml + +stop-swarm: + docker stack rm validate + +scale-workers: + docker service scale validate_worker=$(WORKERS) + +set-worker-limits: + docker service update \ + $(if $(CPU),--limit-cpu $(CPU)) \ + $(if $(MEM),--limit-memory $(MEM)) \ + $(if $(CPU_RES),--reserve-cpu $(CPU_RES)) \ + $(if $(MEM_RES),--reserve-memory $(MEM_RES)) \ + validate_worker + +swarm-push: build + docker tag buildingsmart/validationsvc-backend $(REGISTRY)/validationsvc-backend + docker tag buildingsmart/validationsvc-frontend $(REGISTRY)/validationsvc-frontend + docker push $(REGISTRY)/validationsvc-backend + docker push $(REGISTRY)/validationsvc-frontend + +swarm-status: + @docker service ls + @echo "---" + @docker service ps validate_worker + +# Add a worker node to the Swarm cluster +# Usage: make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM +# Reads SWARM_WORKER_N entries and SWARM_SSH_USER from ENV_FILE +add-worker: + @test -n "$(NAME)" || (echo "Usage: make add-worker NAME= ENV_FILE=.env.DEV_SWARM" && exit 1) + $(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-)) + $(eval MANAGER_IP := $(shell grep '^NFS_SERVER_IP=' $(ENV_FILE) | head -1 | cut -d= -f2-)) + $(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2)) + @test -n "$(WORKER_IP)" || (echo "ERROR: Worker '$(NAME)' not found in $(ENV_FILE). Add it as: SWARM_WORKER_N=$(NAME):" && exit 1) + @test -n "$(MANAGER_IP)" || (echo "ERROR: NFS_SERVER_IP not set in $(ENV_FILE)" && exit 1) + @test -n "$(SSH_USER)" || (echo "ERROR: SWARM_SSH_USER not set in $(ENV_FILE)" && exit 1) + @echo "==> Installing Docker on $(NAME) ($(WORKER_IP))..." + sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "curl -fsSL https://get.docker.com | sh" + @echo "==> Configuring insecure registry ($(MANAGER_IP):5000)..." + sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) 'echo '"'"'{ "insecure-registries": ["$(MANAGER_IP):5000"] }'"'"' | sudo tee /etc/docker/daemon.json && sudo systemctl restart docker' + @echo "==> Joining Swarm..." + sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm join --token $$(sudo docker swarm join-token worker -q) $(MANAGER_IP):2377" + @echo "==> Done! Node list:" + sudo docker node ls + +# Remove a worker node from the Swarm cluster +# Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM +remove-worker: + @test -n "$(NAME)" || (echo "Usage: make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM" && exit 1) + $(eval SSH_USER := $(shell grep '^SWARM_SSH_USER=' $(ENV_FILE) | head -1 | cut -d= -f2-)) + $(eval WORKER_IP := $(shell grep '^SWARM_WORKER_' $(ENV_FILE) | grep '$(NAME)' | head -1 | cut -d: -f2)) + @echo "==> Draining $(NAME)..." + sudo docker node update --availability drain $(NAME) + @echo "==> Leaving swarm..." + -sudo -u $(SSH_USER) ssh -o StrictHostKeyChecking=no $(SSH_USER)@$(WORKER_IP) "sudo docker swarm leave" + @echo "==> Waiting for node to go down..." + @for i in 1 2 3 4 5 6; do sleep 5; sudo docker node ls --format '{{.Hostname}} {{.Status}}' | grep -q '$(NAME) Down' && break; echo " waiting..."; done + @echo "==> Removing node..." + sudo docker node rm $(NAME) + @echo "==> Done! Don't forget to remove the SWARM_WORKER entry from $(ENV_FILE)" + sudo docker node ls + build: docker compose build \ --build-arg GIT_COMMIT_HASH="$$(git rev-parse --short HEAD)" \ @@ -83,7 +165,7 @@ e2e-test: start-infra cd e2e && npm install && npm run install-playwright && npm run test e2e-test-report: start-infra - cd e2e && npm install && npm run install-playwright && npm run test:html && npm run test:report + cd e2e && npm install && npm run inst1all-playwright && npm run test:html && npm run test:report BRANCH ?= main SUBTREES := \ diff --git a/backend/apps/ifc_validation/api/v1/views.py b/backend/apps/ifc_validation/api/v1/views.py index ae7bb5e1..838f09e0 100644 --- a/backend/apps/ifc_validation/api/v1/views.py +++ b/backend/apps/ifc_validation/api/v1/views.py @@ -195,6 +195,7 @@ def post(self, request, *args, **kwargs): #file = os.path.join(MEDIA_ROOT, uploaded_file['file_name']) #uploaded_file['size'] = os.path.getsize(file) uploaded_file['size'] = file_length + f.seek(0) instance = serializer.save() # submit task for background execution diff --git a/backend/apps/ifc_validation/tasks/processing/instance_completion.py b/backend/apps/ifc_validation/tasks/processing/instance_completion.py index 2fb5b3ba..c2940e56 100644 --- a/backend/apps/ifc_validation/tasks/processing/instance_completion.py +++ b/backend/apps/ifc_validation/tasks/processing/instance_completion.py @@ -18,7 +18,7 @@ import itertools import functools - file_path, step_ids = file_path, step_ids = json.load(sys.stdin) + file_path, step_ids = json.load(sys.stdin) ifc_file = ifcopenshell.open(file_path) def filter_serializable(v): def inner(k, v): diff --git a/backend/core/settings.py b/backend/core/settings.py index 78f1e63d..90061152 100644 --- a/backend/core/settings.py +++ b/backend/core/settings.py @@ -246,8 +246,10 @@ "USER": os.environ.get("POSTGRES_USER", "postgres"), "PASSWORD": os.environ.get("POSTGRES_PASSWORD", "postgres"), "PORT": int(os.environ.get("POSTGRES_PORT", "5432")), + "CONN_MAX_AGE": int(os.environ.get("POSTGRES_CONN_MAX_AGE", 600)), + "CONN_HEALTH_CHECKS": True, "OPTIONS": { - "pool": True, + "pool": False, }, }, } diff --git a/docker-compose.swarm.local.yml b/docker-compose.swarm.local.yml new file mode 100644 index 00000000..c05b5c9c --- /dev/null +++ b/docker-compose.swarm.local.yml @@ -0,0 +1,67 @@ +# Override: single-node local testing (no NFS, no ClamAV, reduced replicas) +# +# Usage: +# docker stack deploy -c docker-compose.swarm.yml -c docker-compose.swarm.local.yml validate +# +# For production/NFS testing, use docker-compose.swarm.yml directly. + +services: + + frontend: + environment: + CERTBOT_DOMAIN: _ + CERTBOT_EMAIL: x + + backend: + deploy: + replicas: 1 + + worker: + entrypoint: /bin/sh + command: + - -c + - | + set -e + until cd /files_storage; do echo "Waiting for files_storage..."; done + until cd /app/backend; do echo "Waiting for server volume..."; done + while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do + echo "Waiting for DB..." + sleep 5 + done + echo "DB is ready. Starting worker (no ClamAV)." + rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true + CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4} + echo "Celery concurrency: $$CELERY_CONCURRENCY" + celery --app=core worker --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker@%n + deploy: + replicas: 1 + resources: + limits: + cpus: "2.0" + memory: 2G + reservations: + cpus: "0.5" + memory: 512M + + scheduler: + entrypoint: /bin/sh + command: + - -c + - | + set -e + until cd /files_storage; do echo "Waiting for files_storage..."; done + until cd /app/backend; do echo "Waiting for server volume..."; done + while ! pg_isready -h "$$POSTGRES_HOST" -p "$$POSTGRES_PORT" -d "$$POSTGRES_NAME" -U "$$POSTGRES_USER" 2>/dev/null; do + echo "Waiting for DB..." + sleep 5 + done + echo "DB is ready. Starting scheduler (no ClamAV)." + rm -f /usr/bin/clamdscan /usr/bin/clamscan 2>/dev/null || true + CELERY_CONCURRENCY=$${CELERY_CONCURRENCY:-4} + celery --app=core worker --beat --loglevel=info --concurrency $$CELERY_CONCURRENCY --task-events --hostname=worker-beat@%n + +volumes: + files_data: + driver: local + gherkin_rules_log_data: + driver: local diff --git a/docker-compose.swarm.nodb.yml b/docker-compose.swarm.nodb.yml new file mode 100644 index 00000000..bfe1fb11 --- /dev/null +++ b/docker-compose.swarm.nodb.yml @@ -0,0 +1,137 @@ +# Docker Swarm deployment configuration — external database (no containerized PostgreSQL) +# +# Usage: +# make start-swarm-nodb ENV_FILE=.env.DEV_SWARM +# +# Same as docker-compose.swarm.yml but without the db service. +# Set POSTGRES_HOST, POSTGRES_PORT, etc. in your env file to point to the external DB. + +services: + + frontend: + image: ${REGISTRY}/validationsvc-frontend + ports: + - 80:80 + - 443:443 + environment: + CERTBOT_DOMAIN: ${CERTBOT_DOMAIN} + CERTBOT_EMAIL: ${CERTBOT_EMAIL} + volumes: + - letsencrypt_data:/etc/letsencrypt + - static_data:/app/backend/django_static + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + backend: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/server-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - static_data:/app/backend/django_static + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + expose: + - 8000 + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + + worker: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/worker-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 2 + # No placement constraint - workers run on any node + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + + scheduler: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/worker-beat-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + redis: + image: redis:8.4-alpine + command: redis-server --protected-mode no --bind 0.0.0.0 + expose: + - 6379 + volumes: + - redis_data:/data + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + +networks: + validate: + driver: overlay + driver_opts: + com.docker.network.driver.mtu: "1400" + +volumes: + static_data: + letsencrypt_data: + redis_data: + files_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/files_data" + gherkin_rules_log_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/gherkin_logs" diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml new file mode 100644 index 00000000..ddaac915 --- /dev/null +++ b/docker-compose.swarm.yml @@ -0,0 +1,169 @@ +# Docker Swarm deployment configuration +# +# Usage: +# 1. Build and push images: make swarm-push +# 2. Deploy: make start-swarm +# 3. Scale workers: make scale-workers WORKERS=5 +# 4. Status: make swarm-status +# 5. Tear down: make stop-swarm +# +# Prerequisites: +# - docker swarm init +# - Local registry: docker service create --name registry --publish 5000:5000 registry:2 +# - NFS server configured (see PR description) +# +# NFS: Set NFS_SERVER_IP in .env (default: 10.0.0.1). +# For local testing without NFS, override volumes with plain named volumes. + +services: + + frontend: + image: ${REGISTRY}/validationsvc-frontend + ports: + - 80:80 + - 443:443 + environment: + CERTBOT_DOMAIN: ${CERTBOT_DOMAIN} + CERTBOT_EMAIL: ${CERTBOT_EMAIL} + volumes: + - letsencrypt_data:/etc/letsencrypt + - static_data:/app/backend/django_static + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + backend: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/server-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - static_data:/app/backend/django_static + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + expose: + - 8000 + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + healthcheck: + test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8000/api/')\""] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + + worker: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/worker-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 2 + # No placement constraint - workers run on any node + restart_policy: + condition: on-failure + delay: 5s + update_config: + parallelism: 1 + delay: 30s + failure_action: rollback + # Resource limits applied post-deploy via: + # docker service update --limit-cpu 2 --limit-memory 2G validate_worker + + scheduler: + image: ${REGISTRY}/validationsvc-backend + entrypoint: /app/backend/worker-beat-entrypoint.sh + env_file: ${ENV_FILE} + volumes: + - files_data:/files_storage + - gherkin_rules_log_data:/gherkin_logs + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + redis: + image: redis:8.4-alpine + command: redis-server --protected-mode no --bind 0.0.0.0 + expose: + - 6379 + volumes: + - redis_data:/data + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: on-failure + delay: 5s + + db: + image: postgres:16.10-alpine + environment: + POSTGRES_DB: ${POSTGRES_NAME} + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + expose: + - 5432 + volumes: + - postgres_data:/var/lib/postgresql/data/ + networks: + - validate + deploy: + replicas: 1 + placement: + constraints: [node.role == manager] + restart_policy: + condition: any + delay: 5s + +networks: + validate: + driver: overlay + driver_opts: + com.docker.network.driver.mtu: "1400" + +volumes: + static_data: + letsencrypt_data: + postgres_data: + redis_data: + files_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/files_data" + gherkin_rules_log_data: + driver: local + driver_opts: + type: nfs + o: "addr=${NFS_SERVER_IP},nfsvers=4.1,rw,hard,timeo=600,retrans=2" + device: ":/srv/nfs/gherkin_logs" diff --git a/docker/backend/worker-beat-entrypoint.sh b/docker/backend/worker-beat-entrypoint.sh index 23c38f6e..1eeb5afa 100644 --- a/docker/backend/worker-beat-entrypoint.sh +++ b/docker/backend/worker-beat-entrypoint.sh @@ -23,9 +23,6 @@ freshclam service clamav-freshclam start service clamav-daemon start -python manage.py makemigrations -python manage.py migrate - CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes echo "Celery concurrency: $CELERY_CONCURRENCY" diff --git a/docker/backend/worker-entrypoint.sh b/docker/backend/worker-entrypoint.sh index c08929b8..cbe08319 100644 --- a/docker/backend/worker-entrypoint.sh +++ b/docker/backend/worker-entrypoint.sh @@ -23,9 +23,6 @@ freshclam service clamav-freshclam start service clamav-daemon start -python manage.py makemigrations -python manage.py migrate - CELERY_CONCURRENCY=${CELERY_CONCURRENCY:-6} # default 6 worker processes echo "Celery concurrency: $CELERY_CONCURRENCY" diff --git a/docs/swarm-considerations.md b/docs/swarm-considerations.md new file mode 100644 index 00000000..d25ee790 --- /dev/null +++ b/docs/swarm-considerations.md @@ -0,0 +1,492 @@ +# Docker Swarm — Considerations & Known Issues + +Compiled during IVS-719 development. Grouped by category. + +## Status + +- **Single-node Swarm**: tested and working (Hetzner, 2026-03-10) +- **Multi-node Swarm**: tested and working with 2 nodes + NFS (Hetzner, 2026-03-15) +- **Single-node Swarm on Azure DEV**: tested and working with external DB + NFS (2026-03-15) +- **Multi-node Swarm on Azure DEV**: tested and working — manager + worker node, tasks distributed across both (2026-03-16) +- **CI/CD**: not yet adapted for Swarm — see section 5 +- **SSL/Certbot**: not tested with a real domain yet (using `CERTBOT_DOMAIN=_` to skip) +- **Documentation**: user-facing docs (README, deployment guide) not yet updated for Swarm workflow + +--- + +# Architecture & Design + +## 1. Architecture overview + +Every worker needs access to `/files_storage` (uploaded IFC files) and `/gherkin_logs`. In Docker Compose, these are local volumes on one machine. In Swarm, workers run on **different machines** — so files must be shared via NFS. + +``` + ┌─────────┐ + │ Frontend │ (Nginx + React) + │ :80/443 │ + └────┬─────┘ + │ + ┌────▼─────┐ + │ Backend │ (Django API — manager node) + │ :8000 │ + └────┬─────┘ + │ enqueues tasks + ┌────▼─────┐ + │ Redis │ (Celery broker — manager node) + │ :6379 │ + └────┬─────┘ + │ workers consume via overlay network + ┌─────────┼──────────┐ + │ │ │ + ┌────▼───┐ ┌───▼────┐ ┌──▼─────┐ + │Worker 1│ │Worker 2│ │Worker N│ (any node in swarm) + └────┬───┘ └───┬────┘ └──┬─────┘ + │ │ │ + │ NFS mount │ + └─────────┼──────────┘ + ┌────▼─────┐ + │/srv/nfs/ │ (NFS server on manager node) + │files_data│ + └──────────┘ + │ same machine + ┌────▼─────┐ + │ Postgres │ (manager node) + └──────────┘ + + ┌───────────┐ + │ Scheduler │ (1 replica, manager only) + │ --beat │ file retention: archive@90d, remove@180d + └───────────┘ +``` + +**How it works:** +- The **manager node** runs: frontend, backend, DB, Redis, scheduler, and the NFS server +- **Worker nodes** only run Celery workers — they mount NFS volumes automatically via the Docker volume driver +- The **overlay network** (Docker Swarm native) connects workers to Redis and Postgres across machines +- NFS gives workers read/write access to uploaded files as if they were local + +**If NFS goes down, all workers stall** — `hard,timeo=600` mount options mean workers will hang (not error) until NFS recovers. This is intentional: better to wait than to silently fail. + +For Azure: restrict NFS exports to VNet CIDR (e.g. `10.0.0.0/16(rw,sync,...)`), not `*`. + +--- + +## 2. Build and deploy are now separate steps + +Docker Compose: `docker compose build && docker compose up` — build and run in one flow. + +Docker Swarm: worker nodes **cannot build images**. They pull from a registry. + +``` +Developer machine Registry Swarm nodes + build ──push──> localhost:5000 <──pull── worker-1, worker-2 +``` + +Workflow: +```bash +make build # build images locally +make swarm-push ENV_FILE=.env.xxx # tag + push to registry +make start-swarm ENV_FILE=.env.xxx # docker stack deploy (nodes pull from registry) +``` + +For Azure PROD, replace `localhost:5000` with Azure Container Registry (ACR). + +--- + +## 3. Worker scaling and capacity + +There is **no hard cap** on worker replicas. Scaling is manual: + +```bash +make scale-workers WORKERS=4 +``` + +**Capacity math per worker:** +- ~1GB RAM for ClamAV virus signature database +- ~2-3GB RAM for Celery tasks (depends on `CELERY_CONCURRENCY`) +- Total: **~3-4GB RAM per worker** +- Each worker runs `CELERY_CONCURRENCY` parallel tasks (default: 4 in .env.hetzner, 6 in .env) + +| Environment | Workers | Concurrency | Parallel tasks | RAM needed (workers only) | +|---|---|---|---|---| +| Hetzner (8GB) | 2 | 4 | 8 | ~6-8GB | +| DEV | 2 | 4 | 8 | ~6-8GB | +| PROD | 4+ | 6 | 24+ | ~12-16GB | + +To prevent overloading a single node, use `max_replicas_per_node` in the compose file: +```yaml +deploy: + replicas: 4 + placement: + max_replicas_per_node: 2 +``` +This forces Swarm to spread workers across at least 2 nodes. Not currently set — all replicas can land on one node if Swarm decides to. + +**Resource limits** are optional but recommended in production. Apply post-deploy: +```bash +make set-worker-limits CPU=2 MEM=2G # limits only +make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G # limits + reservations +``` + +Per-environment suggestions: +| Environment | CPU limit | Memory limit | Notes | +|---|---|---|---| +| Hetzner (8GB) | 2 | 2G | Small server, max ~2 workers | +| DEV | 1 | 1G | | +| PROD | 4 | 4G | Includes ClamAV ~1GB | + +--- + +## 4. `.env` strategy + +`.env` is committed with safe defaults (localhost, no secrets). Environment-specific files are gitignored via `.env.*`: + +| File | Purpose | Committed? | +|---|---|---| +| `.env` | Shared defaults for local dev / forking | Yes | +| `.env.hetzner` | Hetzner dev server (IPs, NFS, registry) | No | +| `.env.DEV` | DEV environment (docker compose, used by CI/CD) | No | +| `.env.DEV_SWARM` | DEV Swarm deployment (external Azure DB, NFS) | No | +| `.env.PROD` | Production (real secrets, domains) | No | + +Deploy with: +```bash +make start-swarm ENV_FILE=.env.hetzner # Hetzner (with DB container) +make start-swarm-nodb ENV_FILE=.env.DEV_SWARM # DEV (external Azure DB) +``` + +The Makefile uses `envsubst` to substitute **only compose-level vars** (REGISTRY, NFS_SERVER_IP, CERTBOT_DOMAIN, etc.) from the env file into the YAML, then pipes the result to `docker stack deploy`. Container env vars are loaded by `docker stack deploy` via the `env_file:` directive directly. + +**Why only compose-level vars?** Earlier approaches that sourced the entire env file broke on values with special characters (`#`, `(`, spaces). The current approach extracts only the vars that `envsubst` needs (REGISTRY, CERTBOT_DOMAIN, CERTBOT_EMAIL, NFS_SERVER_IP, etc.) using `grep` + `cut` in the Makefile. + +**Env file format rules (Swarm env files only — `.env.hetzner`, `.env.DEV_SWARM`, etc.):** +- No spaces around `=` — the Makefile uses `grep '^VAR=' | cut -d= -f2-` +- No quotes around values — Docker passes them literally +- No angle bracket placeholders like `` — they get passed as literal strings + +This avoids three problems with earlier approaches: +1. **Type conversion bugs** — `docker compose config` converted ports to strings and cpus to integers, which `docker stack deploy` rejected +2. **`.env` auto-load conflict** — `docker compose config` always loads `.env` from the project directory, silently overriding values from `--env-file` +3. **Special character breakage** — sourcing the whole env file with `set -a && . ./file` breaks on values containing `#` (comment), `(` (subshell), or unquoted spaces + +--- + +## 5. Local dev and server deploy are now different configs + +You maintain two separate compose files: +- `docker-compose.yml` — local development (single machine, local volumes, `container_name`) +- `docker-compose.swarm.yml` — Swarm deployment (overlay network, NFS volumes, `deploy:` section) +- `docker-compose.swarm.nodb.yml` — Swarm with external DB (no containerized Postgres) + +Risk: they drift apart over time (different env vars, image versions, volume configs). Mitigation: keep changes in sync during PRs. + +--- + +## 6. No `container_name` / `depends_on` in Swarm + +Swarm manages container naming internally (e.g. `validate_worker.1.abc123`). `depends_on` is ignored — services start simultaneously. + +Current impact: minimal — entrypoints use DNS service discovery (`redis`, `db`, `backend`) and `pg_isready` wait loops. No code changes needed. + +--- + +## 7. DNS transition strategy for PROD cutover + +To avoid downtime when switching from Docker Compose to Swarm in production, use a temporary subdomain: + +1. Deploy Swarm stack on a new server (or same server on different ports) +2. Point a temp subdomain to it (e.g. `swarm.validate.buildingsmart.org`) +3. Run both setups in parallel — existing Compose on the main domain, Swarm on the temp domain +4. Test via API (bulk uploads, concurrent validations) against the temp domain +5. Once confident, swap DNS: point the main domain to the Swarm deployment +6. Decommission the old Compose setup + +Rollback: if Swarm has issues, DNS points back to the old setup in minutes. + +For DEV: same approach, or direct cutover (lower risk since it's not user-facing). + +--- + +# Known Issues & Gotchas + +## 8. Overlay network MTU must be set to 1400 + +MTU (Maximum Transmission Unit) is the largest packet size a network link can carry — the default is 1500 bytes. Hetzner's private network uses MTU 1450. Docker's VXLAN overlay adds ~50 bytes of encapsulation headers to every packet, so if the underlying MTU is already ≤1500, the oversized packets get silently dropped or fragmented. Without setting the overlay MTU to 1400 (leaving headroom for the VXLAN overhead), worker nodes on different machines **cannot reach services on the manager** (DB, Redis). + +Symptom: workers stuck on `db:5432 - no response` despite DNS resolving correctly. + +Fix is in `docker-compose.swarm.yml`: +```yaml +networks: + validate: + driver: overlay + driver_opts: + com.docker.network.driver.mtu: "1400" +``` + +This applies to any cloud provider with sub-1500 MTU on internal networks. + +--- + +## 9. ClamAV runs inside every worker (~1GB RAM overhead each) + +Each worker container starts its own ClamAV daemon + freshclam (virus signature updater). This is the **same as before** — not a Swarm change. But when scaling to N workers, you get N independent ClamAV instances. + +Impact: +- ~1GB RAM per worker for virus signature database (observed during Hetzner testing — 5 instances caused OOM on 8GB server) +- Each worker independently downloads signature updates on boot +- The 4GB memory limit per worker (PROD) accounts for this: ~1GB ClamAV + ~2-3GB for Celery tasks +- The local override (`docker-compose.swarm.local.yml`) skips ClamAV entirely for testing on small servers + +4 workers with ClamAV = ~4GB just for virus DBs. + +--- + +## 10. Registry must use private IP, not localhost + +**Always set `REGISTRY=:5000`** (e.g. `10.0.0.5:5000`) in the env file, never `localhost:5000`. + +Why: `localhost` resolves to the local machine. On the manager, that works. On worker nodes, `localhost:5000` points to nothing — workers can't pull images and stay at 0/N replicas with `No such image` errors. + +**Every node** (manager AND workers) needs the insecure registry configured in `/etc/docker/daemon.json`: + +```bash +echo '{ "insecure-registries": ["10.0.0.5:5000"] }' | sudo tee /etc/docker/daemon.json +sudo systemctl restart docker +``` + +The `make add-worker` target handles this automatically for workers. For the **manager**, add it manually once during initial setup (merge with any existing `daemon.json` settings like log-driver). + +--- + +## 11. DB `postmaster.pid` disappears in Swarm (containerized DB only) + +PostgreSQL starts, recovers, becomes ready — then shuts itself down because its PID file vanished: + +``` +could not open file "postmaster.pid": No such file or directory +performing immediate shutdown because data directory lock file is invalid +``` + +This is a Docker Swarm volume mount timing issue. Fix: set `restart_policy.condition: any` (not `on-failure`) on the db service so Swarm keeps restarting it until it sticks. Already applied in `docker-compose.swarm.yml`. + +--- + +## 12. Docker caches NFS volume options + +When `docker stack deploy` creates an NFS volume, the driver options (including `addr=`) are cached. If the first deploy has the wrong NFS IP (e.g. the default `10.0.0.1`), **all subsequent deploys reuse that wrong IP** — even after fixing the env file. + +Symptoms: containers stuck in "Created" state, never starting. No logs. NFS mount hangs because the IP doesn't exist. + +Fix: +```bash +docker stack rm validate +sleep 15 +docker container prune -f +docker volume rm validate_files_data validate_gherkin_rules_log_data +# If containers are stuck on hanging NFS mount: +systemctl restart docker +# Then redeploy +make start-swarm-nodb ENV_FILE=.env.DEV_SWARM +``` + +Verify volume has correct IP after deploy: `docker volume inspect validate_files_data` + +--- + +## 13. File upload: `f.seek(0)` after measuring size + +In `views.py`, the upload handler seeks to the end of the file to measure its size (`f.seek(0, 2)` + `f.tell()`), then must rewind (`f.seek(0)`) before `serializer.save()`. Without the rewind, Django saves a 0-byte file because the file pointer is at the end. + +This may only manifest with NFS-backed storage where buffering behaviour differs from local volumes. Commit: `012776c` + +--- + +## 14. `determine_aggregate_status()` masks silent failures + +When a validation task produces zero outcomes (e.g. subprocess crashed, worker OOM, NFS hang), the status defaults to VALID (`models.py:1297` — `# assume valid if no outcomes - TODO: is this correct?`). This pre-dates Swarm but becomes more visible when workers crash/restart across nodes. + +**Why we can't just return INVALID:** Marking a file as invalid has real consequences — vendors have to investigate and fix it. Returning INVALID for a crashed task would create false negatives. The actual problem is **silent failure** — a task fails completely and nobody notices because it looks like it passed. + +**What should happen instead:** When zero outcomes are produced, the system should alert developers (e.g. log an error, send a notification, or set a distinct status like `ERROR` or `INCONCLUSIVE`) rather than silently defaulting to VALID. The file should be flagged for re-validation, not marked as valid or invalid. + +Not blocking for Swarm, but worth a follow-up fix. + +--- + +## 15. DB connection pooling: stale connections on overlay network + +Django's `"pool": True` (psycopg3 connection pool) keeps DB connections open for reuse. The Swarm overlay network drops idle TCP connections after ~13 minutes. When the pool hands out a dead connection, Django raises: + +``` +OperationalError: consuming input failed: server closed the connection unexpectedly +``` + +**Fix** (in `backend/core/settings.py`): +- `"pool": False` — disable psycopg3's built-in connection pool. `CONN_HEALTH_CHECKS` alone is not sufficient because the pool can hand out a stale connection after the health check passes but before it reaches the query. +- `CONN_HEALTH_CHECKS = True` — Django pings the connection before using it; if dead, it reconnects transparently +- `CONN_MAX_AGE = 600` (10 min) — keeps connections open for reuse without the pool layer + +`CONN_MAX_AGE` is configurable via `POSTGRES_CONN_MAX_AGE` env var. The default of 600s works for Swarm; set to `0` to close connections after each request (safest but slower). + +DB logs showing the symptom (every ~13 min): +``` +LOG: could not receive data from client: Connection reset by peer +``` + +--- + +## 16. SSL certs: bind mount vs named volume + +Docker Compose used a bind mount for Let's Encrypt certs: `./docker/frontend/letsencrypt:/etc/letsencrypt`. Swarm uses a named volume (`validate_letsencrypt_data`). + +When migrating, certs must be manually copied into the Swarm volume: +```bash +cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/ +docker service update --force validate_frontend +``` + +Without this, HTTPS won't work and the site is only accessible via HTTP. Certbot renewal should continue to work inside the container since `CERTBOT_DOMAIN` is set. + +--- + +## 17. Overlay network race condition after stack rm + +After `docker stack rm`, the overlay network cleanup is asynchronous. Redeploying too quickly causes `network validate_validate not found` errors. + +Fix: wait ~15 seconds between `docker stack rm` and `docker stack deploy`. If a ghost network persists (`docker network ls` shows it but `docker network rm` says "not found"), restart Docker: `systemctl restart docker`. + +--- + +## 18. No rolling updates for `latest` tags + +Swarm checks if the image tag has changed before pulling. Since all images use `:latest`, Swarm sees "same tag" and skips the pull — even if the image content has changed. + +**Impact:** `docker service update --force` restarts containers but uses the **cached** image. To deploy new code, you must tear down and redeploy: + +```bash +make stop-swarm +make swarm-push ENV_FILE=.env.xxx +make start-swarm ENV_FILE=.env.xxx +``` + +Or force a pull for a single service: +```bash +docker service update --image localhost:5000/validationsvc-backend:latest --force validate_backend +``` + +--- + +## 19. `docker service update --force` does NOT re-read env vars + +`docker service update --force` restarts containers with the **same config** they were deployed with. It does NOT re-read the env file. If you changed `.env.DEV_SWARM` and want the changes to take effect, you must do a full redeploy: + +```bash +make stop-swarm +# wait ~15 seconds +make start-swarm ENV_FILE=.env.DEV_SWARM +``` + +--- + +## 20. VS Code port forwarding conflicts with Swarm ingress + +VS Code's SSH tunnel sometimes conflicts with Swarm's ingress routing (IPv6 issues). Accessing `localhost:80` via VS Code's forwarded port may not work. + +**Workaround:** Use the server's public IP directly instead of localhost. + +--- + +# Maintenance + +## 21. CI/CD not yet adapted for Swarm + +The current GitHub Actions workflow (`.github/workflows/ci_cd.yml`) uses `docker compose up` for DEV and PROD deployments. It does **not** support Swarm. + +What needs to change for Swarm CI/CD: +- `docker compose up` → `make start-swarm ENV_FILE=.env.XXX` (build, push to registry, stack deploy) +- The runner/deploy target needs access to the Swarm manager (SSH or self-hosted runner on the manager node) +- Worker nodes pull images from the registry automatically — no action needed per node +- `ENV_FILE` is already a GitHub Actions variable (`${{ vars.ENV_FILE }}`) — just needs to point to the right file + +Options: +1. **Self-hosted runner on the manager node** — simplest, runner has direct access to Docker and the registry +2. **SSH deploy step** — GitHub-hosted runner SSHes into the manager to run make commands +3. **Separate workflow** — new workflow file for Swarm deployments, triggered manually or on specific branches + +Not blocking for merge to development — Swarm can be deployed manually until CI/CD is adapted. + +--- + +## 22. Periodic cleanup on DEV server + +> **DEV-specific** — the DEV server has a small root disk (29GB). Hetzner/PROD with larger disks are less affected but should still clean up periodically. + +Docker images, build cache, orphaned volumes, and uploaded IFC files accumulate fast. Without periodic cleanup, the disk fills up and deployments fail. + +**What accumulates:** +- Docker build cache (~2GB per full build cycle) +- Old/unused images (previous deployments) +- Orphaned volumes from CI/CD runs (e.g. `repo-clone_*` volumes from GitHub Actions) +- Uploaded IFC files in `files_data` volume (4GB+ and growing) + +**Cleanup commands:** +```bash +# Check disk usage +df -h / + +# Docker overview +docker system df + +# Remove unused images and build cache +docker builder prune -af +docker image prune -af + +# Remove orphaned volumes (CAREFUL: only removes volumes not attached to any container) +docker volume prune -f + +# List volume sizes to find large orphans +docker system df -v | grep -A 50 "Local Volumes" +``` + +**Recommendation:** Run `docker system prune -af` and `docker volume prune -f` after each major deployment cycle. Consider adding this to the CI/CD pipeline or a cron job. The `/mnt` disk (74GB ephemeral Azure temp disk) can be used for temporary storage but **data is lost on VM deallocation/resize**. + +--- + +## 23. `makemigrations` runs on every backend startup + +The `server-entrypoint.sh` runs `python manage.py makemigrations` and `python manage.py migrate` on every container start. This works because: +- Backend is constrained to **1 replica** on the manager node — no migration race conditions +- The generated migration files live inside the container (ephemeral) — they're not persisted + +**Risk:** If model changes exist that haven't been committed as migration files, `makemigrations` will generate them at runtime inside the container. These migrations disappear when the container restarts, potentially causing inconsistency. In production, migrations should be baked into the image at build time. + +**Decision:** Kept as-is for now. Backend is always 1 replica, and in practice all migrations are committed to git before deployment. But worth revisiting for PROD hardening. + +--- + +## 24. Historical Swarm instability + +> "unexplained crashes/corrupt state (5+ years ago) — hopefully they are gone now" + +Modern Docker Engine (24+) should be stable. Mitigations already in place: +- `CELERY_TASK_ACKS_LATE = True` — tasks stay in queue until completed +- `CELERY_TASK_REJECT_ON_WORKER_LOST = True` — crashed tasks are re-queued +- `restart_policy: condition: any` on DB (see section 11), `on-failure` on other services +- `update_config: failure_action: rollback` — bad deploys roll back + +--- + +# Local Dev Only + +## 25. Lima-specific: virtiofs + Celery prefork = errno 35 + +Celery's `prefork` pool + Lima's virtiofs read-only mounts cause `EDEADLK` deadlocks. Workaround: `--pool=solo`. + +**Not a production issue** — only affects local development on macOS with Lima. Docker containers on Linux use proper ext4/overlay2 filesystems. + +--- + +## 26. macOS NFS gotcha: `/tmp` vs `/private/tmp` + +On macOS, `/tmp` is a symlink to `/private/tmp`. NFS exports must use the real path (`/private/tmp/...`). Not relevant for Linux servers (Hetzner/Azure), but relevant for local development on macOS. diff --git a/docs/swarm-deploy-guide.md b/docs/swarm-deploy-guide.md new file mode 100644 index 00000000..f484bcf6 --- /dev/null +++ b/docs/swarm-deploy-guide.md @@ -0,0 +1,183 @@ +# Swarm Deploy Guide + +Copy-paste commands for deploying and operating the Validation Service on Docker Swarm. + +For architecture decisions, known issues, env file strategy, and gotchas, see [swarm-considerations.md](swarm-considerations.md). + +--- + +## Deploy + +```bash +# Build, push images to registry, and deploy +make swarm-push ENV_FILE= +make start-swarm-nodb ENV_FILE= # external DB (Azure DEV/PROD) +# or: make start-swarm ENV_FILE= # containerized DB (Hetzner) +# or: make start-swarm-local ENV_FILE= # local testing (no NFS, no ClamAV) + +# Verify — all services should reach 1/1 within ~60s +watch docker service ls +``` + +## Redeploy (after code changes) + +No rolling updates with `latest` tags — must tear down and redeploy. + +```bash +make stop-swarm +# Wait ~15s for network cleanup +make swarm-push ENV_FILE= +make start-swarm-nodb ENV_FILE= +watch docker service ls +``` + +To force-restart a single service (same image, same env): +```bash +docker service update --force validate_backend +``` + +## Add / Remove Worker Nodes + +### Prerequisites + +1. Worker VM must be in the same VNet/subnet as the manager +2. Manager's SSH key must be on the worker (`~/.ssh/authorized_keys`). On Azure, use Portal > "Reset password > Add SSH public key" +3. Register the worker in the env file: + ``` + SWARM_WORKER_1=dev-vm-worker-1:10.0.0.4 + ``` + +### Add + +```bash +# Installs Docker, configures registry, joins Swarm — all in one command +make add-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM +``` + +### Remove + +```bash +# Drains tasks, leaves Swarm, removes node +make remove-worker NAME=dev-vm-worker-1 ENV_FILE=.env.DEV_SWARM + +# Then: remove SWARM_WORKER_N line from env file, delete VM if temporary +``` + +## Scale Workers + +```bash +# Scale to N worker containers (distributed across nodes) +make scale-workers WORKERS=4 + +# Check which node each worker runs on +docker service ps validate_worker + +# Set resource limits per container +make set-worker-limits CPU=2 MEM=2G CPU_RES=1 MEM_RES=1G +``` + +**Terminology:** A worker _node_ is a VM. Each node runs worker _replicas_ (containers). Each replica runs multiple Celery _processes_ (set by `CELERY_CONCURRENCY`, default 4). + +## Monitoring + +```bash +make swarm-status # service overview + worker placement +docker service logs -f validate_worker # follow logs (also: backend, frontend, scheduler) +docker stats --no-stream # CPU/memory per container +docker node ls # node health +journalctl -k | grep "out of memory" # check for OOM kills +``` + +## Stop / Start + +```bash +make stop-swarm # removes stack, keeps volumes and Swarm membership +make start-swarm-nodb ENV_FILE= # redeploy — volumes are still there +``` + +## Full Reset + +Removes everything — stack, volumes, images, Swarm. Start fresh from first-time setup. + +```bash +make stop-swarm +docker rm -f registry +docker volume prune -f # WARNING: deletes DB data and uploaded files +docker system prune -af +docker swarm leave --force +``` + +--- + +## First-Time Setup (Manager Node) + +One-time setup for a new manager. Once done, use the commands above for daily operations. + +```bash +# 1. Init Swarm +docker swarm init --advertise-addr + +# 2. Start local registry +docker run -d --name registry -p 5000:5000 --restart always registry:2 + +# 3. Configure insecure registry (required for multi-node) +# Add "insecure-registries": [":5000"] to /etc/docker/daemon.json +# Then: sudo systemctl restart docker + +# 4. Set up NFS +apt install -y nfs-kernel-server +mkdir -p /srv/nfs/files_data /srv/nfs/gherkin_logs +chown nobody:nogroup /srv/nfs/files_data /srv/nfs/gherkin_logs +chmod 777 /srv/nfs/files_data /srv/nfs/gherkin_logs + +cat >> /etc/exports << 'EOF' +/srv/nfs/files_data 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +/srv/nfs/gherkin_logs 10.0.0.0/16(rw,sync,no_subtree_check,no_root_squash) +EOF + +exportfs -ra && systemctl restart nfs-kernel-server + +# 5. Create .VERSION +echo "1.0.0" > .VERSION + +# 6. Prepare env file — see swarm-considerations.md for env file strategy +cp .env .env.myserver # customize: PUBLIC_URL, DJANGO_ALLOWED_HOSTS, NFS_SERVER_IP, REGISTRY, etc. + +# 7. Fetch submodules, build, deploy +make fetch-modules +make swarm-push ENV_FILE= +make start-swarm-nodb ENV_FILE= +``` + +### Migrating from Docker Compose + +```bash +# Stop old stack +docker compose -f docker-compose.load_balanced.nodb.yml --env-file .env.DEV down + +# Copy data from compose volumes to NFS (volume names differ: validation-service_* vs validate_*) +docker run --rm -v validation-service_files_data:/src -v /srv/nfs/files_data:/dst alpine sh -c "cp -a /src/. /dst/" +docker run --rm -v validation-service_gherkin_rules_log_data:/src -v /srv/nfs/gherkin_logs:/dst alpine sh -c "cp -a /src/. /dst/" + +# Copy SSL certs (after first deploy) +cp -a docker/frontend/letsencrypt/* /var/lib/docker/volumes/validate_letsencrypt_data/_data/ +docker service update --force validate_frontend +``` + +--- + +## Quick Reference + +| Task | Command | +|---|---| +| Deploy (external DB) | `make start-swarm-nodb ENV_FILE=` | +| Deploy (with DB) | `make start-swarm ENV_FILE=` | +| Stop | `make stop-swarm` | +| Build + push | `make swarm-push ENV_FILE=` | +| Scale workers | `make scale-workers WORKERS=4` | +| Set limits | `make set-worker-limits CPU=2 MEM=2G` | +| Add worker | `make add-worker NAME= ENV_FILE=` | +| Remove worker | `make remove-worker NAME= ENV_FILE=` | +| Status | `make swarm-status` | +| Logs | `docker service logs -f validate_` | +| Force-restart | `docker service update --force validate_` |