diff --git a/development/capacity-test/.env b/development/capacity-test/.env new file mode 100644 index 00000000000..d63dfe662fd --- /dev/null +++ b/development/capacity-test/.env @@ -0,0 +1,4 @@ +CORTEX_VERSION=latest +GRAFANA_VERSION=10.4.2 +PROMETHEUS_VERSION=v3.2.1 +SEAWEEDFS_VERSION=3.67 diff --git a/development/capacity-test/cortex-config.yaml b/development/capacity-test/cortex-config.yaml new file mode 100644 index 00000000000..cc66b022be0 --- /dev/null +++ b/development/capacity-test/cortex-config.yaml @@ -0,0 +1,68 @@ +# Cortex configuration for capacity testing in microservices mode. +# Runs distributor, ingester, querier, and store-gateway as separate +# containers using memberlist for ring discovery (no Consul needed). + +auth_enabled: true + +server: + grpc_server_max_recv_msg_size: 104857600 + grpc_server_max_send_msg_size: 104857600 + grpc_server_max_concurrent_streams: 1000 + +distributor: + shard_by_all_labels: true + pool: + health_check_ingesters: true + +ingester_client: + grpc_client_config: + max_recv_msg_size: 104857600 + max_send_msg_size: 104857600 + grpc_compression: gzip + +ingester: + lifecycler: + min_ready_duration: 0s + final_sleep: 0s + num_tokens: 512 + ring: + kvstore: + store: memberlist + replication_factor: 1 + +blocks_storage: + backend: s3 + s3: + endpoint: seaweedfs:8333 + region: local + bucket_name: cortex-blocks + access_key_id: any + secret_access_key: any + insecure: true + tsdb: + dir: /data/tsdb + bucket_store: + sync_dir: /data/tsdb-sync + bucket_index: + enabled: true + +compactor: + data_dir: /tmp/cortex/compactor + sharding_ring: + kvstore: + store: memberlist + +store_gateway: + sharding_enabled: true + sharding_ring: + replication_factor: 1 + wait_stability_min_duration: 0s + kvstore: + store: memberlist + +frontend_worker: + match_max_concurrent: true + +memberlist: + join_members: + - ingester:7946 diff --git a/development/capacity-test/docker-compose.yaml b/development/capacity-test/docker-compose.yaml new file mode 100644 index 00000000000..0c9cac5f954 --- /dev/null +++ b/development/capacity-test/docker-compose.yaml @@ -0,0 +1,118 @@ +services: + distributor: + image: quay.io/cortexproject/cortex:${CORTEX_VERSION} + command: + - -config.file=/config/cortex-config.yaml + - -target=distributor + - -server.http-listen-port=8001 + - -server.grpc-listen-port=9001 + volumes: + - ./cortex-config.yaml:/config/cortex-config.yaml:ro + depends_on: + ingester: + condition: service_started + ports: + - "8001:8001" + healthcheck: + test: wget -qO- http://127.0.0.1:8001/ready + interval: 10s + timeout: 10s + retries: 3 + restart: on-failure + ingester: + image: quay.io/cortexproject/cortex:${CORTEX_VERSION} + command: + - -config.file=/config/cortex-config.yaml + - -target=ingester + - -server.http-listen-port=8002 + - -server.grpc-listen-port=9002 + volumes: + - ./cortex-config.yaml:/config/cortex-config.yaml:ro + ports: + - "8002:8002" + healthcheck: + test: wget -qO- http://127.0.0.1:8002/ready + interval: 10s + timeout: 10s + retries: 3 + restart: on-failure + querier: + image: quay.io/cortexproject/cortex:${CORTEX_VERSION} + command: + - -config.file=/config/cortex-config.yaml + - -target=querier + - -server.http-listen-port=8003 + - -server.grpc-listen-port=9003 + volumes: + - ./cortex-config.yaml:/config/cortex-config.yaml:ro + depends_on: + ingester: + condition: service_started + ports: + - "8003:8003" + healthcheck: + test: wget -qO- http://127.0.0.1:8003/ready + interval: 10s + timeout: 10s + retries: 3 + restart: on-failure + store-gateway: + image: quay.io/cortexproject/cortex:${CORTEX_VERSION} + command: + - -config.file=/config/cortex-config.yaml + - -target=store-gateway + - -server.http-listen-port=8004 + - -server.grpc-listen-port=9004 + volumes: + - ./cortex-config.yaml:/config/cortex-config.yaml:ro + depends_on: + ingester: + condition: service_started + ports: + - "8004:8004" + healthcheck: + test: wget -qO- http://127.0.0.1:8004/ready + interval: 10s + timeout: 10s + retries: 3 + restart: on-failure + grafana: + image: grafana/grafana:${GRAFANA_VERSION} + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_USERS_DEFAULT_THEME=light + - GF_LOG_MODE=console + - GF_LOG_LEVEL=critical + volumes: + - ./grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro + - ../../docs/getting-started/grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro + - ../../docs/getting-started/dashboards/:/var/lib/grafana/dashboards/:ro + ports: + - "3000:3000" + prometheus: + image: prom/prometheus:${PROMETHEUS_VERSION} + command: + - --config.file=/config/prometheus-config.yaml + volumes: + - ./prometheus-config.yaml:/config/prometheus-config.yaml:ro + ports: + - "9090:9090" + seaweedfs: + image: chrislusf/seaweedfs:${SEAWEEDFS_VERSION} + command: + - server + - -s3 + - -s3.config=/workspace/seaweedfs-config.json + ports: + - "8333:8333" + post_start: + - command: /seaweedfs-init.sh + volumes: + - ../../docs/getting-started/seaweedfs-config.json:/workspace/seaweedfs-config.json:ro + - ../../docs/getting-started/seaweedfs-init.sh:/seaweedfs-init.sh:ro + healthcheck: + test: wget -qO- http://127.0.0.1:8333/status + interval: 10s + timeout: 10s + retries: 3 diff --git a/development/capacity-test/grafana-datasource.yaml b/development/capacity-test/grafana-datasource.yaml new file mode 100644 index 00000000000..ce7ad4efc93 --- /dev/null +++ b/development/capacity-test/grafana-datasource.yaml @@ -0,0 +1,32 @@ +# Grafana datasource configuration for capacity test (microservices mode). +# Queries go through the querier service. +apiVersion: 1 + +datasources: + - name: Cortex (tenant-01) + type: prometheus + access: proxy + orgId: 1 + url: http://querier:8003/api/prom + jsonData: &jsonData + cacheLevel: None + httpHeaderName1: X-Scope-OrgID + httpMethod: POST + prometheusType: Cortex + prometheusVersion: 1.14.0 + timeInterval: 15s + secureJsonData: + httpHeaderValue1: tenant-01 + version: 1 + editable: true + isDefault: true + - name: Tenant 02 + type: prometheus + access: proxy + orgId: 1 + url: http://querier:8003/api/prom + jsonData: *jsonData + secureJsonData: + httpHeaderValue1: tenant-02 + version: 1 + editable: true diff --git a/development/capacity-test/prometheus-config.yaml b/development/capacity-test/prometheus-config.yaml new file mode 100644 index 00000000000..dee64fc006a --- /dev/null +++ b/development/capacity-test/prometheus-config.yaml @@ -0,0 +1,99 @@ +# Prometheus configuration for capacity testing. +# Scrapes distributor and ingester separately so we can isolate +# ingester memory/CPU metrics from other components. +# Uses 25 tenants to increase series count and reduce the impact +# of fixed overhead on per-series estimates. +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + - job_name: "distributor" + static_configs: + - targets: ["distributor:8001"] + - job_name: "ingester" + static_configs: + - targets: ["ingester:8002"] + - job_name: "querier" + static_configs: + - targets: ["querier:8003"] + +remote_write: + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-01" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-02" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-03" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-04" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-05" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-06" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-07" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-08" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-09" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-10" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-11" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-12" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-13" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-14" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-15" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-16" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-17" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-18" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-19" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-20" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-21" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-22" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-23" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-24" + - url: http://distributor:8001/api/v1/push + headers: + X-Scope-OrgID: "tenant-25" diff --git a/development/capacity-test/run-test.sh b/development/capacity-test/run-test.sh new file mode 100755 index 00000000000..91b99c3521e --- /dev/null +++ b/development/capacity-test/run-test.sh @@ -0,0 +1,218 @@ +#!/usr/bin/env bash +# Capacity test for Cortex blocks storage (microservices mode). +# +# Starts a local Cortex stack with separate distributor, ingester, and +# querier containers, waits for metrics to accumulate, then queries +# Prometheus for capacity-related measurements and prints per-million-series +# estimates. Because the ingester runs as a dedicated process, memory +# measurements reflect ingester-only resource usage. +# +# Prerequisites: +# - Docker and Docker Compose +# - Build the Cortex image first: make ./cmd/cortex/.uptodate +# +# Usage: +# cd development/capacity-test +# ./run-test.sh +# +# Cleanup: +# docker compose down -v + +set -euo pipefail + +cd "$(dirname "$0")" + +WAIT_SECS=300 # 5 minutes = 20 scrape cycles at 15s + +echo "==> Starting Docker Compose stack (microservices mode)..." +docker compose up -d + +wait_for_ready() { + local name="$1" + local url="$2" + local log_service="$3" + echo "==> Waiting for ${name} to be ready..." + for i in $(seq 1 30); do + if curl -sf "${url}" >/dev/null 2>&1; then + echo "==> ${name} is ready." + return 0 + fi + if [ "$i" -eq 30 ]; then + echo "ERROR: ${name} did not become ready in time." + docker compose logs "${log_service}" + exit 1 + fi + sleep 2 + done +} + +wait_for_ready "Ingester" "http://localhost:8002/ready" "ingester" +wait_for_ready "Distributor" "http://localhost:8001/ready" "distributor" +wait_for_ready "Store-gateway" "http://localhost:8004/ready" "store-gateway" +wait_for_ready "Querier" "http://localhost:8003/ready" "querier" +wait_for_ready "Prometheus" "http://localhost:9090/-/ready" "prometheus" + +echo "==> Waiting ${WAIT_SECS}s for metrics to accumulate..." +sleep "$WAIT_SECS" + +echo "==> Querying Prometheus for capacity metrics..." +echo "" + +query_prometheus() { + local query="$1" + curl -s --fail-with-body "http://localhost:9090/api/v1/query" \ + --data-urlencode "query=${query}" 2>/dev/null +} + +# Collect ingester-specific metrics (job="ingester") +INGESTER_RSS_JSON=$(query_prometheus 'process_resident_memory_bytes{job="ingester"}') +INGESTER_HEAP_JSON=$(query_prometheus 'go_memstats_alloc_bytes{job="ingester"}') +SERIES_JSON=$(query_prometheus 'cortex_ingester_memory_series{job="ingester"}') +SAMPLE_RATE_JSON=$(query_prometheus 'rate(cortex_ingester_ingested_samples_total{job="ingester"}[2m])') +INGESTER_CPU_JSON=$(query_prometheus 'rate(process_cpu_seconds_total{job="ingester"}[2m])') + +# Collect distributor CPU for distributor throughput estimates +DIST_CPU_JSON=$(query_prometheus 'rate(process_cpu_seconds_total{job="distributor"}[2m])') +DIST_SAMPLE_RATE_JSON=$(query_prometheus 'rate(cortex_distributor_received_samples_total{job="distributor"}[2m])') + +python3 - "$INGESTER_RSS_JSON" "$INGESTER_HEAP_JSON" "$SERIES_JSON" "$SAMPLE_RATE_JSON" "$INGESTER_CPU_JSON" "$DIST_CPU_JSON" "$DIST_SAMPLE_RATE_JSON" <<'PYEOF' +import json +import sys + +def extract_value(json_str, label=""): + """Extract the first numeric value from a Prometheus query result.""" + try: + data = json.loads(json_str) + results = data.get("data", {}).get("result", []) + if not results: + print(f" WARNING: No data for {label}") + return None + # Sum all values (e.g. multiple tenants for sample rates) + return sum(float(r["value"][1]) for r in results) + except (json.JSONDecodeError, KeyError, IndexError, ValueError) as e: + print(f" WARNING: Failed to parse {label}: {e}") + return None + +def extract_first_value(json_str, label=""): + """Extract only the first value (for process-level metrics like RSS/CPU).""" + try: + data = json.loads(json_str) + results = data.get("data", {}).get("result", []) + if not results: + print(f" WARNING: No data for {label}") + return None + return float(results[0]["value"][1]) + except (json.JSONDecodeError, KeyError, IndexError, ValueError) as e: + print(f" WARNING: Failed to parse {label}: {e}") + return None + +ingester_rss = extract_first_value(sys.argv[1], "ingester RSS") +ingester_heap = extract_first_value(sys.argv[2], "ingester heap") +series = extract_first_value(sys.argv[3], "ingester memory_series") +sample_rate = extract_value(sys.argv[4], "ingester ingested_samples rate") +ingester_cpu = extract_first_value(sys.argv[5], "ingester CPU") +dist_cpu = extract_first_value(sys.argv[6], "distributor CPU") +dist_sample_rate = extract_value(sys.argv[7], "distributor received_samples rate") + +print("=" * 60) +print(" CORTEX BLOCKS STORAGE — CAPACITY MEASUREMENTS") +print(" (microservices mode: ingester-only metrics)") +print("=" * 60) +print() + +# --- Raw measurements --- +print("--- Raw Measurements (Ingester) ---") +if ingester_rss is not None: + print(f" Ingester RSS: {ingester_rss / 1e6:.1f} MB") +if ingester_heap is not None: + print(f" Ingester Go heap: {ingester_heap / 1e6:.1f} MB") +if series is not None: + print(f" Active series: {series:.0f}") +if sample_rate is not None: + print(f" Sample ingest rate: {sample_rate:.1f} samples/sec") +if ingester_cpu is not None: + print(f" Ingester CPU: {ingester_cpu:.3f} cores ({ingester_cpu * 100:.1f}%)") +print() + +print("--- Raw Measurements (Distributor) ---") +if dist_cpu is not None: + print(f" Distributor CPU: {dist_cpu:.3f} cores ({dist_cpu * 100:.1f}%)") +if dist_sample_rate is not None: + print(f" Distributor sample rate: {dist_sample_rate:.1f} samples/sec") +print() + +# --- Per-million-series estimates --- +print("--- Per-Million-Series Estimates ---") +print() + +if series is not None and series > 0: + scale = 1_000_000 / series + + if ingester_rss is not None: + rss_per_m = ingester_rss * scale + print(f" Ingester RSS per 1M series: {rss_per_m / 1e9:.1f} GB") + print() + + if ingester_heap is not None: + heap_per_m = ingester_heap * scale + print(f" Ingester heap per 1M series: {heap_per_m / 1e9:.1f} GB") + print() + + # Disk space: 30KB per series with 24h retention and RF=3 (from production-tips.md) + # In this test RF=1, so scale accordingly + disk_per_series_rf3 = 30 * 1024 # 30 KB + disk_per_series_rf1 = disk_per_series_rf3 / 3 # ~10 KB for RF=1 + disk_per_m_rf1 = 1_000_000 * disk_per_series_rf1 + disk_per_m_rf3 = 1_000_000 * disk_per_series_rf3 + print(f" Ingester disk (24h retention, RF=1): {disk_per_m_rf1 / 1e9:.1f} GB per 1M series") + print(f" Ingester disk (24h retention, RF=3): {disk_per_m_rf3 / 1e9:.1f} GB per 1M series") + print(f" (from production-tips.md: ~30KB per series with RF=3)") + print() + + # Object storage: ~1.5 bytes/sample with Gorilla/TSDB compression + # At 15s scrape interval: 5760 samples/series/day + samples_per_day = 86400 / 15 # 5760 + bytes_per_sample = 1.5 + obj_per_series_day = samples_per_day * bytes_per_sample # ~8640 bytes + obj_per_m_day = 1_000_000 * obj_per_series_day + print(f" Object storage per 1M series/day: {obj_per_m_day / 1e9:.1f} GB") + print(f" (~{bytes_per_sample} bytes/sample x {samples_per_day:.0f} samples/series/day)") + print() +else: + print(" WARNING: No series data available, cannot compute per-million estimates.") + print() + +# --- CPU estimates --- +print("--- CPU Estimates ---") +if ingester_cpu is not None and sample_rate is not None and ingester_cpu > 0: + ingester_samples_per_core = sample_rate / ingester_cpu + print(f" Ingester samples/sec per core: {ingester_samples_per_core:.0f}") +if dist_cpu is not None and dist_sample_rate is not None and dist_cpu > 0: + dist_samples_per_core = dist_sample_rate / dist_cpu + print(f" Distributor samples/sec per core: {dist_samples_per_core:.0f}") +if (ingester_cpu is None or ingester_cpu == 0) and (dist_cpu is None or dist_cpu == 0): + print(" WARNING: Insufficient data for CPU estimates.") +print() + +# --- Caveats --- +print("--- Caveats ---") +print(" - Microservices mode: ingester RSS reflects ingester-only memory.") +if series is not None: + print(f" - Small series count ({series:.0f}) means fixed overhead dominates;") + print(f" per-series memory estimates will be higher than at scale.") +print(" - Disk and object storage numbers are calculated from production-tips.md") +print(" and TSDB compression ratios, not directly measured in this test.") +print() + +print("=" * 60) +PYEOF + +echo "" +echo "==> Test stack is running." +echo " Grafana: http://localhost:3000" +echo " Distributor: http://localhost:8001" +echo " Ingester: http://localhost:8002" +echo " Querier: http://localhost:8003" +echo " Prometheus: http://localhost:9090" +echo "" +echo " To clean up: docker compose down -v" diff --git a/docs/guides/capacity-planning.md b/docs/guides/capacity-planning.md index c7c5a2b5ed7..f98921363cf 100644 --- a/docs/guides/capacity-planning.md +++ b/docs/guides/capacity-planning.md @@ -5,9 +5,6 @@ weight: 10 slug: capacity-planning --- -_This doc is likely out of date. It should be updated for blocks storage._ - - You will want to estimate how many nodes are required, how many of each component to run, and how much storage space will be required. In practice, these will vary greatly depending on the metrics being @@ -26,11 +23,12 @@ Some key parameters are: thousands of batch jobs lasting a minute or so and capture metrics with a unique ID for each one. [Read how to analyse this on Prometheus](https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality). -4. How compressible the time-series data are. If a metric stays at - the same value constantly, then Cortex can compress it very well, so - 12 hours of data sampled every 15 seconds would be around 2KB. On - the other hand, if the value jumps around a lot, it might take 10KB. - There are not currently any tools available to analyse this. +4. How compressible the time-series data are. Blocks storage uses + TSDB with Gorilla compression (XOR encoding for values, delta-of-delta + for timestamps), which typically achieves ~1.5 bytes per sample. A + constant metric sampled every 15 seconds for 2 hours (~480 samples) + compresses to under 1KB, while a highly variable metric may use + ~2 bytes per sample. 5. How long you want to retain data for, e.g. 1 month or 2 years. Other parameters which can become important if you have particularly @@ -42,22 +40,31 @@ high values: Now, some rules of thumb: -1. Each million series in an ingester takes 15GB of RAM. The total number - of series in ingesters is the number of active series times the - replication factor. This is with the default of 12-hour chunks - RAM - required will reduce if you set `-ingester.max-chunk-age` lower - (trading off more back-end database I/O). +1. Each million active series in an ingester takes approximately 4-8GB + of RAM. The total number of series in ingesters is the number of active + series times the replication factor. The exact amount depends on the + number of labels, label value lengths, and whether the series are + actively being queried. There are some additional considerations for planning for ingester memory usage. 1. Memory increases during write-ahead log (WAL) replay, [See Prometheus issue #6934](https://github.com/prometheus/prometheus/issues/6934#issuecomment-726039115). If you do not have enough memory for WAL replay, the ingester will not be able to restart successfully without intervention. 2. Memory temporarily increases during resharding since timeseries are temporarily on both the new and old ingesters. This means you should scale up the number of ingesters before memory utilization is too high, otherwise you will not have the headroom to account for the temporary increase. -2. Each million series (including churn) consumes 15GB of chunk - storage and 4GB of index, per day (so multiply by the retention - period). -3. The distributors’ CPU utilization depends on the specific Cortex cluster +2. **Ingester disk**: With `-blocks-storage.tsdb.retention-period=24h` + (recommended), plan for approximately 30KB per in-memory series + (after replication). For example, 20M active series with replication + factor 3 requires ~1.7TB total across all ingesters. See + [blocks storage production tips](../blocks-storage/production-tips.md#ingester-disk-space) + for details. +3. **Object storage**: Each million active series produces approximately + 8-9GB of block data per day in object storage (~1.5 bytes/sample × + 5,760 samples/series/day at a 15-second scrape interval). Multiply + by your retention period for total storage. Actual compression ratios + vary with metric variability — constant metrics compress better, + highly variable metrics use more space. +4. The distributors' CPU utilization depends on the specific Cortex cluster setup, while they don't need much RAM. Typically, distributors are capable of processing between 20,000 and 100,000 samples/sec with 1 CPU core. It's also highly recommended to configure Prometheus `max_samples_per_send` to 1,000 - samples, in order to reduce the distributors’ CPU utilization given the same + samples, in order to reduce the distributors' CPU utilization given the same total samples/sec throughput. If you turn on compression between distributors and ingesters (for @@ -65,3 +72,8 @@ example, to save on inter-zone bandwidth charges at AWS/GCP), they will use significantly more CPU (approx. 100% more for distributor and 50% more for ingester). +For detailed blocks storage operational guidance, see +[blocks storage production tips](../blocks-storage/production-tips.md). + +To run a local capacity test and see these numbers for yourself, use the +setup in `development/capacity-test/`.