From cf9ae6aab249a3d9b51a5c313c11e2356ecb5ddd Mon Sep 17 00:00:00 2001
From: as51340 <andi8647@gmail.com>
Date: Thu, 26 Mar 2026 10:38:29 +0100
Subject: [PATCH] feat: Move timeout and health check frequency flags to
 run-time coordinator settings

---
 .gitignore                                    |  3 +
 .../high-availability/best-practices.mdx      | 74 +++++++++++++------
 .../how-high-availability-works.mdx           | 24 +++---
 pages/database-management/configuration.mdx   |  4 +-
 pages/fundamentals/telemetry.mdx              |  2 +-
 5 files changed, 71 insertions(+), 36 deletions(-)
diff --git a/.gitignore b/.gitignore
index ee9c0caf1..1e12526e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,6 @@ pnpm-lock.yaml
 # misc
 .DS_Store
 .vercel
+
+.claude/
+CLAUDE.md
diff --git a/pages/clustering/high-availability/best-practices.mdx b/pages/clustering/high-availability/best-practices.mdx
index 6ea252f75..228bf02b1 100644
--- a/pages/clustering/high-availability/best-practices.mdx
+++ b/pages/clustering/high-availability/best-practices.mdx
@@ -113,39 +113,39 @@ coordinator’s RPC messages.
 **Example:**
 `--management-port=10000`
 
-#### `--instance-health-check-frequency-sec`
+#### `--instance-health-check-frequency-sec` (deprecated)
 
-How often the coordinator pings data instances (default: 1 second).
-Changing is usually unnecessary.
-
-**Example:**
-`--instance-health-check-frequency-sec=1`
-
-#### `--instance-down-timeout-sec`
-
-How long to wait before marking an instance as down (default: 5 seconds).
+<Callout type="warning">
+**Deprecated in Memgraph 3.10.** This startup flag is now ignored. Use the
+`instance_health_check_frequency_sec` [coordinator runtime setting](#coordinator-runtime-settings)
+instead.
+</Callout>
 
-**Example:**
-`--instance-down-timeout-sec=5`
+#### `--instance-down-timeout-sec` (deprecated)
 
+<Callout type="warning">
+**Deprecated in Memgraph 3.10.** This startup flag is now ignored. Use the
+`instance_down_timeout_sec` [coordinator runtime setting](#coordinator-runtime-settings)
+instead.
+</Callout>
 
 ### Health check behavior
 
 Coordinator health checks follow this pattern:
 
-- A ping is sent every `--instance-health-check-frequency-sec`.
+- A ping is sent every `instance_health_check_frequency_sec` seconds.
 - An instance is marked **down** only after
-  `--instance-down-timeout-sec` elapses without a response.
+  `instance_down_timeout_sec` elapses without a response.
 
 Requirements & recommendations:
 
-- `down-timeout >= health-check-frequency`
+- `instance_down_timeout_sec >= instance_health_check_frequency_sec`
 - Prefer using a multiplier:
-  **down-timeout = N × health-check-frequency**, with **N ≥ 2**
+  **instance_down_timeout_sec = N × instance_health_check_frequency_sec**, with **N ≥ 2**
 
-**Example:**
-`--instance-down-timeout-sec=5`
-`--instance-health-check-frequency-sec=1`
+**Example (defaults):**
+`instance_down_timeout_sec=5`
+`instance_health_check_frequency_sec=1`
 
 
 ## Environment variable configuration
@@ -211,9 +211,39 @@ the command line argument.
 
 ## Coordinator runtime settings
 
-There is a configuration option for specifying whether reads from the main are
-enabled. The configuration value is by default false but can be changed in
-run-time using the following query:
+Coordinator runtime settings are Raft-replicated and can be changed on a live
+cluster without downtime. Use `SET COORDINATOR SETTING` to modify a value and
+`SHOW COORDINATOR SETTINGS` to inspect all current values. Changes propagate
+automatically to every coordinator in the cluster.
+
+### `instance_health_check_frequency_sec`
+
+How often the coordinator pings data instances, in seconds.
+
+```
+SET COORDINATOR SETTING 'instance_health_check_frequency_sec' TO '1' ;
+```
+
+**Default:** `1`
+
+### `instance_down_timeout_sec`
+
+How long to wait (in seconds) before marking an instance as down. Must be
+greater than or equal to `instance_health_check_frequency_sec`.
+
+```
+SET COORDINATOR SETTING 'instance_down_timeout_sec' TO '5' ;
+```
+
+**Default:** `5`
+
+<Callout type="warning">
+**Upgrade note (3.10):** Values previously set via the
+`--instance_down_timeout_sec` and `--instance_health_check_frequency_sec`
+startup flags are **not** automatically migrated. After upgrading, the settings
+revert to their defaults (`5` and `1`). If you had customized these flags, run
+`SET COORDINATOR SETTING` queries to re-apply your values.
+</Callout>
 
 ### `enabled_reads_on_main`
 
diff --git a/pages/clustering/high-availability/how-high-availability-works.mdx b/pages/clustering/high-availability/how-high-availability-works.mdx
index 0e31c61b6..082c3928e 100644
--- a/pages/clustering/high-availability/how-high-availability-works.mdx
+++ b/pages/clustering/high-availability/how-high-availability-works.mdx
@@ -267,15 +267,17 @@ other instances and starts accepting write queries.
 ### Instance health checks
 
 The coordinator performs health checks on each instance at a fixed interval,
-configured with `--instance-health-check-frequency-sec`. An instance is not
-considered down until it has failed to respond for the full duration specified
-by `--instance-down-timeout-sec`.
+configured with the `instance_health_check_frequency_sec` coordinator setting.
+An instance is not considered down until it has failed to respond for the full
+duration specified by the `instance_down_timeout_sec` coordinator setting. Both
+settings can be changed at runtime using
+[`SET COORDINATOR SETTING`](/clustering/high-availability/best-practices#coordinator-runtime-settings).
 
 **Example**
 
-If you set:
-- `--instance-health-check-frequency-sec=1`
-- `--instance-down-timeout-sec=5`
+With the default settings:
+- `instance_health_check_frequency_sec=1`
+- `instance_down_timeout_sec=5`
 
 …the coordinator will send a health check RPC (`StateCheckRpc`) every second. An
 instance is marked as down only after **five consecutive missed responses** (5
@@ -299,9 +301,9 @@ If a **REPLICA** fails to respond:
 2. **Main instance fails to respond**
 
 If the **MAIN** instance fails to respond, two cases apply:
-- **Down for less than** `--instance-down-timeout-sec` The instance is still
+- **Down for less than** `instance_down_timeout_sec` The instance is still
 considered alive and will rejoin as MAIN when it responds again.
-- **Down for longer than** `--instance-down-timeout-sec` The coordinator
+- **Down for longer than** `instance_down_timeout_sec` The coordinator
 initiates the failover procedure. What the old MAIN becomes afterward depends on
 the outcome:
   - **Failover succeeds**: the old MAIN rejoins as a **REPLICA**.
@@ -627,9 +629,9 @@ Thus, only STRICT_SYNC replicas can directly impact write availability.
 When the MAIN instance becomes unavailable, the failure is handled by the leader
 coordinator using two user-configured parameters:
 
-- `--instance-health-check-frequency-sec`: how often health checks are sent
-- `--instance-down-timeout-sec`: how long an instance must remain unresponsive
-  before it is considered down
+- `instance_health_check_frequency_sec`: how often health checks are sent (configurable via [`SET COORDINATOR SETTING`](/clustering/high-availability/best-practices#coordinator-runtime-settings))
+- `instance_down_timeout_sec`: how long an instance must remain unresponsive
+  before it is considered down (configurable via [`SET COORDINATOR SETTING`](/clustering/high-availability/best-practices#coordinator-runtime-settings))
 
 Once the coordinator gathers enough evidence that the MAIN is down, it begins a
 failover procedure using a small number of RPC messages. The exact time required
diff --git a/pages/database-management/configuration.mdx b/pages/database-management/configuration.mdx
index 6fe01b68d..eb982f613 100644
--- a/pages/database-management/configuration.mdx
+++ b/pages/database-management/configuration.mdx
@@ -419,8 +419,8 @@ This section contains the list of flags that are used to configure highly availa
 | `--coordinator-id`                         | Raft server id on coordinator instance.                                                                                                                      | `[int32]` |
 | `--coordinator-port`                       | Raft server's port on coordinator instance.                                                                                                                  | `[uint32]` |
 | `--management-port`                        | Port on which replication instances receive messages from coordinator .                                                                                      | `[uint32]` |
-| `--instance-health-check-frequency-sec=1`  | The interval between two health checks that coordinator does on replication instances.                                                                       | `[uint32]` |
-| `--instance-down-timeout-sec=5            | Number of seconds that need to pass before replication instance is considered down. Must be greater or equal to the `--instance-health-check-frequency-sec`. | `[uint32]` |
+| ~~`--instance-health-check-frequency-sec`~~  | **Deprecated in 3.10.** This flag is ignored. Use `SET COORDINATOR SETTING 'instance_health_check_frequency_sec' TO '<value>'` instead. See [Coordinator runtime settings](/clustering/high-availability/best-practices#coordinator-runtime-settings). | `[uint32]` |
+| ~~`--instance-down-timeout-sec`~~            | **Deprecated in 3.10.** This flag is ignored. Use `SET COORDINATOR SETTING 'instance_down_timeout_sec' TO '<value>'` instead. See [Coordinator runtime settings](/clustering/high-availability/best-practices#coordinator-runtime-settings). | `[uint32]` |
 | `--nuraft-log-file`                        | Path to the file where NuRaft logs are saved.                                                                                                                | `[string]` |
 | `--coordinator-hostname`                   | Coordinator's instance hostname. Used only in `SHOW INSTANCES` query.                                                                                        | `[string]` |
 
diff --git a/pages/fundamentals/telemetry.mdx b/pages/fundamentals/telemetry.mdx
index 337ea3a07..86d3dae84 100644
--- a/pages/fundamentals/telemetry.mdx
+++ b/pages/fundamentals/telemetry.mdx
@@ -64,7 +64,7 @@ available, the following data will be sent to and stored on Memgraph's servers.
 **High availability cluster information:**
   - The number of strict sync, sync and asynchronous replicas (retrieved from the current main).
   - The number of coordinators in the cluster.
-  - Configuration options: `instance_down_timeout_sec`, `instance_health_check_frequency_sec`, `enabled_reads_on_main`, `sync_failover_only`.
+  - Coordinator runtime settings: `instance_down_timeout_sec`, `instance_health_check_frequency_sec`, `enabled_reads_on_main`, `sync_failover_only`.
 
 **Running environment:**
   - Whether Memgraph is running in K8s or somewhere else.