From b6c07f4a0fc7b8efe7f989b7fe46e7a0ba601fac Mon Sep 17 00:00:00 2001
From: as51340 <andi8647@gmail.com>
Date: Thu, 26 Mar 2026 10:57:43 +0100
Subject: [PATCH] feat: Enhance reconciliation loop

---
 .../ha-commands-reference.mdx                 | 36 ++++++++++++++---
 .../how-high-availability-works.mdx           | 40 ++++++++++++++++---
 2 files changed, 66 insertions(+), 10 deletions(-)
diff --git a/pages/clustering/high-availability/ha-commands-reference.mdx b/pages/clustering/high-availability/ha-commands-reference.mdx
index 84fe3e3c2..73489ad6e 100644
--- a/pages/clustering/high-availability/ha-commands-reference.mdx
+++ b/pages/clustering/high-availability/ha-commands-reference.mdx
@@ -139,11 +139,16 @@ REGISTER INSTANCE instanceName ( AS ASYNC | AS STRICT_SYNC ) ? WITH CONFIG {
 
 {<h4 className="custom-header"> Behavior </h4>}
 
+- The operation is first committed to the Raft log and acknowledged by a
+  majority of coordinators.
 - Coordinator connects via `management_server` to verify liveness.
 - Coordinator begins periodic health checks.
 - Instance is automatically demoted to REPLICA.
 - Replication server is started on the data instance.
-- Operation is persisted in Raft.
+- If RPCs to the data instance fail (e.g., due to a transient network issue),
+  the registration still succeeds. The [reconciliation
+  loop](/clustering/high-availability/how-high-availability-works#how-the-reconciliation-loop-works)
+  automatically retries the RPCs.
 
 {<h4 className="custom-header"> Replication mode rules </h4>}
 
@@ -186,7 +191,10 @@ UNREGISTER INSTANCE instanceName;
 
 - Do **not** unregister the MAIN instance; this may corrupt cluster state.
 - A healthy MAIN must exist during the operation.
-- The instance is also removed from MAIN’s replica set.
+- The instance is removed from the Raft state first. If the RPC to unregister
+  the replica from MAIN fails, the [reconciliation
+  loop](/clustering/high-availability/how-high-availability-works#how-the-reconciliation-loop-works)
+  automatically retries the operation.
 
 {<h4 className="custom-header"> Example </h4>}
 
@@ -207,13 +215,17 @@ SET INSTANCE instanceName TO MAIN;
 
 {<h4 className="custom-header"> Behavior </h4>}
 
+- The promotion is first committed to the Raft log and acknowledged by a
+  majority of coordinators.
 - All other registered instances become replicas of the new MAIN.
-- Written to Raft log.
+- RPCs (`PromoteToMainRpc`, `SwapAndUpdateUUID`) are sent to data instances on
+  a best-effort basis. If they fail, the [reconciliation
+  loop](/clustering/high-availability/how-high-availability-works#how-the-reconciliation-loop-works)
+  automatically retries them.
 
 {<h4 className="custom-header"> Implications </h4>}
 
 - Fails if a MAIN already exists.
-- Fails if any instance is unavailable.
 
 {<h4 className="custom-header"> Example </h4>}
 
@@ -232,8 +244,14 @@ DEMOTE INSTANCE instanceName;
 
 {<h4 className="custom-header"> Behavior </h4>}
 
+- The role change is first committed to the Raft log and acknowledged by a
+  majority of coordinators.
 - MAIN becomes REPLICA.
-- Written to Raft log.
+- The `DemoteMainToReplicaRpc` is sent on a best-effort basis. If it fails, the
+  [reconciliation
+  loop](/clustering/high-availability/how-high-availability-works#how-the-reconciliation-loop-works)
+  automatically retries it.
+- Returns an error if the instance is already a REPLICA.
 
 {<h4 className="custom-header"> Implications </h4>}
 
@@ -307,6 +325,14 @@ SHOW REPLICATION LAG;
 - Useful during manual failover to evaluate risk of data loss.
 
 
+## Error handling
+
+If a Raft log commit fails for any cluster operation (register, unregister,
+promote, demote, add coordinator), the error message will indicate:
+
+> Writing to Raft log failed. Please retry the operation.
+
+
 ## Troubleshooting commands
 
 ### `FORCE RESET CLUSTER STATE`
diff --git a/pages/clustering/high-availability/how-high-availability-works.mdx b/pages/clustering/high-availability/how-high-availability-works.mdx
index 0e31c61b6..932e8a444 100644
--- a/pages/clustering/high-availability/how-high-availability-works.mdx
+++ b/pages/clustering/high-availability/how-high-availability-works.mdx
@@ -148,7 +148,7 @@ All of the following messages were sent by the leader coordinator.
 | `DemoteMainToReplicaRpc`   | Demote a Main after failover                        | Sent to the old MAIN in order to demote it to REPLICA.                                                                           |
 | `RegisterReplicaOnMainRpc` | Instruct Main to accept replication from a Replica  | Sent to the MAIN to register a REPLICA on the MAIN.                                                                              |
 | `UnregisterReplicaRpc`     | Remove Replica from Main                            | Sent to the MAIN to unregister a REPLICA from the MAIN.                                                                          |
-| `EnableWritingOnMainRpc`   | Re-enable writes after Main restarts                | Sent to the MAIN to enable writing on that MAIN.                                                                                 |
+| `EnableWritingOnMainRpc`   | Re-enable writes after Main restarts (deprecated)   | Kept for backward compatibility (ISSU). No longer sent by coordinators — writing is implicitly enabled on promotion.             |
 | `GetDatabaseHistoriesRpc`  | Gather committed transaction counts during failover | Sent to all REPLICA instances in order to select a new MAIN during the failover process.                                         |
 | `StateCheckRpc`            | Health check ping (liveness)                        | Sent to all data instances for a liveness check.                                                                                 |
 | `SwapMainUUIDRpc`          | Ensure Replica tracks the correct Main              | Sent to REPLICA instances to set the UUID of the MAIN they should listen to.                                                     |
@@ -225,7 +225,7 @@ in the cluster to ensure high availability, with timeouts.
 | `PromoteToMainReq`         | Coordinator | Data instance  |                  |
 | `RegisterReplicaOnMainReq` | Coordinator | Data instance  |                  |
 | `UnregisterReplicaReq`     | Coordinator | Data instance  |                  | 
-| `EnableWritingOnMainReq`   | Coordinator | Data instance  |                  |
+| `EnableWritingOnMainReq`   | Coordinator | Data instance  |    deprecated    |
 | `GetDatabaseHistoriesReq`  | Coordinator | Data instance  |                  |
 | `StateCheckReq`            | Coordinator | Data instance  | 5s               |
 | `SwapMainUUIDReq`          | Coordinator | Data instance  |                  |
@@ -462,6 +462,36 @@ All state-changing operations are disabled on followers, including:
 
 These operations are permitted **only on the leader coordinator**.
 
+## Raft-first operations and the reconciliation loop
+
+The coordinator follows a **Raft-first** pattern for all cluster operations
+(registering, unregistering, promoting, demoting instances). This means every
+state change is first committed to the Raft log and acknowledged by a majority
+of coordinators **before** the operation returns success to the user.
+
+After the Raft commit, the coordinator sends RPCs to data instances (e.g.,
+`PromoteToMainRpc`, `DemoteMainToReplicaRpc`, `RegisterReplicaOnMainRpc`,
+`UnregisterReplicaRpc`) on a **best-effort** basis. If an RPC fails due to a
+transient network issue, the operation still succeeds from the user's
+perspective because the Raft log is the single source of truth.
+
+### How the reconciliation loop works
+
+The coordinator leader runs a periodic **reconciliation loop** that
+automatically detects and corrects discrepancies between the desired state (Raft
+log) and the actual state of data instances. Specifically:
+
+- **Missing replicas on main**: If a replica exists in the Raft state but is not
+  registered on the current main instance, the reconciliation loop sends a
+  `RegisterReplicaOnMainRpc` to the main.
+- **Stale replicas on main**: If the main instance reports a replica that no
+  longer exists in the Raft state, the reconciliation loop sends an
+  `UnregisterReplicaRpc` to remove it.
+
+This self-healing behavior means the cluster automatically recovers from
+transient RPC failures without user intervention. Users only need to retry an
+operation if the Raft commit itself fails.
+
 ## Instance restarts
 
 ### Restarting data instances
@@ -473,9 +503,9 @@ Both MAIN and REPLICA instances may fail and later restart.
   to follow. This synchronization happens automatically once the coordinator’s
   health check (“ping”) succeeds.
 
-- When the **MAIN** instance restarts, it is initially prevented from accepting
-  write operations. Writes become allowed only after the coordinator confirms
-  the instance’s state and sends an `EnableWritingOnMainRpc` message.
+- When the **MAIN** instance restarts, the coordinator confirms the instance’s
+  state through health checks. Writing is enabled once the
+  coordinator verifies the instance is healthy and its role is confirmed by sending PromoteToMainRpc to the data instance.
 
 This ensures that instances safely rejoin the cluster without causing
 inconsistencies.