diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 858320b6ebb7..65167fcb1357 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -353,6 +353,8 @@ config BLKDEV_UBLK_LEGACY_OPCODES
 
 source "drivers/block/rnbd/Kconfig"
 
+source "drivers/block/brmr/Kconfig"
+
 config BLK_DEV_ZONED_LOOP
 	tristate "Zoned loopback device support"
 	depends on BLK_DEV_ZONED
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 2d8096eb8cdf..4793c9b0b383 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 obj-$(CONFIG_ZRAM) += zram/
 obj-$(CONFIG_BLK_DEV_RNBD)	+= rnbd/
+obj-$(CONFIG_BLK_DEV_BRMR)	+= brmr/
 
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk/
 obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull/
diff --git a/drivers/block/brmr/Kconfig b/drivers/block/brmr/Kconfig
new file mode 100644
index 000000000000..a38d59d2c1d4
--- /dev/null
+++ b/drivers/block/brmr/Kconfig
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config BLK_DEV_BRMR
+	bool
+
+config BLK_DEV_BRMR_CLIENT
+	tristate "Block device over RMR (BRMR) client"
+	depends on INFINIBAND_RMR_CLIENT
+	select BLK_DEV_BRMR
+	help
+	  BRMR client is a block device driver that sits on top of the
+	  RMR ULP and exposes a standard Linux block device (/dev/brmrX)
+	  backed by an RMR pool.  Together with RMR it provides a
+	  single-hop replication and resynchronization solution for
+	  RDMA-connected storage clusters.
+
+	  If unsure, say N.
+
+config BLK_DEV_BRMR_SERVER
+	tristate "Block device over RMR (BRMR) server"
+	depends on INFINIBAND_RMR_SERVER
+	select BLK_DEV_BRMR
+	help
+	  BRMR server exports a local block device as the backing store
+	  for an RMR pool, so that BRMR clients can map it remotely
+	  over RDMA.
+
+	  If unsure, say N.
diff --git a/drivers/block/brmr/Makefile b/drivers/block/brmr/Makefile
new file mode 100644
index 000000000000..894ba2720557
--- /dev/null
+++ b/drivers/block/brmr/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs \
+	     -I$(srctree)/drivers/infiniband/ulp/rmr \
+	     -I$(srctree)/drivers/block/brmr
+
+brmr-client-y := brmr-clt.o \
+		 brmr-clt-sysfs.o \
+		 brmr-clt-reque.o \
+		 brmr-clt-stats.o
+
+brmr-server-y := brmr-srv-sysfs.o \
+		 brmr-srv.o
+
+obj-$(CONFIG_BLK_DEV_BRMR_CLIENT) += brmr-client.o
+obj-$(CONFIG_BLK_DEV_BRMR_SERVER) += brmr-server.o
diff --git a/drivers/block/brmr/brmr-clt-reque.c b/drivers/block/brmr/brmr-clt-reque.c
new file mode 100644
index 000000000000..252661486a0a
--- /dev/null
+++ b/drivers/block/brmr/brmr-clt-reque.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "brmr-clt.h"
+#include "rmr.h"
+#include "rmr-pool.h"
+
+MODULE_AUTHOR("The RMR and BRMR developers");
+MODULE_VERSION(BRMR_VER_STRING);
+MODULE_DESCRIPTION("BRMR Block Device using RMR cluster");
+MODULE_LICENSE("GPL");
+
+static inline void brmr_requeue(struct brmr_queue *q)
+{
+	if (WARN_ON(!q->hctx))
+		return;
+
+	/* We can come here from interrupt, thus async=true */
+	blk_mq_run_hw_queue(q->hctx, true);
+}
+
+/**
+ * requeue implementation as used by ibnbd
+ */
+
+void brmr_init_cpu_qlists(struct brmr_cpu_qlist __percpu *cpu_queues)
+{
+	unsigned int cpu;
+	struct brmr_cpu_qlist *cpu_q;
+
+	for_each_possible_cpu(cpu) {
+		cpu_q = per_cpu_ptr(cpu_queues, cpu);
+
+		cpu_q->cpu = cpu;
+		INIT_LIST_HEAD(&cpu_q->requeue_list);
+		spin_lock_init(&cpu_q->requeue_lock);
+	}
+}
+
+/**
+ * brmr_get_cpu_qlist() - finds a list with HW queues to be requeued
+ *
+ * Description:
+ *     Each CPU has a list of HW queues, which needs to be requeed.  If a list
+ *     is not empty - it is marked with a bit.  This function finds first
+ *     set bit in a bitmap and returns corresponding CPU list.
+ */
+static struct brmr_cpu_qlist *
+brmr_get_cpu_qlist(struct brmr_clt_pool *pool, int cpu)
+{
+	int bit;
+
+	/* First half */
+	bit = find_next_bit(pool->cpu_queues_bm, nr_cpu_ids, cpu);
+	if (bit < nr_cpu_ids) {
+		return per_cpu_ptr(pool->cpu_queues, bit);
+	} else if (cpu != 0) {
+		/* Second half */
+		bit = find_next_bit(pool->cpu_queues_bm, cpu, 0);
+		if (bit < cpu)
+			return per_cpu_ptr(pool->cpu_queues, bit);
+	}
+
+	return NULL;
+}
+
+static inline int nxt_cpu(int cpu)
+{
+	return (cpu + 1) % nr_cpu_ids;
+}
+
+/**
+ * brmr_requeue_if_needed() - requeue if CPU queue is marked as non empty
+ *
+ * Description:
+ *     Each CPU has it's own list of HW queues, which should be requeued.
+ *     Function finds such list with HW queues, takes a list lock, picks up
+ *     the first HW queue out of the list and requeues it.
+ *
+ * Return:
+ *     True if the queue was requeued, false otherwise.
+ *
+ * Context:
+ *     Does not matter.
+ */
+static inline bool brmr_requeue_if_needed(struct brmr_clt_pool *pool)
+{
+	struct brmr_queue *q = NULL;
+	struct brmr_cpu_qlist *cpu_q;
+	unsigned long flags;
+	int *cpup;
+
+	/*
+	 * To keep fairness and not to let other queues starve we always
+	 * try to wake up someone else in round-robin manner.  That of course
+	 * increases latency but queues always have a chance to be executed.
+	 */
+	cpup = get_cpu_ptr(pool->cpu_rr);
+	for (cpu_q = brmr_get_cpu_qlist(pool, nxt_cpu(*cpup)); cpu_q;
+	     cpu_q = brmr_get_cpu_qlist(pool, nxt_cpu(cpu_q->cpu))) {
+		if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags))
+			continue;
+		if (likely(test_bit(cpu_q->cpu, pool->cpu_queues_bm))) {
+			q = list_first_entry_or_null(&cpu_q->requeue_list,
+						     typeof(*q), requeue_list);
+			if (WARN_ON(!q))
+				goto clear_bit;
+			list_del_init(&q->requeue_list);
+			clear_bit_unlock(0, &q->in_list);
+
+			if (list_empty(&cpu_q->requeue_list)) {
+				/* Clear bit if nothing is left */
+clear_bit:
+				clear_bit(cpu_q->cpu, pool->cpu_queues_bm);
+			}
+		}
+		spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
+
+		if (q)
+			break;
+	}
+
+	/**
+	 * Saves the CPU that is going to be requeued on the per-cpu var. Just
+	 * incrementing it doesn't work because brmr_get_cpu_qlist() will
+	 * always return the first CPU with something on the queue list when the
+	 * value stored on the var is greater than the last CPU with something
+	 * on the list.
+	 */
+	if (cpu_q)
+		*cpup = cpu_q->cpu;
+	put_cpu_var(pool->cpu_rr);
+
+	if (q)
+		brmr_requeue(q);
+
+	return !!q;
+}
+
+/**
+ * brmr_requeue_requests() - requeue all queues left in the list if
+ *     brmr_clt_pool is idling (there are no requests in-flight).
+ *
+ * Description:
+ *     This function tries to rerun all stopped queues if there are no
+ *     requests in-flight anymore.  This function tries to solve an obvious
+ *     problem, when number of tags < than number of queues (hctx), which
+ *     are stopped and put to sleep.  If last tag, which has been just put,
+ *     does not wake up all left queues (hctxs), IO requests hang forever.
+ *
+ *     That can happen when all number of tags, say N, have been exhausted
+ *     from one CPU, and we have many block devices per session, say M.
+ *     Each block device has it's own queue (hctx) for each CPU, so eventually
+ *     we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids.
+ *     If number of tags N < M x nr_cpu_ids finally we will get an IO hang.
+ *
+ *     To avoid this hang last caller of brmr_put_iu() (last caller is the
+ *     one who observes pool->busy == 0) must wake up all remaining queues.
+ *
+ * Context:
+ *     Called from msg_io_conf which in turn is a completion handler
+ *     that is called from interupt.
+ */
+void brmr_requeue_requests(struct brmr_clt_pool *pool)
+{
+	bool requeued;
+
+	do {
+		requeued = brmr_requeue_if_needed(pool);
+	} while (atomic_read(&pool->busy) == 0 && requeued);
+}
+
+bool brmr_add_to_requeue(struct brmr_clt_pool *pool, struct brmr_queue *q)
+{
+	struct brmr_cpu_qlist *cpu_q;
+	unsigned long flags;
+	bool added = true;
+	bool need_set;
+
+	cpu_q = get_cpu_ptr(pool->cpu_queues);
+	spin_lock_irqsave(&cpu_q->requeue_lock, flags);
+
+	if (likely(!test_and_set_bit_lock(0, &q->in_list))) {
+		if (WARN_ON(!list_empty(&q->requeue_list)))
+			goto unlock;
+
+		need_set = !test_bit(cpu_q->cpu, pool->cpu_queues_bm);
+		if (need_set) {
+			set_bit(cpu_q->cpu, pool->cpu_queues_bm);
+			/* Paired with brmr_put_iu(). Set a bit first
+			 * and then observe the busy counter.
+			 */
+			smp_mb__before_atomic();
+		}
+		if (likely(atomic_read(&pool->busy))) {
+			list_add_tail(&q->requeue_list, &cpu_q->requeue_list);
+		} else {
+			/* Very unlikely, but possible: busy counter was
+			 * observed as zero.  Drop all bits and return
+			 * false to restart the queue by ourselves.
+			 */
+			if (need_set)
+				clear_bit(cpu_q->cpu, pool->cpu_queues_bm);
+			clear_bit_unlock(0, &q->in_list);
+			added = false;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
+	put_cpu_ptr(pool->cpu_queues);
+
+	return added;
+}
+
diff --git a/drivers/block/brmr/brmr-clt-stats.c b/drivers/block/brmr/brmr-clt-stats.c
new file mode 100644
index 000000000000..de080fde779c
--- /dev/null
+++ b/drivers/block/brmr/brmr-clt-stats.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+//#include <linux/module.h>
+//#include <linux/blkdev.h>
+//#include <linux/hdreg.h>
+
+#include "brmr-clt.h"
+#include "rmr.h"
+#include "rmr-pool.h"
+
+
+int brmr_clt_init_stats(struct brmr_clt_stats *stats)
+{
+	stats->pcpu_stats = alloc_percpu(typeof(*stats->pcpu_stats));
+	if (unlikely(!stats->pcpu_stats))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void brmr_clt_free_stats(struct brmr_clt_stats *stats)
+{
+	free_percpu(stats->pcpu_stats);
+}
+
+int brmr_clt_reset_submitted_req(struct brmr_clt_stats *stats, bool enable)
+{
+	struct brmr_stats_pcpu *s;
+	int cpu;
+
+	if (unlikely(!enable))
+		return -EINVAL;
+
+	for_each_possible_cpu(cpu) {
+		s = per_cpu_ptr(stats->pcpu_stats, cpu);
+		memset(&s->submitted_requests, 0,
+		       sizeof(s->submitted_requests));
+	}
+
+	return 0;
+}
+
+int brmr_clt_reset_req_sizes(struct brmr_clt_stats *stats, bool enable)
+{
+	struct brmr_stats_pcpu *s;
+	int cpu;
+
+	if (unlikely(!enable))
+		return -EINVAL;
+
+	for_each_possible_cpu(cpu) {
+		s = per_cpu_ptr(stats->pcpu_stats, cpu);
+		memset(&s->request_sizes, 0,
+		       sizeof(s->request_sizes));
+	}
+
+	return 0;
+}
+
+static void brmr_update_submitted_requests(struct brmr_stats_pcpu *s,
+					   size_t size, int split, int d)
+{
+	s->submitted_requests.dir[d].total_sectors += (size >> SECTOR_SHIFT);
+	if (split)
+		s->submitted_requests.dir[d].cnt_split++;
+	else
+		s->submitted_requests.dir[d].cnt_whole++;
+}
+
+#define MAX_LEN (128*1024)
+#define NUM_CLASSES 16
+#define CLASSIFY_SHIFT (ilog2(MAX_LEN)-ilog2(NUM_CLASSES))
+
+/**
+   classifies length linearly in 16 classes:
+
+   input length in bytes
+   
+   <    0x2000 (8K)
+   >=   0x2000 (8K)
+   >=   0x4000 (16K)
+   >=   0x6000 (24K)
+   >=   0x8000 (32K)
+   >=   0xa000 (40K)
+   >=   0xc000 (48K)
+   >=   0xe000 (56K)
+   >=  0x10000 (64K)
+   >=  0x12000 (72K)
+   >=  0x14000 (80K)
+   >=  0x16000 (88K)
+   >=  0x18000 (96K)
+   >=  0x1a000 (104K)
+   >=  0x1c000 (112K)
+   >=  0x1e000 (120K)
+
+   Maximum value is 128K-1.
+   However everything larger is classified as class 15 as well.
+*/
+static inline int classify(long length)
+{
+	return length < MAX_LEN ? (length >> CLASSIFY_SHIFT) : NUM_CLASSES-1;
+}
+
+static void brmr_update_request_sizes(struct brmr_stats_pcpu *s,
+				      size_t size, int split, int d)
+{
+	int size_class = classify(size);
+	switch (split) {
+	case 0:
+		s->request_sizes.dir[d].cnt_whole[size_class]++;
+		break;
+	case 1:
+		s->request_sizes.dir[d].cnt_left[size_class]++;
+		break;
+	case 2:
+		s->request_sizes.dir[d].cnt_right[size_class]++;
+		break;
+	default:
+		WARN_ONCE(true,"unexpected value for split");
+	}
+}
+
+void brmr_update_stats(struct brmr_clt_stats *stats, size_t size, int split, int d)
+{
+	struct brmr_stats_pcpu *s;
+
+	s = this_cpu_ptr(stats->pcpu_stats);
+
+	brmr_update_submitted_requests(s, size, split, d);
+	brmr_update_request_sizes(s, size, split, d);
+}
+
+ssize_t brmr_clt_stats_rq_to_str(struct brmr_clt_stats *stats, char *page, size_t len)
+{
+	struct brmr_stats_rq sum;
+	struct brmr_stats_rq *r;
+	int cpu; int d;
+
+	memset(&sum, 0, sizeof(sum));
+
+	for_each_possible_cpu(cpu) {
+		r = &per_cpu_ptr(stats->pcpu_stats, cpu)->submitted_requests;
+
+		for (d=READ; d<=WRITE; d++) {
+			sum.dir[d].cnt_whole      += r->dir[d].cnt_whole;
+			sum.dir[d].cnt_split      += r->dir[d].cnt_split;
+			sum.dir[d].total_sectors  += r->dir[d].total_sectors;
+		}
+	}
+
+	return scnprintf(page, len, "%llu %llu %llu %llu %llu %llu\n",
+			 sum.dir[READ].cnt_whole, sum.dir[READ].cnt_split,
+			 sum.dir[READ].total_sectors,
+			 sum.dir[WRITE].cnt_whole, sum.dir[WRITE].cnt_split,
+			 sum.dir[WRITE].total_sectors);
+}
+
+ssize_t brmr_clt_stats_sizes_to_str(struct brmr_clt_stats *stats, char *page, size_t len)
+{
+	struct brmr_stats_sizes *sum;
+	struct brmr_stats_sizes *per_cpu;
+	int cpu; int d; int i; int cnt = 0;
+
+	sum = kzalloc(sizeof(*sum), GFP_KERNEL);
+	if (unlikely(!sum))
+		return -ENOMEM;
+
+	for (i = 0; i < STATS_SIZES_NUM; i++) {
+		for_each_possible_cpu(cpu) {
+			per_cpu = &per_cpu_ptr(stats->pcpu_stats, cpu)
+				->request_sizes;
+
+			for (d=READ; d<=WRITE; d++) {
+				sum->dir[d].cnt_whole[i]
+					+= per_cpu->dir[d].cnt_whole[i];
+				sum->dir[d].cnt_left[i]
+					+= per_cpu->dir[d].cnt_left[i];
+				sum->dir[d].cnt_right[i]
+					+= per_cpu->dir[d].cnt_right[i];
+			}
+		}
+	}
+
+	cnt += scnprintf(page + cnt, len - cnt,
+		"         READ        "
+		"        whole                left               right               "
+		"\n");
+	if (len - cnt <= 0)
+		goto free_return;
+
+	cnt += scnprintf(page + cnt, len - cnt,
+			 "<=   8 Kbytes: %19llu %19llu %19llu\n",
+			 sum->dir[READ].cnt_whole[0],
+			 sum->dir[READ].cnt_left[0],
+			 sum->dir[READ].cnt_right[0]);
+
+	for (i = 1; i < STATS_SIZES_NUM; i++) {
+
+		cnt += scnprintf(page + cnt, len - cnt,
+				 ">  %3d Kbytes: %19llu %19llu %19llu\n",
+				 (i)<<3,
+				 sum->dir[READ].cnt_whole[i],
+				 sum->dir[READ].cnt_left[i],
+				 sum->dir[READ].cnt_right[i]);
+
+		if (len - cnt <= 0)
+			goto free_return;
+	}
+
+	cnt += scnprintf(page + cnt, len - cnt,
+		"\n        WRITE        "
+		"        whole                left               right               "
+		"\n");
+	if (len - cnt <= 0)
+		goto free_return;
+
+	cnt += scnprintf(page + cnt, len - cnt,
+			 "<=   8 Kbytes: %19llu %19llu %19llu\n",
+			 sum->dir[WRITE].cnt_whole[0],
+			 sum->dir[WRITE].cnt_left[0],
+			 sum->dir[WRITE].cnt_right[0]);
+
+	for (i = 1; i < STATS_SIZES_NUM; i++) {
+
+		cnt += scnprintf(page + cnt, len - cnt,
+				 ">  %3d Kbytes: %19llu %19llu %19llu\n",
+				 (i)<<3,
+				 sum->dir[WRITE].cnt_whole[i],
+				 sum->dir[WRITE].cnt_left[i],
+				 sum->dir[WRITE].cnt_right[i]);
+
+		if (len - cnt <= 0)
+			goto free_return;
+	}
+
+free_return:
+	kfree(sum);
+
+	return cnt;
+}
+
+int brmr_clt_reset_sts_resource(struct brmr_clt_stats *stats, bool enable)
+{
+	struct brmr_stats_pcpu *s;
+	int cpu;
+
+	if (unlikely(!enable))
+		return -EINVAL;
+
+	for_each_possible_cpu(cpu) {
+		s = per_cpu_ptr(stats->pcpu_stats, cpu);
+		memset(&s->sts_resource, 0,
+		       sizeof(s->sts_resource));
+	}
+
+	return 0;
+}
+
+void brmr_clt_update_sts_resource(struct brmr_clt_stats *stats, int which)
+{
+	struct brmr_stats_pcpu *s;
+
+	s = this_cpu_ptr(stats->pcpu_stats);
+	switch (which) {
+	case 0:
+		s->sts_resource.get_iu++;
+		break;
+	case 1:
+		s->sts_resource.get_iu2++;
+		break;
+	case 2:
+		s->sts_resource.clt_request1++;
+		break;
+	case 3:
+		s->sts_resource.clt_request++;
+		break;
+	default:
+		WARN_ONCE(true,"unexpected value for which");
+	}
+}
+
+ssize_t brmr_stats_sts_resource_to_str(
+	struct brmr_clt_stats *stats, char *page, size_t len)
+{
+	struct brmr_stats_sts_resource sum;
+	struct brmr_stats_sts_resource *r;
+	int cpu;
+
+	memset(&sum, 0, sizeof(sum));
+
+	for_each_possible_cpu(cpu) {
+		r = &per_cpu_ptr(stats->pcpu_stats, cpu)->sts_resource;
+
+		sum.get_iu       += r->get_iu;
+		sum.get_iu2      += r->get_iu2;
+		sum.clt_request1 += r->clt_request1;
+		sum.clt_request  += r->clt_request;
+	}
+
+	return scnprintf(page, len, "%llu %llu %llu %llu\n",
+			 sum.get_iu, sum.get_iu2,
+			 sum.clt_request1, sum.clt_request);
+}
+
+ssize_t brmr_stats_sts_resource_per_cpu_to_str(
+	struct brmr_clt_stats *stats, char *page, size_t len)
+{
+	struct brmr_stats_sts_resource *r;
+	int cpu; int cnt = 0;
+
+	for_each_possible_cpu(cpu) {
+		r = &per_cpu_ptr(stats->pcpu_stats, cpu)->sts_resource;
+
+		cnt += scnprintf(page+cnt, len, "%d %llu %llu %llu %llu\n",
+				 cpu, r->get_iu, r->get_iu2,
+				 r->clt_request1, r->clt_request);
+		if (len - cnt <= 0)
+			goto return_cnt;
+	}
+
+return_cnt:
+	return cnt;
+}
+
diff --git a/drivers/block/brmr/brmr-clt-sysfs.c b/drivers/block/brmr/brmr-clt-sysfs.c
new file mode 100644
index 000000000000..7d2435acac6a
--- /dev/null
+++ b/drivers/block/brmr/brmr-clt-sysfs.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/device.h>
+
+#include "brmr-clt.h"
+
+static struct device *brmr_dev;
+static struct class *brmr_dev_class;
+static struct kobject *brmr_devs_kobj;
+
+enum {
+	BRMR_OPT_ERR		= 0,
+	BRMR_OPT_POOL		= 1 << 1,
+	BRMR_OPT_SIZE		= 1 << 2,
+};
+
+static int brmr_clt_create_dev_sysfs_files(struct brmr_clt_dev *dev);
+static int brmr_add_dev_symlink(struct brmr_clt_dev *dev);
+
+static unsigned int brmr_opt_mandatory[] = {
+	BRMR_OPT_POOL,
+};
+
+static const match_table_t brmr_opt_tokens = {
+	{	BRMR_OPT_POOL,		"pool=%s"	},
+	{	BRMR_OPT_SIZE,		"size=%s"	},
+	{	BRMR_OPT_ERR,		NULL		},
+};
+
+/* remove new line from string */
+static void strip(char *s)
+{
+	char *p = s;
+
+	while (*s != '\0') {
+		if (*s != '\n')
+			*p++ = *s++;
+		else
+			++s;
+	}
+	*p = '\0';
+}
+
+static int brmr_clt_parse_options(const char *buf,
+			      char *pool,
+			      unsigned long *size)
+{
+	char *options, *sep_opt;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	sep_opt = strstrip(options);
+	strip(sep_opt);
+	while ((p = strsep(&sep_opt, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, brmr_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case BRMR_OPT_POOL:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("poolname too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(pool, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case BRMR_OPT_SIZE:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			/*
+			 * The conventional semantics are that if the number begins with 0x, it will
+			 * be parsed as hexadecimal; if it begins with 0, it will be parsed as
+			 * octal; otherwise, it will be parsed as decimal.
+			 */
+			ret = kstrtoul(p, 0, size);
+			if (ret) {
+				pr_err("size '%s' isn't an integer: %d\n", p, ret);
+				kfree(p);
+				goto out;
+			}
+			kfree(p);
+			break;
+
+
+		default:
+			pr_err("unknown parameter or missing value"
+			       " '%s'\n", p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(brmr_opt_mandatory); i++) {
+		if ((opt_mask & brmr_opt_mandatory[i])) {
+			ret = 0;
+		} else {
+			pr_err("parameters missing\n");
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static ssize_t brmr_map_device_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo \""
+			 "pool=<name of the RMR pool> "
+			 "size=<size of the volume in sectors>\" > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t brmr_map_device_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct brmr_clt_dev *dev;
+	char pool[NAME_MAX];
+	unsigned long size = 0;
+	int ret;
+
+	ret = brmr_clt_parse_options(buf, pool, &size);
+	if (ret)
+		goto err;
+
+	dev = find_and_get_device(pool);
+	if (dev) {
+		pr_err("Device exists and opened as %s\n",
+		       dev->gd->disk_name);
+		brmr_clt_put_dev(dev);
+		ret = -EEXIST;
+		goto err;
+	}
+
+	dev = brmr_clt_map_device(pool, size);
+	if (IS_ERR(dev)) {
+		pr_err("Error mapping device to pool %s\n", pool);
+		ret = PTR_ERR(dev);
+		goto err;
+	}
+	ret = brmr_clt_create_dev_sysfs_files(dev);
+	if (ret)
+		goto close_device;
+
+	ret = brmr_add_dev_symlink(dev);
+	if (ret)
+		goto destroy_sysfs;
+
+	return count;
+
+destroy_sysfs:
+	sysfs_remove_link(&dev->kobj, BRMR_LINK_NAME);
+	brmr_clt_destroy_dev_sysfs_files(dev, NULL);
+close_device:
+	brmr_clt_close_device(dev, NULL);
+err:
+	return ret;
+}
+
+static struct kobj_attribute brmr_map_device_attr =
+	__ATTR(map_device, 0644,
+	       brmr_map_device_show, brmr_map_device_store);
+
+static struct attribute *default_attrs[] = {
+	&brmr_map_device_attr.attr,
+	NULL,
+};
+
+static struct attribute_group default_attr_group = {
+	.attrs = default_attrs,
+};
+
+static ssize_t brmr_unmap_device_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo <normal|force> > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t brmr_unmap_device_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct brmr_clt_dev *dev;
+	int err;
+
+	dev = container_of(kobj, struct brmr_clt_dev, kobj);
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	pr_info("Closing device %s.\n", dev->gd->disk_name);
+
+	/*
+	 * We take explicit module reference only for one reason: do not
+	 * race with lockless ibnbd_destroy_sessions().
+	 */
+	if (!try_module_get(THIS_MODULE)) {
+		return -ENODEV;
+	}
+	err = brmr_clt_close_device(dev, &attr->attr);
+	if (unlikely(err)) {
+		if (unlikely(err != -EALREADY))
+			pr_err("unmap_device %s: %d\n",
+			       dev->gd->disk_name, err);
+		goto module_put;
+	}
+
+	/*
+	 * Here device can be vanished!
+	 */
+	err = count;
+
+module_put:
+	module_put(THIS_MODULE);
+
+	return err;
+}
+
+static struct kobj_attribute brmr_unmap_device_attr =
+	__ATTR(unmap_device, 0644,
+	       brmr_unmap_device_show, brmr_unmap_device_store);
+
+static ssize_t brmr_clt_device_state_show(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   char *page)
+{
+	struct brmr_clt_dev *dev;
+	int cnt;
+
+	dev = container_of(kobj, struct brmr_clt_dev, kobj);
+
+	switch (dev->dev_state) {
+	case DEV_STATE_INIT:
+		cnt = sysfs_emit(page, "init\n");
+		break;
+	case DEV_STATE_READY:
+		cnt = sysfs_emit(page, "ready\n");
+		break;
+	case DEV_STATE_DISCONNECTED:
+		cnt = sysfs_emit(page, "disconnected\n");
+		break;
+	case DEV_STATE_CLOSING:
+		cnt = sysfs_emit(page, "closing\n");
+		break;
+	default:
+		cnt = sysfs_emit(page, "unknown\n");
+		break;
+	}
+
+	if (dev->map_incomplete)
+		cnt += sysfs_emit_at(page, cnt, "degraded\n");
+
+	return cnt;
+}
+
+static struct kobj_attribute brmr_clt_device_state =
+	__ATTR(state, 0444, brmr_clt_device_state_show, NULL);
+
+static struct attribute *brmr_clt_dev_attrs[] = {
+	&brmr_unmap_device_attr.attr,
+	&brmr_clt_device_state.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(brmr_clt_dev);
+
+static struct kobj_type brmr_clt_device_ktype = {
+	.sysfs_ops      = &kobj_sysfs_ops,
+	.default_groups  = brmr_clt_dev_groups,
+};
+
+static struct kobj_type brmr_clt_stats_ktype = {
+	.sysfs_ops      = &kobj_sysfs_ops,
+};
+
+static int brmr_clt_create_stats_files(struct kobject *kobj,
+				   struct kobject *kobj_stats);
+
+static int brmr_clt_create_dev_sysfs_files(struct brmr_clt_dev *dev)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&dev->kobj, &brmr_clt_device_ktype,
+				   brmr_devs_kobj,
+				   "%s", dev->gd->disk_name);
+	if (ret)
+		pr_err("Failed to create sysfs dir for device '%s': %d\n",
+		       dev->gd->disk_name, ret);
+
+	ret = brmr_clt_create_stats_files(&dev->kobj, &dev->kobj_stats);
+	if (unlikely(ret)) {
+		pr_err("Failed to create sysfs stats files "
+		       "for device '%s': %d\n", dev->gd->disk_name, ret);
+		kobject_del(&dev->kobj);
+		kobject_put(&dev->kobj);
+	}
+	return ret;
+}
+
+static int brmr_add_dev_symlink(struct brmr_clt_dev *dev)
+{
+	struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj;
+	int ret;
+
+	ret = sysfs_create_link(&dev->kobj, gd_kobj, BRMR_LINK_NAME);
+	if (ret) {
+		pr_err("Creating symlink for %s failed, err: %d\n",
+		       dev->gd->disk_name, ret);
+	}
+
+	return ret;
+}
+
+void brmr_clt_destroy_dev_sysfs_files(struct brmr_clt_dev *dev,
+				     const struct attribute *sysfs_self)
+{
+	if (dev->kobj.state_in_sysfs) {
+
+		kobject_del(&dev->kobj_stats);
+		kobject_put(&dev->kobj_stats);
+		if (sysfs_self)
+			sysfs_remove_file_self(&dev->kobj, sysfs_self);
+		kobject_del(&dev->kobj);
+		kobject_put(&dev->kobj);
+	}
+}
+
+int brmr_clt_create_sysfs_files(void)
+{
+	int err;
+
+	brmr_dev_class = class_create("brmr-client");
+	if (IS_ERR(brmr_dev_class))
+		return PTR_ERR(brmr_dev_class);
+
+	brmr_dev = device_create(brmr_dev_class, NULL,
+				 MKDEV(0, 0), NULL, "ctl");
+	if (IS_ERR(brmr_dev)) {
+		err = PTR_ERR(brmr_dev);
+		goto cls_destroy;
+	}
+	brmr_devs_kobj = kobject_create_and_add("devices", &brmr_dev->kobj);
+	if (unlikely(!brmr_devs_kobj)) {
+		err = -ENOMEM;
+		goto dev_destroy;
+	}
+	err = sysfs_create_group(&brmr_dev->kobj, &default_attr_group);
+	if (unlikely(err))
+		goto put_devs_kobj;
+
+	return 0;
+
+put_devs_kobj:
+	kobject_del(brmr_devs_kobj);
+	kobject_put(brmr_devs_kobj);
+dev_destroy:
+	device_unregister(brmr_dev);
+cls_destroy:
+	class_destroy(brmr_dev_class);
+
+	return err;
+}
+
+void brmr_clt_destroy_sysfs_files(void)
+{
+	sysfs_remove_group(&brmr_dev->kobj, &default_attr_group);
+	kobject_del(brmr_devs_kobj);
+	kobject_put(brmr_devs_kobj);
+	device_unregister(brmr_dev);
+	class_destroy(brmr_dev_class);
+}
+
+STAT_ATTR(struct brmr_clt_dev, requests,
+	  brmr_clt_stats_rq_to_str, brmr_clt_reset_submitted_req);
+STAT_ATTR(struct brmr_clt_dev, request_sizes,
+	  brmr_clt_stats_sizes_to_str, brmr_clt_reset_req_sizes);
+STAT_ATTR(struct brmr_clt_dev, sts_resource,
+	  brmr_stats_sts_resource_to_str, brmr_clt_reset_sts_resource);
+STAT_ATTR(struct brmr_clt_dev, sts_resource_per_cpu,
+	  brmr_stats_sts_resource_per_cpu_to_str, brmr_clt_reset_sts_resource);
+
+static struct attribute *brmr_stats_attrs[] = {
+	&requests_attr.attr,
+	&request_sizes_attr.attr,
+	&sts_resource_attr.attr,
+	&sts_resource_per_cpu_attr.attr,
+	NULL,
+};
+
+static struct attribute_group brmr_stats_attr_group = {
+	.attrs = brmr_stats_attrs,
+};
+
+static int brmr_clt_create_stats_files(struct kobject *kobj,
+				   struct kobject *kobj_stats)
+{
+	int ret;
+
+	ret = kobject_init_and_add(kobj_stats, &brmr_clt_stats_ktype, kobj, "stats");
+	if (ret) {
+		pr_err("Failed to init and add stats kobject, err: %d\n",
+		       ret);
+		return ret;
+	}
+
+	ret = sysfs_create_group(kobj_stats, &brmr_stats_attr_group);
+	if (ret) {
+		pr_err("failed to create stats sysfs group, err: %d\n",
+		       ret);
+		goto put_stats_obj;
+	}
+
+	return 0;
+
+put_stats_obj:
+	kobject_del(kobj_stats);
+	kobject_put(kobj_stats);
+
+	return ret;
+}
diff --git a/drivers/block/brmr/brmr-clt.c b/drivers/block/brmr/brmr-clt.c
new file mode 100644
index 000000000000..6f3d2dd2a9d9
--- /dev/null
+++ b/drivers/block/brmr/brmr-clt.c
@@ -0,0 +1,1222 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/uio.h>
+
+#include "brmr-clt.h"
+
+MODULE_AUTHOR("The RMR and BRMR developers");
+MODULE_VERSION(BRMR_VER_STRING);
+MODULE_DESCRIPTION("BRMR Block Device using RMR cluster");
+MODULE_LICENSE("GPL");
+
+/*
+ * Maximum number of partitions an instance can have.
+ * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself)
+ */
+#define BRMR_PART_BITS		6
+
+static DEFINE_IDA(index_ida);
+static DEFINE_MUTEX(ida_lock);
+static DEFINE_MUTEX(brmr_device_lock);
+static LIST_HEAD(brmr_device_list);
+static int brmr_major;
+
+static int BRMR_DELAY_10ms   = 10;
+
+static int index_to_minor(int index)
+{
+	return index << BRMR_PART_BITS;
+}
+
+static int minor_to_index(int minor)
+{
+	return minor >> BRMR_PART_BITS;
+}
+
+static inline const char *rq_op_to_str(struct request *rq)
+{
+	switch (req_op(rq)) {
+	case REQ_OP_READ:
+		return "READ";
+	case REQ_OP_WRITE:
+		return "WRITE";
+	case REQ_OP_DISCARD:
+		return "DISCARD";
+	case REQ_OP_WRITE_ZEROES:
+		return "WRITE_ZEROES";
+	case REQ_OP_FLUSH:
+		return "FLUSH";
+	default:
+		return "UNKNOWN";
+	}
+	return "";
+}
+
+
+/* copy from blk.h */
+static inline bool biovec_phys_mergeable(struct request_queue *q,
+                struct bio_vec *vec1, struct bio_vec *vec2)
+{
+	unsigned long mask = queue_segment_boundary(q);
+	phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
+	phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
+
+	if (addr1 + vec1->bv_len != addr2)
+		return false;
+	// Comment out xen related code
+	/*
+	if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
+		return false;
+	*/
+	if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
+		return false;
+	return true;
+}
+
+/* copy from blk_merge.c */
+static inline unsigned get_max_segment_size(const struct request_queue *q,
+                                            struct page *start_page,
+                                            unsigned long offset)
+{
+	unsigned long mask = queue_segment_boundary(q);
+
+	offset = mask & (page_to_phys(start_page) + offset);
+
+	/*
+	 * overflow may be triggered in case of zero page physical address
+	 * on 32bit arch, use queue's max segment size when that happens.
+	 */
+	return min_not_zero(mask - offset + 1,
+				(unsigned long)queue_max_segment_size(q));
+}
+
+static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
+                struct scatterlist *sglist)
+{
+        if (!*sg)
+                return sglist;
+
+        /*
+         * If the driver previously mapped a shorter list, we could see a
+         * termination bit prematurely unless it fully inits the sg table
+         * on each mapping. We KNOW that there must be more entries here
+         * or the driver would be buggy, so force clear the termination bit
+         * to avoid doing a full sg_init_table() in drivers for each command.
+         */
+        sg_unmark_end(*sg);
+        return sg_next(*sg);
+}
+
+/* only try to merge bvecs into one sg if they are from two bios */
+static inline bool
+__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
+                           struct bio_vec *bvprv, struct scatterlist **sg)
+{
+
+	int nbytes = bvec->bv_len;
+
+	if (!*sg)
+		return false;
+
+	if ((*sg)->length + nbytes > queue_max_segment_size(q))
+		return false;
+
+	if (!biovec_phys_mergeable(q, bvprv, bvec))
+		return false;
+
+	(*sg)->length += nbytes;
+
+	return true;
+}
+
+/*
+ * brmr_clt_get_iu() - Get an RMR I/O unit (iu)
+ *
+ * Description:
+ *	It gets an RMR I/O unit using rmr_clt_get_iu() and increments
+ *	the pool busy counter. It invokes rmr_clt_get_iu() with NO_WAIT
+ *	as brmr can requeue an I/O request.
+ *
+ *	Ref. brmr_add_to_requeue()
+ */
+static inline struct rmr_iu *brmr_clt_get_iu(struct brmr_clt_pool *pool, enum rmr_io_flags flag)
+{
+	struct rmr_iu *iu = rmr_clt_get_iu(pool->rmr, flag, NO_WAIT);
+	if (IS_ERR_OR_NULL(iu))
+		return iu;
+
+	atomic_inc(&pool->busy);
+
+	return iu;
+}
+
+/*
+ * brmr_clt_put_iu() - Put the RMR I/O unit (iu)
+ *
+ * Description:
+ *	It puts the RMR I/O unit using rmr_clt_put_iu() and decrements
+ *	the pool busy counter. It uses memory barrier to reflect the
+ *	busy counter.
+ *
+ *	Ref. brmr_add_to_requeue() and brmr_requeue_requests()
+ */
+static inline void brmr_clt_put_iu(struct brmr_clt_pool *pool, struct rmr_iu *iu)
+{
+	rmr_clt_put_iu(pool->rmr, iu);
+
+	atomic_dec(&pool->busy);
+	/*
+	 * Paired with brmr_add_to_requeue(). Decrement first
+	 * and then check queue bits.
+	 */
+	smp_mb__after_atomic();
+	brmr_requeue_requests(pool);
+}
+
+static void brmr_softirq_done_fn(struct request *rq)
+{
+	struct brmr_clt_iu *iu = blk_mq_rq_to_pdu(rq);
+	struct brmr_clt_dev *dev = iu->dev;
+
+	if (blk_rq_nr_phys_segments(rq))
+		sg_free_table_chained(&iu->sgt, BRMR_INLINE_SG_CNT);
+
+	brmr_clt_put_iu(dev->pool, iu->rmr_iu);
+	blk_mq_end_request(rq, iu->status);
+}
+
+static void brmr_request_conf(void *priv, int errno)
+{
+	struct brmr_clt_iu *iu = (struct brmr_clt_iu *)priv;
+	struct brmr_clt_dev *dev = iu->dev;
+	struct request *rq = iu->rq;
+
+	iu->status = (errno && errno != -ENOENT) ? BLK_STS_IOERR : BLK_STS_OK;
+
+	blk_mq_complete_request(rq);
+
+	if (errno == -ENOENT)
+		pr_debug("%s request for %s IGNORED err: %d\n",
+			 rq_op_to_str(rq), dev->gd->disk_name, errno);
+	else if (errno)
+		pr_err_ratelimited("%s request for %s failed with err: %d\n",
+				rq_op_to_str(rq), dev->gd->disk_name, errno);
+}
+
+static blk_status_t brmr_queue_rq(struct blk_mq_hw_ctx *hctx,
+				  const struct blk_mq_queue_data *bd)
+{
+	struct brmr_clt_dev *dev = bd->rq->q->disk->private_data;
+	struct brmr_clt_pool *pool = dev->pool;
+	struct brmr_clt_iu *iu = blk_mq_rq_to_pdu(bd->rq);
+	struct request *rq = bd->rq;
+	struct rmr_iu *rmr_iu;
+	unsigned int sg_cnt = 0;
+	size_t offset; size_t length;
+	enum rmr_io_flags flag;
+	unsigned short prio, seg;
+	int data_dir, err;
+	blk_status_t ret = BLK_STS_IOERR;
+
+	if (unlikely(dev->dev_state != DEV_STATE_READY))
+		return ret;
+
+	iu->rq = rq;
+	iu->dev = dev;
+
+	offset = blk_rq_pos(rq) << SECTOR_SHIFT;
+	length = blk_rq_bytes(rq);
+	flag = rq_to_rmr_flags(rq);
+	prio = req_get_ioprio(rq);
+	data_dir = rq_data_dir(rq);
+
+	rmr_iu = brmr_clt_get_iu(pool, flag);
+	if (unlikely(rmr_iu == NULL)) {
+		pr_debug("Got no tag to send a request to rmr_clt\n");
+
+		/* Increment statistic counter for it */
+		brmr_clt_update_sts_resource(&dev->stats, 0);
+
+		if (!brmr_add_to_requeue(pool, hctx->driver_data))
+			/*
+			 * TODO unlikely
+			 * Restarting queue with some delay is a stupid way
+			 * of handling resource contentions
+			 */
+			blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms);
+
+		return BLK_STS_RESOURCE;
+	}
+	if (IS_ERR(rmr_iu)) {
+		pr_err("Error %pe when reserving resources for io in pool %s\n",
+		       rmr_iu, pool->rmr->poolname);
+		return BLK_STS_IOERR;
+	}
+	iu->rmr_iu = rmr_iu;
+
+	iu->sgt.sgl = iu->sgl;
+	seg = blk_rq_nr_phys_segments(rq);
+	if (seg) {
+		err = sg_alloc_table_chained(&iu->sgt, seg, iu->sgt.sgl, BRMR_INLINE_SG_CNT);
+		if (err) {
+			pr_err("sg_alloc_table_chained failed, ret=%x\n", err);
+			blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms);
+			brmr_clt_put_iu(pool, rmr_iu);
+			return BLK_STS_RESOURCE;
+		}
+	}
+
+	/* We only support discards with single segment and write_zeroes request with no segment. */
+	/* See queue limits. */
+	if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES))
+		sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl);
+
+	blk_mq_start_request(rq);
+	brmr_update_stats(&dev->stats, length, 0, data_dir);
+
+	pr_debug("brmr %s request with flag %x offset %lu length %lu sg_cnt: %d\n",
+		 rq_op_to_str(rq), flag, offset, length, sg_cnt);
+
+	err = rmr_clt_request(pool->rmr, rmr_iu, offset, length, flag, prio,
+			      iu, brmr_request_conf, iu->sgt.sgl, sg_cnt);
+	if (likely(err == 0))
+		return BLK_STS_OK;
+
+	pr_err_ratelimited("sending %s request for %s failed with err: %d\n",
+			   rq_op_to_str(rq), dev->gd->disk_name, err);
+
+	if (unlikely(err == -EAGAIN || err == -ENOMEM)) {
+		pr_debug("Got resource error %d when sending a request to rmr_clt\n", err);
+
+		brmr_clt_update_sts_resource(&dev->stats, 3);
+		blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms);
+
+		ret = BLK_STS_RESOURCE;
+	} else {
+		ret = BLK_STS_IOERR;
+	}
+
+	if (seg)
+		sg_free_table_chained(&iu->sgt, BRMR_INLINE_SG_CNT);
+
+	brmr_clt_put_iu(pool, rmr_iu);
+	return ret;
+}
+
+static struct blk_mq_ops brmr_mq_ops = {
+	.queue_rq	= brmr_queue_rq,
+	.complete	= brmr_softirq_done_fn,
+};
+
+static struct brmr_clt_pool *brmr_clt_create_pool(const char *poolname)
+{
+	struct brmr_clt_pool *pool;
+	int err;
+	struct rmr_attrs attrs;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	pool->rmr = rmr_clt_open(pool, NULL, poolname);
+	if (IS_ERR_OR_NULL(pool->rmr)) {
+		err = PTR_ERR(pool->rmr);
+		goto free_pool;
+	}
+	err = rmr_clt_query(pool->rmr, &attrs);
+	if (unlikely(err))
+		goto close_rmr;
+
+	pool->queue_depth = attrs.queue_depth;
+	pool->max_io_size = attrs.max_io_size;
+	pool->chunk_size = attrs.chunk_size;
+	pool->max_segments = attrs.max_segments;
+
+	snprintf(pool->poolname, sizeof(pool->poolname), "%s", poolname);
+
+	/*
+	 * When opening a new pool, allocate mq tags for that pool - they are
+	 * going to be shared among all devices opened in that pool
+	 */
+	pool->tag_set.ops		= &brmr_mq_ops;
+	pool->tag_set.queue_depth	= pool->queue_depth;
+	pool->tag_set.numa_node		= NUMA_NO_NODE;
+	pool->tag_set.flags		= BLK_MQ_F_TAG_QUEUE_SHARED;
+	pool->tag_set.cmd_size		= sizeof(struct brmr_clt_iu) + BRMR_RDMA_SGL_SIZE;
+	pool->tag_set.nr_hw_queues	= num_online_cpus();
+
+	err = blk_mq_alloc_tag_set(&pool->tag_set);
+	if (unlikely(err))
+		goto close_rmr;
+
+	refcount_set(&pool->refcount, 1);
+
+	atomic_set(&pool->busy, 0);
+	bitmap_zero(pool->cpu_queues_bm, NR_CPUS);
+	pool->cpu_rr = alloc_percpu(int);
+	if (unlikely(!pool->cpu_rr)) {
+		pr_err("Failed to alloc percpu var (cpu_rr)\n");
+		err = -ENOMEM;
+		goto free_tag_set;
+	}
+	pool->cpu_queues = alloc_percpu(struct brmr_cpu_qlist);
+	if (unlikely(!pool->cpu_queues)) {
+		pr_err("Failed to alloc percpu var (cpu_queues)\n");
+		err = -ENOMEM;
+		goto free_cpu_rr;
+	}
+	brmr_init_cpu_qlists(pool->cpu_queues);
+	return pool;
+free_cpu_rr:
+	free_percpu(pool->cpu_rr);
+free_tag_set:
+	blk_mq_free_tag_set(&pool->tag_set);
+close_rmr:
+	rmr_clt_close(pool->rmr);
+free_pool:
+	kfree(pool);
+
+	return ERR_PTR(err);
+}
+
+static void brmr_clt_free_pool(struct brmr_clt_pool *pool)
+{
+	free_percpu(pool->cpu_queues);
+	pool->cpu_queues = NULL;
+	free_percpu(pool->cpu_rr);
+	pool->cpu_rr = NULL;
+	blk_mq_free_tag_set(&pool->tag_set);
+	rmr_clt_close(pool->rmr);
+	kfree(pool);
+}
+
+static void brmr_clt_put_pool(struct brmr_clt_pool *pool)
+{
+	if (refcount_dec_and_test(&pool->refcount))
+		brmr_clt_free_pool(pool);
+	else
+		rmr_clt_put_pool(pool->rmr);
+}
+
+static inline bool brmr_clt_get_dev(struct brmr_clt_dev *dev)
+{
+	return refcount_inc_not_zero(&dev->refcount);
+}
+
+void brmr_clt_put_dev(struct brmr_clt_dev *dev)
+{
+	might_sleep();
+
+	if (refcount_dec_and_test(&dev->refcount)) {
+
+		mutex_lock(&ida_lock);
+		ida_free(&index_ida, dev->idx);
+		mutex_unlock(&ida_lock);
+
+		kfree(dev->hw_queues);
+
+		brmr_clt_put_pool(dev->pool);
+
+		if (!list_empty(&dev->list)) {
+			mutex_lock(&brmr_device_lock);
+			list_del(&dev->list);
+			mutex_unlock(&brmr_device_lock);
+		}
+		kfree(dev);
+	}
+}
+
+static int brmr_open(struct gendisk *disk, blk_mode_t mode)
+{
+	struct brmr_clt_dev *dev = disk->private_data;
+
+	if (READ_ONCE(dev->dev_state) != DEV_STATE_READY)
+		return -EIO;
+
+	if (!brmr_clt_get_dev(dev))
+		return -EIO;
+
+	return 0;
+}
+
+static void brmr_release(struct gendisk *gen)
+{
+	struct brmr_clt_dev *dev = gen->private_data;
+
+	brmr_clt_put_dev(dev);
+}
+
+#if 0
+static int brmr_getgeo(struct block_device *block_device,
+		       struct hd_geometry *geo)
+{
+	struct brmr_clt_dev *dev = block_device->bd_disk->private_data;
+
+	geo->cylinders	= (dev->size_sect & ~0x3f) >> 6;	/* size/64 */
+	geo->heads	= 4;
+	geo->sectors	= 16;
+	geo->start	= 0;
+
+	return 0;
+}
+#endif
+
+static const struct block_device_operations brmr_ops = {
+	.owner		= THIS_MODULE,
+	.open		= brmr_open,
+	.release	= brmr_release,
+	/*.getgeo		= brmr_getgeo,*/
+};
+
+/**
+ * brmr_clt_init_cmd() - Initialize message command
+ *
+ * @msg:	command message where to init
+ */
+static void brmr_clt_init_cmd(struct brmr_msg_cmd *msg)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD);
+	msg->hdr.__padding = 0;
+	msg->ver = BRMR_PROTO_VER_MAJOR;
+}
+
+/**
+ * brmr_cmd_conf() - Confirmation function for brmr command message
+ *
+ * @priv:	priv pointer to brmr command private data
+ * @errno:	error number passed from RMR.
+ *		See description of errno in RMR function.
+ *
+ * Description:
+ *	Command response for a map new command can fail on multiple levels.
+ *	If RMR fails to send the message to any or one of the nodes, that would reflect on the
+ *	errno. If the command fails on BRMR level, that would reflect on the rsp struct.
+ *	The error number will be used differently by different commands accordingly.
+ */
+static void brmr_clt_cmd_conf(void *priv, int errno)
+{
+	struct brmr_cmd_priv *cmd_priv = (struct brmr_cmd_priv *)priv;
+
+	switch (cmd_priv->cmd_type) {
+	case BRMR_CMD_MAP:
+		pr_info("%s: BRMR_CMD_MAP err=%d\n", __func__, errno);
+		cmd_priv->errno = errno;
+		break;
+	case BRMR_CMD_REMAP:
+		pr_info("%s: BRMR_CMD_REMAP err=%d\n", __func__, errno);
+		break;
+	case BRMR_CMD_UNMAP:
+		pr_info("%s: BRMR_CMD_UNMAP err=%d\n", __func__, errno);
+		/*
+		 * No processing needed here.
+		 */
+		break;
+	case BRMR_CMD_GET_PARAMS:
+		pr_info("%s: BRMR_CMD_GET_PARAMS err=%d\n", __func__, errno);
+		if (errno)
+			cmd_priv->errno = errno;
+		break;
+
+	default:
+		pr_err("%s: Unknown command type %d err=%d\n", __func__, cmd_priv->cmd_type, errno);
+	}
+
+	complete(&cmd_priv->complete_done);
+}
+
+/**
+ * brmr_clt_send_msg_cmd() - Sends command message to rmr pool
+ *
+ * @dev:		pointer to brmr device
+ * @msg:		msg struct to be sent
+ * @rsp_buf:		response buffer where the response of the storage side is stored
+ * @rsp_buf_len:	length of the response buffer
+ *
+ * Return:
+ *	Negative if failed to sent command
+ *	As handled by each command in brmr_clt_cmd_conf, if succeeded to send command
+ *
+ * Context:
+ *	Would block until response is received
+ */
+static int brmr_clt_send_msg_cmd(struct brmr_clt_dev *dev, struct brmr_msg_cmd *msg, void *rsp_buf,
+			     size_t rsp_buf_len)
+{
+	struct brmr_cmd_priv cmd_priv;
+	struct kvec vec;
+	int ret;
+
+	vec = (struct kvec) {
+		.iov_base = msg,
+		.iov_len  = sizeof(*msg)
+	};
+
+	cmd_priv.dev = dev;
+	cmd_priv.cmd_type = msg->cmd_type;
+	cmd_priv.rsp_buf = rsp_buf;
+	cmd_priv.rsp_buf_len = rsp_buf_len;
+	cmd_priv.errno = 0;
+	init_completion(&cmd_priv.complete_done);
+
+	ret = rmr_clt_cmd_with_rsp(dev->pool->rmr, brmr_clt_cmd_conf, &cmd_priv, &vec, 1, rsp_buf,
+				   rsp_buf_len, sizeof(struct brmr_msg_cmd_rsp));
+
+	if (!ret) {
+		wait_for_completion(&cmd_priv.complete_done);
+		ret = cmd_priv.errno;
+	}
+
+	return ret;
+}
+
+static struct brmr_clt_dev *brmr_alloc_and_init_dev(struct brmr_clt_pool *pool,
+						u64 size)
+{
+	struct brmr_clt_dev *dev;
+	struct brmr_queue *q;
+	struct blk_mq_hw_ctx *hctx;
+	int ret;
+	unsigned long i;
+
+	/*
+	 * alloc device structure
+	 */
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&dev->list);
+	dev->size_sect = size;
+	dev->pool = pool;
+	dev->dev_state = DEV_STATE_INIT;
+	dev->map_incomplete = false;
+	refcount_set(&dev->refcount, 1);
+
+	/*
+	 * Alloc a "queue" per cpu
+	 */
+	dev->hw_queues = kcalloc(nr_cpu_ids,
+				 sizeof(*dev->hw_queues), GFP_KERNEL);
+	if (unlikely(!dev->hw_queues)) {
+		ret = -ENOMEM;
+		goto free_dev;
+	}
+
+	/*
+	 * Get an id to be used in /dev/brmr<idx>
+	 */
+	mutex_lock(&ida_lock);
+	ret = ida_alloc_range(&index_ida, 0, minor_to_index(1 << MINORBITS) - 1,
+			      GFP_KERNEL);
+	mutex_unlock(&ida_lock);
+	if (ret < 0) {
+		pr_err("%s: ida_alloc_range() failed for pool %s, err: %d\n",
+		       __func__, pool->poolname, ret);
+		goto free_queues;
+	}
+	dev->idx = ret;
+
+	/*
+	 * Init mq queue
+	 */
+	dev->gd = blk_mq_alloc_disk(&pool->tag_set, NULL, dev);
+	if (IS_ERR(dev->gd)) {
+		ret = PTR_ERR(dev->gd);
+		pr_err("Failed to initialize mq: %pe\n", dev->queue);
+		goto remove_ida;
+	}
+	dev->queue = dev->gd->queue;
+
+	/*
+	 * Assign hardware contexts to our queues
+	 */
+	queue_for_each_hw_ctx(dev->queue, hctx, i) {
+		q = &dev->hw_queues[i];
+		INIT_LIST_HEAD(&q->requeue_list);
+		q->hctx = hctx;
+		hctx->driver_data = q;
+	}
+
+	return dev;
+
+remove_ida:
+	mutex_lock(&ida_lock);
+	ida_free(&index_ida, dev->idx);
+	mutex_unlock(&ida_lock);
+free_queues:
+	kfree(dev->hw_queues);
+free_dev:
+	kfree(dev);
+out:
+	return ERR_PTR(ret);
+}
+
+static int brmr_set_dev_params(struct brmr_clt_dev *dev)
+{
+	struct brmr_clt_pool *pool = dev->pool;
+	u32 chunk_size = brmr_pool_chunk_size(pool);
+	struct queue_limits lim;
+	int ret;
+
+	/* Aligns requests with the chunks in rmr client */
+	if (!is_power_of_2(chunk_size >> SECTOR_SHIFT)) {
+		pr_err("%u not a power of 2!\n", chunk_size);
+		return -EINVAL;
+	}
+
+	/*
+	 * Set request queue parameters via queue_limits API
+	 */
+	lim = queue_limits_start_update(dev->queue);
+	lim.logical_block_size = dev->logical_block_size;
+	lim.physical_block_size = dev->physical_block_size;
+	lim.max_segments = dev->max_segments;
+	lim.max_hw_sectors = dev->max_hw_sectors;
+	lim.max_write_zeroes_sectors = dev->max_write_zeroes_sectors;
+	lim.io_opt = brmr_pool_chunk_size(pool);
+	lim.chunk_sectors = chunk_size >> SECTOR_SHIFT;
+
+	/* however we don't support discards to */
+	/* discontiguous segments in one request */
+	lim.max_discard_segments = 1;
+	lim.max_hw_discard_sectors = dev->max_discard_sectors;
+	if (dev->secure_discard)
+		lim.max_secure_erase_sectors = dev->max_discard_sectors;
+
+	lim.discard_granularity = dev->discard_granularity;
+	lim.discard_alignment = dev->discard_alignment;
+
+	/* needed for ibtrs_map_sg_fr to work */
+	lim.virt_boundary_mask = SZ_4K - 1;
+
+	/* non-rotational device */
+	lim.features &= ~BLK_FEAT_ROTATIONAL;
+
+	if (dev->wc)
+		lim.features |= BLK_FEAT_WRITE_CACHE;
+	if (dev->fua)
+		lim.features |= BLK_FEAT_FUA;
+
+	ret = queue_limits_commit_update(dev->queue, &lim);
+	if (ret)
+		goto err;
+
+	blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
+	blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
+
+	ret = brmr_clt_init_stats(&dev->stats);
+	if (unlikely(ret))
+		goto err;
+
+	dev->gd->major = brmr_major;
+	dev->gd->minors = 1 << BRMR_PART_BITS;
+	dev->gd->first_minor = index_to_minor(dev->idx);
+	dev->gd->fops = &brmr_ops;
+	dev->gd->queue = dev->queue;
+	dev->gd->private_data = dev;
+	snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name),
+		 "brmr%d", dev->idx);
+	set_capacity(dev->gd, dev->size_sect);
+
+	return 0;
+
+err:
+	return ret;
+}
+
+/**
+ * brmr_get_remote_dev_params() - Gets device params from storage nodes
+ *
+ * @dev:		pointer to brmr device
+ *
+ * Description:
+ *	Does the following (sanity) checks
+ *	1) For an unmapped device, param get should succeed on all legs
+ *	2) There should not be a mixture of mapped and unmapped devices
+ *
+ *	In addition to above, it also does the following work
+ *	1) For a mapped device, read from a single leg is enough for success
+ *	2) For an unmapped device, it does validation checks for params for every leg
+ *
+ * Return:
+ *	Negative in case of failure
+ *	0 for success, and a non-mapped device is found
+ *	1 for success, and a mapped device is found
+ *
+ * Context:
+ *	Would block until response is received
+ */
+static int brmr_get_remote_dev_params(struct brmr_clt_dev *dev)
+{
+	struct brmr_clt_pool *pool = dev->pool;
+	struct brmr_msg_cmd msg;
+	struct brmr_msg_cmd_rsp *brmr_cmd_rsp;
+	void *rsp_buf;
+	size_t rsp_buf_len;
+	int err = 0, i;
+	bool partial_fail = false, mapped = false;
+
+	brmr_clt_init_cmd(&msg);
+	msg.cmd_type = BRMR_CMD_GET_PARAMS;
+
+	rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS;
+	rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL);
+	if (!rsp_buf)
+		return -ENOMEM;
+
+	err = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len);
+	if (err < 0) {
+		pr_err("%s: brmr_clt_send_msg_cmd failed with errno %d\n", __func__, err);
+		goto free_data;
+	} else if (err) {
+		/*
+		 * We cannot directly fail here, since we do not know if this is a map for a
+		 * newly created device, or for one which has gone through mapping before.
+		 *
+		 * For the former, any failure should end in the whole map process failing.
+		 * For the latter, a single read from a device with mapped state set should
+		 * be enough for us to go ahead and map.
+		 */
+		partial_fail = true;
+	}
+
+	/*
+	 * Lets do the sanity check first, because combining it with param checks makes the
+	 * entire loop harder to read
+	 */
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		struct brmr_cmd_get_params_rsp *get_params_rsp;
+
+		brmr_cmd_rsp = ((struct brmr_msg_cmd_rsp *)rsp_buf) + i;
+
+		/*
+		 * We do not need to worry about not seeing MAGIC.
+		 * This would happen for a non-working sessions, OR
+		 * for extra sessions in the end for which there are no legs in RMR (Don't care)
+		 *
+		 * For non-working sessions, we will be notified by RMR through the return value
+		 */
+		if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC)
+			continue;
+
+		/*
+		 * This is error returned by rmr-store.
+		 */
+		if (brmr_cmd_rsp->status)
+			partial_fail = true;
+
+		get_params_rsp = &brmr_cmd_rsp->get_params_rsp;
+
+		/*
+		 * If we find a mapped device, we save that info.
+		 */
+		if (get_params_rsp->mapped)
+			mapped = true;
+	}
+
+	/*
+	 * If there is no device mapped, it means that this is the first map after device creation
+	 * In such a case, we need all sessions to be up and running.
+	 */
+	if (mapped == false && partial_fail) {
+		pr_err("%s: Mapping first time, but got failure for some sessions\n", __func__);
+		err = -EINVAL;
+		goto free_data;
+	}
+
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		struct brmr_cmd_get_params_rsp *get_params_rsp;
+		struct brmr_blk_dev_params *rsp_dev_params;
+
+		brmr_cmd_rsp = ((struct brmr_msg_cmd_rsp *)rsp_buf) + i;
+
+		/*
+		 * We are tracking partial failures through the above loop, so
+		 * ignore it here.
+		 */
+		if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC ||
+		    brmr_cmd_rsp->status)
+			continue;
+
+		get_params_rsp = &brmr_cmd_rsp->get_params_rsp;
+
+		/*
+		 * We cheat a little, and do this sanity check here.
+		 *
+		 * If even a single device was mapped, and we have sessions with non-mapped
+		 * devices, it will be wrong to go forward with brmr map.
+		 */
+		if (mapped && !get_params_rsp->mapped) {
+			/*
+			 * This can only happen if a node went down and up.
+			 * And instead of re-adding a MAPPED device, a create was called
+			 * We cannot allow map this way, since this means discard could
+			 * have been skipped.
+			 */
+			pr_err("%s: Mixed combination of mapped+unmapped metadata found\n",
+			       __func__);
+			err = -EINVAL;
+			goto free_data;
+		}
+
+		/*
+		 * The device size_sect, which is the size provided by the user in the map
+		 * command, should be same as the mapped_size of every storage node's backend
+		 * device, which was provided during create_store.
+		 */
+		if (dev->size_sect != le64_to_cpu(get_params_rsp->mapped_size)) {
+			pr_err("%s: Mismatched mapped_size: (Provide) %llu != %llu (Remote)\n",
+			       __func__, dev->size_sect, le64_to_cpu(get_params_rsp->mapped_size));
+			err = -EINVAL;
+			goto free_data;
+		}
+
+		rsp_dev_params = &get_params_rsp->dev_params;
+
+		dev->max_write_zeroes_sectors = min_not_zero(
+						dev->max_write_zeroes_sectors,
+						le32_to_cpu(
+						rsp_dev_params->max_write_zeroes_sectors));
+		dev->max_discard_sectors = min_not_zero(brmr_pool_chunk_size(pool) >> SECTOR_SHIFT,
+						le32_to_cpu(rsp_dev_params->max_discard_sectors));
+		dev->physical_block_size = max_t(u16, dev->physical_block_size,
+						 le16_to_cpu(rsp_dev_params->physical_block_size));
+		dev->logical_block_size = max_t(u16, dev->logical_block_size,
+						le16_to_cpu(rsp_dev_params->logical_block_size));
+
+		dev->discard_granularity = dev->logical_block_size;
+		dev->discard_alignment = dev->logical_block_size;
+
+		/* secure_discard is actually true or false, but since we used
+		 * __le16 to transfer this value in msg, min_t should work fine here
+		 */
+		dev->secure_discard = min_t(u16, dev->secure_discard,
+					    le16_to_cpu(rsp_dev_params->secure_discard));
+
+		dev->cache_policy = rsp_dev_params->cache_policy;
+		dev->wc = !!(rsp_dev_params->cache_policy & BRMR_WRITEBACK);
+		dev->fua = !!(rsp_dev_params->cache_policy & BRMR_FUA);
+	}
+
+	/* max segments and max_hw_sectors we get from rtrs sessions values
+	 * stored in pool like in RNBD, not from bdev of the store side.
+	 */
+	dev->max_segments = pool->max_segments;
+	dev->max_hw_sectors = pool->max_io_size / SECTOR_SIZE;
+
+	/*
+	 * Return whether its a new map or an old one
+	 */
+	err = mapped;
+
+free_data:
+	kfree(rsp_buf);
+
+	return err;
+}
+
+/**
+ * brmr_clt_send_map_cmd() - Sends map command for a brmr device
+ *
+ * @dev:	pointer to brmr device
+ *
+ * Return:
+ *	Negative error value in case of failure
+ *	0 on success
+ *
+ * Context:
+ *	Would block until response is received
+ */
+static int brmr_clt_send_map_cmd(struct brmr_clt_dev *dev)
+{
+	struct brmr_clt_pool *pool = dev->pool;
+	struct brmr_msg_cmd msg;
+	struct brmr_blk_dev_params *dev_params = &(msg.map_new_cmd.dev_params);
+	void *rsp_buf;
+	size_t rsp_buf_len;
+	int err = 0;
+
+	brmr_clt_init_cmd(&msg);
+	msg.cmd_type = BRMR_CMD_MAP;
+
+	rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS;
+	rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL);
+	if (!rsp_buf)
+		return -ENOMEM;
+
+	msg.map_new_cmd.version = BRMR_CURRENT_HEADER_VERSION;
+	msg.map_new_cmd.mapped_size = dev->size_sect;
+
+	dev_params->max_hw_sectors = cpu_to_le32(dev->max_hw_sectors);
+	dev_params->max_write_zeroes_sectors = cpu_to_le32(dev->max_write_zeroes_sectors);
+	dev_params->max_discard_sectors = cpu_to_le32(dev->max_discard_sectors);
+	dev_params->discard_granularity = cpu_to_le32(dev->discard_granularity);
+	dev_params->discard_alignment = cpu_to_le32(dev->discard_alignment);
+	dev_params->physical_block_size = cpu_to_le16(dev->physical_block_size);
+	dev_params->logical_block_size = cpu_to_le16(dev->logical_block_size);
+	dev_params->max_segments = cpu_to_le16(dev->max_segments);
+	dev_params->secure_discard = cpu_to_le16(dev->secure_discard);
+	dev_params->cache_policy = dev->cache_policy;
+
+	err = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len);
+	if (err)
+		pr_err("Failed to send cmd msg BRMR_CMD_MAP in pool %s, err=%d\n",
+		       pool->poolname, err);
+
+	kfree(rsp_buf);
+	return err;
+}
+
+/*
+ * brmr_clt_send_unmap_cmd() - Send an unmap command to the server pool
+ *
+ * Sending may fail (e.g. no sessions connected). The failure is logged but
+ * not propagated — callers always continue with local cleanup regardless.
+ */
+static void brmr_clt_send_unmap_cmd(struct brmr_clt_dev *dev)
+{
+	struct brmr_msg_cmd msg;
+	void *rsp_buf;
+	size_t rsp_buf_len;
+	int ret;
+
+	brmr_clt_init_cmd(&msg);
+	msg.cmd_type = BRMR_CMD_UNMAP;
+
+	rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS;
+	rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL);
+	if (!rsp_buf) {
+		pr_err("Failed to alloc rsp_buf for unmap in pool %s\n",
+		       dev->pool->poolname);
+		return;
+	}
+
+	/*
+	 * Sending messages could fail. For example, there are no client pool sessions
+	 * connected to this pool. Unmap_dev still progresses and cleans up the device
+	 * states on the client side.
+	 */
+	ret = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len);
+	if (ret)
+		pr_err("Error %d when unmap device in pool %s\n",
+		       ret, dev->pool->poolname);
+
+	kfree(rsp_buf);
+}
+
+/**
+ * brmr_clt_map_device() - Maps brmr device through an rmr pool
+ *
+ * @id:	Id for the device
+ * @poolname:	rmr poolname which is to be used for mapping
+ * @size:	Size of the disk
+ *
+ * Description:
+ *	Opens rmr pool with pool name "poolname"
+ *	Allocated brmr device and initializes it
+ *	Maps brmr device using the rmr pool only if its not already mapped
+ *
+ * Return:
+ *	Pointer to allocated and mapped brmr device on success
+ *	Error pointer on failure
+ */
+struct brmr_clt_dev *brmr_clt_map_device(const char *poolname, u64 size)
+{
+	struct brmr_clt_pool *pool = NULL;
+	struct brmr_clt_dev *dev;
+	int ret, mapped;
+
+	/* Create brmr pool */
+	pool = brmr_clt_create_pool(poolname);
+	if (IS_ERR(pool)) {
+		ret = PTR_ERR(pool);
+		goto err_out;
+	}
+
+	/* Alloc device */
+	dev = brmr_alloc_and_init_dev(pool, size);
+	if (IS_ERR(dev)) {
+		pr_err("Error %pe allocating brmr device in pool %s\n",
+			dev, pool->poolname);
+		brmr_clt_put_pool(pool);
+		ret = PTR_ERR(dev);
+		goto err_out;
+	}
+
+	mapped = brmr_get_remote_dev_params(dev);
+	if (mapped < 0) {
+		pr_err("Failed to get remote devs block params in pool %s, err=%d\n",
+		       pool->poolname, mapped);
+		ret = mapped;
+		goto dest_dev;
+	}
+
+	/* Set device params */
+	ret = brmr_set_dev_params(dev);
+	if (unlikely(ret)) {
+		pr_err("Error %d brmr_set_dev_params in pool %s\n",
+		       ret, pool->poolname);
+		goto dest_dev;
+	}
+
+	/*
+	 * We send map command only if its a new map.
+	 * This must happen before add_disk() so the server is ready to serve
+	 * I/O by the time the kernel probes the partition table.
+	 */
+	if (!mapped) {
+		pr_info("%s: Sending map command through pool %s\n", __func__, pool->poolname);
+		ret = brmr_clt_send_map_cmd(dev);
+		if (ret) {
+			pr_err("Failed to send map cmd to pool %s, err=%d\n",
+			       pool->poolname, ret);
+			goto put_disk;
+		}
+	}
+
+	dev->dev_state = DEV_STATE_READY;
+
+	/*
+	 * Add gendisk
+	 */
+	ret = add_disk(dev->gd);
+	if (ret) {
+		pr_err("%s: add_disk failed with err %d\n", __func__, ret);
+		goto unmap_dev;
+	}
+
+	mutex_lock(&brmr_device_lock);
+	list_add(&dev->list, &brmr_device_list);
+	mutex_unlock(&brmr_device_lock);
+
+	return dev;
+
+unmap_dev:
+	dev->dev_state = DEV_STATE_INIT;
+	if (!mapped)
+		brmr_clt_send_unmap_cmd(dev);
+put_disk:
+	put_disk(dev->gd);
+	brmr_clt_free_stats(&dev->stats);
+dest_dev:
+	brmr_clt_put_dev(dev);
+err_out:
+	return ERR_PTR(ret);
+}
+
+static void destroy_gen_disk(struct brmr_clt_dev *dev)
+{
+	unsigned int memflags;
+
+	del_gendisk(dev->gd);
+	/*
+	 * Before marking queue as dying (blk_cleanup_queue() does that)
+	 * we have to be sure that everything in-flight has gone.
+	 * Blink with freeze/unfreeze.
+	 */
+	memflags = blk_mq_freeze_queue(dev->queue);
+	blk_mq_unfreeze_queue(dev->queue, memflags);
+	put_disk(dev->gd);
+}
+
+/**
+ * brmr_clt_close_device() - Closes a brmr device
+ *
+ * @dev:		pointer to brmr device to close
+ * @sysfs_self:	pointer to sysfs attribute
+ *
+ * Return:
+ *	0 in case of success
+ *	negative in case of failure
+ */
+int brmr_clt_close_device(struct brmr_clt_dev *dev,
+		      const struct attribute *sysfs_self)
+{
+	dev->dev_state = DEV_STATE_CLOSING;
+	destroy_gen_disk(dev);
+	brmr_clt_send_unmap_cmd(dev);
+	sysfs_remove_link(&dev->kobj, BRMR_LINK_NAME);
+
+	if (sysfs_self)
+		brmr_clt_destroy_dev_sysfs_files(dev, sysfs_self);
+
+	brmr_clt_free_stats(&dev->stats);
+	brmr_clt_put_dev(dev);
+
+	return 0;
+}
+
+struct brmr_clt_dev *find_and_get_device(const char *name)
+{
+	struct brmr_clt_dev *dev;
+
+	mutex_lock(&brmr_device_lock);
+	list_for_each_entry(dev, &brmr_device_list, list) {
+		if (strncasecmp(dev->pool->poolname, name, NAME_MAX))
+			continue;
+
+		if (brmr_clt_get_dev(dev)) {
+			mutex_unlock(&brmr_device_lock);
+			return dev;
+		}
+	}
+	mutex_unlock(&brmr_device_lock);
+
+	return NULL;
+}
+
+static int __init brmr_client_init(void)
+{
+	int err;
+
+	pr_info("Loading module %s, version %s\n",
+		KBUILD_MODNAME, BRMR_VER_STRING);
+
+	brmr_major = register_blkdev(brmr_major, "brmr");
+	if (brmr_major <= 0) {
+		pr_err("Failed to load module,"
+		       " block device registration failed\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = brmr_clt_create_sysfs_files();
+out:
+	return err;
+}
+
+static void __exit brmr_client_exit(void)
+{
+	struct brmr_clt_dev *dev, *tmp;
+
+	pr_info("Unloading module\n");
+
+	brmr_clt_destroy_sysfs_files();
+	unregister_blkdev(brmr_major, "brmr");
+
+	list_for_each_entry_safe(dev, tmp, &brmr_device_list, list) {
+		brmr_clt_close_device(dev, NULL);
+	}
+
+	ida_destroy(&index_ida);
+
+	pr_info("Module %s unloaded\n", KBUILD_MODNAME);
+}
+
+module_init(brmr_client_init);
+module_exit(brmr_client_exit);
diff --git a/drivers/block/brmr/brmr-clt.h b/drivers/block/brmr/brmr-clt.h
new file mode 100644
index 000000000000..1482c7517ee8
--- /dev/null
+++ b/drivers/block/brmr/brmr-clt.h
@@ -0,0 +1,299 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef BRMR_PRI_H
+#define BRMR_PRI_H
+
+#include <linux/limits.h>
+#include <linux/blk-mq.h>
+#include "rmr-pool.h"
+
+#include "brmr-proto.h"
+
+#define BRMR_VER_MAJOR 0
+#define BRMR_VER_MINOR 1
+
+#ifndef BRMR_VER_STRING
+#define BRMR_VER_STRING __stringify(BRMR_VER_MAJOR) "." \
+			 __stringify(BRMR_VER_MINOR)
+#endif
+
+#define BRMR_LINK_NAME "block"
+
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+#define BRMR_INLINE_SG_CNT 0
+#else
+#define BRMR_INLINE_SG_CNT 2
+#endif
+#define BRMR_RDMA_SGL_SIZE (sizeof(struct scatterlist) * BRMR_INLINE_SG_CNT)
+
+enum brmr_dev_state {
+	DEV_STATE_INIT,
+	DEV_STATE_READY,
+	DEV_STATE_DISCONNECTED,
+	DEV_STATE_CLOSING,
+};
+
+struct brmr_clt_iu {
+	struct request		*rq;
+	struct rmr_iu		*rmr_iu;
+	struct brmr_clt_dev	*dev;
+	blk_status_t		status;
+	struct sg_table		sgt;
+	struct scatterlist	sgl[];
+};
+
+struct brmr_queue {
+	struct list_head	requeue_list;
+	unsigned long		in_list;
+	struct blk_mq_hw_ctx	*hctx;
+};
+
+struct brmr_cpu_qlist {
+	struct list_head	requeue_list;
+	spinlock_t		requeue_lock;
+	unsigned int		cpu;
+};
+
+struct brmr_clt_pool {
+	struct list_head        list;
+	struct rmr_pool         *rmr;
+	wait_queue_head_t       rmr_waitq;
+	bool                    rmr_ready;
+	int			queue_depth;
+	u32			max_io_size;
+	u32			chunk_size;
+	u32			max_segments;
+	struct brmr_cpu_qlist __percpu
+				*cpu_queues;
+	DECLARE_BITMAP(cpu_queues_bm, NR_CPUS);
+	int	__percpu	*cpu_rr; /* per-cpu var for CPU round-robin */
+	atomic_t	     	busy;
+	struct blk_mq_tag_set	tag_set;
+	struct mutex		lock; /* protects state and devs_list */
+	struct list_head        devs_list; /* list of struct brmr_clt_dev */
+	refcount_t		refcount;
+	char			poolname[NAME_MAX];
+};
+
+/**
+ * Statistic of requests submitted to the rmr-clt layer.
+ * This means total number of requests received from blk
+ * is cnt_whole+(cnt_split/2)
+ * while total number submitted to rmr-clt is cnt_whole+cnt_split
+ */
+struct brmr_stats_rq {
+	struct {
+		u64 cnt_whole;
+		u64 cnt_split;
+		u64 total_sectors;
+	} dir[2];
+};
+
+#define STATS_SIZES_NUM 16
+
+struct brmr_stats_sizes {
+	struct {
+		u64 cnt_whole[STATS_SIZES_NUM];
+		u64 cnt_left[STATS_SIZES_NUM];
+		u64 cnt_right[STATS_SIZES_NUM];
+	} dir[2];
+};
+
+struct brmr_stats_sts_resource {
+	u64 get_iu;
+	u64 get_iu2;
+	u64 clt_request1;
+	u64 clt_request;
+};
+
+struct brmr_stats_pcpu {
+
+	struct brmr_stats_rq submitted_requests;
+	struct brmr_stats_sizes request_sizes;
+	struct brmr_stats_sts_resource sts_resource;
+};
+
+struct brmr_clt_stats {
+	struct brmr_stats_pcpu __percpu *pcpu_stats;
+};
+
+struct brmr_clt_dev {
+	struct brmr_clt_pool	*pool;
+	struct request_queue	*queue;
+	struct brmr_queue	*hw_queues;
+	u32			idx;
+	enum brmr_dev_state	dev_state;
+	bool			read_only;
+	bool			map_incomplete;
+	u64			size_sect;	/* device size in sectors */
+	struct list_head        list;
+	struct brmr_clt_stats	stats;
+	struct gendisk		*gd;
+	struct kobject		kobj;
+	struct kobject		kobj_stats;
+	char			blk_symlink_name[NAME_MAX];
+	refcount_t		refcount;
+	struct work_struct	unmap_on_rmmod_work;
+	bool			wc;
+	bool			fua;
+
+	/*
+	 * Params holding block device related info
+	 */
+	u32	max_hw_sectors;
+	u32	max_write_zeroes_sectors;
+	u32	max_discard_sectors;
+	u32	discard_granularity;
+	u32	discard_alignment;
+	u16	physical_block_size;
+	u16	logical_block_size;
+	u16	max_segments;
+	u16	secure_discard;
+	u8	cache_policy;
+};
+
+#define BRMR_HEADER_MAGIC_TOKEN 0x312631494f4e4f53
+
+#define BRMR_HEADER_VERSION_INITIAL 1
+#define BRMR_CURRENT_HEADER_VERSION BRMR_HEADER_VERSION_INITIAL
+
+static inline enum rmr_io_flags rq_to_rmr_flags(struct request *rq)
+{
+	enum rmr_io_flags rmr_flag;
+
+	switch (req_op(rq)) {
+	case REQ_OP_READ:
+		rmr_flag = RMR_OP_READ;
+		break;
+	case REQ_OP_WRITE:
+		rmr_flag = RMR_OP_WRITE;
+		break;
+	case REQ_OP_DISCARD:
+		rmr_flag = RMR_OP_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		rmr_flag = RMR_OP_WRITE_ZEROES;
+		break;
+	case REQ_OP_FLUSH:
+		rmr_flag = RMR_OP_FLUSH;
+		break;
+/* TODO
+	case REQ_OP_SECURE_ERASE:
+		rmr_flag = IBNBD_OP_SECURE_ERASE;
+		break;
+*/
+	default:
+		WARN(1, "Unknown request type %d (flags %u)\n",
+		     req_op(rq), rq->cmd_flags);
+		rmr_flag = 0;
+	}
+
+	/* Set sync flag for write request. */
+	if (op_is_sync(rq->cmd_flags))
+		rmr_flag |= RMR_F_SYNC;
+
+	if (op_is_flush(rq->cmd_flags))
+		rmr_flag |= RMR_F_FUA;
+
+	return rmr_flag;
+}
+
+static inline u32 brmr_pool_chunk_size(struct brmr_clt_pool *pool)
+{
+	return pool->chunk_size;
+}
+
+struct brmr_clt_dev *brmr_clt_map_device(const char *pool, u64 size);
+int brmr_clt_close_device(struct brmr_clt_dev *dev, const struct attribute *sysfs_self);
+
+void brmr_clt_put_dev(struct brmr_clt_dev *dev);
+
+struct brmr_clt_dev *find_and_get_device(const char *name);
+
+/* brmr-sysfs.c */
+
+int brmr_clt_create_sysfs_files(void);
+void brmr_clt_destroy_sysfs_files(void);
+
+void brmr_clt_destroy_dev_sysfs_files(struct brmr_clt_dev *dev,
+				      const struct attribute *sysfs_self);
+
+/* brmr-reque.c */
+
+bool brmr_add_to_requeue(struct brmr_clt_pool *pool, struct brmr_queue *q);
+void brmr_requeue_requests(struct brmr_clt_pool *pool);
+void brmr_init_cpu_qlists(struct brmr_cpu_qlist __percpu *cpu_queues);
+
+/* brmr-stats.c */
+
+int brmr_clt_init_stats(struct brmr_clt_stats *stats);
+void brmr_clt_free_stats(struct brmr_clt_stats *stats);
+
+int brmr_clt_reset_submitted_req(struct brmr_clt_stats *stats, bool enable);
+int brmr_clt_reset_req_sizes(struct brmr_clt_stats *stats, bool enable);
+int brmr_clt_reset_sts_resource(struct brmr_clt_stats *stats, bool enable);
+
+/**
+ * size: size of the request submitted in bytes
+ * split: 0 when request from blk is submitted to rmr-clt as 1
+ * 1 if it is one part of the split from a blk request
+ */
+void brmr_update_stats(struct brmr_clt_stats *stats, size_t size, int split, int d);
+
+/**
+ * which: at which place is BLK_STS_RESOURCE returned?
+ */
+void brmr_clt_update_sts_resource(struct brmr_clt_stats *stats, int which);
+
+ssize_t brmr_clt_stats_sizes_to_str(struct brmr_clt_stats *stats, char *page, size_t len);
+
+ssize_t brmr_clt_stats_rq_to_str(struct brmr_clt_stats *stats, char *page, size_t len);
+
+ssize_t brmr_stats_sts_resource_to_str(
+	struct brmr_clt_stats *stats, char *page, size_t len);
+
+ssize_t brmr_stats_sts_resource_per_cpu_to_str(
+	struct brmr_clt_stats *stats, char *page, size_t len);
+
+#define STAT_STORE_FUNC(type, store, reset)				\
+static ssize_t store##_store(struct kobject *kobj,			\
+			     struct kobj_attribute *attr,		\
+			     const char *buf, size_t count)		\
+{									\
+	int ret = -EINVAL;						\
+	type *dev = container_of(kobj, type, kobj_stats);		\
+									\
+	if (sysfs_streq(buf, "1"))					\
+		ret = reset(&dev->stats, true);				\
+	else if (sysfs_streq(buf, "0"))					\
+		ret = reset(&dev->stats, false);			\
+	if (ret)							\
+		return ret;						\
+									\
+	return count;							\
+}
+
+#define STAT_SHOW_FUNC(type, show, print)				\
+static ssize_t show##_show(struct kobject *kobj,			\
+			   struct kobj_attribute *attr,			\
+			   char *page)					\
+{									\
+	type *dev = container_of(kobj, type, kobj_stats);		\
+									\
+	return print(&dev->stats, page, PAGE_SIZE);			\
+}
+
+#define STAT_ATTR(type, stat, print, reset)				\
+STAT_STORE_FUNC(type, stat, reset)					\
+STAT_SHOW_FUNC(type, stat, print)					\
+static struct kobj_attribute stat##_attr =				\
+		__ATTR(stat, 0644,					\
+		       stat##_show,					\
+		       stat##_store)
+
+#endif /* BRMR_PRI_H */
diff --git a/drivers/block/brmr/brmr-proto.h b/drivers/block/brmr/brmr-proto.h
new file mode 100644
index 000000000000..c5f0f25a5eb7
--- /dev/null
+++ b/drivers/block/brmr/brmr-proto.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#define BRMR_PROTO_VER_MAJOR 0
+#define BRMR_PROTO_VER_MINOR 1
+
+#define BRMR_CMD_RSP_MAGIC 0xDEADF00D
+
+struct brmr_blk_dev_params {
+	/*
+	 * Params holding block device related info
+	 */
+	__le32 max_hw_sectors;
+	__le32 max_write_zeroes_sectors;
+	__le32 max_discard_sectors;
+	__le32 discard_granularity;
+	__le32 discard_alignment;
+	__le16 physical_block_size;
+	__le16 logical_block_size;
+	__le16 max_segments;
+	__le16 secure_discard;
+	u8 cache_policy;
+};
+
+enum brmr_msg_type {
+	BRMR_MSG_IO,
+	BRMR_MSG_CMD,
+};
+
+struct brmr_msg_hdr {
+	__le16	type;
+	__le16	__padding;
+};
+
+enum brmr_msg_cmd_type {
+	BRMR_CMD_MAP, // 0
+	BRMR_CMD_REMAP,
+
+	BRMR_CMD_UNMAP,
+	BRMR_CMD_GET_PARAMS,
+
+	/*
+	 * Add new command types above this.
+	 */
+	BRMR_CMD_RSP,
+};
+
+struct brmr_msg_map_new_cmd {
+	struct brmr_blk_dev_params dev_params;
+
+	u32 version; /* version of the header itself */
+	u64 mapped_size; /* size in 512 byte blocks of this device */
+};
+
+struct brmr_msg_cmd {
+	struct brmr_msg_hdr	hdr;
+	u8			ver;
+	u8			cmd_type;
+	u8			rsvd[2];
+	union {
+		struct brmr_msg_map_new_cmd map_new_cmd;
+		/* May be other command(s) later */
+	};
+};
+
+/**
+ * struct brmr_cmd_get_params_rsp - response message to BRMR_CMD_GET_PARAMS
+ * @hdr:			message header
+ * @nsectors:			number of sectors in the usual 512b unit
+ * @max_hw_sectors:		max hardware sectors in the usual 512b unit
+ * @max_write_zeroes_sectors:	max sectors for WRITE ZEROES in the 512b unit
+ * @max_discard_sectors:	max. sectors that can be discarded at once in 512b
+ * unit.
+ * @discard_granularity:	size of the internal discard allocation unit in bytes
+ * @discard_alignment:		offset from internal allocation assignment in bytes
+ * @physical_block_size:	physical block size device supports in bytes
+ * @logical_block_size:		logical block size device supports in bytes
+ * @max_segments:		max segments hardware support in one transfer
+ * @secure_discard:		supports secure discard
+ * @cache_policy:		support write-back caching or FUA?
+ */
+struct brmr_cmd_get_params_rsp {
+	struct brmr_blk_dev_params dev_params;
+
+	/*
+	 * Params holding brmr device related info
+	 */
+	u8	mapped;
+	__le64	mapped_size;
+};
+
+struct brmr_msg_cmd_rsp {
+	struct brmr_msg_hdr	hdr;
+	u64			magic;
+	u8			ver;
+	u8			cmd_type;
+	u8			status;
+	u8			rsvd[1];
+	union {
+		struct brmr_cmd_get_params_rsp get_params_rsp;
+		//any other command responces.
+	};
+};
+
+struct brmr_cmd_priv {
+	void			*dev;
+	u8			cmd_type;
+	void			*rsp_buf;
+	size_t			rsp_buf_len;
+	int			errno;
+	struct completion	complete_done;
+};
+
+enum brmr_cache_policy {
+	BRMR_FUA = 1 << 0,
+	BRMR_WRITEBACK = 1 << 1,
+};
diff --git a/drivers/block/brmr/brmr-srv-sysfs.c b/drivers/block/brmr/brmr-srv-sysfs.c
new file mode 100644
index 000000000000..7e413eb258bb
--- /dev/null
+++ b/drivers/block/brmr/brmr-srv-sysfs.c
@@ -0,0 +1,707 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+
+#include "brmr-srv.h"
+#include "rmr-srv.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+static struct class *rmr_str_class;
+static struct device *rmr_ctl_dev;
+static struct device *rmr_strs_dev;
+
+enum {
+	BRMR_SRV_STR_OPT_ERR = 0,
+	BRMR_SRV_STR_OPT_DEVICE = 1 << 0,
+	BRMR_SRV_STR_OPT_POOL = 1 << 2,
+	BRMR_SRV_STR_OPT_MAPPED_SIZE = 1 << 3,
+	BRMR_SRV_STR_OPT_MODE = 1 << 4,
+};
+
+static const unsigned int rmr_str_opt_mandatory[] = {
+	BRMR_SRV_STR_OPT_POOL,
+	BRMR_SRV_STR_OPT_DEVICE,
+	BRMR_SRV_STR_OPT_MAPPED_SIZE,
+};
+
+static const match_table_t rmr_str_opt_tokens = {
+	{ BRMR_SRV_STR_OPT_POOL, "pool=%s" },
+	{ BRMR_SRV_STR_OPT_DEVICE, "device=%s" },
+	{ BRMR_SRV_STR_OPT_MAPPED_SIZE, "mapped_size=%s" },
+	{ BRMR_SRV_STR_OPT_MODE, "mode=%s" },
+	{ BRMR_SRV_STR_OPT_ERR, NULL },
+};
+
+struct brmr_srv_str_options {
+	char *pool;
+	char *device;
+	unsigned long mapped_size;
+};
+
+static void brmr_srv_remove_store(struct brmr_srv_blk_dev *dev, struct kobj_attribute *attr,
+				  bool delete)
+{
+	mutex_lock(&store_mutex);
+
+	blk_str_destroy_sysfs_files(dev, &attr->attr);
+
+	brmr_srv_blk_close(dev, delete);
+
+	pr_info("put blkdev %s\n", dev->bdev->bd_disk->disk_name);
+	bdev_fput(dev->bdev_file);
+
+	pr_info("%s store %s, store name %s.\n", (delete ? "Delete" : "Remove"),
+		dev->name, dev->poolname);
+	brmr_srv_blk_destroy(dev);
+	mutex_unlock(&store_mutex);
+}
+
+static int brmr_srv_parse_add_opts(const char *buf, struct brmr_srv_str_options *opt,
+				   unsigned int *replace)
+{
+	char *options, *sep_opt;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	sep_opt = strstrip(options);
+	while ((p = strsep(&sep_opt, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, rmr_str_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case BRMR_SRV_STR_OPT_POOL:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("add_store: pool name is too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(opt->pool, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case BRMR_SRV_STR_OPT_DEVICE:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("add_store: device name is too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(opt->device, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case BRMR_SRV_STR_OPT_MAPPED_SIZE:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			ret = kstrtoul(p, 0, &opt->mapped_size);
+			if (ret) {
+				pr_err("mapped_size isn't an integer: %d\n", ret);
+				kfree(p);
+				goto out;
+			}
+
+			if (opt->mapped_size == 0) {
+				pr_err("mapped_size cannot be 0\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+
+			kfree(p);
+			break;
+
+		case BRMR_SRV_STR_OPT_MODE:
+			if (!replace) {
+				pr_err("%s: mode option not supported here\n", __func__);
+				ret = -EINVAL;
+				goto out;
+			}
+
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			if (!strcmp(p, "replace")) {
+				*replace = true;
+			} else {
+				pr_err("%s: Unknown mode '%s'\n", __func__, p);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			kfree(p);
+			break;
+
+		default:
+			pr_err("add_store: Unknown parameter or missing value '%s'\n",
+			       p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rmr_str_opt_mandatory); i++) {
+		if ((opt_mask & rmr_str_opt_mandatory[i])) {
+			ret = 0;
+		} else {
+			pr_err("add_store: Parameters missing\n");
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static ssize_t blk_str_dev_size_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *page)
+{
+	struct brmr_srv_blk_dev *dev;
+
+	dev = container_of(kobj, struct brmr_srv_blk_dev, kobj);
+
+	return sysfs_emit(page, "%llu\n", dev->dev_size);
+}
+
+static struct kobj_attribute blk_str_dev_size_attr =
+	__ATTR(dev_size, 0644, blk_str_dev_size_show, NULL);
+
+static ssize_t blk_str_mapped_size_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *page)
+{
+	struct brmr_srv_blk_dev *dev;
+
+	dev = container_of(kobj, struct brmr_srv_blk_dev, kobj);
+
+	return sysfs_emit(page, "%llu\n", dev->mapped_size);
+}
+
+static struct kobj_attribute blk_str_mapped_size_attr =
+	__ATTR(mapped_size, 0644, blk_str_mapped_size_show, NULL);
+
+static ssize_t blk_str_bdev_name_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *page)
+{
+	struct brmr_srv_blk_dev *dev;
+
+	dev = container_of(kobj, struct brmr_srv_blk_dev, kobj);
+
+	return sysfs_emit(page, "%s\n", dev->name);
+}
+
+static struct kobj_attribute blk_str_bdev_name_attr =
+	__ATTR(bdev_name, 0644, blk_str_bdev_name_show, NULL);
+
+static ssize_t blk_str_remove_store_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 to remove the store\n");
+}
+
+static ssize_t blk_str_remove_store_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct brmr_srv_blk_dev *dev;
+
+	dev = container_of(kobj, struct brmr_srv_blk_dev, kobj);
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s, %s unknown value: '%s'\n",
+		       dev->name, attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	brmr_srv_remove_store(dev, attr, false);
+
+	return count;
+}
+
+static struct kobj_attribute blk_str_remove_store_attr =
+	__ATTR(remove_store, 0644,
+	       blk_str_remove_store_show, blk_str_remove_store_store);
+
+static ssize_t blk_str_delete_store_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 to delete the store\n");
+}
+
+static ssize_t blk_str_delete_store_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct brmr_srv_blk_dev *dev;
+
+	dev = container_of(kobj, struct brmr_srv_blk_dev, kobj);
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s, %s unknown value: '%s'\n",
+		       dev->name, attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	brmr_srv_remove_store(dev, attr, true);
+
+	return count;
+}
+
+static struct kobj_attribute blk_str_delete_store_attr =
+	__ATTR(delete_store, 0644,
+	       blk_str_delete_store_show, blk_str_delete_store_store);
+
+static ssize_t state_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *page)
+{
+	struct brmr_srv_blk_dev *dev;
+	size_t count = 0;
+
+	dev = container_of(kobj, struct brmr_srv_blk_dev, kobj);
+
+	if (test_bit(BRMR_SRV_STORE_OPEN, &dev->state))
+		count += sysfs_emit_at(page, count, "open\n");
+	else
+		count += sysfs_emit_at(page, count, "closed\n");
+
+	if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state))
+		count += sysfs_emit_at(page, count, "mapped\n");
+	else
+		count += sysfs_emit_at(page, count, "unmapped\n");
+
+	return count;
+}
+
+static struct kobj_attribute blk_str_state_attr =
+	__ATTR_RO(state);
+
+static struct attribute *blk_str_map_attrs[] = {
+	&blk_str_dev_size_attr.attr,
+	&blk_str_mapped_size_attr.attr,
+	&blk_str_bdev_name_attr.attr,
+	&blk_str_remove_store_attr.attr,
+	&blk_str_delete_store_attr.attr,
+	&blk_str_state_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(blk_str_map);
+
+static struct kobj_type blk_str_device_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = blk_str_map_groups,
+};
+
+static int blk_str_create_sysfs_files(struct brmr_srv_blk_dev *dev)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&dev->kobj, &blk_str_device_ktype,
+				   &rmr_strs_dev->kobj,
+				   "%s", dev->poolname);
+	if (ret)
+		pr_err("Failed to create sysfs dir for store %s, name %s, err=%d\n",
+		       dev->name, dev->poolname, ret);
+
+	return ret;
+}
+
+void blk_str_destroy_sysfs_files(struct brmr_srv_blk_dev *dev,
+				 const struct attribute *sysfs_self)
+{
+	if (sysfs_self)
+		sysfs_remove_file_self(&dev->kobj, sysfs_self);
+
+	kobject_del(&dev->kobj);
+	kobject_put(&dev->kobj);
+}
+
+/**
+ * brmr_srv_blk_dev_exit() - Destroy and put the blkdev
+ *
+ * @dev:	RMR block device structure to be used.
+ *
+ * Description:
+ *	This function gives up the blkdev reference, and destroys the rmr block device
+ */
+static void brmr_srv_blk_dev_exit(struct brmr_srv_blk_dev *dev)
+{
+	pr_info("%s: put blkdev %s\n", __func__, dev->name);
+	bdev_fput(dev->bdev_file);
+
+	brmr_srv_blk_destroy(dev);
+}
+
+/**
+ * brmr_srv_blk_dev_init() - Create and initialize a brmr server store block device
+ *
+ * @pool_name:	Name to be given to the created rmr block device
+ * @dev_name:	path to the block device
+ * @mapped_size:mapped size of the block device
+ *
+ * Description:
+ *	This function checks whether the rmr pool is available to be registered.
+ *	It then creates the block device, and initializes it.
+ *
+ * Return:
+ *	Pointer to the created rmr block device on success
+ *	Error pointer on error
+ */
+static struct brmr_srv_blk_dev *brmr_srv_blk_dev_init(char *pool_name, char *dev_name,
+						      u64 mapped_size)
+{
+	struct file *bdev_file;
+	struct brmr_srv_blk_dev *dev;
+
+	dev = brmr_srv_blk_create(dev_name, pool_name);
+	if (IS_ERR(dev)) {
+		pr_err("failed to alloc store for device %s: %pe\n", pool_name, dev);
+		return dev;
+	}
+
+	bdev_file = bdev_file_open_by_path(dev_name, DEFAULT_BLK_OPEN_FLAGS,
+					   dev, NULL);
+	if (IS_ERR(bdev_file)) {
+		pr_err("%s: bdev_file_open_by_path for device %s failed with err (%pe)\n",
+		       __func__, dev_name, bdev_file);
+		brmr_srv_blk_destroy(dev);
+		return ERR_CAST(bdev_file);
+	}
+
+	dev->bdev_file = bdev_file;
+	dev->bdev = file_bdev(bdev_file);
+	dev->dev_size = get_capacity(dev->bdev->bd_disk);
+	strscpy(dev->name, dev->bdev->bd_disk->disk_name, sizeof(dev->name));
+
+	if (mapped_size < BLK_STR_MIN_MAPPED_SIZE) {
+		pr_err("%s: Given mapped size %llu less than minimum default(%lu) for dev %s\n",
+		       __func__, mapped_size, BLK_STR_MIN_MAPPED_SIZE, dev->name);
+		brmr_srv_blk_dev_exit(dev);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	if (mapped_size + BLK_STR_MD_SIZE_SECTORS > dev->dev_size) {
+		pr_err("can not map %llu, only %llu available %s\n",
+		       mapped_size, dev->dev_size - BLK_STR_MD_SIZE_SECTORS, dev->name);
+		brmr_srv_blk_dev_exit(dev);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	dev->mapped_size = mapped_size;
+
+	pr_info("%s: succeeded\n", __func__);
+
+	return dev;
+}
+
+static ssize_t brmr_srv_create_store_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct brmr_srv_str_options opt;
+	char dev_name[NAME_MAX];
+	char pool_name[NAME_MAX];
+	struct brmr_srv_blk_dev *dev;
+	struct brmr_srv_blk_dev_meta *md_page;
+	int md_state, err;
+
+	opt.pool = pool_name;
+	opt.device = dev_name;
+	opt.mapped_size = 0;
+
+	if (brmr_srv_parse_add_opts(buf, &opt, NULL))
+		goto out;
+
+	md_page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!md_page) {
+		pr_err("%s: Failed to allocate page to read md\n", __func__);
+		goto out;
+	}
+
+	mutex_lock(&store_mutex);
+
+	dev = brmr_srv_blk_dev_init(pool_name, dev_name, opt.mapped_size);
+	if (IS_ERR(dev)) {
+		pr_err("%s: brmr_srv_blk_dev_init failed: %pe\n", __func__, dev);
+		goto mut_unlock;
+	}
+
+	md_state = brmr_srv_read_and_check_md(dev, md_page);
+	if (md_state != -1) {
+		/*
+		 * read and check md failed. It could be read error or that md exists
+		 */
+		pr_err("%s: md read and check failed: %d\n", __func__, md_state);
+		goto dev_exit;
+	}
+
+	err = brmr_srv_blk_open(dev, dev_name, true, false);
+	if (err) {
+		pr_err("failed to open %s, err %d\n", dev_name, err);
+		goto dev_exit;
+	}
+
+	err = blk_str_create_sysfs_files(dev);
+	if (err) {
+		pr_err("failed to create sysfs files\n");
+		goto dev_close;
+	}
+
+	mutex_unlock(&store_mutex);
+	pr_info("Created new blk store for %s, with disk %s\n", pool_name, dev_name);
+
+	kfree(md_page);
+	return count;
+
+dev_close:
+	brmr_srv_blk_close(dev, true);
+dev_exit:
+	brmr_srv_blk_dev_exit(dev);
+mut_unlock:
+	mutex_unlock(&store_mutex);
+	kfree(md_page);
+out:
+	return -EINVAL;
+}
+
+static ssize_t brmr_srv_create_store_show(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      char *page)
+{
+	return scnprintf(page, PAGE_SIZE,
+			 "Usage: echo \"pool=<name of the rmr pool> device=<full path of block device> mapped_size=<size of given block device to be mapped>\" > %s\n\n",
+			 attr->attr.name);
+}
+
+static struct kobj_attribute brmr_srv_create_store_attr =
+	__ATTR(create_store, 0644,
+	       brmr_srv_create_store_show, brmr_srv_create_store_store);
+
+static ssize_t brmr_srv_add_store_store(struct kobject *kobj, struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct brmr_srv_blk_dev *dev;
+	char dev_name[NAME_MAX];
+	char pool_name[NAME_MAX];
+	struct brmr_srv_str_options opt;
+	struct brmr_srv_blk_dev_meta *md_page;
+	int md_state, ret;
+	unsigned int replace = false;
+
+	opt.pool = pool_name;
+	opt.device = dev_name;
+	opt.mapped_size = 0;
+
+	if (brmr_srv_parse_add_opts(buf, &opt, &replace))
+		goto out;
+
+	/*
+	 * Disable replace mode for now.
+	 * Most of the code for replace mode to work is present, but there are some
+	 * edge cases which needs work, and a info exchange between storage nodes which
+	 * needs to be added.
+	 */
+	if (replace) {
+		pr_err("%s: Replace mode not supported yet\n", __func__);
+		goto out;
+	}
+
+	md_page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!md_page) {
+		pr_err("Failed to allocate page to read md\n");
+		goto out;
+	}
+
+	mutex_lock(&store_mutex);
+
+	dev = brmr_srv_blk_dev_init(pool_name, dev_name, opt.mapped_size);
+	if (IS_ERR(dev)) {
+		pr_err("brmr_srv_blk_dev_init failed: %pe\n", dev);
+		goto mut_unlock;
+	}
+
+	md_state = brmr_srv_read_and_check_md(dev, md_page);
+	if (md_state == -1) {
+		/*
+		 * md doesn't exists. This means the disk is an empty one.
+		 * We have to replace, so check the mode first
+		 */
+		if (!replace) {
+			pr_err("%s: Incorrect mode %d. md doesn't exists\n", __func__, replace);
+			goto dev_exit;
+		}
+
+		/*
+		 * we have to do the following,
+		 *
+		 * 1) Check params like mapped size from at least one other storage node
+		 * 2) Do discard
+		 */
+		pr_info("%s: No md found. Replacing disk %s for pool %s, size %llu\n",
+			__func__, dev_name, pool_name, dev->mapped_size);
+	} else if (md_state == 0) {
+		/*
+		 * md exists.
+		 * We are restoring an earlier used device.
+		 */
+		if (replace) {
+			pr_err("%s: Incorrect mode %d. md exists\n", __func__, replace);
+			goto dev_exit;
+		}
+
+		/*
+		 * Validate the metadata stored with the data provided.
+		 */
+		ret = brmr_srv_blk_validate_md(dev, md_page);
+		if (ret) {
+			pr_err("Local metadata validation failed\n");
+			goto dev_exit;
+		}
+
+		memcpy(&dev->dev_params, &md_page->dev_params, sizeof(struct rmr_blk_dev_params));
+		dev->state = md_page->state;
+
+		pr_info("%s: md found. Re-adding disk %s for pool %s, size %llu\n",
+			__func__, dev_name, pool_name, dev->mapped_size);
+	} else {
+		pr_err("%s: md cannot be read for block device %s, Err = %d\n",
+		       __func__, dev->name, md_state);
+		goto dev_exit;
+	}
+
+	if (brmr_srv_blk_open(dev, dev_name, false /* create */, replace)) {
+		pr_err("failed to open %s\n", dev_name);
+		goto dev_exit;
+	}
+
+	ret = blk_str_create_sysfs_files(dev);
+	if (ret) {
+		pr_err("failed to create sysfs files\n");
+		goto dev_close;
+	}
+
+	mutex_unlock(&store_mutex);
+
+	kfree(md_page);
+	return count;
+
+dev_close:
+	brmr_srv_blk_close(dev, replace);
+dev_exit:
+	brmr_srv_blk_dev_exit(dev);
+mut_unlock:
+	mutex_unlock(&store_mutex);
+	kfree(md_page);
+out:
+	return -EINVAL;
+}
+
+static ssize_t brmr_srv_add_store_show(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      char *page)
+{
+	return scnprintf(page, PAGE_SIZE,
+			 "Usage: echo \"pool=<name of the rmr pool> device=<full path of block device> mapped_size=<size of given block device to be mapped>\" > %s\n\n",
+			 attr->attr.name);
+}
+
+static struct kobj_attribute brmr_srv_add_store_attr =
+	__ATTR(add_store, 0644,
+	       brmr_srv_add_store_show, brmr_srv_add_store_store);
+
+static struct attribute *default_attrs[] = {
+	&brmr_srv_create_store_attr.attr,
+	&brmr_srv_add_store_attr.attr,
+	NULL,
+};
+
+static struct attribute_group default_attr_group = {
+	.attrs = default_attrs,
+};
+
+int brmr_srv_create_sysfs_files(void)
+{
+	int err;
+	dev_t devt = MKDEV(0, 0);
+
+	rmr_str_class = class_create("brmr-server");
+	if (IS_ERR(rmr_str_class))
+		return PTR_ERR(rmr_str_class);
+
+	rmr_ctl_dev = device_create(rmr_str_class, NULL, devt, NULL, "ctl");
+	if (IS_ERR(rmr_ctl_dev)) {
+		err = PTR_ERR(rmr_ctl_dev);
+		goto cls_destroy;
+	}
+
+	rmr_strs_dev = device_create(rmr_str_class, NULL, devt, NULL, "stores");
+	if (IS_ERR(rmr_strs_dev)) {
+		err = PTR_ERR(rmr_strs_dev);
+		goto ctl_destroy;
+	}
+
+	err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group);
+	if (unlikely(err))
+		goto strs_destroy;
+
+	return 0;
+
+strs_destroy:
+	device_unregister(rmr_strs_dev);
+ctl_destroy:
+	device_unregister(rmr_ctl_dev);
+cls_destroy:
+	class_destroy(rmr_str_class);
+
+	return err;
+}
+
+void brmr_srv_destroy_sysfs_files(void)
+{
+	sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group);
+	device_unregister(rmr_strs_dev);
+	device_unregister(rmr_ctl_dev);
+	class_destroy(rmr_str_class);
+}
diff --git a/drivers/block/brmr/brmr-srv.c b/drivers/block/brmr/brmr-srv.c
new file mode 100644
index 000000000000..cf85a54e4511
--- /dev/null
+++ b/drivers/block/brmr/brmr-srv.c
@@ -0,0 +1,1402 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+
+#include "brmr-srv.h"
+#include "rmr-srv.h"
+
+MODULE_AUTHOR("The RMR and BRMR developers");
+MODULE_VERSION(BRMR_SERVER_VER_STRING);
+MODULE_DESCRIPTION("BRMR Server");
+MODULE_LICENSE("GPL");
+
+LIST_HEAD(store_list);
+DEFINE_MUTEX(store_mutex);	/* mutex to protect store_list */
+
+/**
+ * brmr_srv_blk_validate_md() - Parse metadata for the given rmr block device and validate it
+ *
+ * @dev:	RMR block device against which the md is to be validated
+ * @meta:	pointer to metadata to be checked
+ *
+ * Return:
+ *	0:	On success
+ *	-Error:	On failure
+ */
+int brmr_srv_blk_validate_md(struct brmr_srv_blk_dev *dev, struct brmr_srv_blk_dev_meta *meta)
+{
+	if (meta->magic != BRMR_BLK_STORE_MAGIC) {
+		pr_warn("No md found. store %s md magic=%llX does not match %X\n",
+			dev->poolname, meta->magic, BRMR_BLK_STORE_MAGIC);
+		return -EINVAL;
+	}
+
+	// TODO: check version!
+
+	if (dev->dev_size && dev->dev_size != meta->dev_size) {
+		pr_err("store %s dev_size %llu does not match md value %llu\n",
+		       dev->poolname, dev->dev_size, meta->dev_size);
+		return -EINVAL;
+	}
+
+	if (dev->mapped_size != meta->mapped_size) {
+		pr_err("store %s mapped_size %llu does not match md value %llu\n",
+		       dev->poolname, dev->mapped_size, meta->mapped_size);
+		return -EINVAL;
+	}
+
+	if (strncmp(dev->poolname, meta->poolname, NAME_MAX)) {
+		pr_err("store %s does not match md value %s\n",
+		       dev->poolname, meta->poolname);
+		return -EINVAL;
+	}
+
+	pr_debug("store %s md: mapped_size=%llu\n",
+		dev->poolname, meta->mapped_size);
+	pr_debug("md parsing is done for store %s\n", dev->poolname);
+
+	return 0;
+}
+
+/**
+ * brmr_srv_blk_fill_md() - Fill metadata from brmr srv block device
+ *
+ * @dev:	BRMR server block device from which data is to be taken
+ * @data:	pointer to metadata
+ *
+ * Return:
+ *	0:	On success
+ *	-Error:	On failure
+ */
+static int brmr_srv_blk_fill_md(struct brmr_srv_blk_dev *dev, void *data)
+{
+	struct brmr_srv_blk_dev_meta *meta = data;
+
+	meta->magic = BRMR_BLK_STORE_MAGIC;
+	meta->version = 0;
+	meta->dev_size = dev->dev_size;
+	meta->offset = BLK_STR_MD_SIZE_SECTORS;
+	meta->ts = jiffies; // or ktime_get_real_seconds();
+	meta->mapped_size = dev->mapped_size;
+	meta->state = dev->state;
+
+	memcpy(&meta->dev_params, &dev->dev_params, sizeof(struct rmr_blk_dev_params));
+
+	strscpy(meta->poolname, dev->poolname, NAME_MAX);
+
+	pr_debug("md filling pool %s is done for dev %s\n", meta->poolname, dev->name);
+
+	return 0;
+}
+
+static int brmr_srv_blk_md_io_sync(struct block_device *bdev, int rw, void *md_data)
+{
+	int err = 0;
+	struct bio *bio;
+	blk_opf_t bio_flags = REQ_META;
+	u32 bytes;
+
+	bio = bio_alloc(bdev, 1, bio_flags, GFP_NOIO);
+	if (!bio) {
+		pr_err("Failed to allocate metadata bio\n");
+		return -ENOMEM;
+	}
+
+	bytes = bio_add_page(bio, virt_to_page(md_data), PAGE_SIZE, 0);
+	if (bytes != PAGE_SIZE) {
+		pr_err("Failed to add page to bio, bytes returned=%u, expected %lu\n",
+		       bytes, PAGE_SIZE);
+		err = -EINVAL;
+		goto bio_put;
+	}
+
+	if (rw == READ)
+		bio->bi_opf = REQ_OP_READ;
+	else
+		bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+
+	bio->bi_opf |= bio_flags;
+	bio->bi_iter.bi_sector = 0;
+	bio_set_dev(bio, bdev);
+
+	pr_debug("submit_bio_wait dev %s, rw %s\n",
+		 bdev->bd_disk->disk_name, rw == WRITE ? "WRITE" : "READ");
+	err = submit_bio_wait(bio);
+	if (err) {
+		pr_err("Error reading md from %s, err %d\n",
+		       bdev->bd_disk->disk_name, err);
+		goto bio_put;
+	}
+	pr_info("%s: for dev %s md rw %s is completed with code %d\n",
+		__func__, bdev->bd_disk->disk_name, rw == WRITE ? "WRITE" : "READ", err);
+
+bio_put:
+	bio_put(bio);
+
+	return err;
+}
+
+/**
+ * brmr_srv_blk_read_md() - read md from given block device
+ *
+ * @bdev:	block device from which to read md
+ * @md_page:	buffer to fill with md
+ */
+static int brmr_srv_blk_bdev_read_md(struct block_device *bdev, char *md_page)
+{
+	int err = 0;
+
+	err = brmr_srv_blk_md_io_sync(bdev, READ, md_page);
+	if (err) {
+		pr_err("error reading md from %s, err %d\n", bdev->bd_disk->disk_name, err);
+		return err;
+	}
+
+	pr_debug("read md from dev %s is done\n", bdev->bd_disk->disk_name);
+
+	return err;
+}
+
+static int brmr_srv_blk_write_md(struct brmr_srv_blk_dev *dev)
+{
+	int err = 0;
+	void *md_page;
+
+	pr_debug("flush md to dev %s\n", dev->name);
+	md_page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!md_page) {
+		pr_err("Failed to allocate page to read md\n");
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = brmr_srv_blk_fill_md(dev, md_page);
+	if (err) {
+		pr_err("error filling md for dev %s, err %d\n", dev->name, err);
+		goto free_md_page;
+	}
+
+	err = brmr_srv_blk_md_io_sync(dev->bdev, WRITE, md_page);
+	if (err) {
+		pr_err("error writing md to %s, err %d\n", dev->name, err);
+		goto free_md_page;
+	}
+	pr_debug("flush md to dev is done %s\n", dev->name);
+
+free_md_page:
+	kfree(md_page);
+out:
+	return err;
+}
+
+static void brmr_srv_blk_zero_md(struct brmr_srv_blk_dev *dev)
+{
+	int err = 0;
+	void *md_page;
+
+	pr_debug("zero md on dev %s\n", dev->name);
+	md_page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!md_page) {
+		pr_warn("Failed to allocate page to read md\n");
+		return;
+	}
+
+	err = brmr_srv_blk_md_io_sync(dev->bdev, WRITE, md_page);
+	if (err)
+		pr_warn("error writing zero md to %s, err %d\n", dev->name, err);
+
+	pr_debug("zero md on dev is done %s\n", dev->name);
+	kfree(md_page);
+}
+
+static void brmr_srv_ref_kill(struct brmr_srv_blk_dev *dev)
+{
+	percpu_ref_kill(&dev->kref);
+	wait_for_completion(&dev->comp);
+}
+
+static void brmr_srv_blk_release(struct percpu_ref *kref)
+{
+	struct brmr_srv_blk_dev *dev;
+
+	dev = container_of(kref, struct brmr_srv_blk_dev, kref);
+	complete(&dev->comp);
+}
+
+/**
+ * brmr_srv_blk_close() - Close a brmr srv block device
+ *
+ * @dev:	BRMR server block device to be closed
+ *
+ * Description:
+ *	Close an opened brmr srv store block device.
+ *	This function is the opposite of brmr_srv_blk_open.
+ *	This function is supposed to be the check and stop for inflight IOs.
+ *
+ * Locks:
+ *	store_mutex should be held while calling this.
+ */
+void brmr_srv_blk_close(struct brmr_srv_blk_dev *dev, bool delete)
+{
+	pr_info("rmr store name: %s; dev %s is closing\n", dev->poolname, dev->name);
+	brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN);
+
+	list_del(&dev->entry);
+
+	pr_info("brmr server store blk dev %s wait for io to complete.\n", dev->name);
+	brmr_srv_ref_kill(dev);
+
+	/*
+	 * Reinit the ref counter so that RMR can send metadata requests.
+	 */
+	reinit_completion(&dev->comp);
+	percpu_ref_reinit(&dev->kref);
+
+	rmr_srv_unregister(dev->poolname, delete);
+	dev->pool = NULL;
+	brmr_srv_ref_kill(dev);
+
+	if (delete)
+		brmr_srv_blk_zero_md(dev);
+}
+
+static int brmr_srv_blk_do_discard(struct brmr_srv_blk_dev *dev)
+{
+	struct rmr_pool *pool = dev->pool;
+	int err;
+
+	pr_info("store id %s has mapped size of %llu, send discarded chunks to rmr pool %s\n",
+		dev->poolname, dev->mapped_size, dev->pool->poolname);
+
+	err = rmr_srv_discard_id(pool, 0, 0, 0, true);
+	if (err)
+		pr_err("store %s failed to discard all data\n", dev->poolname);
+
+	return err;
+}
+
+/**
+ * brmr_srv_init_cmd() - Initialize message command
+ *
+ * @msg:	command message where to init
+ */
+static void brmr_srv_init_cmd(struct brmr_msg_cmd *msg)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD);
+	msg->hdr.__padding = 0;
+	msg->ver = BRMR_PROTO_VER_MAJOR;
+}
+
+/**
+ * brmr_srv_cmd_conf() - Confirmation function for brmr srv store internal command message
+ *
+ * @priv:	priv pointer to brmr command private data
+ * @errno:	error number passed from RMR.
+ *		See description of errno in RMR function.
+ *
+ * Description:
+ *	Command response for a map new command can fail on multiple levels.
+ *	If RMR fails to send the message to any or one of the nodes, that would reflect on the
+ *	errno. If the command fails on BRMR level, that would reflect on the rsp struct.
+ *	The error number will be used differently by different commands accordingly.
+ */
+static void brmr_srv_cmd_conf(void *priv, int errno)
+{
+	struct brmr_cmd_priv *cmd_priv = (struct brmr_cmd_priv *)priv;
+
+	cmd_priv->errno = errno;
+
+	switch (cmd_priv->cmd_type) {
+	case BRMR_CMD_GET_PARAMS:
+		if (cmd_priv->errno)
+			pr_err("%s: BRMR_CMD_GET_PARAMS failed with err=%pe on sending",
+				 __func__, ERR_PTR(errno));
+
+		break;
+
+	default:
+		cmd_priv->errno = -EINVAL;
+		pr_err("%s: Unknown command type %d err=%d\n", __func__, cmd_priv->cmd_type, errno);
+	}
+
+	complete(&cmd_priv->complete_done);
+}
+
+/**
+ * brmr_srv_send_msg_cmd() - Sends command message to internal rmr pool through rmr-srv pool
+ *
+ * @dev:		pointer to brmr device
+ * @msg:		msg struct to be sent
+ * @rsp_buf:		response buffer where the response of the storage side is stored
+ * @rsp_buf_len:	length of the response buffer
+ *
+ * Return:
+ *	Negative if failed to sent command
+ *	As handled by each command in brmr_cmd_conf, if succeeded to send command
+ *
+ * Context:
+ *	Would block until response is received
+ */
+static int brmr_srv_send_msg_cmd(struct brmr_srv_blk_dev *dev, struct brmr_msg_cmd *msg,
+				 void *rsp_buf, size_t rsp_buf_len)
+{
+	struct brmr_cmd_priv cmd_priv;
+	struct kvec vec;
+	int ret;
+
+	vec = (struct kvec) {
+		.iov_base = msg,
+		.iov_len  = sizeof(*msg)
+	};
+
+	cmd_priv.dev = dev;
+	cmd_priv.cmd_type = msg->cmd_type;
+	cmd_priv.rsp_buf = rsp_buf;
+	cmd_priv.rsp_buf_len = rsp_buf_len;
+	cmd_priv.errno = 0;
+	init_completion(&cmd_priv.complete_done);
+
+	ret = rmr_srv_pool_cmd_with_rsp(dev->pool, brmr_srv_cmd_conf, &cmd_priv, &vec, 1, rsp_buf,
+				       rsp_buf_len, sizeof(struct brmr_msg_cmd_rsp));
+
+	if (!ret) {
+		wait_for_completion(&cmd_priv.complete_done);
+		ret = cmd_priv.errno;
+	}
+
+	return ret;
+}
+
+/**
+ * brmr_srv_blk_get_params() - Get parameters from other servers
+ *
+ * @dev:		Backend device for which to be checked
+ *
+ * Description:
+ *	Check whether parameters from other servers are consistent with this server through
+ *	internal network.
+ *
+ * Return:
+ *	0 on success of checks
+ *	-Negative error value on failure of checks.
+ *	-EAGAIN if no sync sessions are connected to this server.
+ */
+static int brmr_srv_blk_get_params(void *device)
+{
+	struct brmr_srv_blk_dev *dev;
+	struct brmr_msg_cmd msg;
+	struct brmr_msg_cmd_rsp *brmr_cmd_rsp;
+	void *rsp_buf;
+	size_t rsp_buf_len;
+	int err = 0, i;
+	bool checked = false;
+
+	dev = (struct brmr_srv_blk_dev *)device;
+	brmr_srv_init_cmd(&msg);
+	msg.cmd_type = BRMR_CMD_GET_PARAMS;
+
+	rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS;
+	rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL);
+	if (!rsp_buf)
+		return -ENOMEM;
+
+	err = brmr_srv_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len);
+	if (err < 0) {
+		pr_warn("%s: brmr_send_msg_cmd failed with errno %d\n", __func__, err);
+		/*
+		 * Sending could fail for various reasons. The server may be isolated and has
+		 * no connected sync sessions to other nodes. Or the connected server has no
+		 * store attached.
+		 */
+		goto free_data;
+	}
+
+	/*
+	 * We do not care if the command failed for few storage nodes, as long as we get a good
+	 * response from one of them.
+	 *
+	 * The mapped size of all storage nodes which are connected should be the same, whether
+	 * the backend device of those nodes is mapped or not.
+	 *
+	 * TODO: If the responses of other storage nodes are different, then use values from
+	 * nodes which are mapped. If there are no mapped devices in the pool, then the check
+	 * will fail when the mapped sizes are different.
+	 */
+	brmr_cmd_rsp = (struct brmr_msg_cmd_rsp *)rsp_buf;
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++, brmr_cmd_rsp++) {
+		struct brmr_cmd_get_params_rsp *get_params_rsp = &brmr_cmd_rsp->get_params_rsp;
+		struct brmr_blk_dev_params *rsp_dev_params;
+
+		/*
+		 * If there is no magic, or the command failed,
+		 * we do not use that nodes info to perform the check.
+		 */
+		if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC ||
+		    brmr_cmd_rsp->status)
+			continue;
+
+		if (dev->mapped_size != le64_to_cpu(get_params_rsp->mapped_size)) {
+			pr_err("%s: Mismatch in mapped_size: %llu != %llu\n", __func__,
+			       dev->mapped_size, le64_to_cpu(get_params_rsp->mapped_size));
+			err = -EINVAL;
+			goto free_data;
+		}
+
+		rsp_dev_params = &get_params_rsp->dev_params;
+
+		dev->dev_params.max_hw_sectors = le32_to_cpu(rsp_dev_params->max_hw_sectors);
+		dev->dev_params.max_write_zeroes_sectors =
+					le32_to_cpu(rsp_dev_params->max_write_zeroes_sectors);
+		dev->dev_params.max_discard_sectors =
+					le32_to_cpu(rsp_dev_params->max_discard_sectors);
+		dev->dev_params.discard_granularity =
+					le32_to_cpu(rsp_dev_params->discard_granularity);
+		dev->dev_params.discard_alignment = le32_to_cpu(rsp_dev_params->discard_alignment);
+		dev->dev_params.physical_block_size =
+					le16_to_cpu(rsp_dev_params->physical_block_size);
+		dev->dev_params.logical_block_size =
+					le16_to_cpu(rsp_dev_params->logical_block_size);
+		dev->dev_params.max_segments = le16_to_cpu(rsp_dev_params->max_segments);
+		dev->dev_params.secure_discard = le16_to_cpu(rsp_dev_params->secure_discard);
+		dev->dev_params.cache_policy = rsp_dev_params->cache_policy;
+
+		/*
+		 * At least check passed with one mapped storage node
+		 *
+		 * We still perform the check for other mapped storage nodes just for sanity.
+		 */
+		checked = true;
+	}
+
+	if (checked == false) {
+		pr_err("%s: Check for mapped_size failed for dev %s.\n",
+		       __func__, dev->poolname);
+		err = -EINVAL;
+	}
+
+free_data:
+	kfree(rsp_buf);
+
+	return err;
+}
+
+/**
+ * brmr_srv_blk_add_handle_replace() - Handle check and discard for a store which was replaced
+ *
+ * @dev:	RMR block device to be closed
+ *
+ * Description:
+ *	When an empty disk is added to an already existing brmr server store, it means that the
+ *	empty disk is to replace the disk which was present in the existing brmr srv store.
+ *	Before replacing the disk with the new empty one, there are a number of things to be done.
+ *	This function performs the following task,
+ *	1) Get some parameters from other storage node through the internal network, and checks
+ *	whether the mapped_size passed for the new empty disk is correct or not.
+ *	2) If the above check passed, then discard is sent above to rmr-server.
+ *
+ * Return:
+ *	0 on success
+ *	-Error value on error
+ */
+static int brmr_srv_blk_add_handle_replace(struct brmr_srv_blk_dev *dev)
+{
+	int err = 0;
+
+	/*
+	 * The check passed. We can now do the discard safely.
+	 */
+	err = brmr_srv_blk_do_discard(dev);
+	if (err) {
+		pr_err("%s: brmr_srv_blk_do_discard failed for dev %s\n", __func__, dev->poolname);
+		return err;
+	}
+
+	/*
+	 * We are done with everything, and we are good.
+	 * We now set the MAPPED state and write metadata again so it is persisted.
+	 * so that IOs can be served.
+	 */
+	brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_MAPPED);
+	err = brmr_srv_blk_write_md(dev);
+	if (err) {
+		pr_err("%s: dev %s: write md error %d\n", __func__, dev->name, err);
+		brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED);
+		return err;
+	}
+
+	/*
+	 * After the discarded entries are sent to rmr-server, set the map version of
+	 * rmr pool to zero.
+	 */
+	rmr_srv_replace_store(dev->pool);
+	return 0;
+}
+
+/**
+ * brmr_srv_read_and_check_md() - Read and check metadata if it exists
+ *
+ * @dev:	BRMR server block device for which the metadata is to be checked
+ * @md_page:	pointer to the buf where to read the metadata
+ *
+ * Description:
+ *	Read metadata from the given store device, and check whether metadata exists.
+ *
+ * Return:
+ *	0:	read was successful and metadata exists
+ *	-1:	read was successful but metadata doesn't exists
+ *	-Errno:	read failed
+ */
+int brmr_srv_read_and_check_md(struct brmr_srv_blk_dev *dev, void *md_page)
+{
+	struct brmr_srv_blk_dev_meta *meta = md_page;
+	int err;
+
+	err = brmr_srv_blk_bdev_read_md(dev->bdev, md_page);
+	if (err) {
+		pr_err("%s: failed to read md, err=%d\n", __func__, err);
+		return -EINVAL;
+	}
+
+	if (meta->magic != BRMR_BLK_STORE_MAGIC) {
+		pr_info("%s: No MD exists for block device %s, md magic=%llX does not match %X\n",
+			__func__, dev->name, meta->magic, BRMR_BLK_STORE_MAGIC);
+		return -1;
+	}
+
+	pr_info("%s: %s MD exists for block device %s\n", __func__, meta->poolname, dev->name);
+
+	return 0;
+}
+
+/**
+ * brmr_srv_blk_open() - Open an brmr srv block device
+ *
+ * @dev:	BRMR server block device structure to be used.
+ * @path:	path to the block device.
+ * @create:	Whether to create a new store or open an existing one.
+ * @replace:	Whether the device is being added to replace an empty disk.
+ *
+ * Description:
+ *	Open the block device "path", and populate the brmr srv block device "dev"
+ *	with the details.
+ *	To close the device, call brmr_srv_blk_close()
+ *
+ * Return:
+ *	0 on success
+ *	-Error value on error
+ *
+ * Locks:
+ *	store_mutex should be held while calling this.
+ */
+int brmr_srv_blk_open(struct brmr_srv_blk_dev *dev, const char *path,
+		      bool create, bool replace)
+{
+	struct rmr_attrs attr;
+	int err;
+
+	err = rmr_srv_query(NULL, dev->mapped_size, &attr);
+	if (err) {
+		pr_err("dev %s: rmr srv query failed %d\n", dev->name, err);
+		return err;
+	}
+
+	if ((dev->mapped_size + BLK_STR_MD_SIZE_SECTORS + attr.rmr_md_size) > dev->dev_size) {
+		pr_err("%s: dev %s: No space for rmr metadata %llu(in sectors)\n",
+		       __func__, dev->name, attr.rmr_md_size);
+		return -ENOSPC;
+	}
+
+	/*
+	 * After the device registers to the RMR server pool, there will be metadata requests from
+	 * RMR server transmitted to the device which starts reference counting. The reference
+	 * count of the device must be initialized before any in flight requests are sent to BRMR.
+	 */
+	err = percpu_ref_init(&dev->kref, brmr_srv_blk_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (err) {
+		pr_err("%s: percpu ref init failed.\n", __func__);
+		return -EINVAL;
+	}
+	init_completion(&dev->comp);
+
+	dev->pool = rmr_srv_register(dev->poolname, &pstore_blk_ops, dev,
+				     dev->mapped_size, create ? RMR_SRV_DISK_CREATE :
+						       (replace ? RMR_SRV_DISK_REPLACE :
+							RMR_SRV_DISK_ADD));
+	if (!dev->pool) {
+		pr_err("Failed registering blk store %s, err\n", dev->poolname);
+		brmr_srv_ref_kill(dev);
+		return -EINVAL;
+	}
+
+	brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_OPEN);
+
+	if (!create) {
+		err = brmr_srv_blk_get_params(dev);
+		if (replace) {
+			/*
+			 * Any failure of getting parameters is not allowed when replacing a store.
+			 * Either it failed to send the command or the parameters are different.
+			 */
+			if (err) {
+				pr_err("%s: replace_store: brmr_srv_blk_get_params failed with err %d\n",
+				       __func__, err);
+				goto close_dev;
+			}
+		} else {
+			/*
+			 * The store creation will fail if the connected servers to this server
+			 * share different parameter values. If sending the command of getting
+			 * parameters failed due to no sync sessions connected to this server
+			 * where no parameters are received, the store will be created, delaying
+			 * checks when this server is connected to some other servers.
+			 */
+			if (err && err != -EAGAIN) {
+				pr_err("%s: create_store: brmr_srv_blk_get_params failed with err %d\n",
+				       __func__, err);
+				goto close_dev;
+			}
+		}
+
+		/*
+		 * TODO: Would we be creating the maps for replace (empty disk) at the
+		 * same time as we create one for create_disk?
+		 */
+		if (replace) {
+			err = brmr_srv_blk_add_handle_replace(dev);
+			if (err) {
+				pr_err("%s: replace_store %s: handling replace failed with err %d",
+				       __func__, dev->poolname, err);
+				goto close_dev;
+			}
+		}
+	}
+
+	/* we write md in both cases (new or old device) just to check if device is ok
+	 * for writing
+	 */
+	err = brmr_srv_blk_write_md(dev);
+	if (err) {
+		pr_err("dev %s: write md error %d\n", dev->name, err);
+		goto close_dev;
+	}
+
+	list_add(&dev->entry, &store_list);
+
+	pr_info("%s: brmr srv blk str %s, dev %s set state to open\n", __func__, dev->poolname,
+		dev->name);
+
+	return 0;
+
+close_dev:
+	brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN);
+	/*
+	 * TODO: Ideally, the unregister should be called with (create || replace).
+	 * But right now there is no way to RMR to go ahead with the delete,
+	 * even if marked_delete is not set.
+	 */
+	rmr_srv_unregister(dev->poolname, create);
+	dev->pool = NULL;
+	brmr_srv_ref_kill(dev);
+
+	return err;
+}
+
+/**
+ * brmr_srv_blk_cleanup() - Cleanup all the opened and active brmr srv block devices
+ *
+ * Description:
+ *	This function is called when the module brmr server store is getting removed.
+ *	It closes, destroys and frees all the open and active brmr server block devices.
+ */
+static void brmr_srv_blk_cleanup(void)
+{
+	struct brmr_srv_blk_dev *dev, *tmp;
+
+	mutex_lock(&store_mutex);
+	list_for_each_entry_safe(dev, tmp, &store_list, entry) {
+		blk_str_destroy_sysfs_files(dev, NULL);
+		brmr_srv_blk_close(dev, false);
+
+		pr_info("put blkdev %s\n", dev->bdev->bd_disk->disk_name);
+		bdev_fput(dev->bdev_file);
+
+		brmr_srv_blk_destroy(dev);
+	}
+	mutex_unlock(&store_mutex);
+}
+
+/**
+ * brmr_srv_blk_create() - Create an brmr_srv_blk_dev with the given data
+ *
+ * @path:	path to the block device.
+ * @poolname:	Name to be given to the created block device
+ *
+ * Description:
+ *	To destroy a created brmr server block device, call brmr_srv_blk_destroy()
+ *
+ * Return:
+ *	Pointer to the allocated brmr srv block device on success
+ *	Error pointer on error
+ */
+struct brmr_srv_blk_dev *brmr_srv_blk_create(const char *path, char *poolname)
+{
+	struct brmr_srv_blk_dev *dev;
+	int err = 0;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	strscpy(dev->poolname, poolname, NAME_MAX);
+
+	dev->io_priv_cache = kmem_cache_create("brmr_srv_io_priv_cache",
+					       sizeof(struct brmr_srv_io_priv), 0, 0, NULL);
+	if (!dev->io_priv_cache) {
+		pr_err("failed to create cache for device %s\n", poolname);
+		err = -ENOMEM;
+		goto free_dev;
+	}
+
+	pr_debug("brmr srv blk store with name %s created\n", poolname);
+
+	return dev;
+
+free_dev:
+	kfree(dev);
+err:
+	return ERR_PTR(err);
+}
+
+/**
+ * brmr_srv_blk_destroy() - Destroy a given brmr_srv_blk_dev
+ *
+ * @dev:	brmr server block device to be destroyed
+ * @sysfs_self:	Pointer to self attribute
+ *
+ * Description:
+ *	This function is the opposite of brmr_srv_blk_create()
+ *	The pointer to the self attribute is used to denote whether the destroy call
+ *	is a result of a sysfs task for its own device.
+ */
+void brmr_srv_blk_destroy(struct brmr_srv_blk_dev *dev)
+{
+	kmem_cache_destroy(dev->io_priv_cache);
+	kfree(dev);
+}
+
+/**
+ * brmr_srv_blk_map_dev() - Process a map command from the client side
+ *
+ * @dev:	brmr server block device to be destroyed
+ * @map_cmd:	Pointer to structure holding map command info
+ *
+ * Description:
+ *	We save all the data and param sent in the command in out metadata,
+ *	since these are assured to have been validated across all storage nodes.
+ *
+ *	For future get params requests, we send back these instead of reading them
+ *	from the underlying block device.
+ *
+ * Return:
+ *	0 on success
+ *	-Error value on error
+ */
+static int brmr_srv_blk_map_dev(struct brmr_srv_blk_dev *dev,
+				const struct brmr_msg_map_new_cmd *map_cmd)
+{
+	const struct brmr_blk_dev_params *cmd_dev_params = &map_cmd->dev_params;
+	int err;
+	u64 recvd_mapped_size = map_cmd->mapped_size;
+
+	pr_info("%s: Mapping device %s with mapped_size %llu, recvd size %llu\n",
+		__func__, dev->name, dev->mapped_size, recvd_mapped_size);
+
+	if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state)) {
+		pr_err("%s: Received map command for already mapped device %s\n",
+		       __func__, dev->name);
+		return -EINVAL;
+	}
+
+	if (recvd_mapped_size > dev->dev_size - BLK_STR_MD_SIZE_SECTORS) {
+		pr_err("can not map %llu, only %llu available %s\n",
+		       recvd_mapped_size, dev->dev_size - BLK_STR_MD_SIZE_SECTORS, dev->name);
+		return -ENOSPC;
+	}
+
+	if (dev->mapped_size && dev->mapped_size != recvd_mapped_size) {
+		pr_err("dev %s is already mapped with size %llu, does not match %llu",
+		       dev->name, dev->mapped_size, recvd_mapped_size);
+		return -EINVAL;
+	}
+
+	dev->mapped_size = recvd_mapped_size;
+
+	dev->dev_params.max_hw_sectors = le32_to_cpu(cmd_dev_params->max_hw_sectors);
+	dev->dev_params.max_write_zeroes_sectors =
+				le32_to_cpu(cmd_dev_params->max_write_zeroes_sectors);
+	dev->dev_params.max_discard_sectors = le32_to_cpu(cmd_dev_params->max_discard_sectors);
+	dev->dev_params.discard_granularity = le32_to_cpu(cmd_dev_params->discard_granularity);
+	dev->dev_params.discard_alignment = le32_to_cpu(cmd_dev_params->discard_alignment);
+	dev->dev_params.physical_block_size = le16_to_cpu(cmd_dev_params->physical_block_size);
+	dev->dev_params.logical_block_size = le16_to_cpu(cmd_dev_params->logical_block_size);
+	dev->dev_params.max_segments = le16_to_cpu(cmd_dev_params->max_segments);
+	dev->dev_params.secure_discard = le16_to_cpu(cmd_dev_params->secure_discard);
+	dev->dev_params.cache_policy = cmd_dev_params->cache_policy;
+
+	brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_MAPPED);
+
+	err = brmr_srv_blk_write_md(dev);
+	if (err) {
+		pr_err("failed to write md for %s, err %d\n", dev->name, err);
+		dev->mapped_size = 0;
+		brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Always succeeds. */
+static int brmr_srv_blk_unmap_dev(struct brmr_srv_blk_dev *dev)
+{
+	pr_info("unmap device: %s\n", dev->name);
+	brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED);
+
+	return 0;
+}
+
+static bool brmr_srv_blk_io_allowed(void *store_priv)
+{
+	struct brmr_srv_blk_dev *dev = store_priv;
+
+	if (!dev) {
+		pr_err("no store registered\n");
+		return false;
+	}
+
+	return test_bit(BRMR_SRV_STORE_OPEN, &dev->state) &&
+	       test_bit(BRMR_SRV_STORE_MAPPED, &dev->state);
+}
+
+#define bio_disk_name(bio)    ((bio)->bi_bdev->bd_disk->disk_name)
+#define bio_first_sector(bio) ((bio_end_sector(bio) - bio_sectors(bio)))
+
+static void brmr_srv_bi_end_io(struct bio *bio)
+{
+	struct brmr_srv_io_priv *io_priv = bio->bi_private;
+	struct brmr_srv_blk_dev *dev = io_priv->dev;
+	int err;
+
+	err = blk_status_to_errno(bio->bi_status);
+	pr_debug("end io called for dev %s, bio=%p, err=%d\n", dev->poolname, bio, err);
+
+	if (err) {
+		brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN);
+		pr_err("Dev %s, Bio %p type %s, err=%d bdev_name=%s\n", dev->poolname,
+		       bio, bio_data_dir(bio) == WRITE ? "W" : "R", err, bio_disk_name(bio));
+	}
+
+	rmr_srv_req_resp(io_priv->priv, err);
+
+	kmem_cache_free(dev->io_priv_cache, io_priv);
+	brmr_srv_blk_put_ref(dev);
+	bio_put(bio);
+}
+
+static int brmr_srv_submit_bi(struct brmr_srv_blk_dev *dev, void *data, u64 offset, u32 length,
+			      unsigned long flags, u16 prio, void *priv)
+{
+	struct bio *bio;
+	struct brmr_srv_io_priv *io_priv;
+	blk_opf_t bio_flags;
+	int ret = 0;
+	bool is_md_op = false;
+
+	switch (rmr_op(flags)) {
+	case RMR_OP_READ:
+		bio_flags = REQ_OP_READ;
+		break;
+	case RMR_OP_WRITE:
+	case RMR_OP_SYNCREQ:
+		bio_flags = REQ_OP_WRITE;
+		break;
+	case RMR_OP_DISCARD:
+		bio_flags = REQ_OP_DISCARD;
+		break;
+	case RMR_OP_WRITE_ZEROES:
+		bio_flags = REQ_OP_WRITE_ZEROES;
+		break;
+	case RMR_OP_FLUSH:
+		bio_flags = REQ_OP_WRITE | REQ_PREFLUSH;
+		break;
+	case RMR_OP_MD_READ:
+		bio_flags = REQ_OP_READ;
+		is_md_op = true;
+		break;
+	case RMR_OP_MD_WRITE:
+		bio_flags = REQ_OP_WRITE;
+		is_md_op = true;
+		break;
+	default:
+		pr_err("Wrong flags=%lu\n", flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * Most md IO are created on rmr-srv and does not get priority value passed on from rmr-clt
+	 */
+	if (is_md_op) {
+		bio_flags |= REQ_META;
+		if (rmr_op(flags) == RMR_OP_MD_WRITE)
+			bio_flags |= REQ_FUA;
+	}
+
+	if (flags & RMR_F_SYNC)
+		bio_flags |= REQ_SYNC;
+
+	if (flags & RMR_F_FUA)
+		bio_flags |= REQ_FUA;
+
+	bio = bio_alloc(dev->bdev, 1, bio_flags, GFP_KERNEL);
+	if (bio_add_page(bio, virt_to_page(data), length,
+			 offset_in_page(data)) != length) {
+		pr_err("Failed to map data to bio\n");
+		ret = -EINVAL;
+		goto put_bio;
+	}
+
+	io_priv = kmem_cache_zalloc(dev->io_priv_cache, GFP_KERNEL);
+	if (!io_priv) {
+		pr_err("Failed to alloc io_priv for op %lx dev %s\n", flags, dev->poolname);
+		ret = -ENOMEM;
+		goto put_bio;
+	}
+
+	io_priv->dev = dev;
+	io_priv->priv = priv;
+
+	bio->bi_private = io_priv;
+	bio->bi_end_io = brmr_srv_bi_end_io;
+	bio->bi_iter.bi_sector = offset;
+	bio->bi_iter.bi_size = length;
+	bio_set_dev(bio, dev->bdev);
+
+	pr_debug("Submit %s bio=%p, disk=%s, flag=[%lx], bio_flag=[%x], op=[%x]"
+		 "first_sect=%llu, sectors=%d\n",
+		 is_md_op ? "md req" : "req", bio, bio_disk_name(bio),
+		 flags, bio_flags, rmr_op(flags),
+		 (u64)bio_first_sector(bio), bio_sectors(bio));
+
+	if (is_md_op) {
+		ret = submit_bio_wait(bio);
+		if (ret) {
+			pr_err("Error waiting md from %s, err %d\n",
+			       dev->bdev->bd_disk->disk_name, ret);
+		}
+		goto end_bio;
+	} else {
+		/*
+		 * Most md IO are created on rmr-srv and does not get priority value passed on from
+		 * rmr-clt
+		 */
+		bio->bi_ioprio = prio;
+		submit_bio(bio);
+	}
+
+	return 0;
+end_bio:
+	rmr_srv_req_resp(io_priv->priv, ret);
+	kmem_cache_free(dev->io_priv_cache, io_priv);
+put_bio:
+	bio_put(bio);
+	return ret;
+}
+
+/**
+ * brmr_srv_process_blk_req() - Processes brmr srv store IO messages
+ *
+ * @dev:               pointer to rmr block device
+ * @data:              pointer to data
+ * @data_offset:       offset on disk (represented in bytes)
+ * @length:            length of data in bytes
+ * @flags:             IO flags
+ * @prio:              prio from block layer
+ * @priv:              pointer to priv data for rmr
+ *
+ * Return:
+ *     0 in case of success
+ *     negative in case of failure
+ */
+static int brmr_srv_process_blk_req(void *device, void *data, u32 data_offset,
+				    u32 length, unsigned long flags, u16 prio, void *priv)
+{
+	struct brmr_srv_blk_dev *dev = (struct brmr_srv_blk_dev *)device;
+	u64 offset = 0; /* in sectors */
+	int ret = 0;
+
+	if (!brmr_srv_blk_get_ref(dev)) {
+		pr_err("for dev %s, name %s, failed to get_ref\n",
+		       dev->name, dev->poolname);
+		return -EIO;
+	}
+
+	if (!brmr_srv_blk_io_allowed(dev)) {
+		pr_err("Store name %s, offset %u, length %u, io is not allowed!\n",
+		       dev->poolname, data_offset, length);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	offset = BLK_STR_MD_SIZE_SECTORS;
+	offset += (data_offset) >> SECTOR_SHIFT; //bytes to sectors;
+
+	pr_debug("Submitted req to %s, flag %lu offset %llu length %u\n",
+		 dev->name, flags, offset, length);
+	ret = brmr_srv_submit_bi(dev, data, offset, length, flags, prio, priv);
+	if (ret) {
+		pr_err("%s: bio submission failed for data IO\n", __func__);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	brmr_srv_blk_put_ref(dev);
+	return ret;
+}
+
+/**
+ * brmr_srv_process_blk_md_req() - Process the requests for rmr metadata
+ *
+ * Return:
+ *	0 on success
+ *
+ * Description:
+ *	The rmr metadata will be stored at the end of the device.
+ */
+static int brmr_srv_process_blk_md_req(void *device, void *data, u32 data_offset,
+				       u32 length, unsigned long flags, void *priv)
+{
+	struct brmr_srv_blk_dev *dev = device;
+	int err;
+	u64 offset = 0; /* in sectors */
+
+	if (!brmr_srv_blk_get_ref(dev)) {
+		pr_err("for dev %s, name %s, failed to get_ref\n",
+		       dev->name, dev->poolname);
+		return -EIO;
+	}
+
+	/* The mapped_size is in sectors. */
+	offset = BLK_STR_MD_SIZE_SECTORS + dev->mapped_size;
+	offset += (data_offset) >> SECTOR_SHIFT; //bytes to sectors;
+	pr_debug("Submitted md req to %s, flag %lu offset %llu length %u\n",
+		 dev->name, flags, offset, length);
+	/*
+	 * It's no need to return err to upper layer here. If the submission of md request fails,
+	 * it will go through the endreq path after the server req finishes processing.
+	 */
+	err = brmr_srv_submit_bi(dev, data, offset, length, flags, 0, priv);
+	if (err)
+		pr_err("%s: bio submission failed for metadata IO\n", __func__);
+	brmr_srv_blk_put_ref(dev);
+	return 0;
+}
+
+/**
+ * brmr_srv_init_cmd_rsp() - Initialize command response
+ *
+ * @msg:	command response to initialize
+ */
+static void brmr_srv_init_cmd_rsp(struct brmr_msg_cmd_rsp *msg)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD);
+	msg->hdr.__padding = 0;
+	msg->magic = BRMR_CMD_RSP_MAGIC;
+	msg->ver = BRMR_PROTO_VER_MAJOR;
+	msg->cmd_type = BRMR_CMD_RSP;
+}
+
+/**
+ * brmr_srv_fill_dev_param_dev() - Fill dev params from the saved params in brmr srv block device
+ *
+ * @dev:	pointer to brmr server block device
+ * @rsp:	Pointer to command response structure holding params
+ *
+ * Return:
+ *	0 in case of success
+ *	negative in case of failure
+ */
+static int brmr_srv_fill_dev_param_dev(struct brmr_srv_blk_dev *dev,
+				       struct brmr_cmd_get_params_rsp *rsp)
+{
+	struct brmr_srv_blk_dev_meta *md_page;
+	struct brmr_blk_dev_params *rsp_dev_params = &rsp->dev_params;
+	int ret;
+
+	md_page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!md_page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * We have to read the metadata from the device.
+	 */
+	ret = brmr_srv_blk_bdev_read_md(dev->bdev, (void *)md_page);
+	if (ret) {
+		pr_err("%s: failed to read md, err=%d\n", __func__, ret);
+		goto out;
+	}
+
+	if (md_page->magic != BRMR_BLK_STORE_MAGIC) {
+		pr_warn("%s: No md found. store %s md magic=%llX does not match %X\n",
+			__func__, dev->poolname, md_page->magic, BRMR_BLK_STORE_MAGIC);
+		ret =  -EINVAL;
+		goto out;
+	}
+
+	rsp_dev_params->max_hw_sectors = cpu_to_le32(md_page->dev_params.max_hw_sectors);
+	rsp_dev_params->max_write_zeroes_sectors =
+				cpu_to_le32(md_page->dev_params.max_write_zeroes_sectors);
+	rsp_dev_params->max_discard_sectors = cpu_to_le32(md_page->dev_params.max_discard_sectors);
+	rsp_dev_params->discard_granularity = cpu_to_le32(md_page->dev_params.discard_granularity);
+	rsp_dev_params->discard_alignment = cpu_to_le32(md_page->dev_params.discard_alignment);
+	rsp_dev_params->physical_block_size = cpu_to_le16(md_page->dev_params.physical_block_size);
+	rsp_dev_params->logical_block_size = cpu_to_le16(md_page->dev_params.logical_block_size);
+	rsp_dev_params->max_segments = cpu_to_le16(md_page->dev_params.max_segments);
+	rsp_dev_params->secure_discard = cpu_to_le16(md_page->dev_params.secure_discard);
+	rsp_dev_params->cache_policy = md_page->dev_params.cache_policy;
+
+out:
+	kfree(md_page);
+	return ret;
+}
+
+/**
+ * brmr_srv_fill_dev_param_bdev() - Fill dev params from the underlying block device
+ *
+ * @dev:	pointer to brmr server block device
+ * @rsp:	Pointer to command response structure holding params
+ *
+ * Return:
+ *	0 in case of success
+ *	negative in case of failure
+ */
+static int brmr_srv_fill_dev_param_bdev(struct brmr_srv_blk_dev *dev,
+				       struct brmr_cmd_get_params_rsp *rsp)
+{
+	struct block_device *bdev = dev->bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct brmr_blk_dev_params *rsp_dev_params = &rsp->dev_params;
+
+	if (!q) {
+		pr_err("%s: no queue for dev %s\n", __func__, dev->name);
+		return -EINVAL;
+	}
+
+	rsp_dev_params->logical_block_size =
+		cpu_to_le16(bdev_logical_block_size(bdev));
+	rsp_dev_params->physical_block_size =
+		cpu_to_le16(bdev_physical_block_size(bdev));
+	rsp_dev_params->max_segments =
+		cpu_to_le16(queue_max_segments(q));
+	rsp_dev_params->max_hw_sectors =
+		cpu_to_le32(queue_max_hw_sectors(q));
+	rsp_dev_params->max_write_zeroes_sectors =
+		cpu_to_le32(bdev_write_zeroes_sectors(bdev));
+	rsp_dev_params->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev));
+	rsp_dev_params->discard_granularity =
+		cpu_to_le32(bdev_get_queue(bdev)->limits.discard_granularity);
+	rsp_dev_params->discard_alignment =
+		cpu_to_le32(bdev_get_queue(bdev)->limits.discard_alignment);
+	rsp_dev_params->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev));
+	rsp_dev_params->cache_policy = 0;
+
+	if (blk_queue_write_cache(q))
+		rsp_dev_params->cache_policy |= BRMR_WRITEBACK;
+	if (bdev_fua(bdev))
+		rsp_dev_params->cache_policy |= BRMR_FUA;
+
+	return 0;
+}
+
+/**
+ * brmr_srv_fill_get_params_rsp() - Fill dev params into the command response structure
+ *
+ * @dev:		pointer to brmr server block device
+ * @brmr_cmd_rsp:	Pointer to command response structure
+ *
+ * Description:
+ *	For mapped devices, we need to pick up the params from the brmr server block device itself
+ *	These are the same ones which are saved in the metadata of the device.
+ *
+ *	For unmapped devices, we need to extract this info from the underlying block device
+ *
+ * Return:
+ *	0 in case of success
+ *	negative in case of failure
+ */
+static int brmr_srv_fill_get_params_rsp(struct brmr_srv_blk_dev *dev,
+				       struct brmr_msg_cmd_rsp *brmr_cmd_rsp)
+{
+	struct brmr_cmd_get_params_rsp *rsp;
+	int ret;
+
+	if (!dev) {
+		pr_err("%s: no brmr srv blk dev to get params\n", __func__);
+		return -ENODEV;
+	}
+
+	if (!dev->bdev) {
+		pr_err("%s: no bdev opened for dev %s\n", __func__, dev->name);
+		return -EINVAL;
+	}
+
+	rsp = &brmr_cmd_rsp->get_params_rsp;
+
+	/*
+	 * For a mapped device, we get the saved params in the device structure (read from md)
+	 * since those are the ones which would have gone through validation,
+	 * when the map happened.
+	 *
+	 * For unmapped device, we get params from the underlying bdev.
+	 */
+	if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state))
+		ret = brmr_srv_fill_dev_param_dev(dev, rsp);
+	else
+		ret = brmr_srv_fill_dev_param_bdev(dev, rsp);
+
+	if (ret) {
+		pr_err("%s: Fill dev params failed for dev %s\n", __func__, dev->name);
+		return -EINVAL;
+	}
+
+	rsp->mapped = test_bit(BRMR_SRV_STORE_MAPPED, &dev->state);
+	rsp->mapped_size = cpu_to_le64(dev->mapped_size);
+	pr_info("%s: dev %s, mapped_size %llu\n", __func__,
+		dev->name, le64_to_cpu(rsp->mapped_size));
+
+	return 0;
+}
+
+/**
+ * brmr_srv_blk_cmd() - Processes brmr srv store command messages
+ *
+ * @device:	brmr server store device
+ * @usr_buf:	user buffer containing the command message struct (ones sent as kvec to rmr)
+ * @usr_len:	length of the usr_buf
+ * @data:	data buffer where the response can be sent back for brmr client to read
+ * @datalen:	length of data buffer
+ *
+ * Return:
+ *	0 in case of success
+ *	negative in case of failure
+ */
+static int brmr_srv_blk_cmd(void *device, const void *usr_buf, int usr_len, void *data,
+			     int datalen)
+{
+	struct brmr_srv_blk_dev *dev = device;
+	const struct brmr_msg_cmd *msg = (const struct brmr_msg_cmd *)usr_buf;
+	struct brmr_msg_cmd_rsp *brmr_cmd_rsp = (struct brmr_msg_cmd_rsp *)data;
+	int ret = 0;
+
+	if (datalen < sizeof(*brmr_cmd_rsp)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (!brmr_srv_blk_get_ref(dev)) {
+		pr_err("for dev %s, name %s, failed to get_ref to process command %d\n",
+		       dev->name, dev->poolname, msg->cmd_type);
+		return -EIO;
+	}
+
+	brmr_srv_init_cmd_rsp(brmr_cmd_rsp);
+
+	switch (msg->cmd_type) {
+	case BRMR_CMD_MAP:
+		pr_info("%s: BRMR_CMD_MAP\n", __func__);
+
+		brmr_cmd_rsp->status = brmr_srv_blk_map_dev(dev, &msg->map_new_cmd);
+		if (brmr_cmd_rsp->status) {
+			pr_err("Failed to map new dev to %s, err %d\n",
+			       dev->name, brmr_cmd_rsp->status);
+		}
+		break;
+	case BRMR_CMD_REMAP:
+		pr_info("%s: BRMR_CMD_REMAP\n", __func__);
+		break;
+	case BRMR_CMD_UNMAP:
+		pr_info("%s: BRMR_CMD_UNMAP\n", __func__);
+
+		brmr_cmd_rsp->status = brmr_srv_blk_unmap_dev(dev);
+		break;
+	case BRMR_CMD_GET_PARAMS:
+		pr_info("%s: BRMR_CMD_GET_PARAMS\n", __func__);
+
+		brmr_cmd_rsp->status = brmr_srv_fill_get_params_rsp(dev, brmr_cmd_rsp);
+		break;
+
+	default:
+		pr_err("%s: Unknown command type %d\n", __func__, msg->cmd_type);
+	}
+
+	brmr_srv_blk_put_ref(dev);
+
+	return ret;
+}
+
+struct rmr_srv_store_ops pstore_blk_ops = {
+	.submit_req = brmr_srv_process_blk_req,
+	.submit_md_req = brmr_srv_process_blk_md_req,
+	.submit_cmd = brmr_srv_blk_cmd,
+	.io_allowed = brmr_srv_blk_io_allowed,
+	.get_params = brmr_srv_blk_get_params,
+};
+
+static int __init brmr_srv_init_module(void)
+{
+	int err = 0;
+
+	pr_info("Loading module %s, version %s\n",
+		KBUILD_MODNAME, BRMR_SERVER_VER_STRING);
+
+	err = brmr_srv_create_sysfs_files();
+	if (err) {
+		pr_err("rmr_store_create_sysfs_files(), err: %d\n", err);
+		goto out;
+	}
+
+	return 0;
+out:
+	return err;
+}
+
+static void __exit brmr_srv_cleanup_module(void)
+{
+	brmr_srv_blk_cleanup();
+	brmr_srv_destroy_sysfs_files();
+
+	pr_info("Module %s unloaded\n", KBUILD_MODNAME);
+}
+
+module_init(brmr_srv_init_module);
+module_exit(brmr_srv_cleanup_module);
diff --git a/drivers/block/brmr/brmr-srv.h b/drivers/block/brmr/brmr-srv.h
new file mode 100644
index 000000000000..4180ee600e65
--- /dev/null
+++ b/drivers/block/brmr/brmr-srv.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block device over RMR (BRMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef BRMR_SRV_H
+#define BRMR_SRV_H
+
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/radix-tree.h>
+
+#include "brmr-proto.h"
+#include "rmr-req.h"
+
+#define BRMR_SERVER_VER_MAJOR 0
+#define BRMR_SERVER_VER_MINOR 1
+
+#ifndef BRMR_SERVER_VER_STRING
+#define BRMR_SERVER_VER_STRING	__stringify(BRMR_SERVER_VER_MAJOR) "." \
+				__stringify(BRMR_SERVER_VER_MINOR)
+#endif
+
+#define DEFAULT_BLK_OPEN_FLAGS (BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_EXCL)
+
+#define BRMR_BLK_STORE_MAGIC	0xC0FFEE
+#define BLK_STR_MD_SIZE		PAGE_SIZE
+#define BLK_STR_MD_SIZE_SECTORS (PAGE_SIZE / SECTOR_SIZE)
+#define BLK_STR_MIN_MAPPED_SIZE (PAGE_SIZE + BLK_STR_MD_SIZE)
+
+extern struct list_head store_list;
+extern struct mutex store_mutex;
+
+extern struct rmr_srv_store_ops pstore_blk_ops;
+extern struct kobject *rmr_strs_kobj;
+
+/* brmr server */
+
+enum brmr_srv_store_state {
+	BRMR_SRV_STORE_OPEN,
+	BRMR_SRV_STORE_MAPPED,
+	BRMR_SRV_STORE_NEED_SYNC,
+};
+
+struct brmr_srv_io_priv {
+	struct brmr_srv_blk_dev	*dev;
+	void			*priv;
+};
+
+struct rmr_blk_dev_params {
+	u32 max_hw_sectors;
+	u32 max_write_zeroes_sectors;
+	u32 max_discard_sectors;
+	u32 discard_granularity;
+	u32 discard_alignment;
+	u16 physical_block_size;
+	u16 logical_block_size;
+	u16 max_segments;
+	u16 secure_discard;
+	u8 cache_policy;
+};
+
+struct brmr_srv_blk_dev {
+	char poolname[NAME_MAX];
+	struct block_device *bdev;
+	struct file *bdev_file;
+	struct list_head entry;
+	char name[BDEVNAME_SIZE];
+	struct rmr_pool *pool;
+	u64 mapped_size;	/* in sectors */
+	u64 dev_size;		/* in sectors */
+	struct rmr_blk_dev_params dev_params;
+	struct kmem_cache *io_priv_cache;
+	struct kobject kobj;
+	unsigned long state;
+	struct completion comp;
+	struct percpu_ref kref;
+};
+
+struct brmr_srv_blk_dev_meta {
+	char poolname[NAME_MAX];
+	struct rmr_blk_dev_params dev_params;
+	u64 magic; /* magic token to identify a header */
+	u32 version; /* version of the header itself */
+	u64 dev_size;
+	u64 mapped_size;
+	u64 state;
+	u64 offset;
+	u64 ts;
+} __packed;
+
+int brmr_srv_blk_validate_md(struct brmr_srv_blk_dev *dev, struct brmr_srv_blk_dev_meta *meta);
+struct brmr_srv_blk_dev *brmr_srv_blk_create(const char *path, char *name);
+void brmr_srv_blk_destroy(struct brmr_srv_blk_dev *dev);
+int brmr_srv_blk_open(struct brmr_srv_blk_dev *dev, const char *path, bool create, bool replace);
+void brmr_srv_blk_close(struct brmr_srv_blk_dev *dev, bool delete);
+
+int brmr_srv_read_and_check_md(struct brmr_srv_blk_dev *dev, void *md_page);
+
+static inline void brmr_srv_blk_set_state(struct brmr_srv_blk_dev *dev,
+					  enum brmr_srv_store_state state)
+{
+	set_bit(state, &dev->state);
+}
+
+static inline void brmr_srv_blk_clear_state(struct brmr_srv_blk_dev *dev,
+					    enum brmr_srv_store_state state)
+{
+	clear_bit(state, &dev->state);
+}
+
+static inline int brmr_srv_blk_get_ref(struct brmr_srv_blk_dev *dev)
+{
+	return percpu_ref_tryget(&dev->kref);
+}
+
+static inline void brmr_srv_blk_put_ref(struct brmr_srv_blk_dev *dev)
+{
+	percpu_ref_put(&dev->kref);
+}
+
+
+/* brmr-server-sysfs.c */
+
+int brmr_srv_create_sysfs_files(void);
+void brmr_srv_destroy_sysfs_files(void);
+void blk_str_destroy_sysfs_files(struct brmr_srv_blk_dev *dev,
+				 const struct attribute *sysfs_self);
+
+#endif /* BRMR_SRV_H */
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a7e3f29dc037..4b2470b5a592 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -110,5 +110,6 @@ source "drivers/infiniband/ulp/srpt/Kconfig"
 source "drivers/infiniband/ulp/iser/Kconfig"
 source "drivers/infiniband/ulp/isert/Kconfig"
 source "drivers/infiniband/ulp/rtrs/Kconfig"
+source "drivers/infiniband/ulp/rmr/Kconfig"
 
 endif # INFINIBAND
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
index 51b0d41699b8..24c8e4b00065 100644
--- a/drivers/infiniband/ulp/Makefile
+++ b/drivers/infiniband/ulp/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_INFINIBAND_SRPT)		+= srpt/
 obj-$(CONFIG_INFINIBAND_ISER)		+= iser/
 obj-$(CONFIG_INFINIBAND_ISERT)		+= isert/
 obj-$(CONFIG_INFINIBAND_RTRS)		+= rtrs/
+obj-$(CONFIG_INFINIBAND_RMR)		+= rmr/
diff --git a/drivers/infiniband/ulp/rmr/Kconfig b/drivers/infiniband/ulp/rmr/Kconfig
new file mode 100644
index 000000000000..1d62322a02be
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/Kconfig
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config INFINIBAND_RMR
+	tristate
+	depends on INFINIBAND_ADDR_TRANS
+
+config INFINIBAND_RMR_CLIENT
+	tristate "RMR client module"
+	depends on INFINIBAND_ADDR_TRANS
+	depends on INFINIBAND_RTRS_CLIENT
+	select INFINIBAND_RMR
+	help
+	  Reliable Multicast over RTRS (RMR) client module.
+
+	  RMR is an RDMA ULP that provides active-active block-level
+	  replication on top of the RTRS transport.  It guarantees
+	  delivery of an I/O to a group of storage nodes and handles
+	  resynchronization of data between storage nodes without
+	  involving the compute client.  This option builds the client
+	  side, intended to be used by an upper-layer initiator such
+	  as BRMR.
+
+	  If unsure, say N.
+
+config INFINIBAND_RMR_SERVER
+	tristate "RMR server module"
+	depends on INFINIBAND_ADDR_TRANS
+	depends on INFINIBAND_RTRS_SERVER
+	select INFINIBAND_RMR
+	help
+	  RMR server module processing connection, IO and replication
+	  requests from RMR clients on top of RTRS.  It will pass IO
+	  requests to its consumer, e.g. BRMR_server.
+
+	  If unsure, say N.
diff --git a/drivers/infiniband/ulp/rmr/Makefile b/drivers/infiniband/ulp/rmr/Makefile
new file mode 100644
index 000000000000..c173092f4cf2
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs
+
+CFLAGS_rmr-clt-trace.o = -I$(src)
+
+rmr-client-y := rmr-pool.o \
+		rmr-clt.o \
+		rmr-map-mgmt.o \
+		rmr-clt-stats.o \
+		rmr-clt-sysfs.o \
+		rmr-map.o \
+		rmr-clt-trace.o
+
+rmr-server-y := rmr-pool.o \
+		rmr-srv.o \
+		rmr-srv-md.o \
+		rmr-srv-sysfs.o \
+		rmr-req.o \
+		rmr-map.o
+
+obj-$(CONFIG_INFINIBAND_RMR_CLIENT) += rmr-client.o
+obj-$(CONFIG_INFINIBAND_RMR_SERVER) += rmr-server.o
diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-stats.c b/drivers/infiniband/ulp/rmr/rmr-clt-stats.c
new file mode 100644
index 000000000000..83a4089defc0
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-clt-stats.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "rmr-clt.h"
+
+int rmr_clt_reset_read_retries(struct rmr_clt_stats *stats, bool enable)
+{
+	if (unlikely(!enable))
+		return -EINVAL;
+
+	atomic_set(&stats->read_retries, 0);
+
+	return 0;
+}
+
+ssize_t rmr_clt_stats_read_retries_to_str(
+	struct rmr_clt_stats *stats, char *page)
+{
+	return sysfs_emit(page, "%u\n",
+			 atomic_read(&stats->read_retries));
+}
+
diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c b/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c
new file mode 100644
index 000000000000..7e12c526f0c9
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c
@@ -0,0 +1,1496 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/limits.h>       /* for NAME_MAX */
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kdev_t.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/delay.h>
+
+#include "rmr-clt.h"
+
+/*
+ * Wait a bit before trying to reconnect after a failure
+ * in order to give server time to finish clean up which
+ * leads to "false positives" failed reconnect attempts
+ */
+#define RTRS_RECONNECT_BACKOFF 1000
+
+#define RMR_DEFAULT_CHUNK_SIZE 131072 /* 128 KB */
+
+static struct class *rmr_dev_class;
+static struct device *rmr_ctl_dev;
+static struct device *rmr_pool_dev;
+static struct device *rmr_sess_dev;
+
+enum {
+	RMR_OPT_ERR		= 0,
+	RMR_ADD_OPT_PATH	= 1 << 0,
+	RMR_ADD_OPT_SESSNAME	= 1 << 1,
+	RMR_ADD_OPT_MODE	= 1 << 2,
+	RMR_DEL_OPT_MODE	= 1 << 3,
+};
+
+static unsigned int rmr_opt_add_mandatory[] = {
+	RMR_ADD_OPT_PATH,
+	RMR_ADD_OPT_SESSNAME,
+	RMR_ADD_OPT_MODE,
+};
+
+/* For sync pools mode is not meaningful; only path and sessname are required. */
+static unsigned int rmr_opt_add_sync_mandatory[] = {
+	RMR_ADD_OPT_PATH,
+	RMR_ADD_OPT_SESSNAME,
+};
+
+static const match_table_t rmr_opt_add_tokens = {
+	{	RMR_ADD_OPT_PATH,	"path=%s"		},
+	{	RMR_ADD_OPT_SESSNAME,	"sessname=%s"		},
+	{	RMR_ADD_OPT_MODE,	"mode=%s"		},
+	{	RMR_OPT_ERR,		NULL			},
+};
+
+enum rmr_opt_join {
+	RMR_JOIN_OPT_POOLNAME,
+	RMR_JOIN_OPT_Mandatory_count,
+	RMR_JOIN_OPT_SYNC,
+	RMR_JOIN_OPT_CHUNK_SIZE,
+	RMR_JOIN_OPT_ERR,
+};
+
+static const char * const rmr_srv_opts_mandatory_names[] = {
+	[RMR_JOIN_OPT_POOLNAME] = "poolname",
+};
+
+static const match_table_t rmr_opt_join_tokens = {
+	{ RMR_JOIN_OPT_POOLNAME, "poolname=%s" },
+	{ RMR_JOIN_OPT_SYNC, "sync=%s" },
+	{ RMR_JOIN_OPT_CHUNK_SIZE, "chunk_size=%s" },
+	{ RMR_JOIN_OPT_ERR, NULL },
+};
+
+static unsigned int rmr_opt_del_mandatory[] = {
+	RMR_DEL_OPT_MODE,
+};
+
+static const match_table_t rmr_opt_del_tokens = {
+	{	RMR_DEL_OPT_MODE,	"mode=%s"	},
+	{	RMR_OPT_ERR,		NULL		},
+};
+
+enum {
+	RMR_RECONNECT_OPT_ERR = 0,
+	RMR_RECONNECT_OPT_PATH = 1 << 0,
+};
+
+static unsigned int rmr_opt_reconnect_mandatory[] = {
+	RMR_RECONNECT_OPT_PATH,
+};
+
+static const match_table_t rmr_opt_reconnect_tokens = {
+	{ RMR_RECONNECT_OPT_PATH, "path=%s" },
+	{ RMR_RECONNECT_OPT_ERR, NULL },
+};
+
+/* remove new line from string */
+static void strip(char *s)
+{
+	char *p = s;
+
+	while (*s != '\0') {
+		if (*s != '\n')
+			*p++ = *s++;
+		else
+			++s;
+	}
+	*p = '\0';
+}
+
+static int rmr_clt_parse_add_sess_opts(const char *buf, char *sessname, int *create,
+				       struct rtrs_addr *paths, size_t *path_cnt,
+				       size_t max_path_cnt, const char *er_msg,
+				       const match_table_t rmr_opt_tokens,
+				       unsigned int *rmr_opt_mandatory,
+				       size_t num_rmr_opt_mandatory)
+{
+	char *options, *options_orig, *sep_opt;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+	int p_cnt = 0;
+
+	options_orig = kstrdup(buf, GFP_KERNEL);
+	if (!options_orig)
+		return -ENOMEM;
+
+	options = strstrip(options_orig);
+	strip(options);
+	sep_opt = options;
+	while ((p = strsep(&sep_opt, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, rmr_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case RMR_ADD_OPT_SESSNAME:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("%s: sessname too long\n", er_msg);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(sessname, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case RMR_ADD_OPT_PATH:
+			p = match_strdup(args);
+			if (!p || p_cnt >= max_path_cnt) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			ret = rtrs_addr_to_sockaddr(p, strlen(p), RTRS_PORT,
+						     &paths[p_cnt]);
+			if (ret) {
+				pr_err("Can't parse path %s: %d\n", p, ret);
+				kfree(p);
+				goto out;
+			}
+
+			p_cnt++;
+
+			kfree(p);
+			break;
+
+		case RMR_ADD_OPT_MODE:
+			if (!create) {
+				pr_err("%s: mode option not supported here\n", er_msg);
+				ret = -EINVAL;
+				goto out;
+			}
+
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			if (!strcmp(p, "create")) {
+				*create = true;
+			} else if (!strcmp(p, "assemble")) {
+				*create = false;
+			} else {
+				pr_err("%s: Unknown mode '%s' (valid: create, assemble)\n", er_msg, p);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			kfree(p);
+			break;
+
+		default:
+			pr_err("%s: Unknown parameter or missing value"
+			       " '%s'\n", er_msg, p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num_rmr_opt_mandatory; i++) {
+		if ((opt_mask & rmr_opt_mandatory[i])) {
+			ret = 0;
+		} else {
+			pr_err("%s: Parameters missing\n", er_msg);
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	if (path_cnt)
+		*path_cnt = p_cnt;
+	kfree(options_orig);
+	return ret;
+}
+
+static void rmr_clt_destroy_session_sysfs_files(struct rmr_clt_pool_sess *pool_sess,
+						const struct attribute *sysfs_self)
+{
+	if (pool_sess->kobj.state_in_sysfs) {
+		sysfs_remove_link(&pool_sess->kobj, "clt_sess");
+
+		if (sysfs_self)
+			sysfs_remove_file_self(&pool_sess->kobj, sysfs_self);
+		kobject_del(&pool_sess->kobj);
+		kobject_put(&pool_sess->kobj);
+	}
+}
+
+static int rmr_clt_parse_del_sess_opts(const char *buf, bool *delete)
+{
+	char *options, *options_orig, *sep_opt, *p;
+	substring_t args[MAX_OPT_ARGS];
+	int i, token, opt_mask = 0, ret = -EINVAL;
+
+	options_orig = kstrdup(buf, GFP_KERNEL);
+	if (!options_orig)
+		return -ENOMEM;
+
+	options = strstrip(options_orig);
+	strip(options);
+	sep_opt = options;
+	while ((p = strsep(&sep_opt, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, rmr_opt_del_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case RMR_DEL_OPT_MODE:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			if (!strcmp(p, "delete")) {
+				*delete = true;
+			} else if (!strcmp(p, "disassemble")) {
+				*delete = false;
+			} else {
+				pr_err("%s: Unknown mode '%s' (valid: delete, disassemble)\n", "del_sess", p);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			kfree(p);
+			break;
+
+		default:
+			pr_err("%s: Unknown parameter or missing value '%s'\n", "del_sess", p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rmr_opt_del_mandatory); i++) {
+		if ((opt_mask & rmr_opt_del_mandatory[i])) {
+			ret = 0;
+		} else {
+			pr_err("%s: Parameters missing\n", "del_sess");
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	kfree(options_orig);
+	return ret;
+}
+
+static ssize_t rmr_clt_del_sess_show(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+					 attr->attr.name);
+}
+
+static ssize_t rmr_clt_del_sess_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_sess *clt_sess;
+	int err, i, idx;
+	bool delete = false;
+	u8 srv_sess_member_id;
+
+	pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj);
+	clt_sess = pool_sess->clt_sess;
+	srv_sess_member_id = pool_sess->member_id;
+	pool = pool_sess->pool;
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	err = rmr_clt_parse_del_sess_opts(buf, &delete);
+	if (err)
+		return err;
+
+	if (pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_REMOVING)) {
+		/*
+		 * Freeze
+		 */
+		rmr_clt_pool_io_freeze(clt_pool);
+
+		/*
+		 * Wait for all completion
+		 */
+		rmr_clt_pool_io_wait_complete(clt_pool);
+
+		/*
+		 * Remove the storage node from the pool members list.
+		 */
+		xa_erase(&pool->stg_members, srv_sess_member_id);
+
+		/*
+		 * We simply wait for all inflights to get over to make sure
+		 * that they are not affected with the delete session messages
+		 * we are going to send after this.
+		 * Once the inflights are done, we can restart the IOs immediately,
+		 * since the session state has been changed to "removing".
+		 *
+		 * Unfreeze and wake up.
+		 */
+		rmr_clt_pool_io_unfreeze(clt_pool);
+
+		send_msg_leave_pool(pool_sess, delete, WAIT);
+	}
+
+	pr_info("Closing session %s in pool %s\n",
+		pool_sess->sessname, pool->poolname);
+
+	if (!pool->sync) {
+		if (delete) {
+			/*
+			 * Delete map for this session if it exists.
+			 * For disassemble, keep the map so the piggyback loop
+			 * continues to accumulate dirty entries for the member.
+			 */
+			rmr_pool_remove_map(pool, srv_sess_member_id);
+
+			/*
+			 * Clear the srv_md entry so the piggyback loop does
+			 * not keep referencing a gone member.
+			 * For disassemble, leave it intact — it is needed to
+			 * identify the member during piggyback until reassembly.
+			 */
+			idx = rmr_pool_find_md(&pool->pool_md, srv_sess_member_id, false);
+
+			if (idx >= 0)
+				memset(&pool->pool_md.srv_md[idx], 0,
+				       sizeof(struct rmr_srv_md));
+			/*
+			 * TODO: Push the srv_md change to persistence disk on remaining storages.
+			 */
+		} else {
+			/*
+			 * Disassemble: if this was the last non-sync session, no IOs
+			 * will occur and the dirty maps serve no purpose. Delete them
+			 * all; they will be recreated for all members on the first
+			 * assemble via rmr_clt_process_non_sync_sess.
+			 */
+			if (xa_empty(&pool->stg_members)) {
+				for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+					u8 mid = pool->pool_md.srv_md[i].member_id;
+
+					if (!mid)
+						continue;
+					rmr_pool_remove_map(pool, mid);
+				}
+			}
+		}
+
+		/*
+		 * Send messages to all other sessions,
+		 * Informing them that a particular stor is getting deleted
+		 */
+		err = rmr_clt_del_stor_from_pool(pool_sess, delete);
+		if (err) {
+			pr_err("pool %s, del_stor failed for sess with member_id %u, err %d\n",
+			       pool->poolname, srv_sess_member_id, err);
+			return err;
+		}
+	}
+
+	/*
+	 * Remove the session from the list.
+	 */
+	mutex_lock(&pool->sess_lock);
+	rmr_clt_del_pool_sess(pool_sess);
+	mutex_unlock(&pool->sess_lock);
+
+	rmr_clt_destroy_session_sysfs_files(pool_sess, &attr->attr);
+
+	rmr_clt_free_pool_sess(pool_sess);
+	rmr_clt_sess_put(clt_sess);
+
+	if (list_empty(&pool->sess_list))
+		rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_JOINED, false);
+
+	return count;
+}
+
+static struct kobj_attribute rmr_clt_del_pool_sess_attr =
+	__ATTR(del_sess, 0644, rmr_clt_del_sess_show,
+	       rmr_clt_del_sess_store);
+
+static ssize_t rmr_clt_pool_sess_state_show(struct kobject *kobj,
+					    struct kobj_attribute *attr,
+					    char *page)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	ssize_t written = 0;
+
+	pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj);
+
+	written += scnprintf(page, PAGE_SIZE, "%s\n",
+			     rmr_clt_sess_state_str(atomic_read(&pool_sess->state)));
+
+	written += scnprintf(page + written, PAGE_SIZE - written,
+			     "Maintenance mode: %d\n", pool_sess->maintenance_mode);
+
+	return written;
+}
+
+static struct kobj_attribute rmr_clt_pool_sess_state_attr =
+	__ATTR(state, 0444, rmr_clt_pool_sess_state_show, NULL);
+
+static ssize_t rmr_clt_sess_member_id_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+
+	pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%u\n",
+			 pool_sess->member_id);
+}
+
+static struct kobj_attribute rmr_clt_pool_sess_member_id_attr =
+	__ATTR(member_id, 0644, rmr_clt_sess_member_id_show,
+	       NULL);
+
+static ssize_t rmr_clt_sess_enable_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "echo '1|0' > this_sysfs\n");
+}
+
+static ssize_t rmr_clt_sess_enable_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_pool *pool;
+	int pool_sess_state, err;
+	bool enable;
+
+	pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj);
+	pool = pool_sess->pool;
+
+	if (sysfs_streq(buf, "1"))
+		enable = true;
+	else if (sysfs_streq(buf, "0"))
+		enable = false;
+	else {
+		pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf);
+		goto err;
+	}
+
+	pool_sess_state = atomic_read(&pool_sess->state);
+
+	/*
+	 * Manual disable is interpreted as switching to maintenance mode
+	 * And it is only allowed for sessions NOT in "created" and "removing" state
+	 * And non-sync sessions
+	 */
+	if (!enable && ((pool_sess_state == RMR_CLT_POOL_SESS_CREATED) ||
+			(pool_sess_state == RMR_CLT_POOL_SESS_REMOVING) ||
+			(pool_sess->pool->sync))) {
+		pr_err("Cannot put pool_sess in maintenance mode: state %d, sync %d\n",
+		       pool_sess_state, pool_sess->pool->sync);
+		goto print_state_err;
+	}
+
+	if (enable)
+		err = rmr_clt_enable_sess(pool_sess);
+	else
+		err = rmr_clt_set_pool_sess_mm(pool_sess);
+	if (err) {
+		pr_err("%s failed with err %d\n", __func__, err);
+		goto err;
+	}
+
+	return count;
+
+print_state_err:
+	pr_err("Current state: %d\n", atomic_read(&pool_sess->state));
+err:
+	return -EINVAL;
+}
+
+static struct kobj_attribute rmr_clt_pool_sess_enable_attr =
+	__ATTR(enable, 0644, rmr_clt_sess_enable_show,
+	       rmr_clt_sess_enable_store);
+
+static ssize_t rmr_clt_sess_check_map_show(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "echo '1' > this_sysfs\n");
+}
+
+static ssize_t rmr_clt_sess_check_map_store(struct kobject *kobj,
+					    struct kobj_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_msg_pool_cmd msg = {};
+	int err;
+
+	pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj);
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf);
+		goto err;
+	}
+
+	rmr_clt_init_cmd(pool_sess->pool, &msg);
+	msg.cmd_type = RMR_CMD_MAP_CHECK;
+
+	err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+	if (err) {
+		pr_err("%s failed with err %d\n", __func__, err);
+		goto err;
+	}
+	return count;
+
+err:
+	return -EINVAL;
+}
+
+static struct kobj_attribute rmr_clt_pool_sess_check_map_attr =
+	__ATTR(check_map, 0644, rmr_clt_sess_check_map_show,
+	       rmr_clt_sess_check_map_store);
+
+static struct attribute *rmr_clt_pool_sess_attrs[] = {
+	&rmr_clt_del_pool_sess_attr.attr,
+	&rmr_clt_pool_sess_state_attr.attr,
+	&rmr_clt_pool_sess_member_id_attr.attr,
+	&rmr_clt_pool_sess_enable_attr.attr,
+	&rmr_clt_pool_sess_check_map_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(rmr_clt_pool_sess);
+
+static struct kobj_type rmr_clt_pool_sess_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = rmr_clt_pool_sess_groups,
+};
+
+static int rmr_clt_create_session_sysfs_files(struct rmr_clt_pool_sess *pool_sess)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&pool_sess->kobj, &rmr_clt_pool_sess_ktype,
+				   &pool_sess->pool->sessions_kobj,
+				   "%s", pool_sess->sessname);
+	if (ret)
+		pr_err("Failed to create sysfs dir for session '%s': %d\n",
+		       pool_sess->sessname, ret);
+
+	return ret;
+}
+
+static ssize_t rmr_clt_pool_add_sess_show(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo \""
+					  "sessname=<name of the rtrs session>"
+					  " path=<[srcaddr,]dstaddr>"
+					  " [path=<[srcaddr,]dstaddr>]\" > %s\n\n"
+					  "addr ::= [ ip:<ipv4> | ip:<ipv6> | gid:<gid> ]\n",
+			 attr->attr.name);
+}
+
+static ssize_t rmr_clt_pool_add_sess_store(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct rtrs_addr paths[3];
+	struct sockaddr_storage saddr[ARRAY_SIZE(paths)];
+	struct sockaddr_storage daddr[ARRAY_SIZE(paths)];
+	struct rmr_pool *pool;
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_clt_sess *clt_sess;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_pool_md *clt_md;
+	char *sessname;
+	size_t path_cnt;
+	int ret, index, create = 0;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	sessname = kzalloc(NAME_MAX, GFP_KERNEL);
+	if (unlikely(!sessname))
+		return -ENOMEM;
+
+	for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) {
+		paths[path_cnt].src = &saddr[path_cnt];
+		paths[path_cnt].dst = &daddr[path_cnt];
+	}
+
+	ret = rmr_clt_parse_add_sess_opts(buf, sessname,
+					  pool->sync ? NULL : &create,
+					  paths, &path_cnt, ARRAY_SIZE(paths),
+					  "add_sess", rmr_opt_add_tokens,
+					  pool->sync ? rmr_opt_add_sync_mandatory
+						     : rmr_opt_add_mandatory,
+					  pool->sync ? ARRAY_SIZE(rmr_opt_add_sync_mandatory)
+						     : ARRAY_SIZE(rmr_opt_add_mandatory));
+	if (ret)
+		goto free_name;
+
+	pr_info("%s: Creating rmr client session %s in pool %s\n", __func__, sessname,
+		pool->poolname);
+
+	clt_sess = find_and_get_or_create_clt_sess(sessname, paths, path_cnt);
+	if (IS_ERR(clt_sess)) {
+		pr_err("failed to find and get or create clt sess %s\n", sessname);
+		ret = PTR_ERR(clt_sess);
+		goto free_name;
+	}
+
+	pool_sess = rmr_clt_add_pool_sess(pool, clt_sess, create);
+	if (IS_ERR(pool_sess)) {
+		pr_err("failed to add pool sess %s to the pool %s\n",
+		       sessname, pool->poolname);
+		ret = PTR_ERR(pool_sess);
+		goto put_clt_sess;
+	}
+	ret = rmr_clt_create_session_sysfs_files(pool_sess);
+	if (ret) {
+		pr_err("Creating sysfs files for %s in %s failed: %d\n",
+		       pool_sess->sessname, pool->poolname, ret);
+		goto destroy_sess;
+	}
+
+	ret = sysfs_create_link(&pool_sess->kobj, &clt_sess->kobj, "clt_sess");
+	if (ret) {
+		pr_err("Creating symlink for %s failed, err: %d\n",
+		       pool_sess->sessname, ret);
+		rmr_clt_destroy_session_sysfs_files(pool_sess, NULL);
+		goto destroy_sess;
+	}
+	// ret = sysfs_create_link(&sess->kobj, sess->sess_kobj,
+	// 			RTRS_LINK_NAME);
+	// if (ret) {
+	// 	pr_err("Creating rtrs symlink for %s in %s failed: %d\n",
+	// 	       sess->sessname, pool->poolname, ret);
+	// 	rmr_clt_destroy_session_sysfs_files(sess, NULL);
+	// 	goto destroy_sess;
+	// }
+	rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_JOINED, true);
+
+	clt_md = &pool->pool_md;
+	index = rmr_pool_find_md(clt_md, pool_sess->member_id, true);
+	if (index < 0) {
+		pr_err("No space for member %u in the clt_md\n", pool_sess->member_id);
+		goto destroy_sess;
+	}
+	clt_md->srv_md[index].member_id = pool_sess->member_id;
+	clt_md->srv_md[index].mapped_size = pool->mapped_size;
+
+	kfree(sessname);
+	return count;
+
+destroy_sess:
+	rmr_clt_destroy_pool_sess(pool_sess, create);
+put_clt_sess:
+	rmr_clt_sess_put(clt_sess);
+free_name:
+	kfree(sessname);
+	return ret;
+}
+
+static struct kobj_attribute rmr_clt_pool_add_sess_attr =
+	__ATTR(add_sess, 0644, rmr_clt_pool_add_sess_show,
+	       rmr_clt_pool_add_sess_store);
+
+static ssize_t rmr_clt_pool_leave_pool_show(struct kobject *kobj,
+					    struct kobj_attribute *attr,
+					    char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t rmr_clt_pool_leave_pool_store(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_clt_pool *clt_pool;
+	int ret;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s, %s unknown value: '%s'\n",
+		       pool->poolname, attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	if (refcount_read(&clt_pool->refcount) > 1) {
+		pr_err("%s: Pool %s is in use.\n", __func__, pool->poolname);
+		return -EINVAL;
+	}
+
+	pr_info("clt: Deleting pool '%s'\n", pool->poolname);
+
+	ret = rmr_clt_remove_pool_from_sysfs(pool, &attr->attr);
+	if (unlikely(ret))
+		return ret;
+
+	return count;
+}
+
+static struct kobj_attribute rmr_clt_pool_leave_pool_attr =
+	__ATTR(leave_pool, 0644, rmr_clt_pool_leave_pool_show,
+	       rmr_clt_pool_leave_pool_store);
+
+static ssize_t rmr_clt_pool_chunk_size_show(struct kobject *kobj,
+					    struct kobj_attribute *attr,
+					    char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_clt_pool *clt_pool;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	if (pool->chunk_size == UINT_MAX)
+		return scnprintf(page, PAGE_SIZE, "undefined\n");
+
+	return scnprintf(page, PAGE_SIZE, "%u\n", pool->chunk_size);
+}
+
+static struct kobj_attribute rmr_clt_pool_chunk_size_attr =
+	__ATTR(chunk_size, 0644, rmr_clt_pool_chunk_size_show, NULL);
+
+static ssize_t rmr_clt_pool_map_show(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     char *page)
+{
+	struct rmr_pool *pool = container_of(kobj, struct rmr_pool, kobj);
+	struct rmr_dirty_id_map *map;
+	int i, lock_idx;
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		map = rcu_dereference(pool->maps[i]);
+		if (!map)
+			continue;
+
+		rmr_map_dump_bitmap(map);
+	}
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	return 0;
+}
+
+static ssize_t rmr_clt_pool_map_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	int err;
+	rmr_id_t id = { 0, 0 };
+	int srv_id;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) {
+		pr_err("cannot parse id.a %s\n", buf);
+		return -EINVAL;
+	}
+	pr_debug("add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id);
+
+	/*
+	 * If given chunk number exceeds total chunks for us, ignore!
+	 */
+	if (id.b > pool->no_of_chunks)
+		return count;
+
+	err = rmr_clt_map_add_id(pool, srv_id, id);
+	if (err == -ENOMEM) {
+		pr_err("failed insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id);
+	} else {
+		pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id);
+	}
+
+	return count;
+}
+
+static struct kobj_attribute rmr_clt_pool_map_attr =
+	__ATTR(map, 0644, rmr_clt_pool_map_show,
+	       rmr_clt_pool_map_store);
+
+static ssize_t rmr_clt_pool_map_ver_show(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     char *page)
+{
+	struct rmr_pool *pool;
+	ssize_t written;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+
+	written = scnprintf(page, PAGE_SIZE, "Map ver: %llu\n", pool->map_ver);
+
+	return written;
+}
+
+static struct kobj_attribute rmr_clt_pool_map_ver_attr =
+	__ATTR(map_version, 0444, rmr_clt_pool_map_ver_show, NULL);
+
+static ssize_t rmr_clt_pool_enable_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t rmr_clt_pool_enable_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	int ret;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	ret = rmr_clt_pool_try_enable(pool);
+	if (ret) {
+		pr_err("%s: pool %s rmr_clt_pool_try_enable failed with err %d\n",
+		       attr->attr.name, pool->poolname, ret);
+		return ret;
+	}
+
+	return count;
+}
+
+static struct kobj_attribute rmr_clt_pool_enable_attr =
+	__ATTR(pool_enable, 0644, rmr_clt_pool_enable_show,
+	       rmr_clt_pool_enable_store);
+
+static ssize_t rmr_clt_pool_test_map_show(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t rmr_clt_pool_test_map_store(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	int err;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s, %s unknown value: '%s'\n",
+		       pool->poolname, attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	pr_info("pool %s start test map...\n", pool->poolname);
+	err = rmr_clt_test_map(pool, pool);
+	if (err) {
+		pr_err("pool %s, test map failed, err %d\n",
+		       pool->poolname, err);
+		return err;
+	}
+	pr_info("pool %s test map done.", pool->poolname);
+
+	return count;
+}
+
+static struct kobj_attribute rmr_clt_pool_test_map_attr =
+	__ATTR(test_map, 0644, rmr_clt_pool_test_map_show,
+	       rmr_clt_pool_test_map_store);
+
+static struct attribute *rmr_clt_pool_attrs[] = {
+	&rmr_clt_pool_add_sess_attr.attr,
+	&rmr_clt_pool_leave_pool_attr.attr,
+	&rmr_clt_pool_chunk_size_attr.attr,
+	&rmr_clt_pool_map_attr.attr,
+	&rmr_clt_pool_map_ver_attr.attr,
+	&rmr_clt_pool_enable_attr.attr,
+	&rmr_clt_pool_test_map_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(rmr_clt_pool);
+
+static struct kobj_type rmr_clt_pool_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = rmr_clt_pool_groups,
+};
+
+static struct kobj_type ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+static ssize_t rmr_clt_join_pool_show(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo \""
+					  "poolname=<poolname> sync=y|Y|0|1 [chunk_size=<chunk_size in bytes>]\" "
+					  "> %s\n",
+			 attr->attr.name);
+}
+
+static int rmr_clt_create_stats_files(struct kobject *kobj,
+				      struct kobject *stats_kobj);
+
+static int rmr_clt_create_pool_sysfs_files(struct rmr_pool *pool)
+{
+	int ret;
+	struct rmr_clt_pool *clt_pool;
+
+	ret = kobject_init_and_add(&pool->kobj, &rmr_clt_pool_ktype,
+				   &rmr_pool_dev->kobj, "%s", pool->poolname);
+	if (ret) {
+		pr_err("Failed to create sysfs dir for pool '%s': %d\n",
+		       pool->poolname, ret);
+		return ret;
+	}
+
+	ret = kobject_init_and_add(&pool->sessions_kobj, &ktype, &pool->kobj,
+				   "sessions");
+	if (unlikely(ret)) {
+		pr_err("Failed to create sessions dir for pool '%s': %d\n",
+		       pool->poolname, ret);
+		goto put_pool_kobj;
+	}
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+	ret = rmr_clt_create_stats_files(&pool->kobj, &clt_pool->stats_kobj);
+	if (unlikely(ret)) {
+		pr_err("Failed to create sysfs stats files "
+		       "for pool '%s': %d\n",
+		       pool->poolname, ret);
+		goto put_sessions_kobj;
+	}
+
+	return 0;
+
+put_sessions_kobj:
+	kobject_del(&pool->sessions_kobj);
+	kobject_put(&pool->sessions_kobj);
+put_pool_kobj:
+	kobject_del(&pool->kobj);
+	kobject_put(&pool->kobj);
+
+	return ret;
+}
+
+void rmr_clt_destroy_pool_sysfs_files(struct rmr_pool *pool,
+				      const struct attribute *sysfs_self)
+{
+	struct rmr_clt_pool *clt_pool;
+
+	if (pool->kobj.state_in_sysfs) {
+		clt_pool = (struct rmr_clt_pool *)pool->priv;
+		kobject_del(&clt_pool->stats_kobj);
+		kobject_put(&clt_pool->stats_kobj);
+
+		kobject_del(&pool->sessions_kobj);
+		kobject_put(&pool->sessions_kobj);
+		if (sysfs_self)
+			sysfs_remove_file_self(&pool->kobj, sysfs_self);
+		kobject_del(&pool->kobj);
+		kobject_put(&pool->kobj);
+	}
+}
+
+static ssize_t rmr_clt_sess_reconnect_show(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "echo 'path=ip:<IP>' > this_sysfs\n");
+}
+
+static ssize_t rmr_clt_sess_reconnect_store(struct kobject *kobj,
+					    struct kobj_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct rmr_clt_sess *clt_sess;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rtrs_addr paths[3];
+	struct sockaddr_storage saddr[ARRAY_SIZE(paths)];
+	struct sockaddr_storage daddr[ARRAY_SIZE(paths)];
+	size_t path_cnt;
+	int err;
+
+
+	clt_sess = container_of(kobj, struct rmr_clt_sess, kobj);
+
+	pr_info("%s: Starting manual reconnect for clt_sess %s\n", __func__, clt_sess->sessname);
+
+	/*
+	 * The IP of the server has changed.
+	 * Close the old rtrs connection, parse the path IP,
+	 * and reconnect the session
+	 */
+	for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) {
+		paths[path_cnt].src = &saddr[path_cnt];
+		paths[path_cnt].dst = &daddr[path_cnt];
+	}
+
+	err = rmr_clt_parse_add_sess_opts(buf, NULL, NULL, paths, &path_cnt, ARRAY_SIZE(paths),
+					  "reconnect_sess", rmr_opt_reconnect_tokens,
+					  rmr_opt_reconnect_mandatory,
+					  ARRAY_SIZE(rmr_opt_reconnect_mandatory));
+	if (err) {
+		pr_err("%s: failed to parse options, err=%d\n", __func__, err);
+		return err;
+	}
+
+	if (!IS_ERR_OR_NULL(clt_sess->rtrs)) {
+		pr_info("close rtrs clt for session %s\n", clt_sess->sessname);
+
+		clt_sess->state = RMR_CLT_SESS_DISCONNECTED;
+
+		/*
+		 * Wait for the state to be seen by rmr client
+		 *
+		 * The ones which are already in the rcu read section (see rmr_get_sess_iu)
+		 * would complete its get_permit for rtrs.
+		 * After that, rtrs_clt_close would wait for all the inflight permits to be
+		 * returned.
+		 */
+		mutex_lock(&clt_sess->lock);
+		list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry)
+			synchronize_srcu(&pool_sess->pool->sess_list_srcu);
+		mutex_unlock(&clt_sess->lock);
+
+		rtrs_clt_close(clt_sess->rtrs);
+		clt_sess->rtrs = NULL;
+
+		msleep(RTRS_RECONNECT_BACKOFF);
+	}
+
+	err = rmr_clt_reconnect_sess(clt_sess, paths, path_cnt);
+	if (err) {
+		pr_err("rmr_clt_reconnect_sess Failed\n");
+		return err;
+	}
+
+	pr_info("%s: Manual reconnect for clt_sess %s succeeded\n", __func__, clt_sess->sessname);
+	return count;
+}
+
+static struct kobj_attribute rmr_clt_sess_reconnect_attr =
+	__ATTR(reconnect, 0644, rmr_clt_sess_reconnect_show,
+	       rmr_clt_sess_reconnect_store);
+
+static const char *rmr_clt_sess_state_names[] = {
+	[0] = "invalid state",
+	[RMR_CLT_SESS_DISCONNECTED] = "disconnected",
+	[RMR_CLT_SESS_CONNECTED] = "connected"
+};
+
+static ssize_t rmr_clt_sess_state_show(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       char *page)
+{
+	struct rmr_clt_sess *clt_sess;
+
+	clt_sess = container_of(kobj, struct rmr_clt_sess, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n",
+			 rmr_clt_sess_state_names[clt_sess->state]);
+}
+
+static struct kobj_attribute rmr_clt_sess_state_attr =
+	__ATTR(state, 0444, rmr_clt_sess_state_show, NULL);
+
+static struct attribute *rmr_clt_sess_attrs[] = {
+	&rmr_clt_sess_reconnect_attr.attr,
+	&rmr_clt_sess_state_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(rmr_clt_sess);
+
+static struct kobj_type rmr_clt_sess_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = rmr_clt_sess_groups,
+};
+
+int rmr_clt_create_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&clt_sess->kobj, &rmr_clt_sess_ktype,
+				   &rmr_sess_dev->kobj, "%s", clt_sess->sessname);
+	if (ret) {
+		pr_err("Failed to create sysfs dir for sess '%s': %d\n",
+		       clt_sess->sessname, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+void rmr_clt_destroy_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess)
+{
+	if (clt_sess->kobj.state_in_sysfs) {
+		kobject_del(&clt_sess->kobj);
+		kobject_put(&clt_sess->kobj);
+	}
+}
+
+static int rmr_clt_parse_join_opts(const char *buf, char *poolname,
+				   bool *sync, u32 *chunk_size)
+{
+	char *options, *sep_opt;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	options = strstrip(options);
+	strip(options);
+	sep_opt = options;
+	while ((p = strsep(&sep_opt, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, rmr_opt_join_tokens, args);
+		opt_mask |= (1 << token);
+
+		switch (token) {
+		case RMR_JOIN_OPT_POOLNAME:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("join_pool: poolname too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(poolname, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case RMR_JOIN_OPT_SYNC:
+			p = match_strdup(args);
+
+			ret = kstrtobool(p, sync);
+			if (ret) {
+				pr_err("sync isn't a boolean: %d\n", ret);
+				kfree(p);
+				goto out;
+			}
+
+			kfree(p);
+			break;
+
+		case RMR_JOIN_OPT_CHUNK_SIZE:
+			/*
+			 * Min supported chunk_size is PAGE_SIZE.
+			 * The value must be power-of-2 and multiples
+			 * of SECTOR_SIZE.
+			 */
+			p = match_strdup(args);
+
+			ret = kstrtou32(p, 0, chunk_size);
+			if (ret) {
+				pr_err("chunk_size isn't an integer: %d\n", ret);
+				kfree(p);
+				goto out;
+			} else if (*chunk_size < PAGE_SIZE) {
+				pr_err("Min supported chunk_size is %lu\n", PAGE_SIZE);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			} else if (!is_power_of_2(*chunk_size)) {
+				pr_err("chunk_size must be power of 2\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+
+			kfree(p);
+			break;
+		default:
+			pr_err("join_pool: Unknown parameter or missing value"
+			       " '%s'\n", p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < RMR_JOIN_OPT_Mandatory_count; i++) {
+		if ((opt_mask & (1 << rmr_opt_join_tokens[i].token))) {
+			ret = 0;
+		} else {
+			pr_err("join_pool: Mandatory parameter missing: %s\n",
+			       rmr_srv_opts_mandatory_names[i]);
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static struct rmr_clt_pool *rmr_create_clt_pool(char *poolname, bool sync)
+{
+	struct rmr_clt_pool *clt_pool;
+	int ret;
+
+	clt_pool = kzalloc(sizeof(struct rmr_clt_pool), GFP_KERNEL);
+	if (unlikely(!clt_pool))
+		return ERR_PTR(-ENOMEM);
+
+	refcount_set(&clt_pool->refcount, 1);
+
+	init_waitqueue_head(&clt_pool->map_update_wq);
+	atomic_set(&clt_pool->io_freeze, 0);
+	mutex_init(&clt_pool->io_freeze_lock);
+	mutex_init(&clt_pool->clt_pool_lock);
+
+	clt_pool->recover_wq = alloc_workqueue("%s_recover_wq", 0, 0, poolname);
+	if (!clt_pool->recover_wq) {
+		ret = -ENOMEM;
+		goto free_clt_pool;
+	}
+
+	if (!sync) {
+		INIT_DELAYED_WORK(&clt_pool->recover_dwork, recover_work);
+		queue_delayed_work(clt_pool->recover_wq, &clt_pool->recover_dwork,
+				   msecs_to_jiffies(RMR_RECOVER_INTERVAL_MS));
+	}
+
+	return clt_pool;
+
+free_clt_pool:
+	kfree(clt_pool);
+	return ERR_PTR(ret);
+}
+
+static ssize_t rmr_clt_join_pool_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_pool_md *clt_md;
+	char *poolname;
+	u32 chunk_size = RMR_DEFAULT_CHUNK_SIZE;
+	bool sync = false;
+	int err;
+
+	poolname = kzalloc(NAME_MAX, GFP_KERNEL);
+	if (unlikely(!poolname))
+		return -ENOMEM;
+
+	err = rmr_clt_parse_join_opts(buf, poolname, &sync, &chunk_size);
+	if (unlikely(err))
+		goto out;
+
+	strip(poolname);
+
+	pr_info("%s: Creating client pool with poolname %s, sync %d\n",
+		__func__, poolname, sync);
+
+	clt_pool = rmr_create_clt_pool(poolname, sync);
+	if (IS_ERR(clt_pool)) {
+		pr_err("%s: Clt pool creationg failed\n", __func__);
+		err = PTR_ERR(clt_pool);
+		goto out;
+	}
+
+	pool = rmr_create_pool(poolname, clt_pool);
+	if (IS_ERR(pool)) {
+		err = PTR_ERR(pool);
+		goto put_clt_pool;
+	}
+
+	pool->is_clt = true;
+	pool->sync = sync;
+	clt_pool->pool = pool;
+
+	pr_debug("pool %p, clt_pool %p\n", pool, pool->priv);
+
+	err = rmr_clt_create_pool_sysfs_files(pool);
+	if (err)
+		goto put_clt_pool;
+
+	if (!sync) {
+		clt_md = &clt_pool->pool->pool_md;
+		strscpy(clt_md->poolname, poolname, NAME_MAX);
+		clt_md->group_id = pool->group_id;
+		clt_md->map_ver = 1;
+	}
+
+	kfree(poolname);
+
+	return count;
+
+put_clt_pool:
+	if (!sync)
+		cancel_delayed_work_sync(&clt_pool->recover_dwork);
+
+	rmr_put_clt_pool(clt_pool);
+out:
+	kfree(poolname);
+	return err;
+}
+
+static struct kobj_attribute rmr_clt_join_pool_attr =
+	__ATTR(join_pool, 0644,
+	       rmr_clt_join_pool_show, rmr_clt_join_pool_store);
+
+static struct attribute *default_attrs[] = {
+	&rmr_clt_join_pool_attr.attr,
+	NULL,
+};
+
+static struct attribute_group default_attr_group = {
+	.attrs = default_attrs,
+};
+
+void rmr_clt_destroy_sysfs_files(void)
+{
+	sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group);
+
+	device_unregister(rmr_sess_dev);
+	device_unregister(rmr_pool_dev);
+	device_unregister(rmr_ctl_dev);
+
+	class_destroy(rmr_dev_class);
+}
+
+int rmr_clt_create_sysfs_files(void)
+{
+	int err;
+	dev_t devt = MKDEV(0, 0);
+
+	rmr_dev_class = class_create("rmr-client");
+	if (IS_ERR(rmr_dev_class))
+		return PTR_ERR(rmr_dev_class);
+
+	rmr_ctl_dev = device_create(rmr_dev_class, NULL, devt, NULL, "ctl");
+	if (IS_ERR(rmr_ctl_dev)) {
+		err = PTR_ERR(rmr_ctl_dev);
+		goto cls_destroy;
+	}
+
+	rmr_pool_dev = device_create(rmr_dev_class, NULL, devt, NULL, "pools");
+	if (IS_ERR(rmr_pool_dev)) {
+		err = PTR_ERR(rmr_pool_dev);
+		goto ctl_destroy;
+	}
+
+	rmr_sess_dev = device_create(rmr_dev_class, NULL, devt, NULL, "sessions");
+	if (IS_ERR(rmr_sess_dev)) {
+		err = PTR_ERR(rmr_sess_dev);
+		goto pool_destroy;
+	}
+
+	err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group);
+	if (unlikely(err))
+		goto sess_destroy;
+
+	return 0;
+
+sess_destroy:
+	device_unregister(rmr_sess_dev);
+pool_destroy:
+	device_unregister(rmr_pool_dev);
+ctl_destroy:
+	device_unregister(rmr_ctl_dev);
+cls_destroy:
+	class_destroy(rmr_dev_class);
+
+	return err;
+}
+
+STAT_ATTR(struct rmr_clt_stats, read_retries,
+	  rmr_clt_stats_read_retries_to_str, rmr_clt_reset_read_retries);
+
+static struct attribute *rmr_clt_stats_attrs[] = {
+	&read_retries_attr.attr,
+	NULL,
+};
+
+static struct attribute_group rmr_clt_stats_attr_group = {
+	.attrs = rmr_clt_stats_attrs,
+};
+
+static int rmr_clt_create_stats_files(struct kobject *kobj,
+				      struct kobject *stats_kobj)
+{
+	int ret;
+
+	ret = kobject_init_and_add(stats_kobj, &ktype, kobj, "stats");
+	if (ret) {
+		pr_err("Failed to init and add stats kobject, err: %d\n",
+		       ret);
+		return ret;
+	}
+
+	ret = sysfs_create_group(stats_kobj, &rmr_clt_stats_attr_group);
+	if (ret) {
+		pr_err("failed to create stats sysfs group, err: %d\n",
+		       ret);
+		goto put_stats_obj;
+	}
+
+	return 0;
+
+put_stats_obj:
+	kobject_del(stats_kobj);
+	kobject_put(stats_kobj);
+
+	return ret;
+}
diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-trace.c b/drivers/infiniband/ulp/rmr/rmr-clt-trace.c
new file mode 100644
index 000000000000..2e6d9adee7c8
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-clt-trace.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+#include "rmr-clt.h"
+
+#define CREATE_TRACE_POINTS
+#include "rmr-clt-trace.h"
+
diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-trace.h b/drivers/infiniband/ulp/rmr/rmr-clt-trace.h
new file mode 100644
index 000000000000..1d9a511dc763
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-clt-trace.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rmr_clt
+
+#if !defined(_TRACE_RMR_CLT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RMR_CLT_H
+
+#include <linux/tracepoint.h>
+
+struct rmr_clt_pool_sess;
+
+TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_CREATED);
+TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_NORMAL);
+TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_FAILED);
+TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_RECONNECTING);
+TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_REMOVING);
+
+#define show_pool_sess_state(x) \
+	__print_symbolic(x, \
+		{ RMR_CLT_POOL_SESS_CREATED,		"CREATED" }, \
+		{ RMR_CLT_POOL_SESS_NORMAL,		"NORMAL" }, \
+		{ RMR_CLT_POOL_SESS_FAILED,		"FAILED" }, \
+		{ RMR_CLT_POOL_SESS_RECONNECTING,	"RECONNECTING" }, \
+		{ RMR_CLT_POOL_SESS_REMOVING,		"REMOVING" })
+
+TRACE_EVENT(pool_sess_change_state,
+	TP_PROTO(struct rmr_clt_pool_sess *pool_sess,
+		 int newstate,
+		 int oldstate,
+		 int changed),
+
+	TP_ARGS(pool_sess, newstate, oldstate, changed),
+
+	TP_STRUCT__entry(
+		__string(sessname, pool_sess->sessname)
+		__field(int, newstate)
+		__field(int, oldstate)
+		__field(int, changed)
+	),
+
+	TP_fast_assign(
+		__assign_str(sessname);
+		__entry->newstate = newstate;
+		__entry->oldstate = oldstate;
+		__entry->changed = changed;
+	),
+
+	TP_printk("RMR-CLT: sessname=%s newstate='%s' oldstate='%s' state-changed='%d'",
+		   __get_str(sessname),
+		   show_pool_sess_state(__entry->newstate),
+		   show_pool_sess_state(__entry->oldstate),
+		   __entry->changed
+	)
+);
+
+DECLARE_EVENT_CLASS(rtrs_clt_request_class,
+	TP_PROTO(int dir, struct rmr_clt_sess_iu *sess_iu),
+
+	TP_ARGS(dir, sess_iu),
+
+	TP_STRUCT__entry(
+		__field(int, dir)
+		__array(char, sessname, NAME_MAX)
+		__field(void *, rtrs)
+		__field(void *, clt_sess)
+	),
+
+	TP_fast_assign(
+		struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess;
+		struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+
+		__entry->dir = dir;
+		memcpy(__entry->sessname, pool_sess->sessname, NAME_MAX);
+		__entry->rtrs = clt_sess->rtrs;
+		__entry->clt_sess = clt_sess;
+	),
+
+	TP_printk("rtrs clt request: sessname=%s dir=%s rtrs=%p clt_sess=%p",
+		   __entry->sessname,
+		   __print_symbolic(__entry->dir,
+			{ READ, "READ" },
+			{ WRITE, "WRITE" }),
+		   __entry->rtrs,
+		   __entry->clt_sess
+	)
+);
+
+#define DEFINE_RTRS_CLT_EVENT(name) \
+DEFINE_EVENT(rtrs_clt_request_class, name, \
+	TP_PROTO(int dir, struct rmr_clt_sess_iu *sess_iu), \
+	TP_ARGS(dir, sess_iu))
+
+DEFINE_RTRS_CLT_EVENT(send_usr_msg);
+DEFINE_RTRS_CLT_EVENT(retry_failed_read);
+DEFINE_RTRS_CLT_EVENT(rmr_clt_request);
+DEFINE_RTRS_CLT_EVENT(rmr_clt_cmd_with_rsp);
+DEFINE_RTRS_CLT_EVENT(send_map_update);
+
+#endif /* _TRACE_RMR_CLT_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE rmr-clt-trace
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/ulp/rmr/rmr-clt.c b/drivers/infiniband/ulp/rmr/rmr-clt.c
new file mode 100644
index 000000000000..33e4b6d84b0b
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-clt.c
@@ -0,0 +1,3866 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "rmr-clt.h"
+#include "rmr-clt-trace.h"
+
+MODULE_AUTHOR("The RMR and BRMR developers");
+MODULE_DESCRIPTION("RMR Client");
+MODULE_VERSION(RMR_VER_STRING);
+MODULE_LICENSE("GPL");
+
+#define RMR_CLT_SEND_MSG_TIMEOUT_MS 30000
+
+//static int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool wait);
+static void retry_failed_read(struct work_struct *work);
+static DEFINE_MUTEX(g_sess_lock);
+static LIST_HEAD(g_sess_list);
+
+static bool rmr_get_clt_pool(struct rmr_clt_pool *clt_pool)
+{
+	pr_debug("pool %s, before inc refcount %d\n",
+		 clt_pool->pool->poolname, refcount_read(&clt_pool->refcount));
+	return refcount_inc_not_zero(&clt_pool->refcount);
+}
+
+static struct rmr_clt_pool *rmr_find_and_get_clt_pool(const char *poolname)
+{
+	struct rmr_pool *pool;
+	struct rmr_clt_pool *clt_pool;
+
+	mutex_lock(&pool_mutex);
+	pool = rmr_find_pool(poolname);
+	if (!pool) {
+		clt_pool = ERR_PTR(-ENOENT);
+		goto out;
+	}
+
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+	if (!rmr_get_clt_pool(clt_pool))
+		clt_pool = ERR_PTR(-EINVAL);
+
+out:
+	mutex_unlock(&pool_mutex);
+	return clt_pool;
+}
+
+void rmr_put_clt_pool(struct rmr_clt_pool *clt_pool)
+{
+	struct rmr_pool *pool = clt_pool->pool;
+
+	might_sleep();
+
+	pr_debug("clt pool %s, before dec refcnt %d\n",
+		 (pool ? pool->poolname : "(empty)"), refcount_read(&clt_pool->refcount));
+	if (refcount_dec_and_test(&clt_pool->refcount)) {
+
+		destroy_workqueue(clt_pool->recover_wq);
+		mutex_destroy(&clt_pool->io_freeze_lock);
+		mutex_destroy(&clt_pool->clt_pool_lock);
+
+		if (pool) {
+			pr_info("clt: destroy pool %s\n", pool->poolname);
+			free_pool(pool);
+		}
+
+		kfree(clt_pool);
+	}
+}
+
+static inline int rmr_clt_sess_get(struct rmr_clt_sess *sess)
+{
+	return kref_get_unless_zero(&sess->kref);
+}
+
+static void rmr_clt_sess_release(struct kref *kref)
+{
+	struct rmr_clt_sess *clt_sess;
+
+	clt_sess = container_of(kref, struct rmr_clt_sess, kref);
+
+	mutex_lock(&g_sess_lock);
+
+	rmr_clt_destroy_clt_sess_sysfs_files(clt_sess);
+
+	pr_info("close rtrs for session %s\n", clt_sess->sessname);
+	rtrs_clt_close(clt_sess->rtrs);
+	list_del(&clt_sess->g_list);
+	kfree(clt_sess);
+
+	mutex_unlock(&g_sess_lock);
+}
+
+void rmr_clt_sess_put(struct rmr_clt_sess *sess)
+{
+	kref_put(&sess->kref, rmr_clt_sess_release);
+}
+
+static const char *rmr_get_clt_pool_state_name(enum rmr_clt_pool_state state)
+{
+	switch (state) {
+	case RMR_CLT_POOL_STATE_JOINED: return "RMR_CLT_POOL_STATE_JOINED";
+	case RMR_CLT_POOL_STATE_IN_USE: return "RMR_CLT_POOL_STATE_IN_USE";
+
+	default: return "Unknown state";
+	}
+}
+
+static void rmr_clt_dump_state(struct rmr_clt_pool *rmr_clt_pool)
+{
+	char current_state[1024] = {0};
+	int i, n = 0, len = sizeof(current_state);
+
+	for (i = 0; i < RMR_CLT_POOL_STATE_MAX; i++) {
+		enum rmr_clt_pool_state state = (enum rmr_clt_pool_state)i;
+
+		if (test_bit(state, &rmr_clt_pool->state))
+			n += scnprintf(current_state + n, len - n, "%s, ",
+				       rmr_get_clt_pool_state_name(state));
+	}
+
+	pr_info("%s: RMR client pool current state: %s\n", __func__, current_state);
+}
+
+/**
+ * rmr_clt_change_pool_state() - Change clt pool state
+ *
+ * @clt_pool:	Client pool whose state is to be changed
+ * @new_state:	New state to set
+ * @set:	Informs whether to set/unset the given new+state
+ */
+void rmr_clt_change_pool_state(struct rmr_clt_pool *rmr_clt_pool,
+			       enum rmr_clt_pool_state new_state, bool set)
+{
+	if (set) {
+		set_bit(new_state, &rmr_clt_pool->state);
+		pr_info("%s: state %s set\n",
+			__func__, rmr_get_clt_pool_state_name(new_state));
+	} else {
+		clear_bit(new_state, &rmr_clt_pool->state);
+		pr_info("%s: state %s cleared\n",
+			__func__, rmr_get_clt_pool_state_name(new_state));
+	}
+
+	rmr_clt_dump_state(rmr_clt_pool);
+}
+
+/**
+ * send_map_get_version() - Send a map get version command
+ *
+ * @pool_sess:		pool session where to send the message
+ *
+ * Description:
+ *	Ask the storage node to send back its map_version.
+ *
+ * Return:
+ *	0 on success
+ *	Negative error in case of failure
+ */
+
+/**
+ * rmr_clt_md_update() - Update the client (non-sync) pool metadata
+ */
+static void rmr_clt_md_update(struct rmr_pool *pool)
+{
+	struct rmr_pool_md *clt_md = &pool->pool_md;
+
+	if (pool->sync)
+		return;
+
+	clt_md->map_ver = pool->map_ver;
+}
+
+#if 0
+static int send_map_set_version(struct rmr_clt_pool_sess *pool_sess, u64 ver)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool *pool = pool_sess->pool;
+	int err;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_MAP_SET_VER;
+	msg.set_map_ver_cmd.map_ver = ver;
+
+	err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+	if (err) {
+		pr_err("%s: For sess %s, %s failed with err %d\n",
+		       __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err);
+	}
+	return err;
+}
+
+/**
+ * rmr_clt_coordinate_discard() - Coordinate the discard_entries flag
+ *
+ * @pool:		the client pool
+ * @member_id:		member id of the source node
+ *
+ * Description:
+ *	This function sends discard request to all normal pool sessions of the pool.
+ *	It is to solve the case where network is partitioned between the server nodes
+ *	and only the client connects those partitions. Any request that failed on a session
+ *	would fail this call.
+ *
+ *	TODO: To address the network partitions (including the client), wait for consistency
+ *	protocols.
+ *
+ * Return:
+ *	0 on success
+ *	Negative error in case of failure
+ *
+ * Pre-requisite: rcu read lock should be held by caller
+ */
+static int rmr_clt_coordinate_discard(struct rmr_pool *pool, u8 cmd_type, u8 member_id)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	int err = 0;
+
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		/*
+		 * If the pool session state is not normal, the dirty maps of the that pool is
+		 * likely corrupted. Don't bother to send the discards.
+		 */
+		if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_NORMAL)
+			continue;
+
+		pr_info("%s: send discards to (pool_sess %s: %d) with member_id %u\n",
+			__func__, pool_sess->sessname, pool_sess->member_id, member_id);
+
+		/* Send discard request to the pool session. */
+		err = send_discard(pool_sess, cmd_type, member_id);
+		if (err) {
+			pr_err("%s: Failed discard request on sess %s for member_id %u\n",
+			       __func__, pool_sess->sessname, member_id);
+			return err;
+		}
+	}
+
+	return err;
+}
+
+static int rmr_clt_handle_discard(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_dirty_id_map *map;
+	int idx, ret, err = 0;
+	u64 map_ver;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+
+	/* Find out if there is pending discard requests on the server side */
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		ret = send_map_get_version(pool_sess, &map_ver);
+		if (ret)
+			continue;
+
+		/*
+		 * When disk replacement appears at the storage node, pserver will set the all
+		 * map entries of that server to dirty.
+		 */
+		if (RMR_STORE_IS_REPLACE(map_ver)) {
+			map = rmr_pool_find_map(pool, pool_sess->member_id);
+			if (!map) {
+				pr_err("The clt pool %s cannot find map for member_id %u\n",
+				       pool->poolname, pool_sess->member_id);
+				err = -EINVAL;
+				goto out;
+			}
+
+			rmr_map_set_dirty_all(map, MAP_NO_FILTER);
+
+			/* Check any normal pool session failed to receive discards */
+			err = rmr_clt_coordinate_discard(pool, RMR_CMD_SEND_DISCARD,
+					pool_sess->member_id);
+			if (err) {
+				pr_err("%s: Failed to coordinate discard state for member_id %u\n",
+				       __func__, pool_sess->member_id);
+				goto out;
+			}
+
+			/* update the map version */
+			err = send_map_set_version(pool_sess, RMR_STORE_UNSET_REPLACE(map_ver));
+			if (err) {
+				pr_err("%s: Failed to reset map version for %s\n",
+				       __func__, pool_sess->sessname);
+				goto out;
+			}
+
+			/* Everyone knows about the discarded entries now. */
+			err = rmr_clt_coordinate_discard(pool, RMR_CMD_DISCARD_CLEAR_FLAG,
+					pool_sess->member_id);
+			if (err) {
+				pr_err("%s: Failed to clear discard flag for S%u\n",
+				       __func__, pool_sess->member_id);
+				goto out;
+			}
+		}
+	}
+
+out:
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+	return err;
+}
+#endif
+
+static int rmr_clt_start_send_md(struct rmr_pool *pool);
+
+/**
+ * recover_work() - A work thread, which performs a number of tasks at regular intervals
+ *
+ * @work:	The work struct holding the data
+ *
+ * Description:
+ *	Every client pool has its own work thread. It performs the following 3 tasks.
+ *	1) Pool sessions in NORMAL state, and having dirty map entries associated with it,
+ *	are checked, and if the entries are cleared from the particular storage node, then
+ *	they are deleted from the pserver also.
+ *	2) If the pool session state is FAILED, but the network state (clt session) is connected,
+ *	then a store check message is send to the pool session. The storage node wil confirm
+ *	with the backend, if IOs can be send or not.
+ *	3) Send the client pool metadata to the servers.
+ */
+void recover_work(struct work_struct *work)
+{
+	struct rmr_pool *pool;
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_pool_md *clt_md;
+	int index, lock_idx = 0;
+
+	clt_pool = container_of(to_delayed_work(work), struct rmr_clt_pool, recover_dwork);
+	pool = clt_pool->pool;
+
+	pr_debug("check map for pool %s started...\n", pool->poolname);
+
+	lock_idx = srcu_read_lock(&pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+
+		pr_debug("pool %s sess %s sess->member_id %d  sess->state %d\n",
+			 pool->poolname, pool_sess->sessname,
+			 pool_sess->member_id, atomic_read(&pool_sess->state));
+
+		clt_md = &pool->pool_md;
+		index = rmr_pool_find_md(clt_md, pool_sess->member_id, false);
+		if (index < 0) {
+			pr_debug("%s failed to find pool_sess %u\n",
+				 __func__, pool_sess->member_id);
+			continue;
+		}
+		if (pool_sess->maintenance_mode)
+			goto pool_sess_state_check;
+
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) {
+			struct rmr_dirty_id_map *map;
+
+			map = rmr_pool_find_map(pool, pool_sess->member_id);
+			if (!map) {
+				pr_debug("pool %s no map found for member_id %u\n",
+				       pool->poolname, pool_sess->member_id);
+				continue;
+			}
+			if (!rmr_map_empty(map)) {
+				pr_debug("pool %s sess %s map is not empty, check stg map...\n",
+					 pool->poolname, pool_sess->sessname);
+				send_map_check(pool_sess);
+			}
+		}
+pool_sess_state_check:
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED &&
+		    clt_sess->state == RMR_CLT_SESS_CONNECTED) {
+			pr_debug("pool %s sess %s try pool sess recover\n",
+				 pool->poolname, pool_sess->sessname);
+			send_store_check(pool_sess);
+		}
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, lock_idx);
+
+	rmr_clt_md_update(pool);
+	/* If the send fails, wait for the next update. */
+	rmr_clt_start_send_md(pool);
+
+	pr_debug("check map for pool %s done. schedule next one.\n", pool->poolname);
+
+	queue_delayed_work(clt_pool->recover_wq, &clt_pool->recover_dwork,
+			   msecs_to_jiffies(RMR_RECOVER_INTERVAL_MS));
+}
+
+static int init_clt_pool(struct rmr_clt_pool *clt_pool)
+{
+	int err;
+
+	clt_pool->pcpu_sess = alloc_percpu(typeof(*clt_pool->pcpu_sess));
+	if (unlikely(!clt_pool->pcpu_sess)) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	return err;
+}
+
+static void destroy_clt_pool(struct rmr_pool *pool)
+{
+	int i;
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_dirty_id_map *map;
+	struct rmr_dirty_id_map *maplist = NULL;
+
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+	if (clt_pool) {
+		free_percpu(clt_pool->pcpu_sess);
+		clt_pool->pcpu_sess = NULL;
+	}
+
+	mutex_lock(&pool->maps_lock);
+	for (i = 0; i < pool->maps_cnt; i++) {
+		map = rcu_dereference_protected(pool->maps[i],
+						lockdep_is_held(&pool->maps_lock));
+		if (WARN_ON(!map))
+			continue;
+		rcu_assign_pointer(pool->maps[i], NULL);
+		map->next = maplist;
+		maplist = map;
+	}
+	pool->maps_cnt = 0;
+
+	if (maplist)
+		synchronize_srcu(&pool->map_srcu);
+
+	mutex_unlock(&pool->maps_lock);
+
+	rmr_maplist_destroy(maplist);
+}
+
+static void rmr_put_sess_iu(struct rmr_clt_pool_sess *pool_sess,
+			    struct rmr_clt_sess_iu *sess_iu);
+
+static struct rmr_iu *
+rmr_alloc_iu(void)
+{
+	struct rmr_iu *iu;
+
+	iu = kzalloc(sizeof(*iu), GFP_KERNEL);
+	if (!iu)
+		return NULL;
+	INIT_LIST_HEAD(&iu->sess_list);
+	iu->num_sessions = 0;
+	refcount_set(&iu->ref, 1);
+	return iu;
+}
+
+void rmr_get_iu(struct rmr_iu *iu)
+{
+	refcount_inc(&iu->ref);
+}
+
+void rmr_put_iu(struct rmr_iu *iu)
+{
+	struct rmr_clt_sess_iu *sess_iu, *tmp;
+
+	if (refcount_dec_and_test(&iu->ref)) {
+		list_for_each_entry_safe(sess_iu, tmp,
+					 &iu->sess_list, entry) {
+			if (!list_empty(&sess_iu->entry))
+				list_del_init(&sess_iu->entry);
+			rmr_put_sess_iu(sess_iu->pool_sess, sess_iu);
+		}
+		kfree(iu);
+	}
+}
+
+void rmr_clt_free_pool_sess(struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+
+	clt_pool = (struct rmr_clt_pool *)pool_sess->pool->priv;
+
+	if (!list_empty(&pool_sess->clt_sess_entry)) {
+		mutex_lock(&clt_sess->lock);
+		list_del(&pool_sess->clt_sess_entry);
+		mutex_unlock(&clt_sess->lock);
+	}
+
+	pr_info("before free pool_sess %s, clt_sess refcount=%d\n",
+		pool_sess->sessname, kref_read(&clt_sess->kref));
+
+	kfree(pool_sess);
+}
+
+void rmr_clt_put_pool(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	rmr_put_clt_pool(clt_pool);
+}
+EXPORT_SYMBOL(rmr_clt_put_pool);
+
+/**
+ * rmr_clt_open() - Open a client for use
+ *
+ * @priv:		private data for the user
+ * @link_ev:		holds the link event callback
+ * @poolname:		name of the pool to open
+ *
+ * Description:
+ *	Open an RMR pool for the user to use. The rmr pool must have at least one session.
+ *	A single pool can be opened and used by only a single user.
+ *
+ * Return:
+ *	Returns pointer to the rmr pool opened.
+ */
+struct rmr_pool *rmr_clt_open(void *priv, rmr_clt_ev_fn *link_ev, const char *poolname)
+{
+	struct rmr_clt_pool *clt_pool;
+	int err;
+
+	clt_pool = rmr_find_and_get_clt_pool(poolname);
+	if (IS_ERR(clt_pool)) {
+		pr_err("RMR client pool '%s' is not found\n", poolname);
+		err = PTR_ERR(clt_pool);
+		goto err_out;
+	}
+
+	if (!mutex_trylock(&clt_pool->clt_pool_lock)) {
+		pr_err("RMR client pool '%s' is busy, recovery in progress\n", poolname);
+		err = -EBUSY;
+		goto put_err;
+	}
+	if (test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) {
+		pr_err("RMR client pool '%s' is already in use\n", poolname);
+		err = -ENOENT;
+		goto put_err;
+	}
+
+	if (!test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) {
+		pr_err("RMR client pool '%s' has no sessions open\n", poolname);
+		err = -ENOENT;
+		goto put_err;
+	}
+
+	clt_pool->link_ev = link_ev;
+	clt_pool->priv = priv;
+
+	err = init_clt_pool(clt_pool);
+	if (unlikely(err)) {
+		pr_err("RMR client pool '%s' failed to initialize: %d\n", poolname, err);
+		goto put_err;
+	}
+
+	rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_IN_USE, true);
+
+	mutex_unlock(&clt_pool->clt_pool_lock);
+	return clt_pool->pool;
+
+put_err:
+	mutex_unlock(&clt_pool->clt_pool_lock);
+	rmr_put_clt_pool(clt_pool);
+err_out:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(rmr_clt_open);
+
+void rmr_clt_close(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	mutex_lock(&clt_pool->clt_pool_lock);
+	rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_IN_USE, false);
+
+	pr_info("%s: RMR client close called for pool %s\n", __func__, pool->poolname);
+
+	/*
+	 * Freeze I/O.
+	 * Degrade ref count to the usual model with a single shared
+	 * atomic_t counter
+	 */
+	rmr_clt_pool_io_freeze(clt_pool);
+	pr_info("pool %s wait for inflight io to complete\n", clt_pool->pool->poolname);
+
+	/* Wait for all completion */
+	rmr_clt_pool_io_wait_complete(clt_pool);
+
+	pr_info("pool %s inflight io completed\n", clt_pool->pool->poolname);
+
+	clt_pool->link_ev = NULL;
+	clt_pool->priv = NULL;
+
+	/* Unfreeze and Resurrect */
+	rmr_clt_pool_io_unfreeze(clt_pool);
+
+	mutex_unlock(&clt_pool->clt_pool_lock);
+
+	rmr_put_clt_pool(clt_pool);
+}
+EXPORT_SYMBOL(rmr_clt_close);
+
+void *rmr_clt_get_priv(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool *clt_pool;
+
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+	if (clt_pool)
+		return clt_pool->priv;
+
+	return NULL;
+}
+EXPORT_SYMBOL(rmr_clt_get_priv);
+
+static struct rmr_clt_sess *alloc_clt_sess(const char *sessname)
+{
+	struct rmr_clt_sess *sess;
+
+	sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE);
+	if (unlikely(!sess)) {
+		pr_err("Failed to create session %s,"
+		       " allocating session struct failed\n",
+		       sessname);
+		return ERR_PTR(-ENOMEM);
+	}
+	strscpy(sess->sessname, sessname, sizeof(sess->sessname));
+	mutex_init(&sess->lock);
+	INIT_LIST_HEAD(&sess->pool_sess_list);
+	kref_init(&sess->kref);
+	sess->state = RMR_CLT_SESS_DISCONNECTED;
+
+	return sess;
+}
+
+static struct rmr_clt_pool_sess *alloc_pool_sess(struct rmr_pool *pool,
+						 struct rmr_clt_sess *clt_sess)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+
+	pool_sess = kzalloc_node(sizeof(*pool_sess), GFP_KERNEL, NUMA_NO_NODE);
+	if (unlikely(!pool_sess)) {
+		pr_err("Failed to allocate session for pool %s\n", pool->poolname);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	strscpy(pool_sess->sessname, clt_sess->sessname, NAME_MAX);
+	INIT_LIST_HEAD(&pool_sess->entry);
+	INIT_LIST_HEAD(&pool_sess->clt_sess_entry);
+	pool_sess->pool = pool;
+	pool_sess->clt_sess = clt_sess;
+	pool_sess->maintenance_mode = false;
+	atomic_set(&pool_sess->state, RMR_CLT_POOL_SESS_CREATED);
+
+	return pool_sess;
+}
+
+/*
+ * Checks if the session already exists (search by session name)
+ * Returns TRUE if session found, FALSE otherwise.
+ */
+static bool __find_sess_by_name(struct rmr_pool *pool, const char *sessname)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	int idx;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (!strcmp(sessname, pool_sess->sessname)) {
+			srcu_read_unlock(&pool->sess_list_srcu, idx);
+			return true;
+		}
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	return false;
+}
+
+/**
+ * __find_sess_by_member_id() - Find and return pool_sess with a given member_id
+ *
+ * @pool:	RMR pool to search pool_sess in
+ * @member_id:	member ID to search
+ *
+ * Return:
+ *	Pointer to rmr_clt_pool_sess on success
+ *	NULL if no pool session exists with the given member_id
+ *
+ * Context:
+ *	The caller should hold srcu_read_lock
+ */
+static struct rmr_clt_pool_sess *__find_sess_by_member_id(struct rmr_pool *pool, u8 member_id)
+{
+	struct rmr_clt_pool_sess *pool_sess = NULL, *tmp_pool_sess;
+
+	list_for_each_entry_srcu(tmp_pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (member_id == tmp_pool_sess->member_id) {
+			pool_sess = tmp_pool_sess;
+			break;
+		}
+	}
+
+	return pool_sess;
+}
+
+/**
+ * pool_sess_change_state() - Change pool session state
+ *
+ * @pool_sess:		Pool session whose state is to be changed
+ * @newstate:		New state which is to be set
+ *
+ * Description:
+ *	Pool session states decide a number of crucial things.
+ *	Where the IOs can be sent, which node has an outdated map, etc.
+ *	As such, transitioning of states are important and is tightly controlled through
+ *	this function. All state transitions should happen through this function.
+ *
+ * Return:
+ *	True in case the state was changed
+ *	False in case the state was not changed
+ */
+bool pool_sess_change_state(struct rmr_clt_pool_sess *pool_sess,
+			    enum rmr_clt_pool_sess_state newstate)
+{
+	bool changed = false;
+	int oldstate = atomic_read(&pool_sess->state);
+
+	if (WARN_ON(oldstate == RMR_CLT_POOL_SESS_REMOVING))
+		goto out;
+
+	switch (newstate) {
+	case RMR_CLT_POOL_SESS_NORMAL:
+		if (pool_sess->maintenance_mode)
+			break;
+		/*
+		 * Non-sync sessions must pass through RECONNECTING before
+		 * reaching NORMAL so that a map update can take place first.
+		 * Sync sessions skip RECONNECTING entirely and go FAILED→NORMAL
+		 * directly.
+		 */
+		if (!rmr_clt_sess_is_sync(pool_sess)) {
+			if (WARN_ON(oldstate == RMR_CLT_POOL_SESS_FAILED))
+				break;
+			if (oldstate == RMR_CLT_POOL_SESS_CREATED ||
+			    oldstate == RMR_CLT_POOL_SESS_RECONNECTING)
+				changed = atomic_try_cmpxchg(&pool_sess->state,
+							     &oldstate,
+							     newstate);
+		} else {
+			if (oldstate == RMR_CLT_POOL_SESS_CREATED ||
+			    oldstate == RMR_CLT_POOL_SESS_FAILED ||
+			    oldstate == RMR_CLT_POOL_SESS_RECONNECTING)
+				changed = atomic_try_cmpxchg(&pool_sess->state,
+							     &oldstate,
+							     newstate);
+		}
+		break;
+	case RMR_CLT_POOL_SESS_RECONNECTING:
+		/*
+		 * Sync sessions never need a map update and must not enter
+		 * RECONNECTING.
+		 */
+		if (WARN_ON(rmr_clt_sess_is_sync(pool_sess) &&
+			    !pool_sess->maintenance_mode))
+			break;
+		if (oldstate == RMR_CLT_POOL_SESS_FAILED ||
+		    oldstate == RMR_CLT_POOL_SESS_CREATED ||
+		    (oldstate == RMR_CLT_POOL_SESS_NORMAL && pool_sess->maintenance_mode))
+			changed = atomic_try_cmpxchg(&pool_sess->state,
+						     &oldstate,
+						     newstate);
+		break;
+	case RMR_CLT_POOL_SESS_FAILED:
+		changed = atomic_try_cmpxchg(&pool_sess->state,
+					     &oldstate,
+					     newstate);
+		/*
+		 * TODO
+		 * We should really be updating map version with the state,
+		 * Or before it.
+		 */
+		if (changed && oldstate != RMR_CLT_POOL_SESS_FAILED)
+			pool_sess->pool->map_ver++;
+		break;
+	case RMR_CLT_POOL_SESS_REMOVING:
+		changed = atomic_try_cmpxchg(&pool_sess->state,
+					     &oldstate,
+					     newstate);
+		break;
+	default:
+		pr_err("%s: Unknown state %d\n", __func__, newstate);
+		break;
+	}
+
+	if (changed && !rmr_clt_sess_is_sync(pool_sess)) {
+		if (newstate == RMR_CLT_POOL_SESS_NORMAL) {
+			/*
+			 * Entering NORMAL: this session is no longer the last
+			 * authoritative holder of the dirty map.
+			 */
+			pool_sess->was_last_authoritative = false;
+			atomic_inc(&pool_sess->pool->normal_count);
+		} else if (oldstate == RMR_CLT_POOL_SESS_NORMAL) {
+			/*
+			 * Leaving NORMAL via FAILED or maintenance-mode
+			 * RECONNECTING: decrement the count of NORMAL sessions.
+			 * If this was the last one, mark it as authoritative so
+			 * that recovery can enable it directly (without a map
+			 * update) when it comes back — its dirty map was the last
+			 * complete one the pool had.
+			 *
+			 * REMOVING is not marked authoritative: a deliberate
+			 * removal (delete or disassemble) is not an uncontrolled
+			 * failure. On reassembly the leg goes through the full
+			 * map update path and does not need the direct-enable
+			 * shortcut.
+			 */
+			if (newstate == RMR_CLT_POOL_SESS_FAILED ||
+			    (newstate == RMR_CLT_POOL_SESS_RECONNECTING &&
+			     pool_sess->maintenance_mode)) {
+				if (atomic_dec_and_test(&pool_sess->pool->normal_count))
+					pool_sess->was_last_authoritative = true;
+			} else {
+				/* REMOVING */
+				atomic_dec(&pool_sess->pool->normal_count);
+			}
+		}
+	}
+
+out:
+
+	trace_pool_sess_change_state(pool_sess, newstate, oldstate, changed);
+
+	return changed;
+}
+
+void rmr_clt_pool_io_freeze(struct rmr_clt_pool *clt_pool)
+{
+	struct rmr_pool *pool = clt_pool->pool;
+
+	mutex_lock(&clt_pool->io_freeze_lock);
+	if (atomic_inc_return(&clt_pool->io_freeze) == 1)
+		percpu_ref_kill(&pool->ids_inflight_ref);
+	mutex_unlock(&clt_pool->io_freeze_lock);
+}
+
+void rmr_clt_pool_io_unfreeze(struct rmr_clt_pool *clt_pool)
+{
+	struct rmr_pool *pool = clt_pool->pool;
+
+	mutex_lock(&clt_pool->io_freeze_lock);
+	if (atomic_dec_return(&clt_pool->io_freeze) == 0) {
+		reinit_completion(&pool->complete_done);
+		percpu_ref_reinit(&pool->ids_inflight_ref);
+
+		wake_up_all(&clt_pool->map_update_wq);
+	}
+	mutex_unlock(&clt_pool->io_freeze_lock);
+}
+
+void rmr_clt_pool_io_wait_complete(struct rmr_clt_pool *clt_pool)
+{
+	struct rmr_pool *pool = clt_pool->pool;
+
+	wait_for_completion(&pool->complete_done);
+}
+
+//am: what kind of locking is rquired for that ?
+static void set_pool_sess_states_to_failed(struct rmr_clt_sess *clt_sess)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+
+	mutex_lock(&clt_sess->lock);
+
+	list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) {
+		if (pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED))
+			pr_info("set sess %s to failed due to link_ev\n", pool_sess->sessname);
+	}
+	mutex_unlock(&clt_sess->lock);
+}
+
+static void rmr_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev)
+{
+	struct rmr_clt_sess *clt_sess = priv;
+
+	switch (ev) {
+	case RTRS_CLT_LINK_EV_DISCONNECTED:
+		pr_info("Rtrs link ev disconnected: session %s\n",
+			clt_sess->sessname);
+		clt_sess->state = RMR_CLT_SESS_DISCONNECTED;
+		set_pool_sess_states_to_failed(clt_sess);
+		break;
+	case RTRS_CLT_LINK_EV_RECONNECTED:
+		pr_info("Rtrs link ev reconnected: session %s\n",
+			clt_sess->sessname);
+		clt_sess->state = RMR_CLT_SESS_CONNECTED;
+		resend_join_pool(clt_sess);
+		break;
+	default:
+		pr_err("Unknown rtrs link event received (%d), "
+		       "session: %s\n",
+		       ev, clt_sess->sessname);
+	}
+}
+
+/*
+ * Gets an iu for I/O operations.
+ *
+ * Context:
+ *	The call to this function should be protected with an srcu_read_lock.
+ */
+static struct rmr_clt_sess_iu *rmr_get_sess_iu(struct rmr_clt_pool_sess *pool_sess,
+					       enum rtrs_clt_con_type con_type,
+					       enum wait_type wait)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+	struct rmr_clt_sess_iu *sess_iu;
+	struct rtrs_permit *permit;
+
+	WARN_ON(!srcu_read_lock_held(&pool->sess_list_srcu));
+
+	if (clt_sess->state == RMR_CLT_SESS_DISCONNECTED) {
+		pr_info("The rmr client session %s state is disconnected\n", clt_sess->sessname);
+		return NULL;
+	}
+
+	sess_iu = kzalloc(sizeof(*sess_iu), GFP_KERNEL);
+	if (!sess_iu)
+		return NULL;
+
+	permit = rtrs_clt_get_permit(clt_sess->rtrs, con_type, wait);
+	if (unlikely(!permit)) {
+		kfree(sess_iu);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&sess_iu->entry);
+	sess_iu->permit = permit;
+	sess_iu->pool_sess = pool_sess;
+
+	return sess_iu;
+}
+
+/*
+ * Gets the iu for user messages.
+ * It will be reference counted initialized with refcount
+ */
+static inline struct rmr_clt_sess_iu *rmr_msg_get_iu(struct rmr_clt_pool_sess *pool_sess,
+						     enum rtrs_clt_con_type con_type,
+						     enum wait_type wait, int refcount)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_clt_sess_iu *sess_iu;
+	int idx;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+
+	sess_iu = rmr_get_sess_iu(pool_sess, con_type, wait);
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	if (unlikely(!sess_iu))
+		return NULL;
+
+	init_waitqueue_head(&sess_iu->comp.wait);
+	sess_iu->comp.errno = INT_MAX;
+	atomic_set(&sess_iu->refcount, refcount);
+
+	return sess_iu;
+}
+
+/*
+ * reference counted put, refcount has to be initialized.
+ */
+void rmr_msg_put_iu(struct rmr_clt_pool_sess *pool_sess,
+		    struct rmr_clt_sess_iu *sess_iu)
+{
+	if (atomic_dec_and_test(&sess_iu->refcount)) {
+		rtrs_clt_put_permit(pool_sess->clt_sess->rtrs, sess_iu->permit);
+		kfree(sess_iu);
+	}
+}
+
+/*
+ * put the sess_iu without reference counting.
+ * I/O does not need reference counting.
+ */
+static void rmr_put_sess_iu(struct rmr_clt_pool_sess *pool_sess,
+			    struct rmr_clt_sess_iu *sess_iu)
+{
+	rtrs_clt_put_permit(pool_sess->clt_sess->rtrs, sess_iu->permit);
+	kfree(sess_iu);
+}
+
+void wake_up_iu_comp(struct rmr_clt_sess_iu *sess_iu)
+{
+	sess_iu->comp.errno = sess_iu->errno;
+	wake_up(&sess_iu->comp.wait);
+}
+
+void msg_conf(void *priv, int errno)
+{
+	struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv;
+
+	sess_iu->errno = errno;
+	/* just schedule the work because kfree must not be done here */
+	schedule_work(&sess_iu->work);
+}
+
+static int send_usr_msg(struct rtrs_clt_sess *rtrs, int dir,
+			struct rmr_clt_sess_iu *sess_iu,
+			struct kvec *vec, size_t nr, size_t len,
+			struct scatterlist *sg, unsigned int sg_len,
+			void (*conf)(struct work_struct *work),
+			int *errno, enum rmr_wait_type wait)
+{
+	int err;
+	struct rtrs_clt_req_ops req_ops;
+
+	INIT_WORK(&sess_iu->work, conf);
+	req_ops = (struct rtrs_clt_req_ops){
+		.priv = sess_iu,
+		.conf_fn = msg_conf,
+	};
+
+	trace_send_usr_msg(dir, sess_iu);
+
+	err = rtrs_clt_request(dir, &req_ops, rtrs, sess_iu->permit,
+			       vec, nr, len, sg, sg_len);
+	if (!err && wait) {
+		wait_event_timeout(sess_iu->comp.wait,
+				   sess_iu->comp.errno != INT_MAX,
+				   msecs_to_jiffies(RMR_CLT_SEND_MSG_TIMEOUT_MS));
+		*errno = sess_iu->comp.errno;
+		if (*errno == INT_MAX)
+			*errno = -ETIMEDOUT;
+	} else {
+		*errno = 0;
+	}
+	return err;
+}
+
+static int send_msg_rejoin_pool(struct rmr_clt_pool_sess *pool_sess, bool wait)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+	int ret;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_REJOIN_POOL;
+
+	msg.join_pool_cmd.rejoin = true;
+	msg.join_pool_cmd.chunk_size = pool->chunk_size;
+	msg.join_pool_cmd.queue_depth = clt_sess->queue_depth;
+
+	ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait);
+	if (ret)
+		pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type));
+
+	return ret;
+}
+
+static int send_msg_join_pool(struct rmr_clt_pool_sess *pool_sess, bool create,
+			      bool dirty, bool wait)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool_member_info *mem_info;
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_clt_pool_sess *t_pool_sess;
+	struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+	struct rmr_dirty_id_map *map;
+	int ret, i = 0, idx;
+
+	rmr_clt_init_cmd(pool_sess->pool, &msg);
+	msg.cmd_type = RMR_CMD_JOIN_POOL;
+
+	msg.join_pool_cmd.queue_depth = clt_sess->queue_depth;
+	msg.join_pool_cmd.chunk_size = pool->chunk_size;
+	msg.join_pool_cmd.rejoin = false;
+
+	if (!msg.sync) {
+		msg.join_pool_cmd.create = create;
+		msg.join_pool_cmd.dirty = dirty;
+		mem_info = &(msg.join_pool_cmd.mem_info);
+
+		idx = srcu_read_lock(&pool->sess_list_srcu);
+		list_for_each_entry_srcu(t_pool_sess, &pool->sess_list, entry,
+					 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+			if (t_pool_sess->member_id == pool_sess->member_id)
+				continue;
+
+			map = rmr_pool_find_map(pool, t_pool_sess->member_id);
+			if (!map) {
+				pr_err("%s: Map with member_id %u does not exist\n",
+				       __func__, t_pool_sess->member_id);
+				srcu_read_unlock(&pool->sess_list_srcu, idx);
+				return -ENOENT;
+			}
+
+			mem_info->p_mem_info[i].member_id = t_pool_sess->member_id;
+			/* Only relevant for create */
+			if (create)
+				mem_info->p_mem_info[i].c_dirty = !rmr_map_empty(map);
+			i++;
+			if (WARN_ON(i >= RMR_POOL_MAX_SESS))
+				break;
+		}
+		srcu_read_unlock(&pool->sess_list_srcu, idx);
+		mem_info->no_of_stor = i;
+	}
+
+	ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait);
+	if (ret)
+		pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type));
+
+	return ret;
+}
+
+int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool delete, bool wait)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	int ret;
+
+	rmr_clt_init_cmd(pool_sess->pool, &msg);
+	msg.cmd_type = RMR_CMD_LEAVE_POOL;
+
+	msg.leave_pool_cmd.member_id = pool_sess->member_id;
+	msg.leave_pool_cmd.delete = delete;
+
+	ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait);
+	if (ret)
+		pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type));
+
+	return ret;
+}
+
+bool rmr_clt_sess_is_sync(struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	bool ret = false;
+
+	if (!pool) {
+		WARN(1, "for sess %s pool is not assigned\n",
+		     pool_sess->clt_sess->sessname);
+		return false;
+	}
+
+	if (pool->sync) {
+		pr_debug("sess %s pool %s is sync (internal) clt sess\n",
+			 pool_sess->clt_sess->sessname, pool->poolname);
+		ret = true;
+	} else {
+		pr_debug("sess %s pool %s is not sync clt sess\n",
+			 pool_sess->clt_sess->sessname, pool->poolname);
+		ret = false;
+	}
+	return ret;
+}
+
+/**
+ * rmr_clt_send_pool_info() - Notify all other pool members of a membership change
+ *
+ * @pool_sess:	The pool session of the member whose state is changing.
+ * @op:		Operation: %RMR_POOL_INFO_OP_ADD or %RMR_POOL_INFO_OP_REMOVE.
+ * @mode:	For ADD: %RMR_POOL_INFO_MODE_CREATE or %RMR_POOL_INFO_MODE_ASSEMBLE.
+ *		For REMOVE: %RMR_POOL_INFO_MODE_DELETE or %RMR_POOL_INFO_MODE_DISASSEMBLE.
+ * @dirty:	When op is ADD and mode is CREATE, indicates that @pool_sess
+ *		has outstanding dirty data that the receiving node must track.
+ *
+ * Sends a POOL_INFO command to every other non-FAILED, non-REMOVING
+ * member in the pool so they can update their view of pool membership.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ *
+ * Context:
+ *	This function blocks while sending the command.
+ */
+static int rmr_clt_send_pool_info(struct rmr_clt_pool_sess *pool_sess,
+				  enum rmr_pool_info_op op, enum rmr_pool_info_mode mode,
+				  bool dirty)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_clt_pool_sess *t_pool_sess;
+	struct rmr_msg_pool_cmd msg = {};
+	int idx, ret = 0;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_POOL_INFO;
+
+	msg.pool_info_cmd.member_id = pool_sess->member_id;
+	msg.pool_info_cmd.operation = op;
+	msg.pool_info_cmd.mode = mode;
+
+	if (op == RMR_POOL_INFO_OP_ADD && mode == RMR_POOL_INFO_MODE_CREATE && dirty)
+		msg.pool_info_cmd.dirty = dirty;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	list_for_each_entry_srcu(t_pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		enum rmr_clt_pool_sess_state state;
+
+		/*
+		 * No need to send the info message to the member who just joined.
+		 */
+		if (t_pool_sess->member_id == pool_sess->member_id)
+			continue;
+
+		state = atomic_read(&t_pool_sess->state);
+		/*
+		 * TODO: For FAILED session we have to store the missed
+		 * msgs and send them later when the session recovers.
+		 */
+		if (state == RMR_CLT_POOL_SESS_FAILED ||
+		    state == RMR_CLT_POOL_SESS_REMOVING)
+			continue;
+
+		ret = rmr_clt_pool_send_cmd(t_pool_sess, &msg, WAIT);
+		if (ret) {
+			pr_err("%s failed with err %d\n", rmr_get_cmd_name(msg.cmd_type), ret);
+			break;
+		}
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	return ret;
+}
+
+void resend_join_pool(struct rmr_clt_sess *clt_sess)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+
+	mutex_lock(&clt_sess->lock);
+
+	list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) {
+		int err;
+
+		err = send_msg_rejoin_pool(pool_sess, WAIT);
+		if (err) {
+			pr_err("send_msg_rejoin_pool failed for sess %s error %d\n",
+				pool_sess->sessname, err);
+		}
+	}
+	mutex_unlock(&clt_sess->lock);
+
+	return;
+}
+
+int send_msg_enable_pool(struct rmr_clt_pool_sess *pool_sess, bool enable)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	int ret;
+
+	rmr_clt_init_cmd(pool_sess->pool, &msg);
+	msg.cmd_type = RMR_CMD_ENABLE_POOL;
+
+	msg.enable_pool_cmd.enable = enable;
+
+	ret = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+	if (ret) {
+		pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type));
+		goto err;
+	}
+
+err:
+	return ret;
+}
+
+static const char *rmr_clt_pool_sess_state_names[] = {
+	[0] = "invalid state",
+	[RMR_CLT_POOL_SESS_CREATED] = "created",
+	[RMR_CLT_POOL_SESS_NORMAL] = "normal",
+	[RMR_CLT_POOL_SESS_FAILED] = "failed",
+	[RMR_CLT_POOL_SESS_RECONNECTING] = "reconnecting",
+	[RMR_CLT_POOL_SESS_REMOVING] = "removing"
+};
+
+const char *rmr_clt_sess_state_str(enum rmr_clt_pool_sess_state state)
+{
+	return rmr_clt_pool_sess_state_names[state];
+}
+
+int rmr_clt_reconnect_sess(struct rmr_clt_sess *clt_sess,
+			   const struct rtrs_addr *paths,
+			   size_t path_cnt)
+{
+	struct rtrs_attrs attrs;
+	struct rtrs_clt_ops rtrs_ops;
+	int err = 0;
+
+	rtrs_ops = (struct rtrs_clt_ops){
+		.priv = clt_sess,
+		.link_ev = rmr_clt_link_ev,
+	};
+
+	clt_sess->rtrs = rtrs_clt_open(&rtrs_ops, clt_sess->sessname,
+				   paths, path_cnt, RTRS_PORT,
+				   0, /* Do not use pdu of rtrs */
+				   RECONNECT_DELAY,
+				   MAX_RECONNECTS, 0);
+	if (IS_ERR(clt_sess->rtrs)) {
+		err = PTR_ERR(clt_sess->rtrs);
+		pr_err("rtrs_clt_open error %d\n", err);
+		goto err;
+	}
+
+	err = rtrs_clt_query(clt_sess->rtrs, &attrs);
+	if (unlikely(err)) {
+		pr_err("rtrs_clt_query error %d\n", err);
+		goto close_sess;
+	}
+	clt_sess->max_io_size = attrs.max_io_size;
+	clt_sess->queue_depth = attrs.queue_depth;
+	clt_sess->max_segments = attrs.max_segments;
+
+	clt_sess->state = RMR_CLT_SESS_CONNECTED;
+
+	resend_join_pool(clt_sess);
+
+	return err;
+
+close_sess:
+	rtrs_clt_close(clt_sess->rtrs);
+err:
+	return err;
+}
+
+//TODO: we do not use rsp in this function, do we need it as an argument?
+static int rmr_clt_handle_rejoin_rsp(struct rmr_clt_pool_sess *pool_sess, struct rmr_msg_pool_cmd_rsp *rsp)
+{
+	int err = 0;
+
+	if (rmr_clt_sess_is_sync(pool_sess)) {
+		/*
+		 * The client on sync side does not need map update
+		 * hence goes to "normal" state directly.
+		 * NB: FAILED => NORMAL
+		 */
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL);
+	} else {
+		/*
+		 * The client on non-sync side needs map update,
+		 *
+		 * A map update is to be triggered, which updates the map,
+		 * and then sets state to "normal"
+		 */
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING);
+
+		/*
+		 * Send the info about the pool to all the storages.
+		 * Contains IDs of storages connected to this pool.
+		 */
+		err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_ADD,
+					     RMR_POOL_INFO_MODE_ASSEMBLE, false);
+		if (err) {
+			pr_err("Rejoin: rmr_clt_send_pool_info failed for session %s",
+			       pool_sess->sessname);
+			return -EINVAL;
+		}
+
+		err = rmr_clt_pool_try_enable(pool_sess->pool);
+		if (err)
+			pr_err("%s: pool %s try_enable failed for sess %s: %d\n",
+			       __func__, pool_sess->pool->poolname,
+			       pool_sess->sessname, err);
+	}
+
+	return err;
+}
+
+static void rmr_clt_handle_join_rsp(struct rmr_clt_pool_sess *pool_sess,
+				    struct rmr_msg_pool_cmd_rsp *rsp)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_pool_md *clt_md;
+	u64 mapped_size;
+
+	clt_md = &pool->pool_md;
+
+	pool_sess->ver = min_t(u8, rsp->ver, RMR_PROTO_VER_MAJOR);
+	pool_sess->member_id = rsp->member_id;
+	xa_store(&pool->stg_members, pool_sess->member_id, pool_sess, GFP_KERNEL);
+
+	pool->chunk_size = rsp->join_pool_cmd_rsp.chunk_size;
+	pool->chunk_size_shift = ilog2(pool->chunk_size);
+	clt_md->chunk_size = pool->chunk_size;
+
+	mapped_size  = rsp->join_pool_cmd_rsp.mapped_size;
+	if (mapped_size) {
+		pool->mapped_size = mapped_size;
+		pool->pool_md.mapped_size = mapped_size;
+		rmr_pool_update_no_of_chunk(pool);
+		pr_info("clt join_pool: mapped size %llu\n", pool->mapped_size);
+	}
+}
+
+static int cmd_process_rsp(struct rmr_clt_pool_sess *pool_sess, struct rmr_msg_pool_cmd_rsp *rsp)
+{
+	int err = 0;
+
+	pr_debug("rsp, cmd_type %d, member_id %d, err %d\n",
+		 rsp->cmd_type, rsp->member_id, rsp->err);
+
+	if (rsp->err)
+		return rsp->err;
+
+	switch (rsp->cmd_type) {
+	case RMR_CMD_MAP_CHECK:
+		return rmr_clt_handle_map_check_rsp(pool_sess, rsp);
+	case RMR_CMD_STORE_CHECK:
+		return rmr_clt_handle_store_check_rsp(pool_sess, rsp);
+	case RMR_CMD_MAP_READY:
+	case RMR_CMD_MAP_SEND:
+	case RMR_CMD_MAP_BUF_DONE:
+	case RMR_CMD_MAP_DONE:
+	case RMR_CMD_MAP_DISABLE:
+	case RMR_CMD_LEAVE_POOL:
+	case RMR_CMD_LAST_IO_TO_MAP:
+	case RMR_CMD_MD_SEND:
+	case RMR_CMD_MAP_SET_VER:
+	case RMR_CMD_SEND_DISCARD:
+	case RMR_CMD_DISCARD_CLEAR_FLAG:
+	case RMR_CMD_POOL_INFO:
+		pr_debug("%s: No rsp handling for %s\n", __func__, rmr_get_cmd_name(rsp->cmd_type));
+		break;
+	case RMR_CMD_REJOIN_POOL:
+		return rmr_clt_handle_rejoin_rsp(pool_sess, rsp);
+	case RMR_CMD_JOIN_POOL:
+		rmr_clt_handle_join_rsp(pool_sess, rsp);
+		break;
+	case RMR_CMD_ENABLE_POOL:
+		pool_sess->ver = min_t(u8, rsp->ver, RMR_PROTO_VER_MAJOR);
+		break;
+	default:
+		pr_warn("%s: switch default type: %d\n", __func__, rsp->cmd_type);
+
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static void msg_pool_cmd_conf(struct work_struct *work)
+{
+	struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work);
+	struct rmr_msg_pool_cmd_rsp *rsp = sess_iu->buf;
+	struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess;
+
+	pr_debug("pool cmd for %s session %s member_id %d conf with errno %d\n",
+		 pool_sess->pool->poolname, pool_sess->sessname,
+		 pool_sess->member_id, sess_iu->errno);
+
+	if (!sess_iu->errno) {
+		/*
+		 * We need to check if there was an error while processing the cmd
+		 * on the server side. If there was, then we fail the command.
+		 */
+		sess_iu->errno = cmd_process_rsp(pool_sess, rsp);
+	}
+
+	kfree(rsp);
+	wake_up_iu_comp(sess_iu);
+	rmr_msg_put_iu(pool_sess, sess_iu);
+}
+
+void rmr_clt_init_cmd(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg)
+{
+	memset(msg, 0, sizeof(*msg));
+
+	msg->hdr.group_id = cpu_to_le32(pool->group_id);
+	msg->hdr.type = cpu_to_le16(RMR_MSG_CMD);
+	msg->hdr.__padding = 0;
+	msg->ver = RMR_PROTO_VER_MAJOR;
+	msg->sync = pool->sync;
+
+	strncpy(msg->pool_name, pool->poolname, sizeof(msg->pool_name));
+}
+EXPORT_SYMBOL(rmr_clt_init_cmd);
+
+int rmr_clt_pool_send_cmd(struct rmr_clt_pool_sess *pool_sess,
+			  struct rmr_msg_pool_cmd *msg, bool wait)
+{
+	struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+	struct rmr_msg_pool_cmd_rsp *rsp;
+	struct rmr_clt_sess_iu *sess_iu;
+	struct kvec vec = {
+		.iov_base = msg,
+		.iov_len = sizeof(*msg)
+	};
+	int err, errno;
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (unlikely(!rsp))
+		return -ENOMEM;
+
+	sess_iu = rmr_msg_get_iu(pool_sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT, 2);
+	if (unlikely(!sess_iu)) {
+		kfree(rsp);
+		return -ENOMEM;
+	}
+
+	sess_iu->buf = rsp;
+	sg_init_one(&sess_iu->sg, rsp, sizeof(*rsp));
+
+	err = send_usr_msg(clt_sess->rtrs, READ, sess_iu,
+			   &vec, 1, sizeof(*rsp), &sess_iu->sg, 1,
+			   msg_pool_cmd_conf, &errno, wait);
+	if (unlikely(err)) {
+		rmr_msg_put_iu(pool_sess, sess_iu);
+		kfree(rsp);
+	} else {
+		err = errno;
+	}
+
+	rmr_msg_put_iu(pool_sess, sess_iu);
+
+	return err;
+}
+
+/*
+ * Pre-requisite: rcu read lock should be held by caller
+ */
+static struct rmr_clt_pool_sess *
+rmr_clt_get_first_normal_session(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL)
+			return pool_sess;
+	}
+
+	return NULL;
+}
+
+/**
+ * rmr_clt_pool_send_all - Send a command to all sessions in the pool
+ *
+ * @pool:	The client pool which sends the command message
+ * @msg:	The command message of pool
+ *
+ * Description:
+ *	When sending messages to all pool sessions, it will continue to send
+ *	regardless of the failure of the previous communication.
+ *
+ * Return:
+ *	0 if at least one successful request
+ *	less than 0 if all requests failed
+ */
+int rmr_clt_pool_send_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	int idx, err = 0;
+	u8 member_id = 0;
+	int ret = 0;
+
+	if (msg->cmd_type == RMR_CMD_SEND_DISCARD)
+		member_id = msg->send_discard_cmd.member_id;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		/* The node has had discards. */
+		if (pool_sess->member_id == member_id)
+			continue;
+
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED)
+			continue;
+
+		pr_info("pool %s send cmd %d to sess %s\n",
+			pool->poolname, msg->cmd_type, pool_sess->sessname);
+
+		/* The err code reflects the response from this pool_sess. */
+		err = rmr_clt_pool_send_cmd(pool_sess, msg, WAIT);
+		if (err) {
+			pr_err("pool %s sending cmd to sess %s failed, err=%d\n",
+			       pool->poolname, pool_sess->sessname, err);
+			continue;
+		}
+
+		pr_info("pool %s done sending cmd %d to sess %s\n",
+			pool->poolname, msg->cmd_type, pool_sess->sessname);
+		ret++;
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	if (ret)
+		return 0;
+
+	return -ENETUNREACH;
+}
+EXPORT_SYMBOL(rmr_clt_pool_send_all);
+
+/**
+ * rmr_clt_send_cmd_with_data_all - Send a command with data to all sessions in the pool
+ *
+ * Return:
+ *	0 on success of all sends
+ *	less than 0 if all sends failed
+ *	positive number of failed sends
+ */
+int rmr_clt_send_cmd_with_data_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg,
+				   void *buf, unsigned int buflen)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	int idx, err = 0;
+	bool ret = false;
+	int errno = 0;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				(srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) {
+			errno++;
+			continue;
+		}
+
+		pr_debug("pool %s send cmd %d to sess %s\n",
+			 pool->poolname, msg->cmd_type, pool_sess->sessname);
+		err = rmr_clt_send_cmd_with_data(pool, pool_sess, msg, buf, buflen);
+		if (err) {
+			errno++;
+			pr_debug("pool %s sending cmd to sess %s failed, err=%d\n",
+				 pool->poolname, pool_sess->sessname, err);
+			continue;
+		}
+
+		pr_debug("pool %s done sending cmd %d to sess %s\n",
+			 pool->poolname, msg->cmd_type, pool_sess->sessname);
+		ret = true;
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	if (ret)
+		return errno;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(rmr_clt_send_cmd_with_data_all);
+
+/**
+ * rmr_clt_start_last_io_update() - Do the last IO update
+ *
+ * @pool:		The pool
+ *
+ * Description:
+ *	Last IO update is needed in case a pserver went down while connected to a pool.
+ *	A pserver going down while performing IOs could mean that some IOs could have been
+ *	executed in some nodes but not all. This function takes the last 'queue_depth' number of
+ *	IOs on each storage node and makes sure they are synced in between all the nodes.
+ *	Before performing the last IO conversion, it also makes sure that all the storage nodes
+ *	have the lastest map.
+ *
+ * Return:
+ *	0 on success
+ *	Error value on failure
+ *
+ * Context:
+ *	srcu_read_lock should be held while calling this function.
+ */
+int rmr_clt_start_last_io_update(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool_sess *pool_sess_chosen, *pool_sess;
+	struct rmr_msg_pool_cmd msg = {};
+	u64 map_ver, highest_map_ver = 0;
+	int j, err, idx, ret = 0;
+	int discard_ids[RMR_POOL_MAX_SESS];
+	u8 id, nr_discards = 0;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+
+	for (j = 0; j < RMR_POOL_MAX_SESS; j++) {
+		struct rmr_clt_pool_sess *ps;
+		u8 mid = pool->pool_md.srv_md[j].member_id;
+
+		if (!mid)
+			continue;
+
+		ps = xa_load(&pool->stg_members, mid);
+		if (!ps) {
+			pr_err("%s: member_id %u not yet assembled\n",
+			       __func__, mid);
+			err = -EINVAL;
+			goto out;
+		}
+		if (atomic_read(&ps->state) != RMR_CLT_POOL_SESS_RECONNECTING) {
+			pr_err("%s: member_id %u not in reconnecting state\n",
+			       __func__, mid);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	/*
+	 * Before pserver died, it could be that one or more storage nodes were down.
+	 * This would mean there is a possibility that those storage nodes will not have
+	 * the latest map. But that can create problems.
+	 * We need to make sure that every storage node has the latest map.
+	 * Hence, find out which node has the latest map first,
+	 */
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		err = send_map_get_version(pool_sess, &map_ver);
+		if (err) {
+			pr_err("%s: Failed to read map version for sess %s\n",
+			       __func__, pool_sess->sessname);
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (RMR_STORE_IS_REPLACE(map_ver)) {
+			map_ver = RMR_STORE_GET_VER(map_ver);
+			discard_ids[nr_discards] = pool_sess->member_id;
+			nr_discards++;
+		}
+
+		if (map_ver > highest_map_ver) {
+			highest_map_ver = map_ver;
+			pool_sess_chosen = pool_sess;
+		}
+	}
+
+	for (j = 0; j < nr_discards; j++) {
+		id = discard_ids[j];
+		pr_info("%s: Send discard req %d to S%d\n",
+			__func__, id, pool_sess_chosen->member_id);
+		err = send_discard(pool_sess_chosen, RMR_CMD_SEND_DISCARD, id);
+		if (err) {
+			pr_err("%s: Failed to send discard request to %s\n",
+			       __func__, pool_sess_chosen->sessname);
+			goto out;
+		}
+	}
+
+	/*
+	 * We have the storage node with the latest map,
+	 * make sure the latest map is sent to all other storage nodes.
+	 */
+	err = rmr_clt_spread_map(pool, pool_sess_chosen, false, false);
+	if (err) {
+		pr_err("%s: Failed to spread the latest map\n", __func__);
+		goto out;
+	}
+
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		for (j = 0; j < nr_discards; j++) {
+			id = discard_ids[j];
+			pr_info("%s: Send discard clear req %d to S%d\n",
+				__func__, id, pool_sess->member_id);
+			err = send_discard(pool_sess, RMR_CMD_DISCARD_CLEAR_FLAG, id);
+			if (err) {
+				pr_err("%s: Failed to clear discard state on %s\n",
+				       __func__, pool_sess->sessname);
+			} else {
+				ret++;
+			}
+		}
+	}
+
+	if (nr_discards && !ret) {
+		pr_err("%s: Failed to clear discard state on any storage node\n", __func__);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Now that we are done with the dispersing of the latest map,
+	 * we can start last IO update.
+	 */
+	rmr_clt_init_cmd(pool, &msg);
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		msg.cmd_type = RMR_CMD_LAST_IO_TO_MAP;
+		err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+		if (err) {
+			pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type));
+			goto out;
+		}
+
+		err = rmr_clt_spread_map(pool, pool_sess, true, false);
+		if (err) {
+			pr_err("%s: Failed to spread last_io converted map\n", __func__);
+			goto out;
+		}
+	}
+
+	err = rmr_clt_read_map(pool);
+	if (err) {
+		pr_err("%s: rmr_clt_read_map failed with err %d\n", __func__, err);
+		goto out;
+	}
+
+out:
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+	return err;
+}
+
+/**
+ * rmr_clt_enable_sess() - Enable the rmr clt pool sessions
+ *
+ * @pool_sess:	The rmr clt pool session to enable
+ *
+ * Description:
+ *	This function takes care of enable request, for pool sessions
+ *	not in maintenance mode and in mm.
+ *
+ * Return:
+ *	0 on success
+ *	Error value on failure
+ */
+int rmr_clt_enable_sess(struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	int pool_sess_state, err = 0;
+
+	pr_info("%s: For session %s of pool %s\n",
+		__func__, pool_sess->sessname, pool->poolname);
+
+	if (!pool_sess->maintenance_mode) {
+		/*
+		 * Simple enable, not related to maintenance.
+		 * Manual enable is only allowed for sessions in "created" state
+		 */
+		pool_sess_state = atomic_read(&pool_sess->state);
+		if (pool_sess_state != RMR_CLT_POOL_SESS_CREATED) {
+			pr_err("Cannot manually enable session: state %d\n", pool_sess_state);
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = send_msg_enable_pool(pool_sess, 1);
+		if (err) {
+			pr_err("Failed to send enable to pool %s. Err %d\n",
+			       pool->poolname, err);
+			goto out;
+		}
+
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL);
+	} else {
+		/*
+		 * Enable when in maintenance mode.
+		 */
+		err = rmr_clt_unset_pool_sess_mm(pool_sess);
+	}
+
+out:
+	return err;
+}
+
+/**
+ * rmr_clt_create_sess() - allocate and initialize rmr client session, rmr_clt_pool sess can use it
+ * to submit io to the rtrs connection
+ *
+ * @sessname:	Name to be given to the new session being created.
+ * @paths:	RTRS paths created for the session.
+ * @path_cnt:	Number of paths.
+ *
+ * Return:
+ *	Pointer to rmr_clt_sess on success
+ *	ERR_PTR on failure
+ *
+ * Description:
+ *	Create a new session to storage node with address "rtrs_addr".
+ *	After this function is done, rmr_clt_pool_sess caan use this sess to submit io
+ *
+ * Context:
+ *	This function blocks while creating the session
+ */
+static struct rmr_clt_sess *rmr_clt_create_sess(const char *sessname,
+						const struct rtrs_addr *paths,
+						size_t path_cnt)
+{
+	struct rmr_clt_sess *clt_sess;
+	struct rtrs_attrs attrs;
+	struct rtrs_clt_ops rtrs_ops;
+	int err;
+
+	clt_sess = alloc_clt_sess(sessname);
+	if (IS_ERR(clt_sess)) {
+		pr_err("Session '%s' can not be allocated in pool\n", sessname);
+		return clt_sess; // TODO: isit err_cast here?
+	}
+
+	rtrs_ops = (struct rtrs_clt_ops){
+		.priv = clt_sess,
+		.link_ev = rmr_clt_link_ev,
+	};
+	/*
+	 * Nothing was found, establish rtrs connection and proceed further.
+	 */
+	clt_sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname,
+				       paths, path_cnt, RTRS_PORT,
+				       0, /* Do not use pdu of rtrs */
+				       RECONNECT_DELAY,
+				       MAX_RECONNECTS, 0);
+	if (IS_ERR(clt_sess->rtrs)) {
+		err = PTR_ERR(clt_sess->rtrs);
+		pr_err("rtrs_clt_open error %d\n", err);
+		goto free_clt_sess;
+	}
+	err = rtrs_clt_query(clt_sess->rtrs, &attrs);
+	if (unlikely(err)) {
+		pr_err("rtrs_clt_query error %d\n", err);
+		goto close_sess;
+	}
+	clt_sess->max_io_size = attrs.max_io_size;
+	clt_sess->queue_depth = attrs.queue_depth;
+	clt_sess->max_segments = attrs.max_segments;
+	//sess->sess_kobj = &sess->rtrs->dev.dev.kobj;
+
+	err = rmr_clt_create_clt_sess_sysfs_files(clt_sess);
+	if (err) {
+		pr_err("failed to crete sysfs files for sess %s, err=%d\n",
+		       clt_sess->sessname, err);
+		goto close_sess;
+	}
+	clt_sess->state = RMR_CLT_SESS_CONNECTED;
+
+	mutex_lock(&g_sess_lock);
+	list_add(&clt_sess->g_list, &g_sess_list);
+	mutex_unlock(&g_sess_lock);
+
+	return clt_sess;
+
+close_sess:
+	rtrs_clt_close(clt_sess->rtrs);
+
+free_clt_sess:
+	kfree(clt_sess);
+
+	return ERR_PTR(err);
+}
+
+/**
+ * rmr_clt_pool_try_enable() - Trigger pool session recovery if conditions are met
+ *
+ * @pool:	The pool to check
+ *
+ * Scans pool sessions and fires the appropriate recovery action:
+ *
+ *  Case 1: ≥1 NORMAL session exists → spread its map (with enable=true) to all
+ *          non-NORMAL sessions, then set them to NORMAL on the client side
+ *  Case 2: Exactly one was_last_authoritative RECONNECTING session exists →
+ *          enable it directly (data is complete, no map needed), then spread
+ *          its map to remaining sessions
+ *  Cases 3/4: All pool_md members present and RECONNECTING → last_io_update
+ *
+ * Return: 0 on success or when conditions are not yet met, negative error on failure.
+ */
+int rmr_clt_pool_try_enable(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+	struct rmr_clt_pool_sess *pool_sess, *normal_sess, *auth_sess;
+	bool any_member = false;
+	int idx, j, err = 0;
+
+	pr_info("%s: Started for pool %s\n", __func__, pool->poolname);
+
+	/*
+	 * clt_pool_lock is held across all RPC round-trips below (MAP_READY,
+	 * MAP_SEND, MAP_DONE, last_io_update exchanges).  This serialises
+	 * concurrent try_enable calls and prevents rmr_clt_open/close from
+	 * racing with recovery.  The RPC send path (rmr_clt_pool_send_cmd)
+	 * uses per-session permits and does not acquire clt_pool_lock, so
+	 * there is no deadlock.  rmr_clt_open and rmr_clt_close use
+	 * mutex_trylock and mutex_lock respectively to handle this.
+	 */
+	mutex_lock(&clt_pool->clt_pool_lock);
+
+	normal_sess = NULL;
+	auth_sess = NULL;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		int state = atomic_read(&pool_sess->state);
+
+		if (state == RMR_CLT_POOL_SESS_NORMAL) {
+			if (!normal_sess)
+				normal_sess = pool_sess;
+		} else if (state == RMR_CLT_POOL_SESS_RECONNECTING &&
+			   pool_sess->was_last_authoritative &&
+			   !pool_sess->maintenance_mode &&
+			   !auth_sess) {
+			auth_sess = pool_sess;
+		}
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	/*
+	 * Invariant: at most one was_last_authoritative session can exist
+	 * (guaranteed by atomic_dec_and_test in pool_sess_change_state), and
+	 * it cannot coexist with a NORMAL session (if a NORMAL session exists,
+	 * the pool never fully went to FAILED, so no session gets the flag).
+	 */
+	if (WARN_ON(auth_sess && normal_sess)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Case 2: was_last_authoritative session — enable it directly, then spread */
+	if (auth_sess) {
+		err = send_msg_enable_pool(auth_sess, 1);
+		if (err) {
+			pr_err("%s: pool %s failed to enable auth sess %s: %d\n",
+			       __func__, pool->poolname, auth_sess->sessname, err);
+			goto out;
+		}
+		pool_sess_change_state(auth_sess, RMR_CLT_POOL_SESS_NORMAL);
+		normal_sess = auth_sess;
+	}
+
+	/* Case 1: ≥1 NORMAL session → spread map to all non-NORMAL sessions */
+	if (normal_sess) {
+		idx = srcu_read_lock(&pool->sess_list_srcu);
+		err = rmr_clt_spread_map(pool, normal_sess, true, true);
+		if (err)
+			pr_err("%s: pool %s spread map from %s failed: %d\n",
+			       __func__, pool->poolname, normal_sess->sessname, err);
+		else
+			goto out_normal;
+
+		srcu_read_unlock(&pool->sess_list_srcu, idx);
+		goto out;
+	}
+
+	/* Cases 3/4: all pool_md members present and RECONNECTING */
+	for (j = 0; j < RMR_POOL_MAX_SESS; j++) {
+		struct rmr_clt_pool_sess *ps;
+		u8 mid = pool->pool_md.srv_md[j].member_id;
+
+		if (!mid)
+			continue;
+
+		any_member = true;
+		ps = xa_load(&pool->stg_members, mid);
+		if (!ps || atomic_read(&ps->state) != RMR_CLT_POOL_SESS_RECONNECTING ||
+		     ps->maintenance_mode) {
+			pr_info("%s: pool %s member_id %u not yet in reconnecting/mm, waiting\n",
+				__func__, pool->poolname, mid);
+			goto out;
+		}
+	}
+
+	if (!any_member) {
+		pr_info("%s: pool %s has no members in pool_md, nothing to do\n",
+			__func__, pool->poolname);
+		goto out;
+	}
+
+	pr_info("%s: pool %s all members reconnecting, starting last_io_update\n",
+		__func__, pool->poolname);
+
+	err = rmr_clt_start_last_io_update(pool);
+	if (err) {
+		pr_err("%s: pool %s last_io_update failed: %d\n",
+		       __func__, pool->poolname, err);
+		goto out;
+	}
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+out_normal:
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_RECONNECTING ||
+		    pool_sess->maintenance_mode)
+			continue;
+
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL);
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+out:
+	mutex_unlock(&clt_pool->clt_pool_lock);
+	return err;
+}
+
+/**
+ * rmr_clt_read_pool_md() - Read the full pool_md from a storage server's disk
+ *
+ * @pool_sess:	The pool session to read from.
+ *
+ * Sends RMR_CMD_MD_SEND with read_full_md=1 to the given session and imports
+ * the returned srv_md[] entries into pool->pool_md, skipping already-known
+ * members.  Used during add_sess mode=assemble so the client learns all pool
+ * member IDs from the server's on-disk metadata, not only the one being
+ * assembled.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+static int rmr_clt_read_pool_md(struct rmr_clt_pool_sess *pool_sess, bool first)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool_md *remote_md;
+	int i, err;
+
+	remote_md = kzalloc(sizeof(*remote_md), GFP_KERNEL);
+	if (!remote_md)
+		return -ENOMEM;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_MD_SEND;
+	msg.md_send_cmd.src_mapped_size = pool->mapped_size;
+	msg.md_send_cmd.sender_id = pool_sess->member_id;
+	msg.md_send_cmd.read_full_md = 1;
+
+	err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg,
+					 remote_md, sizeof(*remote_md));
+	if (err) {
+		pr_err("%s: failed to read pool_md from sess %s: %d\n",
+		       __func__, pool_sess->sessname, err);
+		goto out;
+	}
+
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		u8 mid = remote_md->srv_md[i].member_id;
+		int idx;
+
+		if (!mid)
+			continue;
+
+		idx = rmr_pool_find_md(&pool->pool_md, mid, first);
+		if (idx < 0)
+			continue;
+
+		if (!pool->pool_md.srv_md[idx].member_id) {
+			/* New entry — import blindly */
+			memcpy(&pool->pool_md.srv_md[idx], &remote_md->srv_md[i],
+			       sizeof(struct rmr_srv_md));
+		} else {
+			/* Already known — verify stable fields are consistent */
+			if (pool->pool_md.srv_md[idx].mapped_size !=
+			    remote_md->srv_md[i].mapped_size)
+				pr_warn("%s: member_id %u mapped_size mismatch: "
+					"expected %llu, got %llu from sess %s\n",
+					__func__, mid,
+					pool->pool_md.srv_md[idx].mapped_size,
+					remote_md->srv_md[i].mapped_size,
+					pool_sess->sessname);
+		}
+	}
+
+out:
+	kfree(remote_md);
+	return err;
+}
+
+/**
+ * rmr_clt_process_non_sync_sess() - Set up map and notify peers for a new non-sync session
+ *
+ * @pool_sess:	The newly added pool session.
+ * @create:	True if this is a fresh pool creation; false for an assemble of an
+ *		existing pool.
+ * @dirty:	True if there are already other sessions in the pool; the new member's
+ *		map will be marked fully dirty to trigger a resync.
+ *
+ * Creates the dirty map for @pool_sess and informs all existing pool members
+ * about the new storage node joining.  On failure the map is removed.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+static int rmr_clt_process_non_sync_sess(struct rmr_clt_pool_sess *pool_sess, bool create,
+					 bool dirty)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_dirty_id_map *map;
+	enum rmr_pool_info_mode mode;
+	u8 created_mids[RMR_POOL_MAX_SESS];
+	int created_cnt = 0;
+	int i, err = 0;
+
+	/*
+	 * The mapped size of the pool is set after a backend device is mapped to the
+	 * client. If a new client pool session is extended to this pool, the map for that
+	 * new server node needs to be created for the client pool as well.
+	 */
+	if (!pool->mapped_size) {
+		pr_err("%s: pool %s mapped_size is 0\n",
+		       __func__, pool->poolname);
+		err = -EINVAL;
+		goto out;
+	}
+
+	pr_info("Through add_sess, pool %s mapped_size %llu\n",
+		pool->poolname, pool->mapped_size);
+
+	rmr_pool_update_no_of_chunk(pool);
+
+	if (create) {
+		if (rmr_pool_find_map(pool, pool_sess->member_id)) {
+			pr_err("%s: pool %s map for member_id %u already exists\n",
+			       __func__, pool->poolname, pool_sess->member_id);
+			err = -EEXIST;
+			goto out;
+		}
+
+		map = rmr_map_create(pool, pool_sess->member_id);
+		if (IS_ERR(map)) {
+			err = PTR_ERR(map);
+			pr_err("%s: pool %s failed to create map for member_id %u\n",
+			       __func__, pool->poolname, pool_sess->member_id);
+			goto out;
+		}
+
+		/*
+		 * During pool creation, all storage nodes must start with identical
+		 * data. The first node added is taken as the clean reference; any
+		 * subsequent node joining must be fully synced from it.
+		 * Mark the entire map dirty to trigger that initial resync.
+		 */
+		if (dirty)
+			rmr_map_set_dirty_all(map, MAP_NO_FILTER);
+
+		mode = RMR_POOL_INFO_MODE_CREATE;
+	} else {
+		/*
+		 * For assemble, read pool_md first so we know all member IDs,
+		 * then create maps for every member in the pool.
+		 */
+		mode = RMR_POOL_INFO_MODE_ASSEMBLE;
+
+		err = rmr_clt_read_pool_md(pool_sess, !dirty);
+		if (err) {
+			pr_err("%s: failed to read pool_md from sess %s: %d\n",
+			       __func__, pool_sess->sessname, err);
+			goto out;
+		}
+
+		if (!dirty) {
+			for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+				u8 mid = pool->pool_md.srv_md[i].member_id;
+
+				if (!mid)
+					continue;
+
+				map = rmr_map_create(pool, mid);
+				if (IS_ERR(map)) {
+					err = PTR_ERR(map);
+					pr_err("%s: pool %s failed to create map for member_id %u\n",
+					       __func__, pool->poolname, mid);
+					goto del_maps;
+				}
+				created_mids[created_cnt++] = mid;
+			}
+		}
+	}
+
+	/*
+	 * We need to send the info about this node joining to other storage nodes.
+	 */
+	err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_ADD, mode, dirty);
+	if (err) {
+		pr_err("rmr_clt_send_pool_info failed for session %s\n",
+		       pool_sess->sessname);
+		if (create)
+			rmr_pool_remove_map(pool, pool_sess->member_id);
+		else
+			goto del_maps;
+		goto out;
+	}
+
+	if (!create) {
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING);
+		err = rmr_clt_pool_try_enable(pool);
+		if (err)
+			pr_err("%s: pool %s try_enable failed for sess %s: %d\n",
+			       __func__, pool->poolname, pool_sess->sessname, err);
+	}
+
+	return err;
+
+del_maps:
+	for (i = 0; i < created_cnt; i++)
+		rmr_pool_remove_map(pool, created_mids[i]);
+out:
+	return err;
+}
+
+/**
+ * rmr_clt_add_pool_sess() - Add a client session to an RMR pool
+ *
+ * @pool:	The pool to join.
+ * @clt_sess:	The client transport session to associate.
+ * @create:	True if this is a fresh pool creation; false for an assemble of an
+ *		existing pool.
+ *
+ * Sends a join_pool command to the server, allocates a pool session, creates
+ * the dirty map for this storage node (for non-sync pools), and notifies the
+ * other pool members via a pool_info message.
+ *
+ * Return:
+ *	Pointer to the new pool session on success, ERR_PTR on failure.
+ */
+struct rmr_clt_pool_sess *rmr_clt_add_pool_sess(struct rmr_pool *pool,
+						struct rmr_clt_sess *clt_sess, bool create)
+{
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_pool_md *clt_md;
+	int err, idx;
+	bool dirty = false;
+
+	mutex_lock(&pool->sess_lock);
+
+	if (__find_sess_by_name(pool, clt_sess->sessname)) {
+		pr_err("Session '%s' already exists in pool %s\n",
+		       clt_sess->sessname, pool->poolname);
+		err = -EEXIST;
+		goto err_out;
+	}
+
+	pool_sess = alloc_pool_sess(pool, clt_sess);
+	if (IS_ERR(pool_sess)) {
+		pr_err("pool session '%s' can not be allocated in pool %s\n",
+		       clt_sess->sessname, pool->poolname);
+		err = PTR_ERR(pool_sess);
+		goto err_out;
+	}
+
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	/* TODO handle case where tags are alreaydy initialized */
+	clt_pool->queue_depth = clt_sess->queue_depth;
+	clt_md = &clt_pool->pool->pool_md;
+	clt_md->queue_depth = clt_sess->queue_depth;
+
+	if (!pool->sync)
+		dirty = !list_empty(&pool->sess_list);
+
+	err = send_msg_join_pool(pool_sess, create, dirty, WAIT);
+	if (unlikely(err)) {
+		pr_err("send_msg_join_pool error %d\n", err);
+		goto free_sess;
+	}
+
+	/*
+	 * Now that we have the member_id of the new storage node,
+	 * check if it is unique.
+	 */
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	if (__find_sess_by_member_id(pool, pool_sess->member_id)) {
+		srcu_read_unlock(&pool->sess_list_srcu, idx);
+		pr_err("%s: Session with member_id %u already exists\n",
+		       __func__, pool_sess->member_id);
+		err = -EEXIST;
+		goto err_leave_pool;
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	list_add_tail_rcu(&pool_sess->entry, &pool->sess_list);
+
+	if (!pool->sync) {
+		err = rmr_clt_process_non_sync_sess(pool_sess, create, dirty);
+		if (err) {
+			pr_err("%s: rmr_clt_process_non_sync_sess failed for sess %s with err %d\n",
+			       __func__, clt_sess->sessname, err);
+			goto rem_from_list;
+		}
+	} else
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL);
+
+	mutex_unlock(&pool->sess_lock);
+
+	mutex_lock(&clt_sess->lock);
+	list_add_tail(&pool_sess->clt_sess_entry, &clt_sess->pool_sess_list);
+	mutex_unlock(&clt_sess->lock);
+
+	return pool_sess;
+
+rem_from_list:
+	rmr_clt_del_pool_sess(pool_sess);
+err_leave_pool:
+	send_msg_leave_pool(pool_sess, create, WAIT);
+free_sess:
+	rmr_clt_free_pool_sess(pool_sess);
+err_out:
+	mutex_unlock(&pool->sess_lock);
+	return ERR_PTR(err);
+}
+
+//reauire g_sess_lock acquired
+static struct rmr_clt_sess *__find_and_get_clt_sess(const char *sessname)
+{
+	struct rmr_clt_sess *sess, *sn;
+
+again:
+	list_for_each_entry_safe (sess, sn, &g_sess_list, g_list) {
+		if (strcmp(sessname, sess->sessname))
+			continue;
+
+		if (rmr_clt_sess_get(sess))
+			return sess;
+
+		pr_info("failed to get ref for sess %s\n", sessname);
+		goto again; //don't like it
+	}
+
+	return NULL;
+}
+
+struct rmr_clt_sess *find_and_get_or_create_clt_sess(char *sessname,
+						     struct rtrs_addr *paths,
+						     size_t path_cnt)
+{
+	struct rmr_clt_sess *sess;
+
+	mutex_lock(&g_sess_lock);
+	sess = __find_and_get_clt_sess(sessname);
+	mutex_unlock(&g_sess_lock);
+
+	if (!sess) {
+		pr_info("%s: Cannot find rmr_clt_sess with name %s\n", __func__, sessname);
+		sess = rmr_clt_create_sess(sessname, paths, path_cnt);
+		if (IS_ERR(sess)) {
+			return sess;
+		}
+		pr_info("%s: rmr_clt_sess %s created\n", __func__, sessname);
+	}
+
+	return sess;
+}
+
+/**
+ * rmr_clt_del_pool_sess() - Remove a session from the pool session list.
+ * @pool_sess:	Pool session to remove.
+ *
+ * Removes @pool_sess from the pool's session list, waits for any in-progress
+ * SRCU readers to finish, and clears any per-CPU cached references to it.
+ *
+ * Context: Caller must hold pool->sess_lock.
+ */
+void rmr_clt_del_pool_sess(struct rmr_clt_pool_sess *pool_sess)
+{
+	int cpu;
+	bool dosync = false;
+	struct rmr_clt_pool_sess __rcu **ppcpu_sess;
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	list_del_rcu(&pool_sess->entry);
+	synchronize_srcu(&pool->sess_list_srcu);
+
+	for_each_possible_cpu(cpu) {
+		preempt_disable();
+		ppcpu_sess = per_cpu_ptr(clt_pool->pcpu_sess, cpu);
+		if (pool_sess == rcu_access_pointer(*ppcpu_sess)) {
+			rcu_assign_pointer(*ppcpu_sess, NULL);
+			dosync = true;
+		}
+		preempt_enable();
+	}
+
+	if (dosync)
+		synchronize_srcu(&pool->sess_list_srcu);
+}
+
+/**
+ * rmr_clt_destroy_pool_sess() - Send leave_pool and free a pool session
+ *
+ * @pool_sess:	Pool session to destroy.
+ * @delete:	True for a permanent pool deletion; false for a temporary
+ *		disassembly.  This flag is forwarded in the leave_pool message
+ *		so the server can act accordingly.
+ */
+void rmr_clt_destroy_pool_sess(struct rmr_clt_pool_sess *pool_sess, bool delete)
+{
+	struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+
+	send_msg_leave_pool(pool_sess, delete, WAIT);
+	rmr_clt_free_pool_sess(pool_sess);
+	rmr_clt_sess_put(clt_sess);
+}
+
+static void rmr_clt_destroy_pool(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+	struct rmr_clt_pool_sess *pool_sess, *tmp;
+
+	destroy_clt_pool(pool);
+
+	list_for_each_entry_safe (pool_sess, tmp, &pool->sess_list, entry) {
+		mutex_lock(&pool->sess_lock);
+		list_del_rcu(&pool_sess->entry);
+		mutex_unlock(&pool->sess_lock);
+
+		rmr_clt_destroy_pool_sess(pool_sess, false /* never delete */);
+	}
+
+	rmr_put_clt_pool(clt_pool);
+}
+
+int rmr_clt_remove_pool_from_sysfs(struct rmr_pool *pool,
+				   const struct attribute *sysfs_self)
+{
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	if (!pool->sync)
+		cancel_delayed_work_sync(&clt_pool->recover_dwork);
+
+	rmr_clt_destroy_pool_sysfs_files(pool, sysfs_self);
+	rmr_clt_destroy_pool(pool);
+	return 0;
+}
+
+/*
+ * Pre-requisite: rcu read lock should be held by caller
+ */
+static struct rmr_clt_pool_sess *
+rmr_clt_next_sess(struct rmr_pool *pool, struct rmr_clt_pool_sess *prev)
+{
+	struct rmr_clt_pool_sess *next;
+
+	next = list_next_or_null_rcu(&pool->sess_list,
+				     &prev->entry,
+				     struct rmr_clt_pool_sess,
+				     entry);
+	if (next)
+		return next;
+
+	return list_first_or_null_rcu(&pool->sess_list,
+				      struct rmr_clt_pool_sess,
+				      entry);
+}
+
+static inline bool rmr_clt_pool_sess_in_iu(struct rmr_iu *iu,
+					   struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu;
+
+	list_for_each_entry_safe(sess_iu, tmp_sess_iu,
+				 &(iu->sess_list), entry) {
+
+		if (sess_iu->pool_sess == pool_sess)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Pre-requisite: rcu read lock should be held by caller
+ */
+static struct rmr_clt_pool_sess *rmr_clt_round_robin_sess(struct rmr_pool *pool,
+							  struct rmr_iu *iu)
+{
+	struct rmr_clt_pool_sess *old, *next, *pool_sess;
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_clt_pool_sess __rcu **ppcpu_sess;
+
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+	ppcpu_sess = this_cpu_ptr(clt_pool->pcpu_sess);
+
+	if (iu) {
+		list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+					 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+			if (rmr_clt_pool_sess_in_iu(iu, pool_sess))
+				continue;
+
+			rcu_assign_pointer(*ppcpu_sess, pool_sess);
+			return pool_sess;
+		}
+
+		return NULL;
+	}
+
+	old = rcu_dereference(*ppcpu_sess);
+	if (!old) {
+		next = rmr_clt_get_first_normal_session(pool);
+		if (!next)
+			return NULL;
+		rcu_assign_pointer(*ppcpu_sess, next);
+		return next;
+	}
+
+	for (next = rmr_clt_next_sess(pool, old);
+	     next && next != old;
+	     next = rmr_clt_next_sess(pool, next)) {
+		/*
+		 * It could happen that the state of pool_sess hasn't been able to
+		 * represent the recent rtrs-clt sess state.
+		 */
+		if (next->clt_sess->state == RMR_CLT_SESS_DISCONNECTED)
+			continue;
+
+		if (atomic_read(&next->state) == RMR_CLT_POOL_SESS_NORMAL) {
+			rcu_assign_pointer(*ppcpu_sess, next);
+			return next;
+		}
+	}
+
+	/*
+	 * There may be just one session with normal state i.e. old.
+	 * In this case per-cpu sess pointer does not need update.
+	 */
+	return rmr_clt_get_first_normal_session(pool);
+}
+
+int rmr_clt_query(struct rmr_pool *pool, struct rmr_attrs *attr)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+	int idx;
+
+	if (unlikely(!clt_pool))
+		return -EINVAL;
+
+	attr->chunk_size = pool->chunk_size;
+	attr->sync = pool->sync;
+
+	attr->queue_depth = U32_MAX;
+	attr->max_io_size = U32_MAX;
+	attr->max_segments = U32_MAX;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+
+	if (list_empty(&pool->sess_list)) {
+		srcu_read_unlock(&pool->sess_list_srcu, idx);
+		return -ENOENT;
+	}
+
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+
+		attr->queue_depth = min_t(int, clt_sess->queue_depth, attr->queue_depth);
+		attr->max_io_size = min_t(u32, clt_sess->max_io_size, attr->max_io_size);
+		attr->max_segments = min_t(u32, clt_sess->max_segments, attr->max_segments);
+	}
+	attr->pool_kobj = &(pool->kobj);
+
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	return 0;
+}
+EXPORT_SYMBOL(rmr_clt_query);
+
+struct rmr_iu *rmr_clt_get_iu(struct rmr_pool *pool, enum rmr_io_flags flag,
+			      enum rmr_wait_type wait)
+{
+	int err = 0, idx;
+	struct rmr_clt_pool *clt_pool;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_iu *iu;
+	struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu;
+	bool reset = false;
+
+	clt_pool = (struct rmr_clt_pool *)pool->priv;
+
+	if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) {
+		pr_err("%s: Pool %s not in use state\n", __func__, pool->poolname);
+		rmr_clt_dump_state(clt_pool);
+		return NULL;
+	}
+
+	/*
+	 * We get the inflight ref first.
+	 * If we see that an IO freeze is in progress, we put the ref, and wait for it to unfreeze
+	 *
+	 * The while loop protects us from parallel freeze, like
+	 * A leg deletion, and right after that a call to rmr_clt_close.
+	 *
+	 * We are guranteed to not go on an infinite loop, since rmr_clt_close can be called only
+	 * once, And, there are limited legs to delete
+	 */
+	percpu_ref_get(&pool->ids_inflight_ref);
+	while (atomic_read(&clt_pool->io_freeze) > 0) {
+		percpu_ref_put(&pool->ids_inflight_ref);
+		/*
+		 * Coincidentally, the rcu lock might be held when the wait event occurs,
+		 * violating the constraint that no sleeping during general rcu critical section.
+		 * Temporarily release the rcu lock, and re-acquire it after waking up.
+		 *
+		 * TODO: This approach is simple but may need to be revisited.
+		 */
+		if (rcu_read_lock_held()) {
+			rcu_read_unlock();
+			reset = true;
+		}
+
+		wait_event(clt_pool->map_update_wq, !atomic_read(&clt_pool->io_freeze));
+
+		if (reset)
+			rcu_read_lock();
+
+		/*
+		 * Once IO is unfrozen, we check if the state of the pool has changed.
+		 * It could be that rmr_clt_close was called, and hence state is not IN_USE.
+		 * Or, it could be that the last leg was deleted, and we are not in JOINED state
+		 *
+		 * In both the case, we cannot service IOs, hence fail.
+		 */
+		if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state) ||
+		    !test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) {
+			pr_err("%s: Failed to get inflight IO ref.\n", __func__);
+			pr_err("%s: Pool %s is not joined or used\n",
+				__func__, pool->poolname);
+			rmr_clt_dump_state(clt_pool);
+			return NULL;
+		}
+
+		percpu_ref_get(&pool->ids_inflight_ref);
+	}
+
+	iu = rmr_alloc_iu();
+	if (unlikely(!iu)) {
+		percpu_ref_put(&pool->ids_inflight_ref);
+		return NULL;
+	}
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	if (rmr_op(flag) == RMR_OP_READ) {
+		/*
+		 * Round robin use of one of the sessions in normal state for READ.
+		 *
+		 * This call is always from rmr_clt_request, so for READ,
+		 * this is the first pool_sess we are trying
+		 */
+		pool_sess = rmr_clt_round_robin_sess(pool, NULL);
+		if (unlikely(!pool_sess)) {
+			err = -ENODEV;
+			goto put_iu;
+		}
+
+		sess_iu = rmr_get_sess_iu(pool_sess, RTRS_IO_CON, (enum wait_type) wait);
+		if (unlikely(!sess_iu))
+			goto put_iu;
+
+		sess_iu->rmr_iu = iu;
+		iu->num_sessions = 1;
+		list_add_tail(&(sess_iu->entry), (&iu->sess_list));
+	} else {
+		/*
+		 * For WRITE operations we need to submit to all sessions.
+		 */
+		list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+					 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+			/* Sessions must be in normal state for I/O */
+			if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_NORMAL)
+				continue;
+
+			sess_iu = rmr_get_sess_iu(pool_sess,
+						  RTRS_IO_CON, (enum wait_type) wait);
+			if (unlikely(!sess_iu))
+				goto put_sessions;
+
+			sess_iu->rmr_iu = iu;
+			/*
+			 * The mem_id of sess_iu tracks the next free slot in the permit bitmap
+			 * of an RTRS-clt session, which is used to store write IO chunk info by
+			 * RMR-server.
+			 */
+			sess_iu->mem_id = sess_iu->permit->mem_id;
+			iu->num_sessions++;
+			list_add_tail(&(sess_iu->entry), (&iu->sess_list));
+		}
+	}
+
+	refcount_set(&iu->refcount, iu->num_sessions);
+	iu->errno = 0;
+
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	return iu;
+
+put_sessions:
+	list_for_each_entry_safe(sess_iu, tmp_sess_iu,
+				 &(iu->sess_list), entry) {
+		if (!list_empty(&sess_iu->entry))
+			list_del_init(&sess_iu->entry);
+		rmr_put_sess_iu(sess_iu->pool_sess, sess_iu);
+	}
+put_iu:
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+	rmr_put_iu(iu);
+	percpu_ref_put(&pool->ids_inflight_ref);
+
+	if (err)
+		return ERR_PTR(err);
+
+	return NULL;
+}
+EXPORT_SYMBOL(rmr_clt_get_iu);
+
+void rmr_clt_put_iu(struct rmr_pool *pool, struct rmr_iu *iu)
+{
+	rmr_put_iu(iu);
+	percpu_ref_put(&pool->ids_inflight_ref);
+}
+EXPORT_SYMBOL(rmr_clt_put_iu);
+
+/**
+ * Returns 1 if the errno represents a condition in the
+ * storage server that prevents the operation to be executed.
+ * The oposite is an error with respect to the storage server
+ * where the operation can be re-tried on a different one.
+ *
+ * Example is attemp to read a block that does not exists
+ * versus server has been crashed.
+ *
+ * Note that in doubt we have to trigger the re-try.
+ */
+/*
+static inline int rmr_is_op_error(int errno)
+{
+	switch (-errno) {
+	case ENOENT:
+	case EINVAL:
+	case EEXIST:
+	case ENODEV:
+		return 1;
+	default:
+		return 0;
+	}
+}
+*/
+
+static void msg_read_conf(void *priv, int errno)
+{
+	struct rmr_clt_sess_iu *sess_iu	= (struct rmr_clt_sess_iu *)priv;
+	struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess;
+	struct rmr_iu *iu		= sess_iu->rmr_iu;
+	rmr_conf_fn *clt_conf		= iu->conf;
+
+	WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED);
+
+	if (errno) {
+		if (!iu->errno)
+			/* only first error is reported */
+			iu->errno = errno;
+
+		pr_err_ratelimited("%s got errno: %d for session %d. Schedule retry.\n",
+				   __func__, errno, pool_sess->member_id);
+		if (!pool_sess->pool->sync)
+			pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED);
+
+		INIT_WORK(&iu->work, retry_failed_read);
+		schedule_work(&iu->work);
+	} else {
+		(*clt_conf)(iu->priv, errno);
+	}
+}
+
+static void retry_failed_read(struct work_struct *work)
+{
+	struct rmr_iu *iu = container_of(work, struct rmr_iu, work);
+	struct rmr_pool *pool = iu->pool;
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+	rmr_conf_fn *clt_conf	= iu->conf;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_sess_iu *sess_iu;
+	struct rtrs_clt_req_ops req_ops;
+	struct kvec vec;
+	int err, idx;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+
+	pool_sess = rmr_clt_round_robin_sess(pool, iu);
+	if (!pool_sess)
+		goto give_up;
+
+	sess_iu = rmr_get_sess_iu(pool_sess, RTRS_IO_CON, RTRS_PERMIT_WAIT);
+	if (unlikely(!sess_iu))
+		goto give_up;
+
+	pr_debug("%s: Pool %s to session %d, chunk [%llu, %llu]\n",
+		 __func__, pool->poolname, pool_sess->member_id,
+		 le64_to_cpu(iu->msg.id_a), le64_to_cpu(iu->msg.id_b));
+
+	sess_iu->rmr_iu = iu;
+	iu->msg.member_id = pool_sess->member_id;
+	atomic_inc(&clt_pool->stats.read_retries);
+
+	list_add_tail(&(sess_iu->entry), (&iu->sess_list));
+
+	vec = (struct kvec) {
+		.iov_base = &iu->msg,
+		.iov_len  = sizeof(iu->msg)
+	};
+
+	req_ops = (struct rtrs_clt_req_ops) {
+                .priv = sess_iu,
+                .conf_fn = msg_read_conf,
+        };
+
+	trace_retry_failed_read(READ, sess_iu);
+
+	err = rtrs_clt_request(RMR_OP_READ, &req_ops, pool_sess->clt_sess->rtrs, sess_iu->permit,
+			       &vec, 1, le32_to_cpu(iu->msg.length), iu->sg, iu->sg_cnt);
+
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	if (err)
+		/* beware! recursion!! */
+		msg_read_conf(sess_iu, err);
+
+	return;
+give_up:
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+	/* recursion termination! */
+	(*clt_conf)(iu->priv, iu->errno);
+}
+
+/*
+static int rmr_clt_map_remove_id(struct rmr_pool *pool, int srv_id, rmr_id_t id)
+{
+	struct rmr_dirty_id_map *map;
+
+	pr_debug("pool %s, remove id (%llu, %llu) for stg_id %d\n",
+		 pool->poolname, id.a, id.b, srv_id);
+
+	map = rmr_pool_find_map(pool, srv_id);
+	if (!map) {
+		pr_err("pool %s no map found for pool_id %u\n",
+		       pool->poolname, srv_id);
+		return -EINVAL;
+		//TODO: handle this , probably initialize map, or just throw err?
+	}
+
+	if (!rmr_map_empty(map)) {
+		void *val;
+
+		val = rmr_map_find(map, id);
+		if (!val) {
+			pr_debug("pool %s value for id (%llu, %llu) is not in the dirty map\n",
+				 pool->poolname, id.a, id.b);
+			return 0;
+		}
+		rmr_map_erase(map, id);
+		pr_debug("pool %s, id (%llu, %llu) is removed from map for stg_id %d\n",
+			 pool->poolname, id.a, id.b, srv_id);
+	}
+
+	return 0;
+}
+*/
+
+static void msg_io_conf(void *priv, int errno)
+{
+	struct rmr_clt_sess_iu *sess_iu	= (struct rmr_clt_sess_iu *)priv;
+	struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess;
+	struct rmr_iu *iu		= sess_iu->rmr_iu;
+	rmr_conf_fn *clt_conf		= iu->conf;
+	void *clt_priv = iu->priv;
+
+	WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED);
+	WARN_ON(pool_sess->pool->sync);
+
+	if (errno) {
+		pr_err("%s: For sess %s, id (%llu, %llu), got errno: %d\n",
+		       __func__, pool_sess->sessname, iu->msg.id_a, iu->msg.id_b, errno);
+		sess_iu->errno = errno;
+		if (!iu->errno)
+			/* only first error is reported */
+			iu->errno = errno;
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED);
+		pr_debug("iu->errno %d, errno %d, before dec refcnt %d\n",
+			 iu->errno, errno, refcount_read(&iu->refcount));
+	} else {
+		atomic_inc(&iu->succeeded);
+		// TODO: is it ok to clear it here?
+		// rmr_clt_map_remove_id(session->pool, session->pool_id, iu->id);
+	}
+
+	pr_debug("called for id (%llu, %llu), errno %d, sessname %s\n",
+		 iu->msg.id_a, iu->msg.id_b, errno, pool_sess->sessname);
+
+	if (refcount_dec_and_test(&iu->refcount)) {
+		if (atomic_read(&iu->succeeded) == 0) {
+			/*
+			 * None of the IOs succeeded.
+			 * Map add is not needed; Just fail the IO.
+			 */
+			pr_err("Write IO failed. Passing it up. errno %d\n", iu->errno);
+			(*clt_conf)(clt_priv, iu->errno);
+		} else if (iu->errno) {
+			/*
+			 * Some IOs failed. Send map update (add).
+			 * The clt conf will be called when map update is done.
+			 *
+			 * We are using the same iu to send map update
+			 * So reset the refcount.
+			 */
+			refcount_set(&iu->refcount, iu->num_sessions);
+
+			/*
+			 * we are in interrupt here, so sched map update
+			 */
+			pr_debug("%s: some IOs failed for %s. Starts map_add\n", __func__,
+				 pool_sess->sessname);
+			INIT_WORK(&iu->work, sched_map_add);
+			schedule_work(&iu->work);
+		} else {
+			/*
+			 * All good.
+			 */
+			errno = 0;
+			(*clt_conf)(clt_priv, errno);
+		}
+	}
+}
+
+static inline void rmr_clt_put_cu(struct rmr_clt_cmd_unit *cmd_unit)
+{
+	percpu_ref_put(&cmd_unit->clt_pool->pool->ids_inflight_ref);
+	kfree(cmd_unit);
+}
+
+/**
+ * msg_cmd_conf() - Confirmation function called for command user commands sent
+ *
+ * priv:	Pointer to private data passed to rtrs. sess_iu in this case.
+ * errno:	error status passed by rtrs
+ */
+static void msg_cmd_conf(void *priv, int errno)
+{
+	struct rmr_clt_sess_iu *sess_iu	= (struct rmr_clt_sess_iu *)priv;
+	struct rmr_clt_cmd_unit *cmd_unit = sess_iu->rmr_cmd_unit;
+	rmr_conf_fn *clt_conf = cmd_unit->conf;
+	void *clt_priv = cmd_unit->priv;
+	int total_failed;
+
+	pr_debug("%s: sessname:%s, errno=%d\n", __func__, sess_iu->pool_sess->sessname, errno);
+	if (!errno)
+		atomic_inc(&cmd_unit->succeeded);
+
+	if (refcount_dec_and_test(&cmd_unit->refcount)) {
+		if (atomic_read(&cmd_unit->succeeded) == 0) {
+			/*
+			 * None of the IOs succeeded.
+			 */
+			pr_err("CMD failed with err %pe. Passing it up.\n", ERR_PTR(errno));
+			(*clt_conf)(clt_priv, errno);
+		} else {
+			total_failed = cmd_unit->failed_state +
+				       (cmd_unit->num_sessions - atomic_read(&cmd_unit->succeeded));
+			/*
+			 * Pass the number of failures up to the user.
+			 */
+			(*clt_conf)(clt_priv, total_failed);
+		}
+
+		rmr_clt_put_cu(cmd_unit);
+	}
+
+	rmr_put_sess_iu(sess_iu->pool_sess, sess_iu);
+}
+
+/* The amount of data that belongs to an I/O and the amount of data that
+ * should be read or written to the disk (bi_size) can differ.
+ *
+ * E.g. When WRITE_SAME is used, only a small amount of data is
+ * transferred that is then written repeatedly over a lot of sectors.
+ *
+ * Get the size of data to be transferred via RTRS by summing up the size
+ * of the scather-gather list entries.
+ */
+static size_t rmr_clt_get_sg_size(struct scatterlist *sglist, u32 len)
+{
+	struct scatterlist *sg;
+	size_t tsize = 0;
+	int i;
+
+	for_each_sg(sglist, sg, len, i)
+		tsize += sg->length;
+	return tsize;
+}
+
+/**
+ * rmr_clt_request() - Request data transfer to/from storage node via given pool
+ *
+ * @pool:	The Pool
+ * @iu:		Iu allocated by pevious rmr_clt_get_iu call.
+ * @offset:	offset inside the object to read/write:
+ * @length:	length of data starting from offset
+ * @flag:	READ/WRITE/REMOVE
+ * @prio:	priority of IO
+ * @priv:	User provided data, passed back with corresponding
+ *		@(conf) confirmation.
+ * @conf:	callback function to be called as confirmation
+ * @sg:		Pages to be sent/received to/from server.
+ * @sg_cnt:	Number of elements in the @sg
+ *
+ * Description:
+ *	Data transfer through the given pool, using the underlying RTRS <-> RDMA
+ *	While sending write IOs, if there are FAILED or RECONNECTING pool sessions, that IO
+ *	would be added as dirty for such sessions.
+ *	This is used by both pserver client, and the rmr server on the storage node to perform
+ *	sync reads.
+ *
+ * Return:
+ *	0 on success. This means IO was sent. Final confirmation would be sent via conf function
+ *	Error value on failure
+ */
+int rmr_clt_request(struct rmr_pool *pool, struct rmr_iu *iu,
+		    size_t offset, size_t length, enum rmr_io_flags flag, unsigned short prio,
+		    void *priv, rmr_conf_fn *conf, struct scatterlist *sg, unsigned int sg_cnt)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu;
+	struct rtrs_clt_req_ops req_ops;
+	rmr_id_t id;
+	struct kvec vec;
+	size_t sg_len;
+	int dir, err, idx;
+	u32 rmr_flag;
+
+	rmr_get_iu(iu);
+	rmr_flag = rmr_op(flag);
+	dir = (rmr_flag == RMR_OP_READ) ? READ : WRITE;
+
+	sg_len = rmr_clt_get_sg_size(sg, sg_cnt);
+	if (!(flag & RMR_OP_DISCARD || flag & RMR_OP_WRITE_ZEROES))
+		WARN_ON(length != sg_len);
+
+	iu->msg.hdr.group_id = cpu_to_le32(pool->group_id);
+	iu->msg.hdr.type = cpu_to_le16(RMR_MSG_IO);
+	iu->msg.hdr.__padding = 0;
+
+	iu->msg.offset = cpu_to_le32(offset);
+	iu->msg.length = cpu_to_le32(length);
+	iu->msg.flags = cpu_to_le32(flag);
+	iu->msg.prio = cpu_to_le16(prio);
+
+	iu->msg.sync = pool->sync;
+
+	iu->priv = priv;
+	iu->conf = conf;
+	iu->pool = pool;
+
+	if (rmr_flag != RMR_OP_FLUSH && sg_len) {
+		rmr_map_calc_chunk(pool, offset, length, &id);
+		/*
+		 * We are not ready to process IO requests which are across chunk boundary.
+		 * The main area which needs work is triggering sync IO (see rmr-req.c) which
+		 * holding the IO which touches multiple chunks. And then making sure other IOs
+		 * which overlap these chunks are held properly, and restarted once the corresponding
+		 * chunk is synced.
+		 */
+		BUG_ON(id.a > 1);
+		iu->msg.id_a = cpu_to_le64(id.a);
+		iu->msg.id_b = cpu_to_le64(id.b);
+	}
+
+	if (rmr_flag == RMR_OP_READ) {
+		iu->sg = sg;
+		iu->sg_cnt = sg_cnt;
+	} else if (!pool->sync && rmr_flag == RMR_OP_WRITE) {
+		/*
+		 * We take this path only for request from client side
+		 * Never from rmr_req_remote_read.
+		 */
+		int failed_cnt = 0;
+		int i;
+
+		atomic_set(&iu->succeeded, 0);
+		idx = srcu_read_lock(&pool->sess_list_srcu);
+		for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+			struct rmr_clt_pool_sess *ps;
+			enum rmr_clt_pool_sess_state state;
+			u8 mid = pool->pool_md.srv_md[i].member_id;
+
+			if (!mid)
+				continue;
+
+			ps = xa_load(&pool->stg_members, mid);
+			if (ps) {
+				state = atomic_read(&ps->state);
+				if (state != RMR_CLT_POOL_SESS_FAILED &&
+				    state != RMR_CLT_POOL_SESS_RECONNECTING)
+					continue;
+			}
+			/* ps == NULL (disassembled) or FAILED/RECONNECTING */
+			if (WARN_ON(failed_cnt >= RMR_POOL_MAX_SESS))
+				break;
+			iu->msg.map_ver = cpu_to_le64(pool->map_ver);
+			iu->msg.failed_id[failed_cnt] = mid;
+			failed_cnt++;
+			rmr_clt_map_add_id(pool, mid, id);
+		}
+		srcu_read_unlock(&pool->sess_list_srcu, idx);
+		iu->msg.failed_cnt = failed_cnt;
+	} else if (pool->sync) {
+		pr_err("rmr_clt_request: Sync sessions do not process writes\n");
+		return -EPERM;
+	}
+
+	vec = (struct kvec) {
+		.iov_base = &iu->msg,
+		.iov_len  = sizeof(iu->msg)
+	};
+
+	list_for_each_entry_safe(sess_iu, tmp_sess_iu,
+				 &(iu->sess_list), entry) {
+		struct rmr_clt_sess *clt_sess;
+
+		pool_sess = sess_iu->pool_sess;
+		clt_sess = pool_sess->clt_sess;
+		iu->msg.member_id = pool_sess->member_id;
+
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING ||
+		    pool_sess->maintenance_mode) {
+			/*
+			 * The storage for this session is getting removed from
+			 * the pool, or is in maintenance mode.
+			 * Simply complete this IO with error
+			 */
+			err = -EAGAIN;
+			goto complete_io;
+		}
+
+		pr_debug("Sending %x request to pool %s session %s "
+			 "chunk (%llu, %llu) offset %lu length %lu)\n",
+			 rmr_flag,
+			 pool->poolname, pool_sess->sessname,
+			 id.a, id.b, offset, length);
+
+		if (rmr_flag == RMR_OP_READ) {
+			req_ops = (struct rtrs_clt_req_ops) {
+				.priv = sess_iu,
+				.conf_fn = msg_read_conf,
+			};
+		} else {
+			req_ops = (struct rtrs_clt_req_ops) {
+				.priv = sess_iu,
+				.conf_fn = msg_io_conf,
+			};
+
+			/*
+			 * Update mem_id before transmitting each write IO to the corresponding
+			 * server.
+			 */
+			iu->msg.mem_id = cpu_to_le32(sess_iu->mem_id);
+		}
+
+		trace_rmr_clt_request(dir, sess_iu);
+
+		err = rtrs_clt_request(dir, &req_ops, clt_sess->rtrs,
+				       sess_iu->permit, &vec, 1, sg_len,
+				       sg, sg_cnt);
+
+complete_io:
+		if (err) {
+			if (rmr_flag == RMR_OP_READ)
+				msg_read_conf(sess_iu, err);
+			else
+				msg_io_conf(sess_iu, err);
+		}
+	}
+	rmr_put_iu(iu);
+
+	return 0;
+}
+EXPORT_SYMBOL(rmr_clt_request);
+
+/**
+ * rmr_clt_get_cu() - Allocate and return a command unit.
+ *
+ * @pool:	rmr pool for which the command unit is to be allocated
+ *
+ * Description:
+ *	Allocates and returns a command unit for the rmr pool. The command unit contains a list of
+ *	session units, for each session which is not in the "REMOVING" state.
+ *
+ * Return:
+ *	Pointer to the command unit
+ */
+static struct rmr_clt_cmd_unit *rmr_clt_get_cu(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_cmd_unit *cmd_unit;
+	struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu;
+	int idx;
+
+	if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) {
+		pr_err("%s: Pool %s not in use\n", __func__, pool->poolname);
+		rmr_clt_dump_state(clt_pool);
+		return NULL;
+	}
+
+	/*
+	 * We get the inflight ref first.
+	 * If we see that an IO freeze is in progress, we put the ref, and wait for it to unfreeze
+	 *
+	 * The while loop protects us from parallel freeze, like
+	 * A leg deletion, and right after that a call to rmr_clt_close.
+	 *
+	 * We are guranteed to not go on an infinite loop, since rmr_clt_close can be called only
+	 * once, And, there are limited legs to delete
+	 */
+	percpu_ref_get(&pool->ids_inflight_ref);
+	while (atomic_read(&clt_pool->io_freeze) > 0) {
+		percpu_ref_put(&pool->ids_inflight_ref);
+		wait_event(clt_pool->map_update_wq, !atomic_read(&clt_pool->io_freeze));
+
+		/*
+		 * Once IO is unfrozen, we check if the state of the pool has changed.
+		 * It could be that rmr_clt_close was called, and hence state is not IN_USE.
+		 * Or, it could be that the last leg was deleted, and we are not in JOINED state
+		 *
+		 * In both the case, we cannot service IOs, hence fail.
+		 */
+		if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state) ||
+		    !test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) {
+			pr_err("%s: Failed to get inflight IO ref.\n", __func__);
+			pr_err("%s: Pool %s is not joined or used\n", __func__, pool->poolname);
+			rmr_clt_dump_state(clt_pool);
+			return NULL;
+		}
+
+		percpu_ref_get(&pool->ids_inflight_ref);
+	}
+
+	cmd_unit = kzalloc(sizeof(*cmd_unit), GFP_KERNEL);
+	if (!cmd_unit) {
+		percpu_ref_put(&pool->ids_inflight_ref);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&cmd_unit->sess_list);
+	cmd_unit->pool = pool;
+	cmd_unit->clt_pool = clt_pool;
+	atomic_set(&cmd_unit->succeeded, 0);
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+	/*
+	 * Acquire the permits for all sessions.
+	 * Continue only if we manage to get permits for all "normal" sessions??
+	 */
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING)
+			continue;
+
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) {
+			cmd_unit->failed_state++;
+			continue;
+		}
+
+		sess_iu = rmr_get_sess_iu(pool_sess, RTRS_ADMIN_CON, RTRS_PERMIT_NOWAIT);
+		if (unlikely(!sess_iu))
+			goto put_sessions;
+
+		sess_iu->rmr_cmd_unit = cmd_unit;
+
+		cmd_unit->num_sessions++;
+		list_add_tail(&(sess_iu->entry), (&cmd_unit->sess_list));
+	}
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+	refcount_set(&cmd_unit->refcount, cmd_unit->num_sessions);
+
+	return cmd_unit;
+
+put_sessions:
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	/* Free sess_ius */
+	list_for_each_entry_safe(sess_iu, tmp_sess_iu,
+				 &(cmd_unit->sess_list), entry) {
+		if (!list_empty(&sess_iu->entry))
+			list_del_init(&sess_iu->entry);
+		rmr_put_sess_iu(sess_iu->pool_sess, sess_iu);
+	}
+
+	rmr_clt_put_cu(cmd_unit);
+
+	return NULL;
+}
+
+/**
+ * rmr_clt_cmd_err_conf() - Calls confirmation function for commands
+ *
+ * @work:	schedules work
+ *
+ * Description:
+ *	In case of error in the user command path, we cannot call the confirmation function
+ *	directly, since it might end up calling confirmation function of the user itself.
+ *	Hence a work is scheduled to call the confirmation function in case the code for sending
+ *	user commands itself fails.
+ */
+static void rmr_clt_cmd_err_conf(struct work_struct *work)
+{
+	struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work);
+
+	msg_cmd_conf(sess_iu, sess_iu->errno);
+}
+
+/**
+ * rmr_clt_cmd_with_rsp() - Sends a user command to all sessions of an rmr pool
+ *
+ * @pool:	rmr pool to which the command is for
+ * @conf:	confirmation function to be called after completion
+ * @priv:	pointer to priv data, to be returned to user while calling conf function
+ * @usr_vec:	kvec containing user data (mostly command messages?)
+ * @nr:		number of kvecs
+ * @buf:	buf where the response from the user server is to be directed
+ *		The buf must be physically contiguous in memory (kmalloc()'d).
+ * @buf_len:	length of the buffer
+ * @size:	size of the buf to be sent to a single session
+ *
+ * Description:
+ *	This function provides an interface for the user to send commands to the server side.
+ *	The command is sent as a read, so that the response from the user srv side can be received
+ *	The buffer sent by the user is meant to receive the response from the user server side.
+ *	The size of the buffer is set during rmr_clt_open.
+ *
+ * Return:
+ *	0 on success
+ *	negative errno in case of error
+ *
+ * Context:
+ *	Inflight commands will block map update, until the inflights are completed.
+ */
+int rmr_clt_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv,
+			 const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, size_t size)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu;
+	struct rmr_clt_cmd_unit *cmd_unit;
+	struct rmr_msg_pool_cmd msg = {};
+	struct rtrs_clt_req_ops req_ops;
+	struct kvec *vec;
+	int i, j, err = 0;
+
+	/*
+	 * TODO: kvmalloc() memory is yet to be supported for SG I/O.
+	 */
+	if (is_vmalloc_addr(buf))
+		return -EINVAL;
+
+	if (buf_len != (RMR_POOL_MAX_SESS * size))
+		return -EINVAL;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_USER;
+
+	/*
+	 * RMR msg struct + user vecs
+	 */
+	vec = kzalloc((1 + nr) * sizeof(*vec), GFP_KERNEL);
+	if (!vec)
+		return -ENOMEM;
+
+	/*
+	 * RMR msg struct first,
+	 * followed by the user kvecs
+	 */
+	vec[0].iov_base = &msg;
+	vec[0].iov_len = sizeof(msg);
+	for (i = 1, j = 0; j < nr; i++, j++) {
+		vec[i].iov_base = usr_vec[j].iov_base;
+		vec[i].iov_len = usr_vec[j].iov_len;
+
+		msg.user_cmd.usr_len += usr_vec[j].iov_len;
+	}
+
+	cmd_unit = rmr_clt_get_cu(pool);
+	if (!cmd_unit) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	cmd_unit->conf = conf;
+	cmd_unit->priv = priv;
+
+	i = 0;
+	list_for_each_entry_safe(sess_iu, tmp_sess_iu,
+				 &(cmd_unit->sess_list), entry) {
+		pool_sess = sess_iu->pool_sess;
+
+		req_ops = (struct rtrs_clt_req_ops){
+			.priv = sess_iu,
+			.conf_fn = msg_cmd_conf,
+		};
+
+		/*
+		 * The user expects each node to be able to send back data of this "size" as
+		 * response.
+		 * So divide the user buffer into chunks of "size", and send them to each leg.
+		 */
+		sg_init_one(&sess_iu->sg, buf + (i * size), size);
+
+		trace_rmr_clt_cmd_with_rsp(READ, sess_iu);
+
+		err = rtrs_clt_request(READ, &req_ops, pool_sess->clt_sess->rtrs, sess_iu->permit,
+				       vec, (1 + nr), size, &sess_iu->sg, 1);
+		if (err) {
+			/*
+			 * We want to deal with this error just like we deal with the error
+			 * received from the conf function returned from rtrs.
+			 * This would help us to inform the user the correct number of commands
+			 * which failed on the rmr level (rtrs is also rmr level for user).
+			 */
+			pr_warn("rtrs_clt_request Failed with err %d\n", err);
+			sess_iu->errno = err;
+			INIT_WORK(&sess_iu->work, rmr_clt_cmd_err_conf);
+			schedule_work(&sess_iu->work);
+			err = 0;
+		}
+
+		i++;
+	}
+
+	/*
+	 * No session to send command
+	 */
+	if (i == 0) {
+		rmr_clt_put_cu(cmd_unit);
+		err = -EINVAL;
+	}
+
+out:
+	kfree(vec);
+
+	return err;
+}
+EXPORT_SYMBOL(rmr_clt_cmd_with_rsp);
+
+/**
+ * rmr_clt_send_cmd_with_data() - send command containing data buffer as a payload or response
+ *
+ * @pool:	rmr pool to send command
+ * @pool_sess:	client pool session used to send
+ * @msg:	initialized command message describing the command
+ * @buf:	pointer to the data buffer for data transfers
+ * @buflen:	size of the buffer in bytes
+ *
+ * Description:
+ *	Performs sending the command described by msg with a payload or response
+ *	in the buf.
+ *
+ * Return:
+ *	0 on success, error code otherwise.
+ *
+ * Context:
+ *	This function blocks while sending the buffer.
+ *
+ * Locks:
+ *	should be called under srcu_read_lock since it uses pool_sess
+ */
+int rmr_clt_send_cmd_with_data(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess,
+			       struct rmr_msg_pool_cmd *msg,
+			       void *buf, unsigned int buflen)
+{
+	struct rmr_clt_sess_iu *sess_iu;
+	struct rmr_clt_sess *clt_sess = pool_sess->clt_sess;
+	struct kvec vec = {
+		.iov_base = msg,
+		.iov_len = sizeof(*msg)
+	};
+	int errno = 0, err = 0;
+	int dir;
+
+	switch (msg->cmd_type) {
+	case RMR_CMD_MAP_CHECK:
+	case RMR_CMD_READ_MAP_BUF:
+	case RMR_CMD_MAP_GET_VER:
+	case RMR_CMD_MD_SEND:
+	case RMR_CMD_MAP_SET_VER:
+		dir = READ;
+		break;
+	case RMR_CMD_MAP_TEST:
+	case RMR_CMD_SEND_MAP_BUF:
+	case RMR_CMD_SEND_MD_BUF:
+		dir = WRITE;
+		break;
+	default:
+		pr_err("%s: pool %s cmd type %u is not supported\n",
+		       __func__, pool->poolname, msg->cmd_type);
+		return -EINVAL;
+	}
+
+	// TODO: why io_con not admin?
+	if (clt_sess->state == RMR_CLT_SESS_DISCONNECTED) {
+		pr_debug("The rmr client session %s state is disconnected\n", clt_sess->sessname);
+		err = -EINVAL;
+		goto err;
+	}
+
+	sess_iu = rmr_msg_get_iu(pool_sess, RTRS_IO_CON, RTRS_PERMIT_WAIT, 2);
+	if (unlikely(!sess_iu)) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	sess_iu->buf = buf;
+	sg_init_one(&sess_iu->sg, buf, buflen);
+
+	err = send_usr_msg(clt_sess->rtrs, dir, sess_iu,
+			   &vec, 1, buflen, &sess_iu->sg, 1,
+			   msg_pool_cmd_map_content_conf, &errno, WAIT);
+	if (unlikely(err)) {
+		rmr_msg_put_iu(pool_sess, sess_iu);
+	} else {
+		err = errno;
+	}
+
+	rmr_msg_put_iu(pool_sess, sess_iu);
+
+err:
+	return err;
+}
+
+/**
+ * rmr_clt_pool_member_synced() - check if the pool member has no data to sync
+ *
+ * @pool:	rmr pool in which we perform the check
+ * @member_id:	id of the pool member tto check
+ *
+ * Description:
+ *	Send the check map command to the pool member with  the specified id.
+ *	Pool member returns whether he has unsynced chunks or not.
+ *
+ * Return:
+ *	error code if failed to send, 0 if pool member is not synced completely,
+ *	1 if pool member is synced (has no dirty chunks in his map).
+ *
+ * Context:
+ *	This function blocks while sending the command.
+ *
+ * Locks:
+ *	no
+ */
+int rmr_clt_pool_member_synced(struct rmr_pool *pool, u8 member_id)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_msg_pool_cmd_rsp rsp = {};
+	struct rmr_msg_pool_cmd msg = {};
+	int ret = 0, idx;
+	enum rmr_clt_pool_sess_state state;
+
+	pr_debug("start looking for session with member_id=%u\n", member_id);
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+
+	pool_sess = __find_sess_by_member_id(pool, member_id);
+	if (!pool_sess) {
+		pr_err("in pool %s failed to find sess with a member_id=%u\n",
+		       pool->poolname, member_id);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	pr_debug("found session %s with member_id=%u\n",
+		 pool_sess->sessname, member_id);
+
+	state = atomic_read(&pool_sess->state);
+	if (state == RMR_CLT_POOL_SESS_FAILED ||
+	    state == RMR_CLT_POOL_SESS_REMOVING) {
+		pr_debug("pool %s session %s is in %s state, cannot send cmd %s\n",
+			 pool->poolname, pool_sess->sessname,
+			 rmr_clt_sess_state_str(state), rmr_get_cmd_name(msg.cmd_type));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_MAP_CHECK;
+
+	pr_debug("send cmd %u to %s\n", msg.cmd_type, pool_sess->sessname);
+	ret = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, &rsp, sizeof(rsp));
+	if (ret) {
+		pr_err("%s: For pool %s failed to %s, err %d\n",
+		       __func__, pool->poolname, rmr_get_cmd_name(msg.cmd_type), ret);
+		goto out;
+	}
+
+	if (rsp.value)
+		ret = 1; // other side reported map is clear
+
+	pr_debug("send cmd %u to %s is done\n", msg.cmd_type, pool_sess->sessname);
+out:
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	return ret;
+}
+EXPORT_SYMBOL(rmr_clt_pool_member_synced);
+
+/**
+ * rmr_pool_md_to_buf - Fill the buffer with the metadata
+ *
+ * @pool:	rmr pool contains the metadata. It must be a non-sync pool,
+ *		either client or server pool.
+ * @buf:	buffer to fill with the metadata.
+ *
+ */
+static void rmr_clt_md_to_buf(struct rmr_pool *pool, u8 *buf)
+{
+	struct rmr_pool_md *pool_md;
+	struct rmr_srv_md *srv_md;
+
+	if (pool->is_clt) {
+		pool_md = (struct rmr_pool_md *)buf;
+		/* copy the entire client pool md */
+		memcpy(pool_md, &pool->pool_md, sizeof(struct rmr_pool_md));
+		return;
+	}
+
+	srv_md = (struct rmr_srv_md *)(&buf[RMR_CLT_MD_SIZE]);
+	memcpy(srv_md, &pool->pool_md.srv_md[0], RMR_SRV_MD_SIZE);
+}
+
+/**
+ * rmr_clt_pool_send_md_all() - Send metadata of rmr pool
+ *
+ * Description:
+ *	Send metadata of the src pool to all sessions of the client pool.
+ *	1) If the client pool is sync pool, it sends the entire server pool
+ *	metadata back after the leader reads the metadata of its connected
+ *	nodes.
+ *	2) If it is non-sync, send the client pool metadata to storage node
+ *	backups.
+ */
+int rmr_clt_pool_send_md_all(struct rmr_pool *src_pool, struct rmr_pool *clt_pool)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_msg_pool_cmd msg = {};
+	void *buf;
+	u32 buflen;
+	int err = 0, idx;
+
+	if (!clt_pool) {
+		pr_err("Cannot send metadata when clt_pool is NULL\n");
+		return -EINVAL;
+	}
+
+	if (src_pool->sync) {
+		pr_err("Cannot send metadata when src_pool is sync\n");
+		return -EINVAL;
+	}
+
+	buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL);
+	buflen = RMR_MD_SIZE;
+	if (!buf)
+		return -ENOMEM;
+
+	rmr_clt_md_to_buf(src_pool, buf);
+
+	/*
+	 * It will continue to send the md to the next session even if the previous send failed.
+	 */
+	idx = srcu_read_lock(&clt_pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &clt_pool->sess_list, entry,
+				 (srcu_read_lock_held(&clt_pool->sess_list_srcu))) {
+		pr_debug("Start sending md for pool %s; to session %s with member_id %d\n",
+			 src_pool->poolname, pool_sess->sessname, pool_sess->member_id);
+
+		rmr_clt_init_cmd(clt_pool, &msg);
+		msg.cmd_type = RMR_CMD_SEND_MD_BUF;
+		msg.send_md_buf_cmd = (struct rmr_msg_send_md_buf_cmd) {
+			.sync = clt_pool->sync,
+			/* the receiver of buffer is the leader */
+			.receiver_id = pool_sess->member_id,
+			/* change flags in cmd message */
+			.flags = RMR_OP_MD_WRITE,
+		};
+
+		err = rmr_clt_send_cmd_with_data(clt_pool, pool_sess, &msg, buf, buflen);
+		if (err) {
+			pr_debug("Cannot send the clt/srv_md of entire pool to the pool sess %s\n",
+				 pool_sess->sessname);
+			continue;
+		}
+	}
+
+	pr_debug("send_md done\n");
+
+	kfree(buf);
+
+	srcu_read_unlock(&clt_pool->sess_list_srcu, idx);
+	return err;
+}
+EXPORT_SYMBOL(rmr_clt_pool_send_md_all);
+
+static int rmr_clt_start_send_md(struct rmr_pool *pool)
+{
+	return rmr_clt_pool_send_md_all(pool, pool);
+}
+
+/**
+ * rmr_clt_del_stor_from_pool() - Notify pool members that a storage node is leaving
+ *
+ * @pool_sess:	The pool session of the departing storage node.
+ * @delete:	True for a permanent deletion (%RMR_POOL_INFO_MODE_DELETE);
+ *		false for a temporary disassembly (%RMR_POOL_INFO_MODE_DISASSEMBLE).
+ *
+ * Sends a POOL_INFO REMOVE message to all other active pool members so they
+ * can update their dirty maps and membership state accordingly.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+int rmr_clt_del_stor_from_pool(struct rmr_clt_pool_sess *pool_sess, bool delete)
+{
+	enum rmr_pool_info_mode mode;
+	int err;
+
+	if (delete)
+		mode = RMR_POOL_INFO_MODE_DELETE;
+	else
+		mode = RMR_POOL_INFO_MODE_DISASSEMBLE;
+
+	err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_REMOVE, mode, false);
+	if (err) {
+		pr_err("rmr_clt_send_pool_info failed for session\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static int __init rmr_client_init(void)
+{
+	int err;
+
+	pr_info("Loading module %s, version %s, proto %s\n", KBUILD_MODNAME,
+		RMR_VER_STRING, RMR_PROTO_VER_STRING);
+
+	err = rmr_clt_create_sysfs_files();
+	if (err) {
+		pr_err("Failed to load module,"
+		       " creating sysfs device files failed, err: %d\n",
+		       err);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	return err;
+}
+
+static void __exit rmr_client_exit(void)
+{
+	struct rmr_pool *pool, *tmp;
+
+	pr_info("Unloading module\n");
+
+	list_for_each_entry_safe(pool, tmp, &pool_list, entry)
+		(void) rmr_clt_remove_pool_from_sysfs(pool, NULL);
+
+	rmr_clt_destroy_sysfs_files();
+	pr_info("Module unloaded\n");
+}
+
+module_init(rmr_client_init);
+module_exit(rmr_client_exit);
diff --git a/drivers/infiniband/ulp/rmr/rmr-clt.h b/drivers/infiniband/ulp/rmr/rmr-clt.h
new file mode 100644
index 000000000000..c50651efe4a3
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-clt.h
@@ -0,0 +1,291 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef RMR_CLT_H
+#define RMR_CLT_H
+
+#include <rtrs-clt.h>
+#include "rmr-pool.h"
+
+#define RECONNECT_DELAY 30
+#define MAX_RECONNECTS -1
+#define RTRS_LINK_NAME "rtrs"
+
+#define RMR_MAP_CLEAN_DELAY_MS	  5000
+#define RMR_RECOVER_INTERVAL_MS	  3000
+
+enum rmr_clt_sess_state {
+	RMR_CLT_SESS_DISCONNECTED = 1,
+	RMR_CLT_SESS_CONNECTED,
+};
+
+struct rmr_clt_sess {
+	char		  	sessname[NAME_MAX];
+	struct kobject    	kobj;
+	struct mutex      	lock;
+	struct rtrs_clt_sess	*rtrs;
+	bool rtrs_ready;
+	/* server this session is connected to */
+	int		  	queue_depth;
+	u32               	max_io_size;
+	u32 max_segments;
+	struct list_head pool_sess_list;
+	struct list_head g_list;
+	struct kref kref;
+	enum rmr_clt_sess_state state;
+};
+
+/*
+ * NB: If you change here, make sure the changes are in sync with
+ *     pool_sess state machine routine i.e. pool_sess_change_state().
+ */
+enum rmr_clt_pool_sess_state {
+	RMR_CLT_POOL_SESS_CREATED = 1, // No IO, No dirty map addition, Yes cmd msgs
+	RMR_CLT_POOL_SESS_NORMAL, // Yes IO, No dirty map addition, Yes cmd msgs
+	RMR_CLT_POOL_SESS_FAILED, // No IO, Yes dirty map addition, No cmd msgs
+	RMR_CLT_POOL_SESS_RECONNECTING, // No IO, Yes, dirty map addition, Yes cmd msgs
+					// But not with an updated map
+
+	RMR_CLT_POOL_SESS_REMOVING // No IO, No dirty map addition, Yes cmd msgs
+				   // Getting removed from pool
+};
+
+struct rmr_clt_pool_sess {
+	char		sessname[NAME_MAX];
+	struct		rmr_pool *pool;
+	struct		kobject kobj;
+	u8		member_id; /* refers to the pool id on the */
+	struct		kobject sess_kobj;
+	struct		list_head entry; /* for pool->sess_list */
+	struct		list_head clt_sess_entry; /* for clt_sess->pool_sess_list */
+	struct		rmr_clt_sess *clt_sess;
+	atomic_t	state; /* rmr_clt_pool_sess_state */
+	u8		ver; /* protocol version */
+	u8		pool_id; /* refers to the pool id on the */
+	bool		maintenance_mode; /* If the pool is in maintenance mode or not */
+	bool		was_last_authoritative; /* last NORMAL sess before it went FAILED;
+					       * carries complete dirty maps for all members */
+};
+
+struct rmr_clt_stats {
+	struct kobject	kobj_stats;
+	atomic_t read_retries;
+};
+
+/*
+ * State descriptions:
+ * RMR_CLT_POOL_STATE_JOINED: An rmr_clt_pool which has one or more legs (rmr_clt_pool_sess)
+ *			      added to it. This means the pool has joined into pools from
+ *			      storage nodes
+ *
+ * RMR_CLT_POOL_STATE_IN_USE: An rmr_clt_pool which is in use by an upper layer client. This
+ *			      is usually done by calling rmr_clt_open
+ *
+ * Note: When adding a new state,
+ * remember to add an entry in the function rmr_get_clt_pool_state_name()
+ */
+enum rmr_clt_pool_state {
+	RMR_CLT_POOL_STATE_JOINED = 0,
+	RMR_CLT_POOL_STATE_IN_USE,
+	// RMR_CLT_POOL_STATE_DEGRADED,			uncomment and use
+	// RMR_CLT_POOL_STATE_DIRTY,
+	RMR_CLT_POOL_STATE_MAX,
+};
+
+struct rmr_clt_pool {
+	struct rmr_pool		*pool;
+	refcount_t		refcount;
+	unsigned long		state;
+	struct mutex		clt_pool_lock;
+
+	size_t		     	queue_depth;
+
+	struct rmr_clt_stats 	stats;
+	struct kobject       	stats_kobj;
+
+	void		     	*priv; /* provided by user */
+	rmr_clt_ev_fn	     	*link_ev; /* deliver events to user */
+
+	atomic_t                io_freeze;
+	wait_queue_head_t       map_update_wq;
+	struct mutex		io_freeze_lock;
+
+	struct workqueue_struct	*recover_wq;
+	struct delayed_work	recover_dwork;
+
+	/* use sessions round robbin to read */
+	struct rmr_clt_pool_sess __rcu *__percpu *pcpu_sess;
+};
+
+struct rmr_iu_comp {
+        wait_queue_head_t wait;
+        int errno;
+};
+
+/**
+ * rmr_iu - reserves resources needed to do an I/O op on pool
+ */
+struct rmr_iu {
+	struct rmr_pool		*pool;
+	unsigned int		mem_id;
+	struct list_head	sess_list;       /* list of per-session tags */
+	u8			num_sessions;
+	refcount_t		ref;             /* lifetime refcount */
+	struct rmr_msg_io	msg;
+	int			errno;
+	atomic_t		succeeded;
+	refcount_t		refcount;
+	rmr_conf_fn		*conf;
+	void			*priv;
+	/* for retry of failed reads */
+	struct work_struct	work;
+	struct scatterlist	*sg;
+	unsigned int		sg_cnt;
+};
+
+struct rmr_clt_sess_iu {
+	void *buf; /* for session messages */
+	struct rtrs_permit      *permit;
+	struct rmr_clt_pool_sess *pool_sess;
+	int			errno;
+	union {
+		/* for session messages only */
+		struct scatterlist	sg;
+		/* for tag->sess_list of io messages*/
+		struct list_head	entry;
+	};
+
+	/* for session messages only */
+	struct work_struct	work;
+
+	/* for io requests */
+	struct rmr_iu		*rmr_iu;
+	unsigned int		mem_id;
+
+	/* for command messages */
+	struct rmr_clt_cmd_unit	*rmr_cmd_unit;
+
+	/* for session messages only */
+	struct rmr_iu_comp	comp;
+	atomic_t		refcount;
+};
+
+struct rmr_clt_iu_comp {
+	wait_queue_head_t wait;
+	int errno;
+};
+
+struct rmr_clt_cmd_unit {
+	struct rmr_pool		*pool;
+	struct rmr_clt_pool	*clt_pool;
+
+	struct list_head	sess_list;
+	int			num_sessions;
+
+	int			failed_state;
+	int			errno;
+	atomic_t		succeeded;
+	refcount_t		refcount;
+
+	rmr_conf_fn		*conf;
+	void			*priv;
+};
+
+/* rmr-clt.c */
+struct rmr_pool *rmr_clt_create_pool(const char *name);
+void rmr_put_clt_pool(struct rmr_clt_pool *clt_pool);
+
+void rmr_clt_change_pool_state(struct rmr_clt_pool *rmr_clt_pool,
+			       enum rmr_clt_pool_state new_state, bool set);
+int rmr_clt_remove_pool_from_sysfs(struct rmr_pool *pool,
+				   const struct attribute *sysfs_self);
+struct rmr_clt_sess *find_and_get_or_create_clt_sess(char *sessname,
+						     struct rtrs_addr *paths,
+						     size_t path_cnt);
+struct rmr_clt_pool_sess *rmr_clt_add_pool_sess(struct rmr_pool *pool,
+						struct rmr_clt_sess *clt_sess, bool create);
+void rmr_clt_sess_put(struct rmr_clt_sess *sess);
+void rmr_clt_del_pool_sess(struct rmr_clt_pool_sess *sess);
+void rmr_clt_destroy_pool_sess(struct rmr_clt_pool_sess *sess, bool delete);
+
+const char *rmr_clt_sess_state_str(enum rmr_clt_pool_sess_state state);
+void resend_join_pool(struct rmr_clt_sess *sess);
+int rmr_clt_reconnect_sess(struct rmr_clt_sess *sess,
+			   const struct rtrs_addr *paths,
+			   size_t path_cnt);
+int rmr_clt_start_last_io_update(struct rmr_pool *pool);
+int rmr_clt_set_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess);
+int rmr_clt_enable_sess(struct rmr_clt_pool_sess *sess);
+
+int rmr_clt_send_map_update(struct rmr_pool *pool, struct rmr_iu *iu);
+
+int rmr_clt_pool_send_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg);
+int rmr_clt_send_cmd_with_data(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess,
+			       struct rmr_msg_pool_cmd *msg,
+			       void *buf, unsigned int buflen);
+int rmr_clt_map_add_id(struct rmr_pool *pool, int stg_id, rmr_id_t id);
+void rmr_clt_init_cmd(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg);
+int rmr_clt_pool_send_cmd(struct rmr_clt_pool_sess *sess, struct rmr_msg_pool_cmd *msg, bool wait);
+int rmr_clt_del_stor_from_pool(struct rmr_clt_pool_sess *pool_sess, bool delete);
+bool rmr_clt_sess_is_sync(struct rmr_clt_pool_sess *sess);
+int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool delete, bool wait);
+void rmr_clt_free_pool_sess(struct rmr_clt_pool_sess *pool_sess);
+int rmr_clt_send_map(struct rmr_pool *map_src_pool, struct rmr_pool *clt_pool,
+		     const struct rmr_msg_map_send_cmd *map_send_cmd, rmr_map_filter filter);
+int rmr_clt_test_map(struct rmr_pool *src_pool, struct rmr_pool *dst_pool);
+int rmr_clt_send_cmd_with_data_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg,
+				   void *buf, unsigned int buflen);
+int rmr_clt_pool_send_md_all(struct rmr_pool *src_pool, struct rmr_pool *clt_pool);
+int rmr_clt_pool_send_cmd_all(struct rmr_pool *pool, enum rmr_msg_cmd_type cmd_type);
+void recover_work(struct work_struct *work);
+
+int rmr_clt_pool_member_synced(struct rmr_pool *pool, u8 member_id);
+
+bool pool_sess_change_state(struct rmr_clt_pool_sess *pool_sess,
+			    enum rmr_clt_pool_sess_state newstate);
+
+void rmr_clt_pool_io_freeze(struct rmr_clt_pool *clt_pool);
+void rmr_clt_pool_io_unfreeze(struct rmr_clt_pool *clt_pool);
+void rmr_clt_pool_io_wait_complete(struct rmr_clt_pool *clt_pool);
+int rmr_clt_pool_try_enable(struct rmr_pool *pool);
+int send_msg_enable_pool(struct rmr_clt_pool_sess *pool_sess, bool enable);
+
+void rmr_get_iu(struct rmr_iu *iu);
+void rmr_put_iu(struct rmr_iu *iu);
+void rmr_msg_put_iu(struct rmr_clt_pool_sess *pool_sess,
+		    struct rmr_clt_sess_iu *sess_iu);
+void wake_up_iu_comp(struct rmr_clt_sess_iu *sess_iu);
+void msg_conf(void *priv, int errno);
+
+/* rmr-map-mgmt.c */
+void send_map_check(struct rmr_clt_pool_sess *pool_sess);
+void send_store_check(struct rmr_clt_pool_sess *pool_sess);
+int send_map_get_version(struct rmr_clt_pool_sess *pool_sess, u64 *ver);
+int send_discard(struct rmr_clt_pool_sess *pool_sess, u8 cmd_type, u8 member_id);
+int rmr_clt_handle_map_check_rsp(struct rmr_clt_pool_sess *pool_sess,
+				 struct rmr_msg_pool_cmd_rsp *rsp);
+int rmr_clt_handle_store_check_rsp(struct rmr_clt_pool_sess *pool_sess,
+				   struct rmr_msg_pool_cmd_rsp *rsp);
+int rmr_clt_read_map(struct rmr_pool *pool);
+int rmr_clt_spread_map(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess_chosen,
+		       bool enable, bool skip_normal);
+int rmr_clt_unset_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess);
+void sched_map_add(struct work_struct *work);
+void msg_pool_cmd_map_content_conf(struct work_struct *work);
+
+/* rmr-clt-sysfs.c */
+int rmr_clt_create_sysfs_files(void);
+void rmr_clt_destroy_sysfs_files(void);
+void rmr_clt_destroy_pool_sysfs_files(struct rmr_pool *pool,
+				      const struct attribute *sysfs_self);
+int rmr_clt_create_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess);
+void rmr_clt_destroy_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess);
+
+int rmr_clt_reset_read_retries(struct rmr_clt_stats *stats, bool enable);
+ssize_t rmr_clt_stats_read_retries_to_str(struct rmr_clt_stats *stats, char *page);
+
+#endif /* RMR_CLT_H */
diff --git a/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c b/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c
new file mode 100644
index 000000000000..cade5dbf2e20
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c
@@ -0,0 +1,933 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR) — client MAP-exchange management
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "rmr-clt.h"
+#include "rmr-clt-trace.h"
+
+void send_map_check(struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool *pool = pool_sess->pool;
+	int err;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_MAP_CHECK;
+
+	err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+	if (err) {
+		pr_err("%s: For sess %s, %s failed with err %d\n",
+		       __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err);
+		return;
+	}
+}
+
+void send_store_check(struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool *pool = pool_sess->pool;
+	int err;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_STORE_CHECK;
+
+	err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); //am : why wait ?
+	if (err) {
+		pr_err("%s: For sess %s, %s failed with err %d\n",
+		       __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err);
+		pr_err("sess %s failed to send store check with err %d\n",
+		       pool_sess->sessname, err);
+	}
+}
+
+int send_map_get_version(struct rmr_clt_pool_sess *pool_sess, u64 *ver)
+{
+	struct rmr_msg_pool_cmd_rsp rsp = {};
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool *pool = pool_sess->pool;
+	int err;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = RMR_CMD_MAP_GET_VER;
+
+	err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, &rsp, sizeof(rsp));
+	if (err) {
+		pr_err("%s: For sess %s, %s failed with err %d\n",
+			__func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err);
+		return -EINVAL;
+	}
+
+	*ver = rsp.value;
+
+	return 0;
+}
+
+int send_discard(struct rmr_clt_pool_sess *pool_sess, u8 cmd_type, u8 member_id)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	struct rmr_pool *pool = pool_sess->pool;
+	int err;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = cmd_type;
+	msg.send_discard_cmd.member_id = member_id;
+
+	err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+	if (err) {
+		pr_err("%s: For sess %s, %s failed with err %d\n",
+		       __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err);
+	}
+
+	return err;
+}
+
+int rmr_clt_handle_map_check_rsp(struct rmr_clt_pool_sess *pool_sess,
+					struct rmr_msg_pool_cmd_rsp *rsp)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	struct rmr_dirty_id_map *map;
+
+	pr_debug("pool %s sess %s member_id %u, rsp->value=%llu\n",
+		 pool->poolname, pool_sess->sessname, rsp->member_id, rsp->value);
+	if (!rsp->value) // map is not empty on stg
+		return 0;
+
+	pr_debug("pool %s server with id %u has empty dirty map, lets clean it.\n",
+		 pool->poolname, rsp->member_id);
+	map = rmr_pool_find_map(pool, rsp->member_id);
+	if (!map) {
+		pr_err("%s: pool %s no map found for member_id %u\n",
+		       __func__, pool->poolname, rsp->member_id);
+		return -EINVAL;
+		//TODO: handle this, how?
+	}
+
+	if (!rmr_map_empty(map)) {
+		pr_debug("pool %s dirty map for member_id %d is not empty, map->ts %lu (now %lu)\n",
+			 pool->poolname, rsp->member_id, map->ts, jiffies);
+		if (time_after(jiffies, map->ts + msecs_to_jiffies(RMR_MAP_CLEAN_DELAY_MS))) {
+			pr_info("%s: pool %s clear dirty map for member_id %d\n",
+				__func__, pool->poolname, rsp->member_id);
+			rmr_map_unset_dirty_all(map);
+			map->ts = jiffies;
+		}
+	}
+
+	pr_debug("pool %s map with member_id %u cleaned\n",
+		 pool->poolname, map->member_id);
+	return 0;
+}
+
+int rmr_clt_handle_store_check_rsp(struct rmr_clt_pool_sess *pool_sess,
+					  struct rmr_msg_pool_cmd_rsp *rsp)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	int err = 0;
+
+	pr_debug("pool %s sess %s member_id %u, rsp->value=%llu\n",
+		 pool->poolname, pool_sess->sessname, rsp->member_id, rsp->value);
+	if (!rsp->value) {
+		pr_debug("pool %s sess %s (state=%d) reported that store is not available, changing state\n",
+			 pool->poolname, pool_sess->sessname, atomic_read(&pool_sess->state));
+		return 0;
+	}
+	pr_info("pool %s sess %s (state=%d) reported that store is available, changing state\n",
+		pool->poolname, pool_sess->sessname, atomic_read(&pool_sess->state));
+
+	pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING);
+
+	if (!pool_sess->maintenance_mode) {
+		err = rmr_clt_pool_try_enable(pool);
+		if (err) {
+			pr_err("%s: pool %s try_enable failed for sess %s: %d\n",
+			       __func__, pool->poolname, pool_sess->sessname, err);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Pre-requisite: rcu read lock should be held by caller
+ */
+static struct rmr_clt_pool_sess *rmr_clt_get_first_reconnecting_session(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_RECONNECTING)
+			return pool_sess;
+	}
+
+	return NULL;
+}
+
+/**
+ * rmr_clt_pool_map_xfer() - transfer dirty maps between rmr client and server
+ *
+ * @pool:	the rmr pool used for map transfers
+ * @pool_sess:	client pool session that is used for map transfer
+ * @cmd_type:	pool command type generated for this transfer, for now only
+ *		RMR_CMD_READ_MAP_BUF, RMR_CMD_SEND_MAP_BUF, RMR_CMD_MAP_TEST are used
+ * @buf:	pointer to the data buffer for data transfers
+ * @buflen:	size of the buffer in bytes
+ * @map_idx:	index of the map in dirty map array from which we start to send or receive
+ *		the data
+ * @offset:	key in the map from which we start to send/receive the data about the maps
+ *
+ * Description:
+ *	Performs transfer of the information about the dirty maps starting from the map with
+ *	position map_idx in the array of dirty maps and from the start_key at that map.
+ *	cmd types are handled as follows:
+ *	RMR_CMD_READ_MAP_BUF - read the information about the maps from the pool and fill buf
+ *	RMR_CMD_SEND_MAP_BUF - send buf with filled data to the pull
+ *	RMR_CMD_MAP_TEST - send the buf with data to the pool to perform map comparison
+ *
+ * Return:
+ *	0 on success, error code otherwise.
+ *
+ * Context:
+ *	This function blocks while sending the buffer.
+ *
+ * Locks:
+ *	should be called under srcu_read_lock since it uses pool_sess
+ */
+static int rmr_clt_pool_map_xfer(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess,
+				 int cmd_type, void *buf, unsigned int buflen,
+				 u8 map_idx, u64 slp_idx)
+{
+	struct rmr_msg_pool_cmd msg = {};
+	int err;
+
+	rmr_clt_init_cmd(pool, &msg);
+	msg.cmd_type = cmd_type;
+
+	msg.map_buf_cmd.map_idx = map_idx;
+	msg.map_buf_cmd.slp_idx = slp_idx;
+
+	err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, buf, buflen);
+	if (err) {
+		pr_debug("pool %s failed to send map xfer cmd %u, err %d\n",
+			 pool->poolname, cmd_type, err);
+		return err;
+	}
+
+	return 0;
+}
+
+int rmr_clt_read_map(struct rmr_pool *pool)
+{
+	struct rmr_clt_pool_sess *pool_sess = NULL;
+	struct rmr_map_buf_hdr *map_buf_hdr;
+	u8 map_idx = 0;
+	u64 slp_idx = 0;
+	int err = 0, buflen, idx;
+	void *buf;
+
+	idx = srcu_read_lock(&pool->sess_list_srcu);
+
+	pool_sess = rmr_clt_get_first_reconnecting_session(pool);
+	if (pool_sess == NULL) {
+		srcu_read_unlock(&pool->sess_list_srcu, idx);
+		pr_err("%s: No created session found\n", __func__);
+		return -EINVAL;
+	}
+
+	buflen = RTRS_IO_LIMIT;
+	buf = kzalloc(buflen, GFP_KERNEL);
+	if (!buf) {
+		pr_err("%s: Error allocating buffer\n", __func__);
+		err = -ENOMEM;
+		goto ret;
+	}
+
+	while (true) {
+		err = rmr_clt_pool_map_xfer(pool, pool_sess, RMR_CMD_READ_MAP_BUF,
+					    buf, buflen, map_idx, slp_idx);
+		if (err) {
+			pr_debug("rmr_clt_pool_map_xfer failed for pool %s, err %d\n",
+				 pool->poolname, err);
+			goto ret_free;
+		}
+
+		map_buf_hdr = (struct rmr_map_buf_hdr *)buf;
+		if (map_buf_hdr->member_id == 0)
+			break;
+
+		err = rmr_pool_save_map(pool, buf, buflen, false);
+		if (err) {
+			pr_err("rmr_pool_save_map failed\n");
+			goto ret_free;
+		}
+
+		map_idx = map_buf_hdr->map_idx;
+		slp_idx = map_buf_hdr->slp_idx;
+	}
+
+ret_free:
+	kfree(buf);
+
+ret:
+	srcu_read_unlock(&pool->sess_list_srcu, idx);
+
+	return err;
+}
+
+/**
+ * rmr_clt_spread_map() - Spread the map contained in storage node connected by pool_sess_chosen
+ *
+ * @pool:		The pool
+ * @pool_sess_chosen:	pool session from where the map is to be updated from
+ * @enable:		Whether the last MAP_DONE command should have the enable param set or not
+ * @skip_normal:	If true, freeze IOs before spreading and silently skip any NORMAL
+ *			sessions encountered in the loop (used in Case 1 recovery where
+ *			pool_sess_chosen is itself a NORMAL session that is still serving IOs).
+ *			If false, encountering a NORMAL session is treated as an error.
+ *
+ * Description:
+ *	This function spreads the map contained in the storage node connected by given pool
+ *	session. The param enable denotes whether the map update should result in the storage
+ *	going to NORMAL state or not. This is controlled by the enable param in the last MAP_DONE
+ *	message.
+ *
+ * Return:
+ *	0 on success
+ *	Error value on failure
+ *
+ * Context:
+ *	srcu_read_lock should be held while calling this function.
+ */
+int rmr_clt_spread_map(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess_chosen,
+			      bool enable, bool skip_normal)
+{
+	struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_msg_pool_cmd msg = {};
+	int state, err = 0;
+
+	rmr_clt_init_cmd(pool, &msg);
+
+	/*
+	 * If we expect NORMAL session, then we should expect IOs running.
+	 * Which is why we should freeze IOs before doing map_update.
+	 */
+	if (skip_normal) {
+		/* Freeze IOs */
+		rmr_clt_pool_io_freeze(clt_pool);
+
+		/* Wait for all completion */
+		rmr_clt_pool_io_wait_complete(clt_pool);
+	}
+
+	/*
+	 * TODO: Use rmr_clt_handle_discard to check whether the pool
+	 * session has pending discard request to be sent.
+	 *
+	 * Enable this when we fix replace.
+	 *
+	err = rmr_clt_handle_discard(pool);
+	if (err) {
+		pr_err("%s: discard handling failed\n", __func__);
+		goto err;
+	}
+	*/
+
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (pool_sess == pool_sess_chosen)
+			continue;
+
+		state = atomic_read(&pool_sess->state);
+		if (state == RMR_CLT_POOL_SESS_NORMAL) {
+			if (skip_normal)
+				continue;
+			pr_err("%s: pool %s unexpected NORMAL session %s during spread\n",
+			       __func__, pool->poolname, pool_sess->sessname);
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		if (state != RMR_CLT_POOL_SESS_RECONNECTING ||
+		    pool_sess->maintenance_mode)
+			continue;
+
+		msg.cmd_type = RMR_CMD_MAP_READY;
+
+		err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+		if (err) {
+			pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type));
+			goto err_dis;
+		}
+
+		msg.cmd_type = RMR_CMD_MAP_SEND;
+		msg.map_send_cmd.receiver_member_id = pool_sess->member_id;
+		err = rmr_clt_pool_send_cmd(pool_sess_chosen, &msg, WAIT);
+		if (err) {
+			pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type));
+			goto err_dis;
+		}
+
+		msg.cmd_type = RMR_CMD_MAP_DONE;
+		msg.map_done_cmd.enable = enable;
+
+		err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+		if (err) {
+			pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type));
+			goto err_dis;
+		}
+	}
+
+	/* Unfreeze IOs and wake up */
+	if (skip_normal)
+		rmr_clt_pool_io_unfreeze(clt_pool);
+
+	return 0;
+
+err_dis:
+	list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry,
+				 (srcu_read_lock_held(&pool->sess_list_srcu))) {
+		if (pool_sess == pool_sess_chosen)
+			continue;
+
+		if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) {
+			if (skip_normal)
+				continue;
+			pr_err("%s: pool %s unexpected NORMAL session %s during spread\n",
+			       __func__, pool->poolname, pool_sess->sessname);
+		}
+
+		msg.cmd_type = RMR_CMD_MAP_DISABLE;
+		rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+	}
+
+err_out:
+	/* Unfreeze IOs and wake up */
+	if (skip_normal)
+		rmr_clt_pool_io_unfreeze(clt_pool);
+
+	return err;
+}
+
+/**
+ * rmr_clt_set_pool_sess_mm() - Set the rmr clt pool session to maintenance mode
+ *
+ * @pool_sess:	The rmr clt pool session to set in maintenance mode
+ *
+ * Description:
+ *	This function does the necessary work required, like setting the pool session to
+ *	maintenance mode and updating the state.
+ *	It then also communicates this state change to the corresponding storage node.
+ *
+ * Return:
+ *	0 on success
+ *	Error value on failure
+ */
+int rmr_clt_set_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	int err;
+
+	pr_info("%s: Putting sess %s of pool %s in maintenance mode\n",
+		__func__, pool_sess->sessname, pool->poolname);
+
+	if (pool_sess->maintenance_mode)
+		goto send_message;
+
+	/*
+	 * If the pool_sess is to be put in maintenance mode,
+	 * update relevant states and params, Then send message to storage node.
+	 *
+	 * We do not need any kind of locking for this, because of the way IO units (IU) are
+	 * allocated & sent. The mm mode update & the state change can happen at multiple places.
+	 *
+	 * 1) If the state changes before the pool_sess is picked up into the IU, then we are safe
+	 * 2) If the state changes after the pool_sess is picked up into the IU, but before,
+	 * rmr_clt_request, it will be failed in rmr_clt_request.
+	 * 3) If the state changes after rmr_clt_request, the IO would be sent to the storage node
+	 * for that pool_sess. Then we have 2 cases,
+	 *   a) The message for maintenance_mode is received by the storage node before the IO,
+	 *   then the storage node will fail the IO. Failure would then be handled by the client.
+	 *   b) The message for maintenance_mode is received by the storage node after the IO,
+	 *   then the storage node will process the IO, and return success to client. In this case
+	 *   also we are fine, since the IO got processes successfully.
+	 */
+	pool->map_ver++;
+	pool_sess->maintenance_mode = true;
+	pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING);
+
+send_message:
+	err = send_msg_enable_pool(pool_sess, 0);
+	if (err) {
+		pr_err("%s: send_msg_enable_pool failed for pool %s. Err %d\n",
+		       __func__, pool->poolname, err);
+	}
+
+	return err;
+}
+
+/**
+ * rmr_clt_unset_pool_sess_mm() - Clear the rmr clt pool sessions maintenance mode
+ *
+ * @pool_sess:	The rmr clt pool session to clear maintenance mode of
+ *
+ * Description:
+ *	This function clears the maintenance mode of the given rmr clt pool session.
+ *	It also does the map_update which essentially brings the pool_session and its
+ *	corresponding storage node to NORMAL state.
+ *
+ * Return:
+ *	0 on success
+ *	Error value on failure
+ */
+int rmr_clt_unset_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess)
+{
+	struct rmr_pool *pool = pool_sess->pool;
+	int err;
+
+	pr_info("%s: Putting to sess %s of pool %s out of maintenance mode\n",
+		__func__, pool_sess->sessname, pool->poolname);
+
+	/*
+	 * Cannot be in NORMAL and CREATED states while in maintenance mode.
+	 */
+	WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL);
+	WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED);
+
+	/*
+	 * If this pool_sess is getting removed, we fail unset maintenance mode
+	 */
+	if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING)
+		return -EINVAL;
+
+	/*
+	 * First unset mm of storage node
+	 */
+	err = send_msg_enable_pool(pool_sess, 1);
+	if (err) {
+		pr_err("Failed to send enable to pool %s. Err %d\n",
+		       pool->poolname, err);
+		return -EINVAL;
+	}
+
+	/* Now do this */
+	pool_sess->maintenance_mode = false;
+
+	/*
+	 * For FAILED states, further action would happen when it goes to RECONNECTING state
+	 */
+	if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED)
+		return 0;
+
+	/*
+	 * Since we are in RECONNECTING state, we do map update here.
+	 */
+	err = rmr_clt_pool_try_enable(pool);
+	if (err) {
+		pr_err("%s: pool %s try_enable failed for sess %s: %d\n",
+		       __func__, pool->poolname, pool_sess->sessname, err);
+		return err;
+	}
+
+	return 0;
+}
+
+void msg_pool_cmd_map_content_conf(struct work_struct *work)
+{
+	struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work);
+	struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess;
+
+	pr_debug("%s: session %s conf with errno %d\n",
+		 __func__, pool_sess->sessname, sess_iu->errno);
+
+	wake_up_iu_comp(sess_iu);
+	rmr_msg_put_iu(pool_sess, sess_iu);
+}
+
+static void send_map_update_done(struct work_struct *work)
+{
+	struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work);
+	struct rmr_iu *iu = sess_iu->rmr_iu;
+	struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess;
+	int errno = sess_iu->errno;
+
+	pr_debug("%s: Session %s, err %d, iu %p\n",
+		 __func__, pool_sess->sessname, errno, iu);
+	WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED);
+
+	/*
+	 * We leave "iu->errno" set from the IO failure.
+	 * Even though one map_add succeeds, we clear `iu->errno`
+	 * and the main IO succeeds. And all other map_adds
+	 * simply trigger session state change to FAILURE.
+	 */
+	if (!errno) {
+		iu->errno = 0;
+	} else {
+		pr_err_ratelimited("%s: for sess %s got errno: %d\n",
+				__func__, pool_sess->sessname, errno);
+
+		if (iu->errno)
+			/* only the last error is reported */
+			iu->errno = errno;
+		pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED);
+	}
+
+	pr_debug("%s: Before dec and test iu %p refcnt=%d\n",
+		 __func__, iu, refcount_read(&iu->refcount));
+
+	if (refcount_dec_and_test(&iu->refcount)) {
+		rmr_conf_fn *conf = iu->conf;
+
+		pr_debug("all maps updated, call conf %p withh errno %d\n",
+			 conf, errno);
+		(*conf)(iu->priv, iu->errno);
+	}
+}
+
+/**
+ * rmr_clt_send_map_update() - Send map update to all connected storage nodes
+ *
+ * @pool:	The client pool of whose sessions the update is to be sent
+ * @iu:		The IO unit containing the information for the update
+ *
+ * Description:
+ *	Send map update, using the underlying RTRS <-> RDMA
+ *	Currently we use the same rmr_iu as IO, since it saves us time.
+ *	When an IO fails, and a MAP_ADD is to be sent, the code reuses the
+ *	same rmr_iu used for IO. This way we do not spend time acquiring
+ *	and initializing another rmr_iu.
+ *
+ *	A map update currently can either be a MAP_ADD or a MAP_CLEAR.
+ *	The caller must make sure the basic and required information for both
+ *	the above commands is updated in the rmr_iu.
+ *	Basic being the pool group_id, msg hdr type, etc.
+ *	Required being the following,
+ *		MAP_ADD requires the rmr_id_t chunk numbers, failed_id array and failed_cnt
+ *		MAP_CLEAR requires the rmr_id_t and the member_id
+ *
+ * Return:
+ *	0 on success. This means the map_update was sent successfully.
+ *	The subsequent status (err or not) goes to iu->conf call,
+ *	so the caller should check that too.
+ *
+ *	Error value on failure. When this function returns error,
+ *	be aware that the iu->conf will not be called.
+ */
+int rmr_clt_send_map_update(struct rmr_pool *pool, struct rmr_iu *iu)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu;
+	struct rtrs_clt_req_ops req_ops;
+	struct kvec vec;
+	int err;
+
+	pr_debug("%s: rmr_id (%llu, %llu), msg %d, refcnt=%d\n", __func__,
+		 iu->msg.id_a, iu->msg.id_b, iu->msg.hdr.type, refcount_read(&iu->refcount));
+
+	if (!pool) {
+		pr_err("Cannot send map update. pool is NULL\n");
+		return -EINVAL;
+	}
+
+	rmr_get_iu(iu);
+
+	vec = (struct kvec){
+		.iov_base = &iu->msg,
+		.iov_len = sizeof(iu->msg)
+	};
+
+	list_for_each_entry_safe(sess_iu, tmp_sess_iu, &(iu->sess_list), entry) {
+		struct rmr_clt_sess *clt_sess;
+		enum rmr_clt_pool_sess_state state;
+
+		pool_sess = sess_iu->pool_sess;
+		clt_sess = pool_sess->clt_sess;
+
+		INIT_WORK(&sess_iu->work, send_map_update_done);
+
+		req_ops = (struct rtrs_clt_req_ops) {
+			.priv = sess_iu,
+			.conf_fn = msg_conf,
+		};
+
+		state = atomic_read(&pool_sess->state);
+		if (state == RMR_CLT_POOL_SESS_FAILED ||
+		    state == RMR_CLT_POOL_SESS_REMOVING) {
+			/*
+			 * Sessions in failed state is probably the reason why we sending
+			 * map add in the first place.
+			 * We can skip those sessions, since map update will take care of this.
+			 */
+			pr_debug("%s: skipped sess %s\n", __func__, sess_iu->pool_sess->sessname);
+			sess_iu->errno = -EINVAL;
+			schedule_work(&sess_iu->work);
+			continue;
+		}
+
+		pr_debug("Sending request flags %u to pool %s session %s "
+			 "chunk [%llu, %llu] offset %u length %u)\n",
+			 iu->msg.flags, pool->poolname, pool_sess->sessname,
+			 iu->msg.id_a, iu->msg.id_b,
+			 iu->msg.offset, iu->msg.length);
+
+		trace_send_map_update(WRITE, sess_iu);
+
+		err = rtrs_clt_request(WRITE, &req_ops, clt_sess->rtrs,
+				       sess_iu->permit, &vec, 1, 0, NULL, 0);
+
+		/* we can ignore errno since we called rmr_clt_send_map_update with NO_WAIT */
+		if (err) {
+			sess_iu->errno = err;
+
+			pr_err("%s: Failed with err %d, schedule work\n",
+			       __func__, err);
+			schedule_work(&sess_iu->work);
+		}
+	}
+	rmr_put_iu(iu);
+
+	/*
+	 * We are handling err through iu->conf
+	 */
+	return 0;
+}
+EXPORT_SYMBOL(rmr_clt_send_map_update);
+
+int rmr_clt_map_add_id(struct rmr_pool *pool, int stg_id, rmr_id_t id)
+{
+	struct rmr_dirty_id_map *map;
+
+	map = rmr_pool_find_map(pool, stg_id);
+	if (!map) {
+		pr_err("in pool %s cannot find map for member_id %u\n",
+		       pool->poolname, stg_id);
+		return -EINVAL;
+	}
+
+	map->ts = jiffies;
+	rmr_map_set_dirty(map, id, 0);
+
+	pr_debug("pool %s id (%llu, %llu) inserted to the dirty map\n",
+		 pool->poolname, id.a, id.b);
+
+	return 0;
+}
+
+void sched_map_add(struct work_struct *work)
+{
+	struct rmr_iu *iu = container_of(work, struct rmr_iu, work);
+	struct rmr_pool *pool = iu->pool;
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_clt_sess_iu *sess_iu;
+	rmr_conf_fn *clt_conf = iu->conf;
+	void *clt_priv = iu->priv;
+	int failed_cnt = 0, err = 0;
+	rmr_id_t id;
+
+	pr_debug("scheduled work process for rmr iu %p send map add id (%llu, %llu), poolname %s\n",
+		 iu, iu->msg.id_a, iu->msg.id_b, pool->poolname);
+
+	/*
+	 * For MAP_ADD, we need failed_id, failed_cnt, and rmr_id_t for chunk number.
+	 *
+	 * We reuse the iu which was used for this IO.
+	 * It already has the chunk number, the clt_conf function to be called,
+	 * and other important things.
+	 */
+	iu->msg.hdr.type = cpu_to_le16(RMR_MSG_MAP_ADD);
+
+	id.a = le64_to_cpu(iu->msg.id_a);
+	id.b = le64_to_cpu(iu->msg.id_b);
+	list_for_each_entry(sess_iu, &(iu->sess_list), entry) {
+		pool_sess = sess_iu->pool_sess;
+
+		if (sess_iu->errno) {
+			iu->msg.map_ver = cpu_to_le64(pool->map_ver);
+			iu->msg.failed_id[failed_cnt] = pool_sess->member_id;
+			failed_cnt++;
+
+			rmr_clt_map_add_id(pool, pool_sess->member_id, id);
+		}
+	}
+	iu->msg.failed_cnt = failed_cnt;
+
+	err = rmr_clt_send_map_update(pool, iu);
+	if (err) {
+		pr_err("error sending map add for id (%llu, %llu), err=%d\n",
+		       iu->msg.id_a, iu->msg.id_b, err);
+		(*clt_conf)(clt_priv, err);
+	}
+}
+
+/**
+ * rmr_clt_send_map() - Send dirty map entries
+ *
+ * @map_src_pool:	Pool whose map is to be sent
+ * @clt_pool:		Client pool through which the dest session is selected
+ * @map_send_cmd:	Command structure containing the member_id of the target session
+ *			where the map is to be sent. If NULL then send to all of the session
+ *
+ * Return:
+ *	0 on success, err code otherwise.
+ *
+ * Description:
+ *	Sends all the dirty entries from the map in "map_src_pool" to the session with
+ *	member_id equal to member_id mentioned in the map_send_cmd.
+ *	The session where to send the map is picked from the clt_pool. If
+ *	map_send_cmd is NULL then send cmd to all of the sessions in clt_pool.
+ *
+ * Context:
+ *	This function blocks while sending the map.
+ */
+int rmr_clt_send_map(struct rmr_pool *map_src_pool, struct rmr_pool *clt_pool,
+		     const struct rmr_msg_map_send_cmd *map_send_cmd, rmr_map_filter filter)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	struct rmr_msg_pool_cmd msg = {};
+	bool sess_found = false;
+	void *bitmap_buf;
+	int err = 0, idx;
+
+	if (!clt_pool) {
+		pr_err("Cannot send map, when clt_pool is NULL\n");
+		return -EINVAL;
+	}
+
+	bitmap_buf = kzalloc(RTRS_IO_LIMIT, GFP_KERNEL);
+	if (!bitmap_buf) {
+		pr_err("%s: pool %s error allocating buffer to send map\n",
+		       __func__, map_src_pool->poolname);
+		return -ENOMEM;
+	}
+
+	idx = srcu_read_lock(&clt_pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &clt_pool->sess_list, entry,
+				 (srcu_read_lock_held(&clt_pool->sess_list_srcu))) {
+		int bytes = 0;
+		u8 map_idx = 0;
+		u64 slp_idx = 0;
+
+		/* if we have a command then skip all the sessions that are not in command */
+		if (map_send_cmd && pool_sess->member_id != map_send_cmd->receiver_member_id)
+			continue;
+
+		sess_found = true;
+		pr_info("Start sending dirty map for pool %s; to session %s with member_id %d\n",
+			map_src_pool->poolname, pool_sess->sessname, pool_sess->member_id);
+
+		while ((bytes = rmr_pool_maps_to_buf(map_src_pool, &map_idx, &slp_idx,
+						     bitmap_buf, RTRS_IO_LIMIT, filter)) > 0) {
+			pr_debug("mapped %d bytes to bitmap_buf\n", bytes);
+
+			err = rmr_clt_pool_map_xfer(clt_pool, pool_sess, RMR_CMD_SEND_MAP_BUF,
+						    bitmap_buf, bytes, 0, 0);
+			if (err) {
+				pr_err("%s: Failed to send bitmap_buf, from %s to %s err %d\n",
+				       __func__, map_src_pool->poolname, clt_pool->poolname, err);
+				goto err_free;
+			}
+		}
+
+		rmr_clt_init_cmd(map_src_pool, &msg);
+		msg.cmd_type = RMR_CMD_MAP_BUF_DONE;
+		msg.map_buf_done_cmd.map_version = map_src_pool->map_ver;
+
+		err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT);
+		if (err) {
+			pr_err("%s: For pool %s, %s failed\n",
+			       __func__, map_src_pool->poolname, rmr_get_cmd_name(msg.cmd_type));
+			goto err_free;
+		}
+	}
+
+	if (map_send_cmd && !sess_found) {
+		pr_err("pool %s failed to find sess with member_id %u to send map\n",
+		       clt_pool->poolname, map_send_cmd->receiver_member_id);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	pr_info("%s: Sending map done\n", __func__);
+
+err_free:
+	kfree(bitmap_buf);
+	srcu_read_unlock(&clt_pool->sess_list_srcu, idx);
+
+	return err;
+}
+EXPORT_SYMBOL(rmr_clt_send_map);
+
+int rmr_clt_test_map(struct rmr_pool *src_pool, struct rmr_pool *dst_pool)
+{
+	struct rmr_clt_pool_sess *pool_sess;
+	void *bitmap_buf;
+	int err, idx;
+
+	pr_info("test maps from src_pool=%s to dst_pool=%s...\n",
+		src_pool->poolname, dst_pool->poolname);
+
+	bitmap_buf = kzalloc(RTRS_IO_LIMIT, GFP_KERNEL);
+	if (!bitmap_buf) {
+		pr_err("%s: Error allocating buffer\n", __func__);
+		err = -ENOMEM;
+		goto err;
+	}
+
+	idx = srcu_read_lock(&dst_pool->sess_list_srcu);
+	list_for_each_entry_srcu(pool_sess, &dst_pool->sess_list, entry,
+				 (srcu_read_lock_held(&dst_pool->sess_list_srcu))) {
+		enum rmr_clt_pool_sess_state state;
+		int bytes = 0;
+		u8 map_idx = 0;
+		u64 slp_idx = 0;
+
+		state = atomic_read(&pool_sess->state);
+		if (state == RMR_CLT_POOL_SESS_CREATED ||
+		    state == RMR_CLT_POOL_SESS_FAILED) {
+			pr_warn("sess %s is in created/failed state, skip map test.\n",
+				pool_sess->sessname);
+			continue;
+		}
+		pr_info("perform map test for sess %s\n", pool_sess->sessname);
+		while ((bytes = rmr_pool_maps_to_buf(src_pool, &map_idx, &slp_idx,
+						     bitmap_buf, RTRS_IO_LIMIT,
+						     MAP_NO_FILTER)) > 0) {
+			pr_debug("mapped %d bytes to bitmap_buf\n", bytes);
+
+			err = rmr_clt_pool_map_xfer(dst_pool, pool_sess, RMR_CMD_MAP_TEST,
+						    bitmap_buf, bytes, 0, 0);
+			if (err) {
+				pr_err("%s: For sess %s failed test map, src_pool %s dst_pool %s err %d\n",
+				       __func__, pool_sess->sessname, src_pool->poolname,
+				       dst_pool->poolname, err);
+				srcu_read_unlock(&dst_pool->sess_list_srcu, idx);
+				goto err_free;
+			}
+		}
+		pr_info("sess %s map test done\n", pool_sess->sessname);
+	}
+	srcu_read_unlock(&dst_pool->sess_list_srcu, idx);
+
+err_free:
+	kfree(bitmap_buf);
+err:
+	pr_info("test maps from src_pool=%s to dst_pool=%s done, err %d\n",
+		src_pool->poolname, dst_pool->poolname, err);
+
+	return err;
+}
+EXPORT_SYMBOL(rmr_clt_test_map);
diff --git a/drivers/infiniband/ulp/rmr/rmr-map.c b/drivers/infiniband/ulp/rmr/rmr-map.c
new file mode 100644
index 000000000000..f4b7dd7c3b50
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-map.c
@@ -0,0 +1,904 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#include <linux/slab.h>
+
+#include "rmr-map.h"
+#include "rmr-pool.h"
+
+void rmr_map_update_page_params(struct rmr_dirty_id_map *map)
+{
+	unsigned long remaining_chunks;
+
+	map->no_of_flp = (map->no_of_chunks >> CHUNKS_PER_FLP_LOG2);
+
+	/*
+	 * If the number of chunks are not completely filling an FLP (CHUNKS_PER_FLP),
+	 * then the remaining would be tracked by the next FLP. Thus the next FLP would
+	 * have unused SLP pointers. We will calculate the number of SLP slots which will
+	 * be used in the last FLP.
+	 */
+	remaining_chunks = map->no_of_chunks & (CHUNKS_PER_FLP - 1);
+	if (!remaining_chunks) {
+		/*
+		 * If there are no remaining chunks, then the last FLP is completely full.
+		 */
+		map->no_of_slp_in_last_flp = NO_OF_SLP_PER_FLP;
+		map->no_of_chunk_in_last_slp = NO_OF_CHUNKS_PER_PAGE;
+	} else {
+		/*
+		 * If there are remaining chunks, then we add another FLP for it.
+		 * This FLP will not be full, hence we calculate the number of SLP slots
+		 * that will be used.
+		 */
+		map->no_of_flp += 1;
+		map->no_of_slp_in_last_flp = (remaining_chunks >> CHUNKS_PER_SLP_LOG2);
+
+		/*
+		 * Same as above. It could be that the number of chunks do not fit neatly
+		 * in the last SLP (CHUNKS_PER_SLP), and the remaining ones end up in the
+		 * SLP with remaining chunk slots.
+		 */
+		remaining_chunks &= (CHUNKS_PER_SLP - 1);
+		if (!remaining_chunks) {
+			/*
+			 * If there are no remaining chunks, then the last SLP is completely full.
+			 */
+			map->no_of_chunk_in_last_slp = CHUNKS_PER_SLP;
+		} else {
+			/*
+			 * If there are remaining chunks, then we add another SLP.
+			 */
+			map->no_of_slp_in_last_flp += 1;
+			map->no_of_chunk_in_last_slp = remaining_chunks;
+		}
+	}
+
+	map->total_slp = ((map->no_of_flp - 1) * NO_OF_SLP_PER_FLP) + map->no_of_slp_in_last_flp;
+}
+
+static void rmr_map_update_map_params(struct rmr_pool *pool, struct rmr_dirty_id_map *map)
+{
+	map->no_of_chunks = pool->no_of_chunks;
+
+	rmr_map_update_page_params(map);
+
+	pr_info("%s: Chunks info %u, %u, %u, %llu\n",
+		__func__, pool->chunk_size, ilog2(pool->chunk_size),
+		pool->chunk_size_shift, map->no_of_chunks);
+	pr_info("%s: FLPs %llu, SLPs in last FLP %llu, Total SLPs %llu, chunks in last SLP %llu\n",
+		__func__, map->no_of_flp, map->no_of_slp_in_last_flp, map->total_slp,
+		map->no_of_chunk_in_last_slp);
+	pr_info("%s: Dirty map size %lldB\n", __func__, (map->total_slp * PAGE_SIZE));
+}
+
+static int rmr_map_allocate_pages(struct rmr_pool *pool, struct rmr_dirty_id_map *map)
+{
+	el_flp *flp_ptr;
+	u64 no_of_slps;
+	int i, j;
+
+	for (i = 0; i < map->no_of_flp;) {
+		map->dirty_bitmap[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!map->dirty_bitmap[i])
+			goto err_alloc;
+		flp_ptr = (el_flp *)map->dirty_bitmap[i];
+
+		if (i == (map->no_of_flp - 1))
+			no_of_slps = map->no_of_slp_in_last_flp;
+		else
+			no_of_slps = NO_OF_SLP_PER_FLP;
+
+		/*
+		 * Move the increment to here, so that later in err_alloc: if we have to free,
+		 * the index i, is pointing in the correct position.
+		 */
+		i++;
+
+		for (j = 0; j < no_of_slps; j++, flp_ptr++) {
+			*flp_ptr = get_zeroed_page(GFP_KERNEL);
+			if (!*flp_ptr)
+				goto err_alloc;
+		}
+	}
+
+	// TODO remove this
+	map->bitmap_filter = kcalloc(pool->no_of_chunks, sizeof(*map->bitmap_filter), GFP_KERNEL);
+	if (!map->bitmap_filter)
+		goto err_alloc;
+
+	return 0;
+
+err_alloc:
+	for (--i; i >= 0; i--) {
+		flp_ptr = (el_flp *)map->dirty_bitmap[i];
+
+		for (--j; j >= 0; j--)
+			free_page((unsigned long)*(flp_ptr + j));
+
+		j = NO_OF_SLP_PER_FLP;
+		free_page((unsigned long)map->dirty_bitmap[i]);
+	}
+
+	return -ENOMEM;
+}
+
+struct rmr_dirty_id_map *rmr_map_create(struct rmr_pool *pool, u8 member_id)
+{
+	struct rmr_dirty_id_map *map = NULL;
+	int ret;
+
+	pr_info("%s: Creating map for member_id %u, in pool %s. Existing map_cnt %u\n",
+		__func__, member_id, pool->poolname, pool->maps_cnt);
+
+	if (!pool->no_of_chunks) {
+		pr_err("%s: dirty map size cannot be zero\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mutex_lock(&pool->maps_lock);
+
+	/*
+	 * Don't create if already exists
+	 */
+	map = rmr_pool_find_map(pool, member_id);
+	if (map != NULL) {
+		pr_err("Map with member_id %u already exists\n", member_id);
+		ret = -EEXIST;
+		goto err_unlock;
+	}
+
+	if (pool->maps_cnt >= RMR_POOL_MAX_SESS) {
+		pr_err("pool %s can not create new map, max number of sessions %d achieved\n",
+		       pool->poolname, RMR_POOL_MAX_SESS);
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	/*
+	 * Allocate memory and init the structure
+	 */
+	map = (struct rmr_dirty_id_map *)get_zeroed_page(GFP_KERNEL);
+	if (!map) {
+		pr_err("cannot allocate map for member_id %u\n", member_id);
+		ret = -ENOMEM;
+		goto err_unlock;
+	}
+	rmr_map_update_map_params(pool, map);
+
+	ret = rmr_map_allocate_pages(pool, map);
+	if (ret) {
+		pr_err("cannot allocate memory for member_id %u\n", member_id);
+		goto err_map;
+	}
+
+	xa_init_flags(&map->rmr_id_map, XA_FLAGS_ALLOC);
+	map->member_id = member_id;
+	map->ts = jiffies;
+
+	rmr_pool_maps_append(pool, map);
+
+	mutex_unlock(&pool->maps_lock);
+
+	return map;
+
+err_map:
+	free_page((unsigned long)map);
+err_unlock:
+	mutex_unlock(&pool->maps_lock);
+	return ERR_PTR(ret);
+}
+
+void rmr_map_destroy(struct rmr_dirty_id_map *map)
+{
+	el_flp *flp_ptr;
+	int i, j;
+	u64 no_of_slps;
+
+	WARN_ON(!xa_empty(&map->rmr_id_map));
+	map->ts = jiffies;
+
+	pr_info("%s: member_id %u\n", __func__, map->member_id);
+	kfree(map->bitmap_filter);
+
+	for (i = 0; i < map->no_of_flp; i++) {
+		flp_ptr = (el_flp *)map->dirty_bitmap[i];
+
+		if (i == (map->no_of_flp - 1))
+			no_of_slps = map->no_of_slp_in_last_flp;
+		else
+			no_of_slps = NO_OF_SLP_PER_FLP;
+
+		for (j = 0; j < no_of_slps; j++)
+			free_page((unsigned long)*(flp_ptr + j));
+
+		free_page((unsigned long)map->dirty_bitmap[i]);
+	}
+
+	free_page((unsigned long)map);
+}
+
+/**
+ * rmr_map_calc_chunk -	Calculate chunk number from offset and length of IO
+ *
+ * @pool:		The pool
+ * @offset:		Offset of the IO
+ * @length:		Length of the IO
+ * @id:			rmr_id_t where to populate the chunk details
+ *			id.b: chunk number denoted by this entry
+ *			id.a: Number of chunks dirty starting (and including) id.b
+ *
+ *			For example:
+ *			if id.a is 1, only id.b is dirty.
+ *			if id.a is 2, id.b and (id.b+1) is dirty
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+void rmr_map_calc_chunk(struct rmr_pool *pool, size_t offset, size_t length, rmr_id_t *id)
+{
+	u64 off_len = offset + length;
+
+	id->b = GET_CHUNK_NUMBER(offset, pool->chunk_size_shift);
+	id->a = GET_FOLLOWING_CHUNKS(off_len, pool->chunk_size_shift, id->b);
+}
+
+/**
+ * rmr_get_chunk_md_from_id - Get the chunk metadata byte from rmr_id_t
+ *
+ * @map:	The map to work on
+ * @id:		rmr_id_t to use to get the chunk metadata byte
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline u8 *rmr_get_chunk_md_from_id(struct rmr_dirty_id_map *map, rmr_id_t id)
+{
+	unsigned long idb_slp, idb_slp_index, idb_chunk;
+	el_flp *flp_ptr;
+	u8 *slp, *chunk_md;
+
+	/*
+	 * First get the pointer to first level page (FLP).
+	 * To get that, we need to find which first level page the chunk belongs, and it can
+	 * be found by dividing the chunk number by the maximum number of chunks 1 FLP can track.
+	 *
+	 * After that we need to adjust the id.b to go one level down. This is because we just
+	 * moved to the desired FLP, and hence that portion of id.b can be dropped.
+	 * For this we do the modulo with CHUNKS_PER_FLP.
+	 */
+	flp_ptr = (el_flp *)(map->dirty_bitmap[id.b >> CHUNKS_PER_FLP_LOG2]);
+	idb_slp = id.b & (CHUNKS_PER_FLP - 1);
+
+	/*
+	 * Now we need to move to the second level page (SLP).
+	 * The addresses to SLPs are stored in the FLP as a list of addresses. Hence we calculate
+	 * the desired slp index which has the address to the SLP our chunk md resides in.
+	 *
+	 * We then adjust our flp_ptr according to the index.
+	 * Note that flp_ptr is of type el_flp (flp element), which is unsigned long, since
+	 * addresses are of that data type. This lets us move to the slp index easily.
+	 */
+	idb_slp_index = idb_slp >> CHUNKS_PER_SLP_LOG2;
+	flp_ptr += idb_slp_index;
+
+	/*
+	 * The location pointed by flp_ptr is storing the address to the SLP we want to move to.
+	 * So we dereference it first, and then cast it to relevant pointer (to the chunk metadata
+	 * data type, which is u8).
+	 *
+	 * The last step it to move to the correct chunk metadata in the SLP.
+	 *
+	 * Each SLP can store metadata for CHUNKS_PER_SLP chunks. So we adjust the idb_slp
+	 * accordingly. And then move our slp pointer to the correct chunk metadata byte.
+	 */
+	slp = (u8 *)(*flp_ptr);
+	idb_chunk = idb_slp & (CHUNKS_PER_SLP - 1);
+	chunk_md = slp + idb_chunk;
+
+	return chunk_md;
+}
+
+static bool rmr_chunk_md_check_dirty(u8 *chunk_md)
+{
+	return (*chunk_md) & (0x1 << CHUNK_DIRTY_BIT);
+}
+
+static void rmr_chunk_md_set_dirty(u8 *chunk_md)
+{
+	*chunk_md |= (0x1 << CHUNK_DIRTY_BIT);
+}
+
+static void rmr_chunk_md_unset_dirty(u8 *chunk_md)
+{
+	*chunk_md &= ~(0x1 << CHUNK_DIRTY_BIT);
+}
+
+/**
+ * rmr_map_set_dirty -	Set bits from rmr_id_t
+ *
+ * @map:		Map to work on
+ * @id:			rmr_id_t containing the chunk info
+ *			id.b: chunk number denoted by this entry
+ *			id.a: Number of chunks dirty starting (and including) id.b
+ * @filter:		Filter to add to entry
+ *
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline void rmr_map_set_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter)
+{
+	u8 *chunk_md;
+	u64 i;
+
+	map->ts = jiffies;
+
+	chunk_md = rmr_get_chunk_md_from_id(map, id);
+	for (i = 0; i < id.a; i++) {
+		rmr_chunk_md_set_dirty(chunk_md);
+		chunk_md++;
+	}
+}
+
+inline void rmr_map_set_dirty_all(struct rmr_dirty_id_map *map, u8 filter)
+{
+	el_flp *flp_ptr;
+	u64 no_of_slps, no_of_chunks;
+	bool is_last_flp;
+	u8 *slp;
+	int i, j, k;
+
+	for (i = 0; i < map->no_of_flp; i++) {
+		flp_ptr = (el_flp *)map->dirty_bitmap[i];
+		is_last_flp = (i == (map->no_of_flp - 1));
+
+		if (is_last_flp)
+			no_of_slps = map->no_of_slp_in_last_flp;
+		else
+			no_of_slps = NO_OF_SLP_PER_FLP;
+
+		for (j = 0; j < no_of_slps; j++, flp_ptr++) {
+			slp = (u8 *)(*flp_ptr);
+
+			if (is_last_flp && j == (no_of_slps - 1))
+				no_of_chunks = map->no_of_chunk_in_last_slp;
+			else
+				no_of_chunks = NO_OF_CHUNKS_PER_PAGE;
+
+			for (k = 0; k < no_of_chunks; k++, slp++)
+				rmr_chunk_md_set_dirty(slp);
+		}
+	}
+}
+
+/**
+ * rmr_map_unset_dirty - Clear bits from rmr_id_t, and free entry if any
+ *
+ * @map:		Map to work on
+ * @id:			rmr_id_t containing the chunk info
+ *			id.b: chunk number denoted by this entry
+ *			id.a: Number of chunks dirty starting (and including) id.b
+ * @filter:		Filter to add to entry
+ *
+ * Description:
+ *	This version can be used by both client and server.
+ *	If entry is found, the function frees it.
+ *	Clears the bit using info from the given rmr_id_t
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline struct rmr_map_entry *rmr_map_unset_dirty(struct rmr_dirty_id_map *map, rmr_id_t id,
+						 u8 filter)
+{
+	struct rmr_map_entry *entry;
+	u8 *chunk_md;
+	u64 i;
+
+	map->ts = jiffies;
+
+	chunk_md = rmr_get_chunk_md_from_id(map, id);
+	BUG_ON(!chunk_md);
+	for (i = 0; i < id.a; i++) {
+		rmr_chunk_md_unset_dirty(chunk_md);
+		chunk_md++;
+	}
+
+	entry = xa_erase(&map->rmr_id_map, rmr_id_to_key(id));
+	if (!entry) {
+		pr_debug("in the member_id %d there is no entry for id [%llu, %llu]\n",
+			 map->member_id, id.a, id.b);
+	}
+
+	return entry;
+}
+
+/*
+ * rmr_map_check_dirty - Check if the following bits are set or not
+ *
+ * @map:		Map to work on
+ * @id:			rmr_id_t containing the chunk info
+ *			id.b: chunk number denoted by this entry
+ *			id.a: Number of chunks dirty starting (and including) id.b
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline bool rmr_map_check_dirty(struct rmr_dirty_id_map *map, rmr_id_t id)
+{
+	u8 *chunk_md;
+
+	chunk_md = rmr_get_chunk_md_from_id(map, id);
+	return rmr_chunk_md_check_dirty(chunk_md);
+}
+
+/**
+ * rmr_map_get_dirty_entry - Check and return entry if the following bits are set
+ *
+ * @map:		Map to work on
+ * @id:			rmr_id_t containing the chunk info
+ *			id.b: chunk number denoted by this entry
+ *			id.a: Number of chunks dirty starting (and including) id.b
+ *
+ * Description:
+ *	Check if a chunk is dirty or not.
+ *	If the particular chunk is dirty, then create an entry for it and return back.
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline struct rmr_map_entry *rmr_map_get_dirty_entry(struct rmr_dirty_id_map *map, rmr_id_t id)
+{
+	struct rmr_map_entry *entry;
+	int err;
+
+	if (rmr_map_check_dirty(map, id)) {
+		entry = xa_load(&map->rmr_id_map, rmr_id_to_key(id));
+		if (entry) {
+			pr_debug("%s: For id [%llu, %llu], entry exists member_id %u\n",
+				 __func__, id.a, id.b, map->member_id);
+			return entry;
+		}
+
+		entry = kmem_cache_zalloc(rmr_map_entry_cachep, GFP_KERNEL);
+		if (!entry) {
+			pr_err("%s: Cannot allocate entry for member_id %d, id [[%llu, %llu]]\n",
+			       __func__, map->member_id, id.a, id.b);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		atomic_set(&entry->sync_cnt, -1);
+		init_llist_head(&entry->wait_list);
+
+		err = xa_insert(&map->rmr_id_map, rmr_id_to_key(id), entry, GFP_KERNEL);
+		if (err == 0)
+			return entry;
+
+		kmem_cache_free(rmr_map_entry_cachep, entry);
+
+		if (err == -EBUSY)
+			return xa_load(&map->rmr_id_map, rmr_id_to_key(id));
+		else
+			return ERR_PTR(-ENOMEM);
+	}
+
+	return NULL;
+}
+
+/**
+ * rmr_map_clear_filter_all - Clear filter for entire bitmap
+ *
+ * @map:       Map to work on
+ * @filter:    Filter to be cleared
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline void rmr_map_clear_filter_all(struct rmr_dirty_id_map *map, u8 filter)
+{
+	u64 i;
+
+	for (i = 0; i < map->no_of_chunks; i++)
+		map->bitmap_filter[i] &= ~filter;
+}
+
+/**
+ * rmr_map_unset_dirty_all - Clear all chunk bits (the entire map)
+ *
+ * @map:       Map to work on
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline void rmr_map_unset_dirty_all(struct rmr_dirty_id_map *map)
+{
+	rmr_id_t id;
+	u64 i;
+
+	/*
+	 * TODO: memcpy zeroes or something faster
+	 */
+
+	id.a = 1;
+	for (i = 0; i < map->no_of_chunks; i++) {
+		id.b = i;
+
+		if (!rmr_map_check_dirty(map, id))
+			continue;
+
+		rmr_map_unset_dirty(map, id, MAP_NO_FILTER);
+	}
+
+	rmr_map_clear_filter_all(map, MAP_ENTRY_UNSYNCED);
+}
+
+/**
+ * rmr_map_empty - Check if there are any chunks dirty
+ *
+ * @map:       Map to work on
+ *
+ * Return:
+ *	True:	If map is empty
+ *	False:	Otherwise
+ *
+ * Context:
+ *	srcu pool->map_srcu should be held while calling this function.
+ */
+inline bool rmr_map_empty(struct rmr_dirty_id_map *map)
+{
+	el_flp *flp_ptr;
+	u64 no_of_slps, no_of_chunks;
+	bool is_last_flp;
+	u8 *slp;
+	int i, j, k;
+
+	for (i = 0; i < map->no_of_flp; i++) {
+		flp_ptr = (el_flp *)map->dirty_bitmap[i];
+		is_last_flp = (i == (map->no_of_flp - 1));
+
+		if (is_last_flp)
+			no_of_slps = map->no_of_slp_in_last_flp;
+		else
+			no_of_slps = NO_OF_SLP_PER_FLP;
+
+		for (j = 0; j < no_of_slps; j++, flp_ptr++) {
+			slp = (u8 *)(*flp_ptr);
+
+			if (is_last_flp && j == (no_of_slps - 1))
+				no_of_chunks = map->no_of_chunk_in_last_slp;
+			else
+				no_of_chunks = NO_OF_CHUNKS_PER_PAGE;
+
+			for (k = 0; k < no_of_chunks; k++, slp++) {
+				if (rmr_chunk_md_check_dirty(slp))
+					return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+inline void rmr_map_bitwise_or_buf(void *dst_buf, void *src_buf, u32 buf_size)
+{
+	u8 *src_byte, *dst_byte;
+
+	src_byte = src_buf;
+	dst_byte = dst_buf;
+
+	while (buf_size--)
+		*(dst_byte + buf_size) |= *(src_byte + buf_size);
+}
+
+inline int rmr_map_create_entries(struct rmr_dirty_id_map *map)
+{
+	struct rmr_map_entry *entry;
+	rmr_id_t id;
+	int err;
+	u64 i;
+
+	id.a = 1;
+	for (i = 0; i < map->no_of_chunks; i++) {
+		id.b = i;
+
+		if (!rmr_map_check_dirty(map, id))
+			continue;
+
+		if (xa_load(&map->rmr_id_map, rmr_id_to_key(id)))
+			continue;
+
+		entry = kmem_cache_zalloc(rmr_map_entry_cachep, GFP_KERNEL);
+		if (!entry) {
+			pr_err("%s: Cannot allocate entry for member_id %d, chunk %llu\n",
+			       __func__, map->member_id, i);
+			return -ENOMEM;
+		}
+
+		atomic_set(&entry->sync_cnt, -1);
+		init_llist_head(&entry->wait_list);
+
+		pr_debug("%s: Adding entry %p for chunk %llu\n",
+			 __func__, entry, i);
+
+		err = xa_insert(&map->rmr_id_map, rmr_id_to_key(id), entry, GFP_KERNEL);
+		if (err) {
+			pr_err("%s: Cannot insert entry for member_id %d, chunk %llu\n",
+			       __func__, map->member_id, i);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * rmr_map_slps_to_buf - Copy SLPs to given buf
+ *
+ * @map:	Map to work on
+ * @slp_idx:	SLP number to start copying from
+ * @no_of_slp:	Number of SLPs to copy
+ * @buf:	Buffer to copy SLPs to
+ *
+ * Context:
+ *     srcu pool->map_srcu should be held while calling this function.
+ */
+void rmr_map_slps_to_buf(struct rmr_dirty_id_map *map, u64 slp_idx, u64 no_of_slp, u8 *buf)
+{
+	el_flp *flp_ptr;
+	u64 slp_no, flp_no, i = 0;
+	void *slp;
+
+	flp_no = slp_idx >> NO_OF_SLP_PER_FLP_LOG2;
+	slp_no = slp_idx & (NO_OF_SLP_PER_FLP - 1);
+
+	flp_ptr = (el_flp *)map->dirty_bitmap[flp_no];
+	while (i < no_of_slp) {
+		slp = (void *)(*(flp_ptr + slp_no));
+
+		memcpy(buf, slp, PAGE_SIZE);
+		buf += PAGE_SIZE;
+
+		slp_no++;
+		if (slp_no >= NO_OF_SLP_PER_FLP) {
+			flp_no += 1;
+			slp_no = 0;
+
+			flp_ptr = (el_flp *)map->dirty_bitmap[flp_no];
+		}
+
+		i++;
+	}
+
+	return;
+}
+
+/**
+ * rmr_map_buf_to_slps - Copy data from buf to SLPs
+ *
+ * @map:	Map to work on
+ * @buf:	Buffer from which to copy data
+ * @buf_size:	Buffer size
+ * @slp_idx:	SLP number to start copying to
+ * @test:	Whether to compare data or copy
+ *
+ * Return:
+ *	Number of SLPs to which data was copied.
+ *	0 in case of failure.
+ *
+ * Context:
+ *     srcu pool->map_srcu should be held while calling this function.
+ */
+u64 rmr_map_buf_to_slps(struct rmr_dirty_id_map *map, u8 *buf, u32 buf_size, u64 slp_idx,
+			bool test)
+{
+	el_flp *flp_ptr;
+	u64 slp_no, flp_no, i = 0;
+	u64 no_of_slp;
+	void *slp;
+
+	/*
+	 * The buf_size should be a factor of PAGE_SIZE
+	 */
+	if (buf_size % PAGE_SIZE) {
+		pr_info("%s: Failed %u\n", __func__, buf_size);
+		return 0;
+	}
+
+	no_of_slp = buf_size >> PAGE_SHIFT;
+
+	flp_no = slp_idx >> NO_OF_SLP_PER_FLP_LOG2;
+	slp_no = slp_idx & (NO_OF_SLP_PER_FLP - 1);
+
+	pr_info("%s: no_of_slp=%llu, flp_no=%llu, slp_no=%llu, slp_idx=%llu\n",
+		__func__, no_of_slp, flp_no, slp_no, slp_idx);
+	flp_ptr = (el_flp *)map->dirty_bitmap[flp_no];
+	while (i < no_of_slp) {
+		slp = (void *)(*(flp_ptr + slp_no));
+
+		if (test && memcmp(slp, buf, PAGE_SIZE)) {
+			pr_info("%s: Compare failed\n", __func__);
+			return 0;
+		} else if (!test) {
+			memcpy(slp, buf, PAGE_SIZE);
+		}
+		buf += PAGE_SIZE;
+
+		slp_no++;
+		if (slp_no >= NO_OF_SLP_PER_FLP) {
+			flp_no += 1;
+			slp_no = 0;
+
+			flp_ptr = (el_flp *)map->dirty_bitmap[flp_no];
+		}
+
+		i++;
+	}
+
+	return no_of_slp;
+}
+
+void rmr_map_hexdump_bitmap_buf(u8 member_id, void *buf, u32 buf_size)
+{
+	u8 *buf_byte;
+	u32 size = 0;
+
+	buf_byte = buf;
+
+	pr_info("%s: Starting bitmap dump for member %u in hex, size %u\n",
+		__func__, member_id, buf_size);
+	pr_info("---------------------------------------------------------\n");
+	while (size < buf_size) {
+		pr_cont("%02X", *(buf_byte + size));
+		size++;
+	}
+
+	pr_info("\n");
+}
+
+void rmr_map_dump_bitmap(struct rmr_dirty_id_map *map)
+{
+	el_flp *flp_ptr;
+	u64 no_of_slps, no_of_chunks;
+	bool is_last_flp;
+	u8 *slp;
+	int i, j;
+
+	for (i = 0; i < map->no_of_flp; i++) {
+		flp_ptr = (el_flp *)map->dirty_bitmap[i];
+		is_last_flp = (i == (map->no_of_flp - 1));
+
+		if (is_last_flp)
+			no_of_slps = map->no_of_slp_in_last_flp;
+		else
+			no_of_slps = NO_OF_SLP_PER_FLP;
+
+		for (j = 0; j < no_of_slps; j++, flp_ptr++) {
+			slp = (u8 *)(*flp_ptr);
+
+			if (is_last_flp && j == (no_of_slps - 1))
+				no_of_chunks = map->no_of_chunk_in_last_slp;
+			else
+				no_of_chunks = NO_OF_CHUNKS_PER_PAGE;
+
+			/* Each chunk is represented by a byte */
+			rmr_map_hexdump_bitmap_buf(map->member_id, slp, no_of_chunks);
+		}
+	}
+}
+
+/**
+ * rmr_map_summary_format - Format a per-member dirty-chunk summary into buf
+ *
+ * @pool:	Pool whose maps to summarise
+ * @buf:	Output buffer (must be at least @buf_size bytes)
+ * @buf_size:	Size of @buf in bytes
+ *
+ * Description:
+ *	Output format (one line per member that has a map):
+ *	member <id>: [<idx0> <idx1> ...] <dirty_count>/<total> dirty
+ *	At most 50 dirty chunk indices are listed per member; if there
+ *	are more, a "..." marker appears before the closing bracket.
+ *
+ * Context: caller must hold srcu pool->map_srcu.
+ *
+ * Return: number of bytes written (excluding trailing NUL).
+ */
+int rmr_map_summary_format(struct rmr_pool *pool, char *buf, size_t buf_size)
+{
+	struct rmr_dirty_id_map *map;
+	el_flp *flp_ptr;
+	u64 no_of_slps, no_of_chunks_in_slp;
+	u64 chunk_idx, dirty_count;
+	bool is_last_flp;
+	u8 *slp;
+	int printed_ids;
+	int pos = 0;
+	int i, fi, si;
+
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		map = rcu_dereference(pool->maps[i]);
+		if (!map)
+			continue;
+
+		pos += scnprintf(buf + pos, buf_size - pos,
+				 "member %u: [", map->member_id);
+
+		dirty_count = 0;
+		chunk_idx = 0;
+		printed_ids = 0;
+		for (fi = 0; fi < map->no_of_flp; fi++) {
+			flp_ptr = (el_flp *)map->dirty_bitmap[fi];
+			is_last_flp = (fi == (map->no_of_flp - 1));
+			no_of_slps = is_last_flp ?
+				map->no_of_slp_in_last_flp : NO_OF_SLP_PER_FLP;
+
+			for (si = 0; si < no_of_slps; si++, flp_ptr++) {
+				u64 ci;
+
+				slp = (u8 *)(*flp_ptr);
+				no_of_chunks_in_slp =
+					(is_last_flp && si == (no_of_slps - 1)) ?
+					map->no_of_chunk_in_last_slp :
+					NO_OF_CHUNKS_PER_PAGE;
+
+				for (ci = 0; ci < no_of_chunks_in_slp;
+				     ci++, chunk_idx++) {
+					if (!(slp[ci] & (1 << CHUNK_DIRTY_BIT)))
+						continue;
+					dirty_count++;
+					/* Cap listed IDs to fit all members in PAGE_SIZE */
+					if (printed_ids < 50) {
+						pos += scnprintf(buf + pos,
+								 buf_size - pos,
+								 "%llu ", chunk_idx);
+						printed_ids++;
+					}
+				}
+			}
+		}
+
+		/* Overwrite trailing space before ']' */
+		if (pos > 0 && buf[pos - 1] == ' ')
+			pos--;
+		if (printed_ids < dirty_count)
+			pos += scnprintf(buf + pos, buf_size - pos,
+					 "...] %llu/%llu dirty\n",
+					 dirty_count, map->no_of_chunks);
+		else
+			pos += scnprintf(buf + pos, buf_size - pos,
+					 "] %llu/%llu dirty\n",
+					 dirty_count, map->no_of_chunks);
+	}
+
+	return pos;
+}
+
+void rmr_map_bidump_bitmap_buf(void *buf, u8 member_id, u32 buf_long)
+{
+	char box[65];
+	u64 *buf_byte;
+	u64 the_byte;
+	int i, j;
+	u32 count = 0;
+
+	buf_byte = buf;
+
+	pr_info("%s: bitmap for member %d dump in binary, the size in longs %u\n",
+		__func__, member_id, buf_long);
+	while (count < buf_long) {
+		the_byte = *(buf_byte + count);
+		for (i = 63, j = 0; i >= 0; i--, j++)
+			box[j] = (the_byte & (1ULL << i)) ? '1' : '0';
+		box[j] = '\0';
+		pr_cont("[%s]", box);
+		count++;
+	}
+
+	pr_info("\n");
+	pr_info("---------------------------------------------------------\n");
+}
diff --git a/drivers/infiniband/ulp/rmr/rmr-map.h b/drivers/infiniband/ulp/rmr/rmr-map.h
new file mode 100644
index 000000000000..76ef6506421f
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-map.h
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef RMR_MAP_H
+#define RMR_MAP_H
+
+#include <linux/types.h>
+#include <linux/xarray.h>
+
+#include "rmr.h"
+
+/**
+ * The dirty map buffer is used to track dirty chunks through bits.
+ * The position of the bit denotes the chunk number it tracks.
+ *
+ * Bitmap structure
+ * ----------------
+ * The dirty bitmap is stored in a 2 level tree-like structure.
+ * The main unit of storage are memory pages; They act as nodes of this structure.
+ * The first level pages (FLP) stores the address of the second level pages.
+ * There can be a total of 256 first level pages.
+ * The second level pages (SLP, also the leaf nodes/pages) stores the bitmap.
+ *
+ * The first level pages have to store the address of the second level pages.
+ * An address being 8B (default/max) long, the addresses of a maximum of 512 pages can
+ * be stored in a first level page. This then decides the maximum leaf pages a pool can
+ * have, which, for our example, is [(# pages of FLP) * (PAGE_SIZE / address_size)],
+ * (256*512)=131072.
+ * With the above info, the available space for bitmap is 131072*4KB(PAGE_SIZE)=512MB.
+ *
+ * A chunk is the smallest unit of data which is tracked for being dirty. A chunk is
+ * called dirty/unsynced, even if a single byte in it is dirty/unsynced.
+ * To track a chunk, a single byte (1B) is used. The least significant bit is used to signify
+ * if the chunk is dirty (set) or not. Other bits can be used for other purposes (for example,
+ * filters). The maximum number of chunks RMR can manage are then, (512MB)/1B=536870912.
+ * This number is fixed, as one can see from the calculations, and hence the maximum size of
+ * metadata RMR can allocate and use is fixed.
+
+ * The user configurable part is the chunk size. Its range is 128KB-1MB, and it has to be a
+ * power of 2.
+ * The chunk size decides the maximum mapped size for an RMR pool.
+ * For example, for chunk size 1MB, and taking the maximum number of chunks RMR can allocate
+ * and handle (536870912, see above), the maximum mapped size would be (536870912*1MB)=512TB.
+ * The table showing the relation between chunk size and maximum mapped size is as follows,
+ * Chunk size	Maximum mapped size
+ * 128KB	64TB
+ * 256KB	128TB
+ * 512KB	256TB
+ * 1MB		512TB
+ *
+ * Calculating chunk number
+ * ------------------------
+ * Some key points
+ * 1) The Linux kernel has a fixed size for sector, which is 512 (or 9 bitshift)
+ * 2) The mapped_size provided and stores in the rmr_pool structure is in sectors.
+ * 3) The chunk_size provided and stored in the rmr_pool structure is in bytes.
+ * 4) The code calculates and stores chunk_size_shift in the rmr_pool structure to do fast
+ *    calculation.
+ * 5) The IO offset give to RMR (through function rmr_clt_request) is in bytes.
+ *
+ * --
+ * With the above points, lets have a sample scenario with mapped_size 1GB and chunk_size 128KB
+ * The numbers would then be,
+ *
+ * no_of_chunks = (mapped_size / chunk_size)
+ * no_of_chunks = 8192
+ *
+ * chunk_size = 131072
+ * chunk_size_shift = 17
+ *
+ * dirty_map buffer size (in BYTES) = (no_of_chunks / bits in a byte)
+ * dirty_map buffer size (in BYTES) = 1024
+ *
+ * --
+ * Lets do a sample calculation of chunk_no from offset and length of an IO
+ *
+ * For offset 30801920 and length 4096
+ *
+ * chunk_no = (offset >> chunk_size_shift)
+ * chunk_no = 235
+ *
+ */
+
+#define RMR_KEY_SHIFT 32
+
+// Each chunk requires 1B of metadata
+#define PER_CHUNK_MD		1
+#define PER_CHUNK_MD_LOG2	ilog2(PER_CHUNK_MD)
+
+#define GET_CHUNK_NUMBER(offset, shift)			(offset >> shift)
+#define GET_FOLLOWING_CHUNKS(offset_len, shift, start)	(((offset_len - 1) >> shift) - start + 1)
+
+#define CHUNK_TO_OFFSET(chunk_no, shift)       (chunk_no << shift)
+
+// The element type stored in FLP
+typedef unsigned long	el_flp;
+
+enum {
+	CHUNK_DIRTY_BIT = 0,
+	CHUNK_FILTER_BIT,
+};
+
+enum {
+	MAX_NO_OF_FLP = 256,
+	NO_OF_SLP_PER_FLP = (PAGE_SIZE >> ilog2(sizeof(void *))),
+	NO_OF_SLP_PER_FLP_LOG2 = ilog2(NO_OF_SLP_PER_FLP),
+	MAX_NO_OF_SLP = (MAX_NO_OF_FLP * NO_OF_SLP_PER_FLP),
+
+	NO_OF_CHUNKS_PER_PAGE = (PAGE_SIZE >> PER_CHUNK_MD_LOG2),
+	// Chunks data is stored only in SLP
+	MAX_NO_OF_CHUNKS = (MAX_NO_OF_SLP * NO_OF_CHUNKS_PER_PAGE),
+
+	CHUNKS_PER_SLP = (PAGE_SIZE >> PER_CHUNK_MD_LOG2),
+	CHUNKS_PER_SLP_LOG2 = ilog2(CHUNKS_PER_SLP),
+	CHUNKS_PER_FLP = (CHUNKS_PER_SLP * NO_OF_SLP_PER_FLP),
+	CHUNKS_PER_FLP_LOG2 = ilog2(CHUNKS_PER_FLP),
+};
+
+typedef enum {
+	MAP_NO_FILTER = 0,
+	MAP_ENTRY_UNSYNCED
+} rmr_map_filter;
+
+enum rmr_map_state {
+	RMR_MAP_STATE_NO_CHECK = 0,
+	RMR_MAP_STATE_CHECKING,
+	// do we have some other useful states ?
+};
+
+struct rmr_dirty_id_map {
+	u8 member_id;
+	struct xarray rmr_id_map;
+	unsigned long ts;
+	atomic_t check_state;
+
+	/*
+	 * The usage of this is restricted to form a linked lised
+	 * during mass deletion. Since this is in an RCU list (maps
+	 * in rmr_pool), we cannot use this or change any data until
+	 * the RCU period completes. So we use this next variable
+	 * during mass deletion so we can have a list and don't have
+	 * to wait and restart the search on every individual deletion
+	 * of a map. Refer destroy_clt_pool().
+	 */
+	struct rmr_dirty_id_map *next;
+
+	u64		no_of_chunks;
+	u64		no_of_flp;
+	u64		no_of_slp_in_last_flp;
+	u64		no_of_chunk_in_last_slp;
+	u64		total_slp;
+	u8		*bitmap_filter;
+	void		*dirty_bitmap[MAX_NO_OF_FLP];
+};
+
+struct rmr_map_entry {
+	atomic_t sync_cnt;
+	struct llist_head wait_list;
+};
+
+/*
+ * The header of the bitmap buffer.
+ */
+struct rmr_map_cbuf_hdr {
+	u64		version;
+	u8		member_id;
+
+	u64		no_of_chunks;
+	u64		no_of_flp;
+	u64		no_of_slp_in_last_flp;
+	u64		no_of_chunk_in_last_slp;
+	u64		total_slp;
+} __packed;
+
+static inline unsigned long rmr_id_to_key(rmr_id_t id)
+{
+	unsigned long res;
+
+	// highest bits for id.a, the rest are for id.b;
+	res = ((id.a << RMR_KEY_SHIFT) | id.b);
+	return res;
+}
+
+static inline u64 key_to_a(unsigned long key)
+{
+	return key >> RMR_KEY_SHIFT;
+}
+
+static inline u64 key_to_b(unsigned long key)
+{
+	return key & ((1ULL << RMR_KEY_SHIFT) - 1);
+}
+
+void rmr_map_update_page_params(struct rmr_dirty_id_map *map);
+struct rmr_dirty_id_map *rmr_map_create(struct rmr_pool *pool, u8 member_id);
+void rmr_map_destroy(struct rmr_dirty_id_map *map);
+void rmr_map_calc_chunk(struct rmr_pool *pool, size_t offset, size_t length, rmr_id_t *id);
+void rmr_map_set_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter);
+void rmr_map_set_dirty_all(struct rmr_dirty_id_map *map, u8 filter);
+struct rmr_map_entry *rmr_map_unset_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter);
+bool rmr_map_check_dirty(struct rmr_dirty_id_map *map, rmr_id_t id);
+struct rmr_map_entry *rmr_map_get_dirty_entry(struct rmr_dirty_id_map *map, rmr_id_t id);
+void rmr_map_clear_filter_all(struct rmr_dirty_id_map *map, u8 filter);
+void rmr_map_unset_dirty_all(struct rmr_dirty_id_map *map);
+bool rmr_map_empty(struct rmr_dirty_id_map *map);
+
+void rmr_map_bitwise_or_buf(void *dst_buf, void *src_buf, u32 buf_size);
+int rmr_map_create_entries(struct rmr_dirty_id_map *map);
+
+void rmr_map_hexdump_bitmap_buf(u8 member_id, void *buf, u32 buf_size);
+void rmr_map_slps_to_buf(struct rmr_dirty_id_map *map, u64 slp_idx, u64 no_of_slp, u8 *buf);
+u64 rmr_map_buf_to_slps(struct rmr_dirty_id_map *map, u8 *buf, u32 buf_size, u64 slp_idx,
+			bool test);
+void rmr_map_dump_bitmap(struct rmr_dirty_id_map *map);
+int rmr_map_summary_format(struct rmr_pool *pool, char *buf, size_t buf_size);
+void rmr_map_bidump_bitmap_buf(void *buf, u8 member_id, u32 buf_size);
+
+static inline void map_entry_get_sync(struct rmr_map_entry *entry)
+{
+	atomic_inc(&entry->sync_cnt);
+	pr_debug("after get ref for entry %p, sync cnt %d\n",
+		 entry, atomic_read(&entry->sync_cnt));
+}
+
+static inline int map_entry_put_sync(struct rmr_map_entry *entry)
+{
+	pr_debug("before dec_and_test for entry %p, sync cnt %d\n",
+		 entry, atomic_read(&entry->sync_cnt));
+	return atomic_dec_and_test(&entry->sync_cnt);
+}
+
+static inline void rmr_maplist_destroy(struct rmr_dirty_id_map *maplist)
+{
+	struct rmr_dirty_id_map *mp;
+
+	while (maplist != NULL) {
+		mp = maplist;
+		maplist = maplist->next;
+		rmr_map_destroy(mp);
+	}
+}
+#endif /* RMR_MAP_H */
diff --git a/drivers/infiniband/ulp/rmr/rmr-pool.c b/drivers/infiniband/ulp/rmr/rmr-pool.c
new file mode 100644
index 000000000000..5e5632d9d701
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-pool.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "rmr-pool.h"
+
+LIST_HEAD(pool_list);
+DEFINE_MUTEX(pool_mutex);	/* mutex to protect pool_list */
+struct kmem_cache *rmr_map_entry_cachep;
+
+const char *rmr_get_cmd_name(enum rmr_msg_cmd_type cmd)
+{
+	switch (cmd) {
+	case RMR_CMD_MAP_READY: return "RMR_CMD_MAP_READY";
+	case RMR_CMD_MAP_SEND: return "RMR_CMD_MAP_SEND";
+	case RMR_CMD_SEND_MAP_BUF: return "RMR_CMD_SEND_MAP_BUF";
+	case RMR_CMD_MAP_BUF_DONE: return "RMR_CMD_MAP_BUF_DONE";
+	case RMR_CMD_MAP_DONE: return "RMR_CMD_MAP_DONE";
+	case RMR_CMD_MAP_DISABLE: return "RMR_CMD_MAP_DISABLE";
+	case RMR_CMD_READ_MAP_BUF: return "RMR_CMD_READ_MAP_BUF";
+	case RMR_CMD_MAP_CHECK: return "RMR_CMD_MAP_CHECK";
+	case RMR_CMD_LAST_IO_TO_MAP: return "RMR_CMD_LAST_IO_TO_MAP";
+	case RMR_CMD_STORE_CHECK: return "RMR_CMD_STORE_CHECK";
+	case RMR_CMD_MAP_TEST: return "RMR_CMD_MAP_TEST";
+	case RMR_CMD_SEND_MD_BUF: return "RMR_CMD_SEND_MD_BUF";
+	case RMR_CMD_MD_SEND: return "RMR_CMD_MD_SEND";
+
+	case RMR_CMD_MAP_GET_VER: return "RMR_CMD_MAP_GET_VER";
+	case RMR_CMD_MAP_SET_VER: return "RMR_CMD_MAP_SET_VER";
+	case RMR_CMD_DISCARD_CLEAR_FLAG: return "RMR_CMD_DISCARD_CLEAR_FLAG";
+	case RMR_CMD_SEND_DISCARD: return "RMR_CMD_SEND_DISCARD";
+
+	case RMR_MAP_CMD_MAX: return "RMR_MAP_CMD_MAX";
+
+	case RMR_CMD_POOL_INFO: return "RMR_CMD_POOL_INFO";
+	case RMR_CMD_JOIN_POOL: return "RMR_CMD_JOIN_POOL";
+
+	case RMR_CMD_REJOIN_POOL: return "RMR_CMD_REJOIN_POOL";
+
+	case RMR_CMD_LEAVE_POOL: return "RMR_CMD_LEAVE_POOL";
+	case RMR_CMD_ENABLE_POOL: return "RMR_CMD_ENABLE_POOL";
+
+	case RMR_CMD_USER: return "RMR_CMD_USER";
+
+	case RMR_POOL_CMD_MAX: return "RMR_POOL_CMD_MAX";
+
+	default: return "Unknown command";
+	}
+}
+
+void free_pool(struct rmr_pool *pool)
+{
+	WARN_ON(!list_empty(&pool->sess_list));
+
+	cleanup_srcu_struct(&pool->sess_list_srcu);
+	cleanup_srcu_struct(&pool->map_srcu);
+
+	if (!list_empty(&pool->entry)) {
+		mutex_lock(&pool_mutex);
+		list_del(&pool->entry);
+		mutex_unlock(&pool_mutex);
+	}
+
+	percpu_ref_exit(&pool->ids_inflight_ref);
+	kfree(pool);
+}
+
+/**
+ * rmr_find_pool_by_group_id - Find a pool with group_id in global pool list
+ *
+ * @group_id: Group_id of the pool being searched
+ *
+ * Locks:
+ *    Caller should hold global pool_mutex
+ */
+struct rmr_pool *rmr_find_pool_by_group_id(u32 group_id)
+{
+	struct rmr_pool *pool;
+
+	list_for_each_entry(pool, &pool_list, entry)
+		if (pool->group_id == group_id)
+			return pool;
+
+	return NULL;
+}
+
+/**
+ * rmr_find_pool - Find a pool named poolname in the global pool list
+ *
+ * @poolname: Name of the pool to be searched
+ *
+ * Locks:
+ *    Caller must hold global pool_mutex
+ */
+struct rmr_pool *rmr_find_pool(const char *poolname)
+{
+	struct rmr_pool *pool;
+
+	lockdep_assert_held(&pool_mutex);
+
+	list_for_each_entry(pool, &pool_list, entry) {
+		if (!strcmp(poolname, pool->poolname))
+			return pool;
+	}
+
+	return NULL;
+}
+
+static void rmr_pool_inflight_ref_release(struct percpu_ref *ref)
+{
+	struct rmr_pool *pool = container_of(ref, struct rmr_pool, ids_inflight_ref);
+
+	complete_all(&pool->complete_done);
+}
+
+void rmr_pool_confirm_inflight_ref(struct percpu_ref *ref)
+{
+	struct rmr_pool *pool = container_of(ref, struct rmr_pool, ids_inflight_ref);
+
+	complete_all(&pool->confirm_done);
+}
+
+static struct rmr_pool *alloc_pool(const char *poolname, u32 group_id)
+{
+	struct rmr_pool *pool;
+	int ret;
+
+	pr_debug("%s: allocate pool %s with group_id %u\n",
+		 __func__, poolname, group_id);
+
+	if (strlen(poolname) > NAME_MAX) {
+		pr_err("%s: Failed to create '%s': name too long\n", __func__, poolname);
+		return ERR_PTR(-EINVAL);
+	}
+
+	pool = kzalloc(sizeof(struct rmr_pool), GFP_KERNEL);
+	if (unlikely(!pool))
+		return ERR_PTR(-ENOMEM);
+
+	ret = init_srcu_struct(&pool->sess_list_srcu);
+	if (ret) {
+		pr_err("%s: Sess list srcu init failed, err: %d\n", __func__, ret);
+		pool = ERR_PTR(ret);
+		goto free_pool;
+	}
+
+	ret = init_srcu_struct(&pool->map_srcu);
+	if (ret) {
+		pr_err("%s: Map srcu init failed, err: %d\n", __func__, ret);
+		pool = ERR_PTR(ret);
+		goto cleanup_sess_srcu;
+	}
+
+	ret = percpu_ref_init(&pool->ids_inflight_ref,
+			      rmr_pool_inflight_ref_release,
+			      PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (ret) {
+		pr_err("%s: Percpu reference init failed for pool %s\n", __func__, poolname);
+		pool = ERR_PTR(ret);
+		goto cleanup_map_srcu;
+	}
+
+	pool->group_id = group_id;
+	pool->map_ver = 1;
+	pool->mapped_size = 0;
+	xa_init_flags(&pool->stg_members, XA_FLAGS_ALLOC);
+	init_completion(&pool->complete_done);
+	init_completion(&pool->confirm_done);
+	mutex_init(&pool->sess_lock);
+	mutex_init(&pool->maps_lock);
+	INIT_LIST_HEAD(&pool->entry);
+	INIT_LIST_HEAD(&pool->sess_list);
+
+	init_completion(&pool->discard_done);
+	atomic_set(&pool->discard_waiting, 0);
+	atomic_set(&pool->normal_count, 0);
+
+	strscpy(pool->poolname, poolname, sizeof(pool->poolname));
+
+	return pool;
+
+cleanup_map_srcu:
+	cleanup_srcu_struct(&pool->map_srcu);
+cleanup_sess_srcu:
+	cleanup_srcu_struct(&pool->sess_list_srcu);
+free_pool:
+	kfree(pool);
+	return pool;
+}
+
+struct rmr_pool *rmr_create_pool(const char *poolname, void *priv)
+{
+	u32 group_id;
+	struct rmr_pool *pool;
+
+	mutex_lock(&pool_mutex);
+
+	pool = rmr_find_pool(poolname);
+	if (unlikely(pool)) {
+		pr_err("Pool '%s' already exists\n", poolname);
+		pool = ERR_PTR(-EEXIST);
+		goto out;
+	}
+
+	/* Calculate the poolname hash */
+	group_id = rmr_pool_hash(poolname);
+
+	/* Double ensure there is no hash-clash */
+	pool = rmr_find_pool_by_group_id(group_id);
+	if (unlikely(pool)) {
+		pr_err("Pool '%s' already exists\n", poolname);
+		pool = ERR_PTR(-EEXIST);
+		goto out;
+	}
+
+	pool = alloc_pool(poolname, group_id);
+	if (IS_ERR(pool)) {
+		pr_err("Pool allocation failed for pool %s\n", poolname);
+		goto out;
+	}
+
+	list_add(&pool->entry, &pool_list);
+	pool->priv = priv;
+	pool->pool_md.magic = RMR_POOL_MD_MAGIC;
+
+out:
+	mutex_unlock(&pool_mutex);
+	return pool;
+}
+
+/**
+ * rmr_pool_maps_to_buf - Copy dirty_bitmap buffer of pool to buf
+ *
+ * @pool:	The pool whose map is to be copied
+ * @map_idx:	The map index in the pool's map array
+ * @offset:	The offset to read from in the maps dirty_bitmap buffer
+ * @buf:	Pointer to buf where to copy the dirty_bitmap buffer
+ * @buflen:	Length of the buf available to copy to
+ * @filter:	TODO
+ *
+ * Description:
+ *	This function is one half of the (map <-> buf) pair. It is used to save map into a buf.
+ *	The other half is rmr_pool_save_map, which is used to save a buf into the map.
+ *	This function is used while both sending a map and reading a map.
+ *	The process for both of them is largely same.
+ *
+ *	The relevant params like member_id, offset for the dirty_bitmap buffer
+ *	are stored in the rmr_map_buf_hdr, which is kept at the starting of buf.
+ *
+ *	The caller has to take care of sending the correct map index and offset to copy from.
+ *	For this, the function provides some help in the form of updating the map_idx and
+ *	offset values (for map send), and storing it those in map_buf_hdr (for map read).
+ *
+ * Return value:
+ *	0 If there is no more data to send
+ *	Total size copied to buf
+ */
+int rmr_pool_maps_to_buf(struct rmr_pool *pool, u8 *map_idx, u64 *slp_idx,
+			 void *buf, size_t buflen, rmr_map_filter filter)
+{
+	struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf;
+	struct rmr_dirty_id_map *map = NULL;
+	int lock_idx;
+	u64 no_of_slp;
+
+	/* Adjust buf and buflen */
+	buf += sizeof(struct rmr_map_buf_hdr);
+	buflen -= sizeof(struct rmr_map_buf_hdr);
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	for ( ; ; *map_idx += 1) {
+
+		if (*map_idx >= pool->maps_cnt) {
+			srcu_read_unlock(&pool->map_srcu, lock_idx);
+			return 0;
+		}
+
+		map = rcu_dereference(pool->maps[*map_idx]);
+		if (map)
+			break;
+	}
+
+	map_buf_hdr->version = RMR_MAP_FORMAT_VER;
+
+	/* This is for the destination, to inform where to store */
+	map_buf_hdr->member_id = map->member_id;
+	map_buf_hdr->dst_slp_idx = (*slp_idx);
+
+	/*
+	 * SLPs are pages. Duh!
+	 */
+	no_of_slp = buflen >> PAGE_SHIFT;
+	no_of_slp = min(no_of_slp, (map->total_slp - *slp_idx));
+	rmr_map_slps_to_buf(map, *slp_idx, no_of_slp, buf);
+	map_buf_hdr->buf_size = no_of_slp * PAGE_SIZE;
+
+	if ((*slp_idx + no_of_slp) >= map->total_slp) {
+		/*
+		 * All done for this map.
+		 * Now move on to the next one, and reset the index.
+		 */
+		*map_idx += 1;
+		*slp_idx = 0;
+	} else {
+		/*
+		 * Copy the number of SLPs we can, and increment the index.
+		 */
+		*slp_idx += no_of_slp;
+	}
+
+	pr_info("%s: buf_size %u, buflen w/o hdr %lu\n",
+		__func__, map_buf_hdr->buf_size, buflen);
+
+	/* This is for MAP_READ, to inform where to ask from next */
+	map_buf_hdr->map_idx = *map_idx;
+	map_buf_hdr->slp_idx = *slp_idx;
+
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	return (map_buf_hdr->buf_size + sizeof(struct rmr_map_buf_hdr));
+}
+
+/**
+ * rmr_pool_save_map - Copy given buf to dirty_bitmap buffer of pool
+ *
+ * @pool:	The pool whose map is the dest for the copy
+ * @buf:	Pointer to buf from where to copy
+ * @buflen:	Length of the buf available to copy
+ * @test_only:	Only test if the buf given matches with dirty_bitmap buf of pool
+ * @map_clean:	TODO
+ *
+ * Description:
+ *	This function is the other half of the (map <-> buf) pair.
+ *	It saves buf into the map of pool. The relevant params are read from the
+ *	rmr_map_buf_hdr which lies in the start of the given buf.
+ *
+ * Return value:
+ *	0 on success
+ *	-errno on error
+ */
+int rmr_pool_save_map(struct rmr_pool *pool, void *buf, size_t buflen,
+		      bool test_only)
+{
+	struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf;
+	struct rmr_dirty_id_map *map = NULL;
+	int err = 0, lock_idx;
+	u32 buf_size;
+	u64 slp_idx;
+
+	if (map_buf_hdr->version != RMR_MAP_FORMAT_VER) {
+		pr_err("Wrong map format. Expected %d but received %llu\n",
+		       RMR_MAP_FORMAT_VER, map_buf_hdr->version);
+		return -EINVAL;
+	}
+
+	/* Adjust buf and buflen */
+	buf += sizeof(struct rmr_map_buf_hdr);
+	buflen -= sizeof(struct rmr_map_buf_hdr);
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	map = rmr_pool_find_map(pool, map_buf_hdr->member_id);
+	if (!map) {
+		pr_err("%s: No map found for member_id %llu\n",
+		       __func__, map_buf_hdr->member_id);
+		err = -ENOENT;
+		goto out;
+	}
+
+	slp_idx = map_buf_hdr->dst_slp_idx;
+	buf_size = map_buf_hdr->buf_size;
+
+	pr_info("%s: For pool %s, received map for %llu, slp_idx %llu, buf_size %u, buflen %lu\n",
+		__func__, pool->poolname, map_buf_hdr->member_id, slp_idx, buf_size, buflen);
+
+	/* Sanity */
+	WARN_ON(buf_size > buflen);
+	WARN_ON(buf_size % PAGE_SIZE);
+
+	pr_info("%s: buf_size %u, buflen w/o hdr %lu\n", __func__, map_buf_hdr->buf_size, buflen);
+
+	/*
+	 * The buf_size would be a factor of PAGE_SIZE,
+	 * and thats how we know no_of_slp(s) to save.
+	 */
+	if (!rmr_map_buf_to_slps(map, buf, buf_size, slp_idx, test_only)) {
+		pr_err("%s: rmr_map_buf_to_slps failed\n", __func__);
+		goto out;
+	}
+
+out:
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	return err;
+}
diff --git a/drivers/infiniband/ulp/rmr/rmr-pool.h b/drivers/infiniband/ulp/rmr/rmr-pool.h
new file mode 100644
index 000000000000..3cb7d3ae84b9
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-pool.h
@@ -0,0 +1,400 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef RMR_POOL_H
+#define RMR_POOL_H
+
+#include <linux/limits.h>	/* for NAME_MAX */
+#include <linux/refcount.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>	/* for jhash() */
+#include <linux/kernel.h>	/* for round_up */
+#include "rmr.h"
+#include "rmr-map.h"
+
+#define RMR_POOL_MD_MAGIC 0xDEADBEEF
+#define XA_TRUE  ((void *)1UL)
+#define XA_FALSE ((void *)2UL)
+
+extern struct kmem_cache *rmr_map_entry_cachep;
+/*
+ * enum srv_sync_thread_state
+ */
+enum srv_sync_thread_state {
+	SYNC_THREAD_REQ_STOP,	/* 0 */
+	SYNC_THREAD_STOPPED,
+	SYNC_THREAD_RUNNING,
+	SYNC_THREAD_WAIT,
+};
+
+enum srv_map_update_state {
+	MAP_UPDATE_STATE_DISABLED,
+	MAP_UPDATE_STATE_READY,
+	MAP_UPDATE_STATE_DONE,
+};
+
+/* The srv pool specific structure */
+struct rmr_srv_md {
+	u64			map_ver;
+	u64			mapped_size;		/* server store size in sectors */
+	u8			member_id;
+	u8			srv_pool_state;		/* server pool state */
+	u8			store_state;		/* state of io_store */
+	u8			map_update_state;
+	bool			discard_entries;
+};
+
+/* Shared by each pool */
+struct rmr_pool_md {
+	char			poolname[NAME_MAX];
+	u64			magic;
+	u32			group_id;
+	u32			chunk_size;		/* rmr client */
+	u64			mapped_size;		/* client view of store size */
+	u32			queue_depth;
+	u64			map_ver;
+	struct rmr_srv_md	srv_md[RMR_POOL_MAX_SESS];
+} __packed;
+
+struct rmr_pool {
+	char			poolname[NAME_MAX];
+	u32			group_id;	/* jhash() on poolname */
+	struct kobject		kobj;
+	struct kobject		sessions_kobj;
+	struct list_head	entry;		/* for global pool_list */
+
+	struct list_head	sess_list;	/* list of sessions */
+	struct mutex		sess_lock;	/* protect list of sessions */
+	struct srcu_struct	sess_list_srcu;
+
+	void			*priv;
+	u64			mapped_size;
+	u32 			chunk_size;
+	u8			chunk_size_shift;
+	u64			no_of_chunks;
+
+	struct percpu_ref       ids_inflight_ref;
+	struct completion       complete_done;
+	struct completion       confirm_done;
+
+	struct completion	discard_done; /* for sync client pool */
+	/* Set when waiting for response of discard request */
+	atomic_t		discard_waiting;
+
+	u8                      maps_cnt;
+	struct mutex		maps_lock;
+	struct rmr_dirty_id_map __rcu
+				*maps[RMR_POOL_MAX_SESS];
+	/* All member ids of the storage nodes */
+	struct xarray		stg_members;
+	u64			map_ver;
+	atomic_t		normal_count; /* number of pool sessions currently in NORMAL state */
+	struct srcu_struct	map_srcu;
+
+	struct rmr_pool_md	pool_md;
+
+	bool is_clt;
+	bool sync;
+};
+
+/**
+ * rmr_pool_find_md - find the index of the srv_md with the provided key in the pool_md
+ *
+ * @pool_md: the pool_md to search
+ * @key: the member_id of the server pool to search for
+ * @empty_slot: the empty slot is required by caller or not
+ *
+ * Description:
+ *	Find the index of the srv_md with the matched key. If there is no such a key and the empty
+ *	slot is not required, return -1.
+ *
+ * Return:
+ *	>= 0, the index of the key in the pool_md. Return the index of an empty slot when the key
+ *	is not found and the empty_slot flag is true
+ *	-1 if the key is not found and empty_slot is false, or the pool_md doesn't exist
+ */
+static inline int rmr_pool_find_md(struct rmr_pool_md *pool_md, u8 key, bool empty_slot)
+{
+	int i;
+	int empty_i = -1;
+
+	if (!pool_md)
+		return -1;
+
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		if (!pool_md->srv_md[i].member_id)
+			empty_i = i;
+
+		if (pool_md->srv_md[i].member_id == key)
+			return i;
+	}
+
+	if (empty_slot)
+		return empty_i;
+	return -1;
+}
+
+/**
+ * rmr_pool_md_check_discard - check the discard_entries flag of the srv_md
+ *
+ * @pool: the pool to check pool_md
+ * @member_id: the member_id of the srv_md to check
+ *
+ * Description:
+ *	Check if the pool has received the discards from the server pool with the provided
+ *	member_id.
+ *
+ * Return:
+ *	1 (true) if the pool has received the discards,
+ *	0 (false) if the pool has not received the discards,
+ *	<0 if the pool has no info of the server pool
+ */
+static inline int rmr_pool_md_check_discard(struct rmr_pool *pool, u8 member_id)
+{
+	int md_i = rmr_pool_find_md(&pool->pool_md, member_id, false);
+
+	if (md_i < 0) {
+		pr_err("Failed to find md for member_id %u\n", member_id);
+		return -EINVAL;
+	}
+
+	/* If the flag is set, this pool has received the discards. */
+	return pool->pool_md.srv_md[md_i].discard_entries;
+}
+
+#define RMR_MAP_FORMAT_VER 1
+/*
+ * Get the first most significant bit of map_ver. If it is one, then the store of that storage node
+ * is being replaced.
+ */
+#define RMR_STORE_IS_REPLACE(map_ver) (map_ver >> 63 & 1ULL)
+#define RMR_STORE_GET_VER(map_ver) (map_ver & ~(1ULL << 63))
+#define RMR_STORE_SET_REPLACE(map_ver) (map_ver |= 1ULL << 63)
+#define RMR_STORE_UNSET_REPLACE(map_ver) (map_ver &= ~(1ULL << 63))
+#define RTRS_IO_LIMIT	   102400
+//#define RTRS_IO_LIMIT 40 //for tests only
+
+/*
+ * TODO:
+ * We currently do not have mapped_size while creating dirty maps,
+ * which means we cannot calculate no_of_chunks, hence cannot allocate bitmap
+ * So, as a workaround, we allocate max size bitmap,
+ * and to reduce that allocation, we cap max mapped_size.
+ *
+ * 1GB max mapped size for now.
+ * (Size mentioned in number of sectors, just like nr_sects)
+ */
+#define RMR_MAX_MAPPED_SIZE    2097152
+
+/* The header structure of rmr pool metadata will not over this limit. */
+#define RMR_MD_SIZE		PAGE_SIZE
+#define RMR_MD_SIZE_SECTORS	(PAGE_SIZE / SECTOR_SIZE)
+#define RMR_MAP_BUF_HDR_SIZE    PAGE_SIZE
+#define RMR_SRV_MD_SIZE		(sizeof(struct rmr_srv_md) * RMR_POOL_MAX_SESS)
+#define RMR_CLT_MD_SIZE		(sizeof(struct rmr_pool_md) - RMR_SRV_MD_SIZE)
+#define RMR_SECTOR_SIZE		512
+#define RMR_INT_ROUND_UP(x, y)	(((x) + (y) - 1) / (y))
+#define RMR_ROUND_UP(x)		round_up(x, RMR_SECTOR_SIZE)
+
+#define RMR_SRV_MAX_QDEPTH	512
+
+/* last_io region starts right after the pool_md header page */
+#define RMR_LAST_IO_OFFSET	RMR_MD_SIZE
+
+static inline u64 rmr_last_io_len(u32 queue_depth)
+{
+	return RMR_ROUND_UP((u64)queue_depth * sizeof(rmr_id_t));
+}
+
+static inline u64 rmr_bitmap_offset(u32 queue_depth)
+{
+	return RMR_LAST_IO_OFFSET + rmr_last_io_len(queue_depth);
+}
+
+static inline u64 rmr_per_map_bitmap_size(u64 no_of_chunks)
+{
+	return DIV_ROUND_UP(no_of_chunks, CHUNKS_PER_SLP) * PAGE_SIZE;
+}
+
+static inline u64 rmr_bitmap_len(u64 no_of_chunks)
+{
+	return RMR_POOL_MAX_SESS * rmr_per_map_bitmap_size(no_of_chunks);
+}
+
+struct rmr_map_buf_hdr {
+	u64 version;
+	u64 member_id;
+
+	/*
+	 * dst_slp_idx: SLP index in the local dirty map buffer,
+	 * from where to write the recved dirty map buffer
+	 */
+	u64 dst_slp_idx;
+	u32 buf_size;
+
+	/*
+	 * slp_idx: Only used for MAP_READ,
+	 * to let client know where to ask from in the next iteration
+	 */
+	u64 map_idx;
+	u64 slp_idx;
+} __packed;
+
+extern struct list_head pool_list;
+extern struct mutex pool_mutex;
+
+const char *rmr_get_cmd_name(enum rmr_msg_cmd_type cmd);
+
+struct rmr_pool *rmr_create_pool(const char *poolname, void *priv);
+void free_pool(struct rmr_pool *pool);
+
+struct rmr_pool *rmr_find_pool_by_group_id(u32 group_id);
+struct rmr_pool *rmr_find_pool(const char *poolname);
+int rmr_pool_maps_to_buf(struct rmr_pool *pool, u8 *map_idx, u64 *slp_idx,
+			 void *buf, size_t buflen, rmr_map_filter filter);
+int rmr_pool_save_map(struct rmr_pool *pool, void *buf, size_t buflen,
+		      bool test_only);
+
+static inline void rmr_pool_update_no_of_chunk(struct rmr_pool *pool)
+{
+	u64 calc_no_of_chunks = 0, old_no_of_chunks = pool->no_of_chunks;
+
+	/*
+	 * In include/linux/types.h
+	 *
+	 * "Linux always considers sectors to be 512 (SECTOR_SHIFT==9) bytes long independently
+	 * of the devices real block size."
+	 *
+	 * mapped_size is saved in sectors.
+	 */
+	if (pool->mapped_size) {
+		calc_no_of_chunks = (pool->mapped_size >> (pool->chunk_size_shift - 9));
+
+		if (pool->chunk_size &&
+		    (pool->mapped_size << 9) % pool->chunk_size)
+			calc_no_of_chunks += 1;
+	}
+
+	if (calc_no_of_chunks != pool->no_of_chunks) {
+		pool->no_of_chunks = calc_no_of_chunks;
+		pr_info("%s: For %s, no_of_chunks old (%llu), updated %llu\n",
+			__func__, pool->poolname, old_no_of_chunks, pool->no_of_chunks);
+	}
+}
+
+/*
+ * rmr_pool_maps_append - Append a map to the dense maps array
+ * @pool: pool
+ * @map: map to add
+ *
+ * Context: Caller must hold maps_lock.
+ */
+static inline void rmr_pool_maps_append(struct rmr_pool *pool,
+					struct rmr_dirty_id_map *map)
+{
+	rcu_assign_pointer(pool->maps[pool->maps_cnt], map);
+	pool->maps_cnt++;
+}
+
+/*
+ * rmr_pool_maps_swap_remove - Remove map at index @i using swap-with-last
+ * @pool: pool
+ * @i: index of the map in the map array to remove
+ * @map: the map being removed
+ *
+ * Description:
+ *      Maintains the dense invariant: pool->maps[0:maps_cnt] has no NULL gaps.
+ *
+ * Context: Caller must hold maps_lock.
+ */
+static inline void rmr_pool_maps_swap_remove(struct rmr_pool *pool, u8 i,
+					     struct rmr_dirty_id_map *map)
+{
+	u8 last = pool->maps_cnt - 1;
+
+	if (i != last)
+		rcu_assign_pointer(pool->maps[i], rcu_dereference_protected(pool->maps[last],
+					lockdep_is_held(&pool->maps_lock)));
+
+	rcu_assign_pointer(pool->maps[last], NULL);
+	pool->maps_cnt--;
+}
+
+static inline struct rmr_dirty_id_map *rmr_pool_find_map(struct rmr_pool *pool, u8 member_id)
+{
+	int i;
+	struct rmr_dirty_id_map *map;
+	struct rmr_dirty_id_map *res = NULL;
+
+	rcu_read_lock();
+	for (i = 0; i < pool->maps_cnt; i++) {
+		map = rcu_dereference(pool->maps[i]);
+
+		if (WARN_ON(!map) || map->member_id != member_id)
+			continue;
+
+		res = map;
+		break;
+	}
+	rcu_read_unlock();
+
+	return res;
+}
+
+static inline int rmr_pool_remove_map(struct rmr_pool *pool, u8 member_id)
+{
+	int i;
+	struct rmr_dirty_id_map *mp;
+	struct rmr_dirty_id_map *map = NULL;
+
+	pr_info("%s: pool %s is removing map for member_id %d\n",
+		__func__, pool->poolname, member_id);
+
+	mutex_lock(&pool->maps_lock);
+	for (i = 0; i < pool->maps_cnt; i++) {
+		mp = rcu_dereference_protected(pool->maps[i],
+				lockdep_is_held(&pool->maps_lock));
+		if (WARN_ON(!mp))
+			continue;
+		if (mp->member_id == member_id) {
+			map = mp;
+			break;
+		}
+	}
+
+	if (!map) {
+		mutex_unlock(&pool->maps_lock);
+		pr_err("%s: pool %s cannot find map for member_id %d\n",
+		       __func__, pool->poolname, member_id);
+		return -EINVAL;
+	}
+
+	/* Dirty map entries are also removed since the map no longer exists. */
+	rmr_map_unset_dirty_all(map);
+
+	rmr_pool_maps_swap_remove(pool, i, map);
+	synchronize_srcu(&pool->map_srcu);
+
+	mutex_unlock(&pool->maps_lock);
+
+	/* Free up the memory */
+	rmr_map_destroy(map);
+
+	return 0;
+}
+
+
+bool rmr_pool_change_state(struct rmr_pool *pool, enum rmr_pool_state new_state);
+
+void rmr_pool_confirm_inflight_ref(struct percpu_ref *ref);
+
+static inline u32 rmr_pool_hash(const char *poolname)
+{
+	return jhash(poolname, strlen(poolname), 0);
+}
+
+#endif /* RMR_POOL_H */
diff --git a/drivers/infiniband/ulp/rmr/rmr-proto.h b/drivers/infiniband/ulp/rmr/rmr-proto.h
new file mode 100644
index 000000000000..02c20ed76bef
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-proto.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef RMR_PROTO_H
+#define RMR_PROTO_H
+
+#define RMR_PROTO_VER_MAJOR 0
+#define RMR_PROTO_VER_MINOR 1
+
+#define RMR_PROTO_VER_STRING __stringify(RMR_PROTO_VER_MAJOR) "." \
+			       __stringify(RMR_PROTO_VER_MINOR)
+
+#ifndef RMR_VER_STRING
+#define RMR_VER_STRING __stringify(RMR_PROTO_VER_MAJOR) "." \
+			 __stringify(RMR_PROTO_VER_MINOR)
+#endif
+
+/* TODO: should be configurable */
+#define RTRS_PORT 1234
+
+#define RMR_POOL_MAX_SESS 4
+
+/**
+ * enum rmr_msg_types - RMR message types
+ * @RMR_MSG_JOIN_POOL:      Join pool message from client to server
+ * @RMR_MSG_JOIN_POOL_RSP:  Join pool messge response from server to client
+ * @RMR_MSG_LEAVE_POOL:     Leave pool message from client to server
+ * @RMR_MSG_IO:             IO(read/write) request on an object
+ */
+enum rmr_msg_type {
+	RMR_MSG_CMD,
+	RMR_MSG_CMD_RSP,
+	RMR_MSG_IO,
+	RMR_MSG_MD,
+	RMR_MSG_MAP_CLEAR,
+	RMR_MSG_MAP_ADD,
+};
+
+/**
+ * struct rmr_msg_hdr - header of RMR messages
+ * @type:	Message type, valid values see: enum rmr_msg_types
+ */
+struct rmr_msg_hdr {
+	__le32		group_id; /* poolname jhash() */
+	__le16		type;
+	__le16		__padding;
+};
+
+/**
+ * struct rmr_msg_io - message for object I/O read/write
+ * @hdr:	message header
+ * @id_a:	first 64bit of the object id
+ * @id_b:	second 64bit of the object id
+ * @offset:	offset from where to read/write
+ * @flags:	bitmask, valid values are defined in enum rmr_io_flags
+ * @length:	number of bytes for I/O read/write
+ * @pool_id:	pool id to which the object belongs
+ */
+struct rmr_msg_io {
+	struct rmr_msg_hdr hdr;
+	__le64		id_a;
+	__le64		id_b;
+
+	__le32		offset;
+	__le32		length;
+	__le32		flags;
+	__le16          prio;
+
+	__le32		mem_id;
+	__le64		map_ver;
+	u8		failed_id[RMR_POOL_MAX_SESS];
+	u8		failed_cnt;
+
+	u8		member_id;
+	u8		sync;
+	u8		__padding[19]; //padding is not correct now i think
+};
+
+struct rmr_pool_member_info {
+	u8	no_of_stor;
+
+	struct per_mem_info {
+		u8	member_id;
+		u8	c_dirty;
+	} p_mem_info[RMR_POOL_MAX_SESS];
+};
+
+/**
+ * enum rmr_msg_cmd_types - RMR command types
+ * @RMR_CMD_MAP_READY: Get ready to receive map
+ * @RMR_CMD_MAP_SEND:  Send map to certain node
+ * @RMR_CMD_MAP_DONE:  Confirm map receipt
+ *
+ * When adding a command,
+ * make sure to add it to the function rmr_get_cmd_name.
+ */
+enum rmr_msg_cmd_type {
+	RMR_CMD_MAP_READY,	// 0
+	RMR_CMD_MAP_SEND,
+	RMR_CMD_SEND_MAP_BUF,
+	RMR_CMD_MAP_BUF_DONE,
+	RMR_CMD_MAP_DONE,
+	RMR_CMD_MAP_DISABLE,
+	RMR_CMD_READ_MAP_BUF,
+	RMR_CMD_MAP_CHECK,
+	RMR_CMD_LAST_IO_TO_MAP,
+	RMR_CMD_STORE_CHECK,
+	RMR_CMD_MAP_TEST,
+	/* sends the metadata of non-sync rmr-client to server */
+	RMR_CMD_SEND_MD_BUF,
+	/*sends the message of discards to the node */
+	RMR_CMD_SEND_DISCARD,
+	/* sends the message of md_update to the node; the node sends its srv_md back. */
+	RMR_CMD_MD_SEND,
+
+	RMR_CMD_MAP_GET_VER,	// 14
+	RMR_CMD_MAP_SET_VER,
+	RMR_CMD_DISCARD_CLEAR_FLAG,
+
+	/*
+	 * Add map related commands above this
+	 */
+	RMR_MAP_CMD_MAX,
+
+	RMR_CMD_POOL_INFO,	// 18
+	RMR_CMD_JOIN_POOL,
+
+	RMR_CMD_REJOIN_POOL,
+
+	RMR_CMD_LEAVE_POOL,
+	RMR_CMD_ENABLE_POOL,	// 22
+
+	RMR_CMD_USER,
+
+	/*
+	 * Add pool related commands above this
+	 */
+	RMR_POOL_CMD_MAX,
+};
+
+struct rmr_msg_map_send_cmd {
+	u8	receiver_member_id;
+};
+
+struct rmr_msg_map_buf_cmd {
+	u64	version;
+	u8	map_idx;
+	u64	slp_idx;
+};
+
+struct rmr_msg_map_buf_done_cmd {
+	u64	map_version;
+};
+
+struct rmr_msg_map_done_cmd {
+	u8	enable;
+};
+
+struct rmr_msg_send_md_buf_cmd {
+	u8	sync;	/* if the pool is sync or not */
+	u8	sender_id;
+	u8	receiver_id;
+	u64	flags;
+};
+
+struct rmr_msg_send_discard_cmd {
+	u8	member_id;	/* the storage node that discards all data */
+};
+
+struct rmr_msg_md_send_cmd {
+	u64	src_mapped_size; /* the pool mapped size on the sending side */
+	u8	sender_id;
+	u8	leader_id;
+	u8	read_full_md;	/* 1 = return full pool_md; 0 = own entry only */
+};
+
+struct rmr_msg_pool_info_cmd {
+	u8	member_id;
+	u8	operation;	/* add/remove */
+	u8	mode;		/* For add -> create/assemble. For remove -> delete/disassemble */
+	u8	dirty;		/* Valid only when operation=ADD and mode=CREATE */
+};
+
+enum rmr_pool_info_op {
+	RMR_POOL_INFO_OP_ADD = 0,
+	RMR_POOL_INFO_OP_REMOVE,
+};
+
+enum rmr_pool_info_mode {
+	RMR_POOL_INFO_MODE_CREATE = 0,
+	RMR_POOL_INFO_MODE_ASSEMBLE,
+	RMR_POOL_INFO_MODE_DELETE,
+	RMR_POOL_INFO_MODE_DISASSEMBLE,
+};
+
+struct rmr_msg_set_map_ver_cmd {
+	u8	map_ver; /* the map version to set */
+};
+
+struct rmr_msg_join_pool_cmd {
+	u64	queue_depth;
+	u32	chunk_size;
+	struct rmr_pool_member_info	mem_info;
+	u8	dirty;
+	u8	create;
+	u8	rejoin;
+};
+
+struct rmr_msg_leave_pool_cmd {
+	u8	member_id;
+	u8	delete;
+};
+
+struct rmr_msg_enable_pool_cmd {
+	u32	enable;
+};
+
+struct rmr_msg_user_cmd {
+	size_t usr_len;
+};
+
+struct rmr_msg_join_pool_cmd_rsp {
+	u64	mapped_size;
+	u32	chunk_size;
+};
+
+struct rmr_msg_pool_cmd {
+	struct rmr_msg_hdr	hdr;
+	u8			ver;
+	u8			cmd_type;
+	u8			sync;
+	u8			rsvd[1];
+	s8			pool_name[NAME_MAX];
+	union {
+		struct rmr_msg_map_send_cmd	map_send_cmd;
+		struct rmr_msg_map_buf_cmd	map_buf_cmd;
+		struct rmr_msg_map_buf_done_cmd	map_buf_done_cmd;
+		struct rmr_msg_map_done_cmd	map_done_cmd;
+
+		struct rmr_msg_send_md_buf_cmd	send_md_buf_cmd;
+		struct rmr_msg_send_discard_cmd send_discard_cmd;
+		struct rmr_msg_md_send_cmd	md_send_cmd;
+
+		struct rmr_msg_pool_info_cmd	pool_info_cmd;
+
+		struct rmr_msg_set_map_ver_cmd  set_map_ver_cmd;
+
+		struct rmr_msg_join_pool_cmd	join_pool_cmd;
+
+		struct rmr_msg_leave_pool_cmd	leave_pool_cmd;
+		struct rmr_msg_enable_pool_cmd	enable_pool_cmd;
+
+		struct rmr_msg_user_cmd		user_cmd;
+	};
+};
+
+struct rmr_msg_pool_cmd_rsp {
+	struct rmr_msg_hdr	hdr;
+	enum rmr_msg_cmd_type	cmd_type;
+	u8			err;
+	u8			ver;
+	u8			member_id;
+	union {
+		struct rmr_msg_join_pool_cmd_rsp	join_pool_cmd_rsp;
+		u64					value;
+	};
+};
+
+#endif /* RMR_PROTO_H */
diff --git a/drivers/infiniband/ulp/rmr/rmr-req.c b/drivers/infiniband/ulp/rmr/rmr-req.c
new file mode 100644
index 000000000000..d748579c489c
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-req.c
@@ -0,0 +1,796 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#include <linux/slab.h>
+
+#include "rmr-req.h"
+#include "rmr-srv.h"
+#include "rmr-clt.h"
+
+extern struct kmem_cache *rmr_req_cachep;
+extern struct kmem_cache *rmr_map_entry_cachep;
+extern struct rmr_store_ops *pstore_ops;
+
+static void rmr_req_complete(struct rmr_srv_req *req);
+static void rmr_req_store_done(struct rmr_srv_req *req);
+static void rmr_req_sync_failed(struct rmr_srv_req *req);
+static void rmr_req_send_map_clear(struct rmr_srv_req *req);
+static void rmr_req_sync_complete(struct rmr_srv_req *req);
+static void rmr_req_store(struct rmr_srv_req *req);
+
+/**
+ * rmr_srv_req_resp - Response from the lower level module
+ *
+ * @req:	Request to be processed
+ * @err:	Error value
+ *
+ * Description:
+ *	This function is the return point from the below module
+ *	where IO is submitted.
+ *
+ * Context:
+ *	In this function the request should always be in state RMR_REQ_STATE_STORE
+ */
+void rmr_srv_req_resp(struct rmr_srv_req *req, int err)
+{
+	/*
+	 * Use the error sent from lower layer
+	 */
+	req->err = err;
+
+	/*
+	 * For Normal (non-sync) requests we handle both non-error and error cases from one
+	 * place. Since its simple.
+	 */
+	if (rmr_op(req->flags) != RMR_OP_SYNCREQ) {
+		rmr_req_complete(req);
+		return;
+	}
+
+	/*
+	 * Sync requests are complicated, since it needs extra post-processing
+	 * once IO is done for us.
+	 *
+	 * 1) In case of no failure, we need to send map clear to other nodes,
+	 *    since they think we are still dirty for this chunk.
+	 *
+	 * 2) We need to check for waiting IO in entry->wait_list, and kick them.
+	 */
+	if (!req->err)
+		rmr_req_store_done(req);
+	else
+		rmr_req_sync_failed(req);
+}
+EXPORT_SYMBOL(rmr_srv_req_resp);
+
+/**
+ * rmr_srv_req_create - Create an rmr server request
+ *
+ * @msg:	IO message containing information
+ * @srv_pool:	Server pool creating this request
+ * @rtrs_op:	rtrs IO context
+ * @data:	pointer to data buf
+ * @datalen:	len of data buf
+ * @endreq:	Function to be called at the end of rmr request processing
+ *
+ * Description:
+ *	RMR server request are base structures which holds the IO while they are being processed.
+ *	They go through a state machine, while a number of checks are done. IOs which are
+ *	destined for a chunk that is dirty, are paused while that chunk is synced.
+ *
+ * Return:
+ *	Pointer to the create rmr server request on success
+ *	Error pointer on failure
+ */
+struct rmr_srv_req *rmr_srv_req_create(const struct rmr_msg_io *msg, struct rmr_srv_pool *srv_pool,
+				       struct rtrs_srv_op *rtrs_op, void *data, u32 datalen,
+				       void (*endreq)(struct rmr_srv_req *, int))
+{
+	struct rmr_srv_req *req;
+	struct rmr_srv_io_store *store = srv_pool->io_store;
+	int i;
+
+	if (!store || !atomic_read(&srv_pool->store_state)) {
+		pr_err("%s: store not set, or srv_pool not in correct state %s\n",
+		       __func__, srv_pool->pool->poolname);
+		return ERR_PTR(-ENODEV);
+	}
+
+	req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL);
+	if (!req) {
+		pr_err("cannot allocate memory for rmr_req.\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	req->id.a = le64_to_cpu(msg->id_a);
+	req->id.b = le64_to_cpu(msg->id_b);
+
+	req->offset = le32_to_cpu(msg->offset);
+	req->length = le32_to_cpu(msg->length);
+	req->flags = le32_to_cpu(msg->flags);
+	req->prio = le16_to_cpu(msg->prio);
+
+	req->mem_id = le32_to_cpu(msg->mem_id);
+	for (i = 0; i < msg->failed_cnt; i++)
+		req->failed_srv_id[i] = msg->failed_id[i];
+
+	req->failed_cnt = msg->failed_cnt;
+	req->map_ver = le64_to_cpu(msg->map_ver);
+	req->sync = msg->sync;
+
+	req->data = data;
+	req->datalen = datalen;
+	req->rtrs_op = rtrs_op;
+	req->srv_pool = srv_pool;
+	req->store = store;
+	req->endreq = endreq;
+
+	pr_debug("req %p, chunk_size %u\n", req, req->srv_pool->pool->chunk_size);
+
+	return req;
+}
+
+struct rmr_srv_req *rmr_srv_md_req_create(struct rmr_srv_pool *srv_pool,
+					  struct rtrs_srv_op *rtrs_op, void *data,
+					  u32 offset, u32 len, unsigned long flags,
+					  void (*endreq)(struct rmr_srv_req *, int))
+{
+	struct rmr_srv_req *req;
+	struct rmr_srv_io_store *store = srv_pool->io_store;
+
+	if (!store) {
+		pr_err("No store_id registered for srv pool %s\n", srv_pool->pool->poolname);
+		return ERR_PTR(-ENODEV);
+	}
+
+	req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL);
+	if (!req) {
+		pr_err("cannot allocate memory for rmr_req.\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	req->offset = offset;
+	req->length = len;
+	req->flags = flags;
+	req->sync = false; /* A md req is always non-sync */
+
+	req->data = data;
+	req->rtrs_op = rtrs_op;
+	req->srv_pool = srv_pool;
+	req->store = store;
+	req->endreq = endreq;
+
+	pr_debug("md req %p, len %u\n", req, len);
+
+	return req;
+}
+
+void rmr_req_submit(struct rmr_srv_req *req);
+static void rmr_req_sched(struct work_struct *work)
+{
+	struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work);
+
+	pr_debug("scheduled work process for req %p\n", req);
+	if (req->err)
+		rmr_req_complete(req);
+	else
+		rmr_req_submit(req);
+}
+
+void rmr_process_wait_list(struct rmr_map_entry *entry, int err)
+{
+	struct llist_node *first, *next;
+	struct rmr_srv_req *req;
+
+	pr_debug("processing wait list for entry %p, sync_cnt=%d\n",
+		 entry, atomic_read(&entry->sync_cnt));
+
+	WARN_ON(atomic_read(&entry->sync_cnt) > 0);
+
+	while (!llist_empty(&entry->wait_list)) {
+		first = llist_del_all(&entry->wait_list);
+		while (first) {
+			next = first->next;
+			req = llist_entry(first, struct rmr_srv_req, node);
+
+			pr_debug("process waiting req %p id (%llu, %llu) flags %u\n",
+				 req, req->id.a, req->id.b, req->flags);
+			if (err) {
+				pr_err("fail waiting req %p id (%llu, %llu) flags %u err %d\n",
+				       req, req->id.a, req->id.b, req->flags, err);
+				req->err = -EIO;
+			}
+
+			pr_debug("schedule processing req %p with err %d\n", req, req->err);
+			INIT_WORK(&req->work, rmr_req_sched);
+			schedule_work(&req->work);
+
+			first = next;
+		}
+	}
+}
+
+void rmr_req_submit(struct rmr_srv_req *req)
+{
+	struct rmr_srv_pool *srv_pool = req->srv_pool;
+	struct rmr_map_entry *entry;
+	struct rmr_dirty_id_map *map;
+
+	if (rmr_op(req->flags) == RMR_OP_FLUSH && !req->length) {
+		rmr_req_store(req);
+		return;
+	}
+
+	pr_debug("check map for req %p flag %u request id [%llu, %llu] offset %u length %u\n",
+		 req, req->flags,
+		 req->id.a, req->id.b, req->offset, req->length);
+
+	map = rmr_pool_find_map(srv_pool->pool, srv_pool->member_id);
+	if (!map) {
+		pr_err("no map found for pool_id %u\n", srv_pool->member_id);
+		req->err = -EINVAL;
+		goto err;
+	}
+
+	rcu_read_lock();
+	entry = rmr_map_get_dirty_entry(map, req->id);
+	if (!entry) {
+		/*
+		 * The chunk containing data for this req is NOT dirty for us
+		 */
+		pr_debug("check map for req %p flags %u request id [%llu, %llu], no entry in the map\n",
+			 req, req->flags, req->id.a, req->id.b);
+		rcu_read_unlock();
+		rmr_req_store(req);
+		return;
+	} else {
+		/*
+		 * The chunk for this data is dirty for us.
+		 *
+		 * we have 2 cases.
+		 *
+		 * 1) Its coming from a sync rmr-clt (Its an internal read).
+		 * Then, fail the IO, since we do not want to end up in a deadlock,
+		 * or go through multiple hops for a single read. The sender can try some other
+		 * node itself.
+		 */
+		if (req->sync) {
+			WARN_ON(rmr_op(req->flags) != RMR_OP_READ);
+			rcu_read_unlock();
+			req->err = -EIO;
+			goto err;
+		}
+
+		/*
+		 * 2) If its coming from a non-sync rmr-clt,
+		 *    simply go ahead with syncing the data first.
+		 */
+		llist_add(&req->node, &entry->wait_list);
+		pr_debug("%s: req %p flags %u id (%llu %llu) added to wait list. sync_cnt %d\n",
+			 __func__, req, req->flags, req->id.a, req->id.b,
+			 atomic_read(&entry->sync_cnt));
+
+		rcu_read_unlock();
+		/*
+		 * If we are the first who grabs the entry then start sync.
+		 *
+		 * Otherwise, the one syncing the data would pick us up from the entry->wait_list
+		 * and kick us. So simply exit for now.
+		 */
+		if (atomic_cmpxchg(&entry->sync_cnt, -1, 0) == -1) {
+			int err;
+
+			req->priv = entry;
+			err = rmr_srv_sync_chunk_id(srv_pool, entry, req->id, false);
+			if (err) {
+				atomic_set(&entry->sync_cnt, -1);
+				rmr_process_wait_list(entry, err);
+			}
+		}
+	}
+
+	return;
+
+err:
+	rmr_req_complete(req);
+}
+
+static void rmr_req_store(struct rmr_srv_req *req)
+{
+	int err;
+
+	pr_debug("submit to store req %p flags %u request id [%llu, %llu] offset %u length %u\n",
+		 req, req->flags,
+		 req->id.a, req->id.b, req->offset, req->length);
+
+	err = req->store->ops->submit_req(req->store->priv, req->data, req->offset,
+					  req->length, req->flags, req->prio, req);
+	if (err) {
+		pr_err("%s: error submitting req %p, err %d\n", __func__, req, err);
+		req->err = err;
+		if (rmr_op(req->flags) == RMR_OP_SYNCREQ)
+			rmr_req_sync_failed(req);
+		else
+			rmr_req_complete(req);
+	}
+}
+
+static void rmr_md_req_store(struct rmr_srv_req *req)
+{
+	int err;
+
+	err = req->store->ops->submit_md_req(req->store->priv, req->data, req->offset, req->length,
+					     req->flags, req);
+	if (err) {
+		req->endreq(req, err);
+		pr_err("release md req %p, flags %u\n", req, req->flags);
+		kmem_cache_free(rmr_req_cachep, req);
+	}
+}
+
+/* md req submission path*/
+void rmr_md_req_submit(struct rmr_srv_req *req)
+{
+	rmr_md_req_store(req);
+}
+
+static void rmr_req_sched_store(struct work_struct *work)
+{
+	struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work);
+
+	pr_debug("scheduled store for req %p\n", req);
+	rmr_req_store(req);
+}
+
+static void rmr_req_remote_io_done(void *priv, int err)
+{
+	struct rmr_srv_req *req = priv;
+
+	pr_debug("called for req %p, err code %d\n", req, err);
+
+	rmr_clt_put_iu(req->srv_pool->clt, req->iu);
+
+	if (err) {
+		req->err = err;
+		rmr_req_sync_failed(req);
+		return;
+	}
+
+	pr_debug("schedule store for req %p with err %d\n", req, req->err);
+	INIT_WORK(&req->work, rmr_req_sched_store);
+	schedule_work(&req->work);
+}
+
+static void rmr_req_remote_read(struct rmr_srv_req *req)
+{
+	struct rmr_srv_pool *srv_pool = req->srv_pool;
+	struct rmr_pool *clt = srv_pool->clt;
+	unsigned long flags;
+	int err;
+
+	pr_debug("redirecting req id (%llu, %llu)\n",
+		 req->id.a, req->id.b);
+	if (!clt) {
+		pr_err("No srv pool assigned for redirect for %s\n", srv_pool->pool->poolname);
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (rmr_op(req->flags) == RMR_OP_SYNCREQ)
+		flags = RMR_OP_READ;
+	else
+		flags = req->flags;
+
+	req->iu = rmr_clt_get_iu(clt, flags, WAIT);
+	if (IS_ERR_OR_NULL(req->iu)) {
+		pr_err("Failed to get rmr_iu for req id (%llu, %llu)\n",
+		       req->id.a, req->id.b);
+		err = -EINVAL;
+		goto err;
+	}
+
+	sg_init_one(&req->sg, req->data, req->datalen);
+
+	pr_debug("After sg_init_one nents=%d\n", sg_nents(&req->sg));
+
+	/* look at the flags here! */
+	err = rmr_clt_request(clt, req->iu, req->offset, req->length, flags,
+			      req->prio, req, rmr_req_remote_io_done,
+			      &req->sg, sg_nents(&req->sg));
+	if (err) {
+		pr_err("rmr_clt_request error %d\n", err);
+		rmr_clt_put_iu(clt, req->iu);
+		err = -EREMOTEIO;
+		goto err;
+	}
+
+	pr_debug("remote read submitted\n");
+	return;
+
+err:
+	req->err = err;
+	rmr_req_sync_failed(req);
+}
+
+static void rmr_sync_req_sched(struct work_struct *work)
+{
+	struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work);
+
+	pr_debug("scheduled work process for req %p\n", req);
+	if (req->err)
+		rmr_req_sync_complete(req);
+	else
+		rmr_req_send_map_clear(req);
+}
+
+static void rmr_req_complete(struct rmr_srv_req *req)
+{
+	pr_debug("send completeion for req %p flags %u request id (%llu, %llu) offset %u length %u err %d\n",
+		 req, req->flags,
+		 req->id.a, req->id.b, req->offset, req->length, req->err);
+
+	/* endreq() records the Last IO buffer accordingly. */
+	req->endreq(req, req->err);
+
+	pr_debug("release req %p, flags %u\n", req, req->flags);
+
+	kmem_cache_free(rmr_req_cachep, req);
+}
+
+static struct rmr_srv_req *rmr_req_create_sync_req(struct rmr_srv_pool *srv_pool, rmr_id_t id,
+						   u32 offset, u32 len, bool from_sync,
+						   struct rmr_srv_req *parent)
+{
+	struct rmr_srv_req *req;
+	struct rmr_srv_io_store *store = srv_pool->io_store;
+
+	if (!store) {
+		pr_err("No store_id registered for srv pool %s\n", srv_pool->pool->poolname);
+		return ERR_PTR(-ENODEV);
+	}
+
+	req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL);
+	if (!req) {
+		pr_err("cannot allocate memory for rmr_req.\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	req->id.a = id.a;
+	req->id.b = id.b;
+	req->flags = RMR_OP_SYNCREQ;
+	req->length = len;
+	req->offset = offset;
+	req->srv_pool = srv_pool;
+	req->store = store;
+	req->from_sync = from_sync;
+
+	if (parent) {
+		req->data = parent->data + offset;
+	} else {
+		req->data = kmalloc(req->length, GFP_KERNEL);
+		if (!req->data) {
+			pr_err("cannot allocate memory for sync req id [%llu, %llu]\n",
+			       req->id.a, req->id.b);
+			kmem_cache_free(rmr_req_cachep, req);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	req->datalen = len;
+	req->parent = parent;
+
+	pr_debug("sync req %p created, flags %u request id (%llu, %llu) offset %u length %u parent %p\n",
+		 req, req->flags, req->id.a, req->id.b, req->offset, req->length, parent);
+
+	return req;
+}
+
+//should be called only if corresponding map entry has 0 sync cnt
+int rmr_srv_sync_chunk_id(struct rmr_srv_pool *srv_pool, struct rmr_map_entry *entry,
+			  rmr_id_t id, bool from_sync)
+{
+	struct rmr_pool *pool = srv_pool->pool;
+	struct rmr_dirty_id_map *map;
+	struct rmr_srv_req *parent_req;
+	u32 max_io_size, total_len, offset;
+
+	if (!srv_pool->clt) {
+		pr_err("For pool %s no sync pool assigned.\n", pool->poolname);
+		return -EINVAL;
+	}
+	max_io_size = srv_pool->max_sync_io_size;
+
+	map = rmr_pool_find_map(pool, srv_pool->member_id);
+	if (!map) {
+		pr_err("no map found for pool_id %u\n", srv_pool->member_id);
+		//TODO: handle this , probably initialize map, or just throw err?
+		return -EINVAL;
+	}
+
+	offset = CHUNK_TO_OFFSET(id.b, pool->chunk_size_shift);
+	total_len = pool->chunk_size;
+
+	pr_debug("pool %s sync id (%llu, %llu), total_len %u, max_io_size %u\n",
+		 pool->poolname, id.a, id.b, total_len, max_io_size);
+
+	/*
+	 * The parent_req starts with total_len, then get decremented in loop below.
+	 * The child reqs are filled one by one from end to second.
+	 *
+	 * Maybe refactor this to a simple loop?
+	 */
+	parent_req = rmr_req_create_sync_req(srv_pool, id, offset, total_len, from_sync, NULL);
+	if (IS_ERR_OR_NULL(parent_req)) {
+		pr_err("pool %s failed to create main sync req to sync id (%llu, %llu)\n",
+		       pool->poolname, id.a, id.b);
+		return -ENOMEM;
+	}
+	parent_req->priv = entry;
+
+	if (from_sync) {
+		if (rmr_srv_get_sync_permit(srv_pool)) {
+			pr_err("rmr_srv_sync_chunk_id failed to acquire permit for parent\n");
+			kfree(parent_req->data);
+			kmem_cache_free(rmr_req_cachep, parent_req);
+
+			return -EINVAL;
+		}
+	}
+
+	// inc ref cnt for parent_req
+	map_entry_get_sync(entry);
+	while (parent_req->length > max_io_size) {
+		struct rmr_srv_req *req;
+		u32 child_offset = offset + (parent_req->length - max_io_size);
+
+		// submit req
+		req = rmr_req_create_sync_req(srv_pool, id, (parent_req->length - max_io_size),
+					      max_io_size, from_sync, parent_req);
+		if (IS_ERR_OR_NULL(req)) {
+			pr_err("%s: Pool %s, id (%llu, %llu), offset %u, len %u, err %ld\n",
+			       __func__, pool->poolname, id.a, id.b,
+			       (parent_req->length - max_io_size), max_io_size, PTR_ERR(req));
+			parent_req->err = PTR_ERR(req);
+
+			rmr_req_sync_failed(parent_req);
+			return -EINVAL;
+		}
+
+		/*
+		 * The offset sent to rmr_req_create_sync_req for this req is in context of the
+		 * chunk. But the real offset for this req in the disk is this.
+		 */
+		req->offset = child_offset;
+
+		if (from_sync) {
+			if (rmr_srv_get_sync_permit(srv_pool)) {
+				pr_err("rmr_srv_sync_chunk_id failed to acquire permit for child\n");
+				kmem_cache_free(rmr_req_cachep, req);
+
+				parent_req->err = -EBUSY;
+				rmr_req_sync_failed(parent_req);
+				return -EINVAL;
+			}
+		}
+
+		// inc ref cnt for the child req just created
+		map_entry_get_sync(entry);
+		req->priv = entry;
+		rmr_req_remote_read(req);
+
+		parent_req->length -= max_io_size;
+		parent_req->datalen -= max_io_size;
+	}
+
+	//submit parent req
+	rmr_req_remote_read(parent_req);
+
+	return 0;
+}
+
+static void __release_parent_req(struct rcu_head *head)
+{
+	struct rmr_srv_req *req = container_of(head, struct rmr_srv_req, rcu);
+	struct rmr_map_entry *entry = req->priv;
+
+	pr_debug("is called for req=%p id=(%llu,%llu) err=%d, entry=%p\n",
+		 req, req->id.a, req->id.b, req->err, entry);
+
+	kfree(req->data);
+
+	//may be now we can stop saving entry in req->priv, but always rmr_map_find it
+	if (!req->err) {
+		pr_debug("req %p, completed all sync req, lets clean map\n", req);
+		rmr_process_wait_list(entry, 0);
+	} else {
+		pr_debug("req %p completed with err %d, process wait list\n",
+			 req, req->err);
+
+		/* sync of this entry failed, we reset the sync_cnt so that the other req
+		 * or sync thread could try again in the future. Without resetting, no one
+		 * could get the ref and start sync again.
+		 */
+		atomic_set(&entry->sync_cnt, -1);
+		rmr_process_wait_list(entry, req->err);
+	}
+
+	pr_debug("free entry %p for req %p\n", entry, req);
+	kmem_cache_free(rmr_map_entry_cachep, entry);
+
+	if (req->from_sync)
+		rmr_srv_put_sync_permit(req->srv_pool);
+
+	kmem_cache_free(rmr_req_cachep, req);
+}
+
+static void rmr_req_sync_complete(struct rmr_srv_req *req)
+{
+	struct rmr_srv_pool *srv_pool = req->srv_pool;
+	struct rmr_dirty_id_map *map;
+	int lock_idx;
+
+	pr_debug("sync_req %p completed for id (%llu, %llu), offset %u, len %u, err %d, from sync %d\n",
+		 req, req->id.a, req->id.b, req->offset, req->length,
+		 req->err, req->from_sync);
+
+	if (req->err)
+		rmr_srv_sync_req_failed(req->srv_pool);
+
+	pr_debug("release sync req %p, flags %u\n", req, req->flags);
+
+	/*
+	 * Only parent sync req own the allocated data.
+	 */
+	if (!req->parent) {
+		if (!req->err) {
+			map = rmr_pool_find_map(srv_pool->pool,
+						srv_pool->member_id);
+			if (map) {
+				lock_idx = srcu_read_lock(&srv_pool->pool->map_srcu);
+				rmr_map_unset_dirty(map, req->id,
+						    MAP_NO_FILTER);
+				srcu_read_unlock(&srv_pool->pool->map_srcu, lock_idx);
+			} else {
+				pr_err("no map found for pool_id %u\n", srv_pool->member_id);
+				req->err = -EINVAL;
+			}
+		}
+
+		pr_debug("req %p, completed all sync req, lets clean map\n",
+			 req);
+		call_rcu(&req->rcu, __release_parent_req);
+	} else {
+		/*
+		 * Child req has nothing to do but put permit and free
+		 */
+		if (req->from_sync)
+			rmr_srv_put_sync_permit(req->srv_pool);
+
+		kmem_cache_free(rmr_req_cachep, req);
+	}
+}
+
+static void rmr_req_sync_failed(struct rmr_srv_req *req)
+{
+	rmr_srv_sync_req_failed(req->srv_pool);
+
+	pr_err("pool %s sync req %p failed for id (%llu, %llu), offset %u, len %u, err %d\n",
+	       req->srv_pool->pool->poolname, req, req->id.a, req->id.b,
+	       req->offset, req->length, req->err);
+
+	rmr_req_store_done(req);
+}
+
+// this is actually very like rmr_req_remote_io_done but without rmr_clt_put_iu
+// do we want to have one function for both cases?
+static void rmr_req_map_clear_done(void *priv, int err)
+{
+	struct rmr_srv_req *req = priv;
+
+	rmr_clt_put_iu(req->srv_pool->clt, req->iu);
+
+	pr_debug("called for req %p, err code %d\n", req, err);
+	if (err)
+		pr_err("pool %s, sync req  with id (%llu, %llu) failed to send map clear\n",
+		       req->srv_pool->pool->poolname, req->id.a, req->id.b);
+
+	rmr_req_sync_complete(req);
+}
+
+static void rmr_req_store_done(struct rmr_srv_req *req)
+{
+	struct rmr_map_entry *entry = req->priv;
+	struct rmr_srv_req *parent_req = NULL;
+
+	pr_debug("called for req %p id (%llu, %llu ) offset %u len %u with parent req %p\n",
+		 req, req->id.a, req->id.b, req->offset, req->length, req->parent);
+
+	if (req->parent)
+		parent_req = req->parent;
+	else
+		parent_req = req;
+
+	if (req->err)
+		parent_req->err = req->err;
+
+	if (map_entry_put_sync(entry)) {
+		pr_debug("%s: for entry %p id (%llu, %llu) all sync req done.\n", __func__,
+			 entry, req->id.a, req->id.b);
+
+		/* We have to schedule the work of parent req from here since we are in the
+		 * interrupt context of either parent req or child req
+		 */
+		pr_debug("%s: process parent_req %p\n", __func__, parent_req);
+		INIT_WORK(&parent_req->work, rmr_sync_req_sched);
+		schedule_work(&parent_req->work);
+	}
+
+	if (req != parent_req) {
+		pr_debug("completing req %p with err %d\n", req, req->err);
+		rmr_req_sync_complete(req);
+	}
+}
+
+static void rmr_req_send_map_clear(struct rmr_srv_req *req)
+{
+	struct rmr_srv_pool *srv_pool = req->srv_pool;
+	struct rmr_pool *pool = srv_pool->clt;
+	struct rmr_iu *iu;
+	int err;
+
+	if (!pool) {
+		pr_err("Cannot send map clear. No pool client assigend for srv pool %s\n",
+		       req->srv_pool->pool->poolname);
+		req->err = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * We try to clear map, but if we fail to, we simply ignore the error.
+	 * Such zombie entries will be clear by rmr_srv_check_map_clear.
+	 */
+	iu = rmr_clt_get_iu(pool, RMR_OP_WRITE, WAIT);
+	if (IS_ERR_OR_NULL(iu)) {
+		pr_err("Failed to get rmr_iu for req id (%llu, %llu)\n",
+		       req->id.a, req->id.b);
+		goto err;
+	}
+
+	pr_debug("send map clear req id (%llu, %llu), member_id %u\n",
+		 req->id.a, req->id.b, srv_pool->member_id);
+
+	/*
+	 * For MAP_CLEAR, we only need rmr_id_t for chunk number,
+	 * and our member_id to say to clear the above chunk number for ths storage node.
+	 *
+	 * We also update the minimum members needed for map update.
+	 */
+	iu->msg.hdr.group_id = cpu_to_le32(pool->group_id);
+	iu->msg.hdr.type = cpu_to_le16(RMR_MSG_MAP_CLEAR);
+	iu->msg.hdr.__padding = 0;
+
+	iu->msg.id_a = cpu_to_le64(req->id.a);
+	iu->msg.id_b = cpu_to_le64(req->id.b);
+	iu->msg.member_id = srv_pool->member_id;
+
+	iu->msg.flags = cpu_to_le32(RMR_OP_WRITE);
+
+	iu->conf = rmr_req_map_clear_done;
+	iu->priv = req;
+
+	req->iu = iu;
+
+	err = rmr_clt_send_map_update(pool, req->iu);
+	if (err) {
+		pr_err("%s error %d\n", __func__, err);
+		rmr_clt_put_iu(pool, req->iu);
+		goto err;
+	}
+
+	pr_debug("send map clear submitted\n");
+	return;
+
+err:
+	rmr_req_sync_complete(req);
+}
diff --git a/drivers/infiniband/ulp/rmr/rmr-req.h b/drivers/infiniband/ulp/rmr/rmr-req.h
new file mode 100644
index 000000000000..8f15b36fe480
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-req.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef RMR_REQ_H
+#define RMR_REQ_H
+
+#include "rmr-pool.h"
+
+struct rmr_srv_req {
+	struct rmr_srv_pool *srv_pool;
+	rmr_id_t id;
+
+	u32 offset;
+	u32 length;
+	u32 flags;
+	u16 prio;
+
+	u32 mem_id;
+	struct rtrs_srv_op *rtrs_op;
+	struct rmr_srv_io_store *store;
+	void *data;
+	u32 datalen; //TODO: what is the difference between lenghth?
+	void (*endreq)(struct rmr_srv_req *, int err);
+	struct work_struct work;
+	int err;
+	u8 failed_cnt;
+	u8 failed_srv_id[RMR_POOL_MAX_SESS];
+	u64 map_ver;
+	void *priv;
+	struct llist_node node;
+	bool from_sync;
+	struct scatterlist sg;
+	struct rmr_iu *iu;
+	struct rmr_srv_req *parent;
+	bool sync;
+	struct rcu_head rcu;
+};
+
+struct rmr_srv_req *rmr_srv_req_create(const struct rmr_msg_io *msg,
+				       struct rmr_srv_pool *srv_pool,
+				       struct rtrs_srv_op *rtrs_op,
+				       void *data, u32 datalen,
+				       void (*endreq)(struct rmr_srv_req *, int));
+struct rmr_srv_req *rmr_srv_md_req_create(struct rmr_srv_pool *srv_pool,
+					  struct rtrs_srv_op *rtrs_op, void *data,
+					  u32 offset, u32 len, unsigned long flags,
+					  void (*endreq)(struct rmr_srv_req *, int));
+void rmr_req_submit(struct rmr_srv_req *req);
+void rmr_md_req_submit(struct rmr_srv_req *req);
+void rmr_srv_req_resp(struct rmr_srv_req *req, int err);
+void rmr_srv_md_req_resp(struct rmr_srv_req *req, int err);
+int rmr_srv_sync_chunk_id(struct rmr_srv_pool *srv_pool, struct rmr_map_entry *entry,
+			  rmr_id_t id, bool from_sync);
+
+void rmr_process_wait_list(struct rmr_map_entry *entry, int err);
+
+struct rmr_map_entry_info {
+	rmr_id_t id;
+	u8 srv_id;
+};
+#endif /* RMR_REQ_H */
diff --git a/drivers/infiniband/ulp/rmr/rmr-srv-md.c b/drivers/infiniband/ulp/rmr/rmr-srv-md.c
new file mode 100644
index 000000000000..9dab71a810b8
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-srv-md.c
@@ -0,0 +1,764 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR) — server metadata subsystem
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+
+#include "rmr-srv.h"
+#include "rmr-req.h"
+#include "rmr-clt.h"
+
+/**
+ * process_md_io() - Process medata IO message
+ *
+ * @pool:	the pool where requests go through
+ * @rtrs_op:	rtrs IO context
+ * @offset:	offset in bytes relative to rmr metadata.
+ * @len:	length of the buffer in bytes
+ * @flags:	indicates metadata IO options
+ * @buf:	pointer to metadata buffer
+ *
+ * Return:
+ *	0 on success
+ *
+ * Description:
+ *	All metadata IOs go through this function to submit requests to block device. The offset it
+ *	passes on is relative to bytes shifting on rmr medata which is composed of a header
+ *	structure for pool metadata, bitmap and last_io array.
+ */
+int process_md_io(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len,
+			 unsigned long flags, void *buf)
+{
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_srv_req *req;
+	int err = 0;
+
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (!percpu_ref_tryget_live(&pool->ids_inflight_ref)) {
+		err = -EIO;
+		goto no_put;
+	}
+
+	req = rmr_srv_md_req_create(srv_pool, rtrs_op, buf, offset, len, flags, rmr_srv_endreq);
+	if (IS_ERR(req)) {
+		pr_err("Failed to create rmr_req %pe\n", req);
+		err = PTR_ERR(req);
+		goto put_pool;
+	}
+
+	rmr_md_req_submit(req);
+	return 0;
+
+put_pool:
+	percpu_ref_put(&pool->ids_inflight_ref);
+no_put:
+	return err;
+}
+
+int rmr_srv_read_md(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len,
+		    struct rmr_pool_md *pool_md_page)
+{
+	/* pool_md is pre-allocated */
+	return process_md_io(pool, rtrs_op, offset, len, RMR_OP_MD_READ, pool_md_page);
+}
+
+static int rmr_srv_load_last_io(struct rmr_srv_pool *srv_pool)
+{
+	void *buf;
+	u64 offset, len;
+	struct rmr_pool *pool = srv_pool->pool;
+	struct rmr_pool_md *pool_md = &pool->pool_md;
+	int err = 0;
+
+	if (!pool_md->queue_depth) {
+		pr_err("%s: pool %s has zero queue_depth\n",
+		       __func__, pool->poolname);
+		return -EINVAL;
+	}
+	offset = RMR_LAST_IO_OFFSET;
+	len = rmr_last_io_len(pool_md->queue_depth);
+
+	if (!srv_pool->last_io_idx) {
+		srv_pool->last_io_idx = kcalloc(pool_md->queue_depth,
+						sizeof(*srv_pool->last_io_idx), GFP_KERNEL);
+		if (!srv_pool->last_io_idx)
+			return -ENOMEM;
+	}
+
+	buf = kzalloc(len, GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	err = rmr_srv_read_md(pool, NULL, offset, len, buf);
+	if (err) {
+		pr_err("%s: failed to read last_io buffer of len %lld at offset %lld\n",
+		       __func__, len, offset);
+		goto free_buf;
+	}
+	memcpy(srv_pool->last_io_idx, (rmr_id_t *)buf, len);
+
+free_buf:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * rmr_srv_md_maps_sync - Sync dirty maps to persistent storage
+ *
+ * Description:
+ *	Writes maps in two passes to the map-related regions of the on-disk layout:
+ *
+ *	Pass 1 — hdr_region (single PAGE_SIZE write at RMR_MD_SIZE + last_io_len):
+ *	  Fills one rmr_map_cbuf_hdr slot per map_idx in [0:maps_cnt].
+ *	  The buffer is kzalloc'd, so slots beyond maps_cnt are zero.
+ *	  The entire PAGE_SIZE region is issued as a single I/O.
+ *
+ *	Pass 2 — maps_region (slp pages at computed offsets after hdr_region):
+ *	  Each map's data offset = map_region_offset + map_idx * per_map_size.
+ *	  pool->maps[0:maps_cnt] is always dense (no NULL gaps).
+ */
+void rmr_srv_md_maps_sync(struct rmr_pool *pool)
+{
+	struct rmr_map_cbuf_hdr *map_cbuf_hdr;
+	struct rmr_dirty_id_map *map = NULL;
+	u32 hdr_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth);
+	u32 map_region_offset = hdr_region_offset + RMR_MAP_BUF_HDR_SIZE;
+	u64 per_map_size = 0;
+	int err, lock_idx;
+	void *buf;
+	u8 map_idx;
+
+	buf = kzalloc(RMR_MAP_BUF_HDR_SIZE, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+
+	/* Fill the header region: one slot per active map */
+	for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) {
+		map = rcu_dereference(pool->maps[map_idx]);
+		if (WARN_ON(!map))
+			goto unlock;
+
+		map_cbuf_hdr = buf + map_idx * sizeof(struct rmr_map_cbuf_hdr);
+		map_cbuf_hdr->version = RMR_MAP_FORMAT_VER;
+		map_cbuf_hdr->member_id = map->member_id;
+		map_cbuf_hdr->no_of_chunks = map->no_of_chunks;
+		map_cbuf_hdr->no_of_flp = map->no_of_flp;
+		map_cbuf_hdr->no_of_slp_in_last_flp = map->no_of_slp_in_last_flp;
+		map_cbuf_hdr->no_of_chunk_in_last_slp = map->no_of_chunk_in_last_slp;
+		map_cbuf_hdr->total_slp = map->total_slp;
+		per_map_size = map->total_slp * PAGE_SIZE;
+	}
+
+	/* Write the entire header region as a single PAGE_SIZE I/O */
+	err = process_md_io(pool, NULL, hdr_region_offset,
+			PAGE_SIZE, RMR_OP_MD_WRITE, buf);
+	if (err) {
+		pr_warn("%s: failed to write header region at 0x%x: %d\n",
+			__func__, hdr_region_offset, err);
+		goto unlock;
+	}
+
+	if (WARN_ON(!per_map_size))
+		goto unlock;
+
+	/* Write each map's slp pages */
+	for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) {
+		u32 map_data_offset;
+		el_flp *flp_ptr;
+		u64 no_of_slps;
+		void *slp;
+		int i, j;
+
+		map = rcu_dereference(pool->maps[map_idx]);
+		if (WARN_ON(!map))
+			break;
+
+		map_data_offset = map_region_offset + map_idx * per_map_size;
+
+		for (i = 0; i < map->no_of_flp; i++) {
+			flp_ptr = (el_flp *)map->dirty_bitmap[i];
+
+			if (i == (map->no_of_flp - 1))
+				no_of_slps = map->no_of_slp_in_last_flp;
+			else
+				no_of_slps = NO_OF_SLP_PER_FLP;
+
+			for (j = 0; j < no_of_slps; j++, flp_ptr++) {
+				slp = (void *)(*flp_ptr);
+
+				err = process_md_io(pool, NULL, map_data_offset,
+						PAGE_SIZE, RMR_OP_MD_WRITE, slp);
+				if (err)
+					pr_warn("%s: failed to write map slp at 0x%x: %d\n",
+						__func__, map_data_offset, err);
+				map_data_offset += PAGE_SIZE;
+			}
+		}
+	}
+
+unlock:
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+	kfree(buf);
+}
+
+/**
+ * rmr_srv_refresh_md_maps - Restore maps from map buffers on disk
+ *
+ * Description:
+ *	Reads back the maps written by rmr_srv_md_maps_sync(). Reads the hdr_region
+ *	in a single I/O to obtain the per-map headers, then loads each present
+ *	map's slp pages from maps_region:
+ *	  data offset = map_region_offset + map_idx * per_map_size
+ *	Header slots 0..N-1 are active; remaining are zero (member_id == 0).
+ */
+static int rmr_srv_refresh_md_maps(struct rmr_srv_pool *srv_pool)
+{
+	struct rmr_pool *pool = srv_pool->pool;
+	struct rmr_map_cbuf_hdr *map_cbuf_hdr;
+	struct rmr_dirty_id_map *map = NULL;
+	u32 hdr_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth);
+	u32 map_region_offset = hdr_region_offset + RMR_MAP_BUF_HDR_SIZE;
+	int err = 0, lock_idx;
+	void *buf;
+	u8 map_idx, valid_nr = 0;
+	bool unpack;
+
+	buf = kzalloc(RMR_MAP_BUF_HDR_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* Read the entire header region in a single PAGE_SIZE I/O */
+	err = rmr_srv_read_md(pool, NULL, hdr_region_offset, RMR_MAP_BUF_HDR_SIZE, buf);
+	if (err) {
+		pr_err("%s: failed to read header region at offset %u\n",
+				__func__, hdr_region_offset);
+		kfree(buf);
+		return err;
+	}
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	for (map_idx = 0; map_idx < RMR_POOL_MAX_SESS; map_idx++) {
+		u64 per_map_size;
+		u32 map_data_offset;
+		el_flp *flp_ptr;
+		u64 no_of_slps;
+		void *slp;
+		int i, j;
+
+		map_cbuf_hdr = buf + map_idx * sizeof(struct rmr_map_cbuf_hdr);
+		pr_debug("%s: %llu %u %llu %llu %llu %llu %llu\n", __func__,
+			map_cbuf_hdr->version,
+			map_cbuf_hdr->member_id,
+			map_cbuf_hdr->no_of_chunks,
+			map_cbuf_hdr->no_of_flp,
+			map_cbuf_hdr->no_of_slp_in_last_flp,
+			map_cbuf_hdr->no_of_chunk_in_last_slp,
+			map_cbuf_hdr->total_slp);
+
+		/* Empty slot: no more active maps beyond this point */
+		if (!map_cbuf_hdr->member_id)
+			break;
+		valid_nr++;
+
+		per_map_size = map_cbuf_hdr->total_slp * PAGE_SIZE;
+		map_data_offset = map_region_offset + map_idx * per_map_size;
+
+		unpack = false;
+		/*
+		 * The dirty map should be updated only when the one on disk is more updated.
+		 * Such cases are as follows.
+		 * 1) The dirty map does not exist in the pool. The map will be simply restored to
+		 * the last version we have.
+		 * 2) The dirty map of the pool is just created. If it has been updated, the one on
+		 * disk is outdated.
+		 */
+		map = rmr_pool_find_map(pool, map_cbuf_hdr->member_id);
+		if (!map) {
+			map = rmr_map_create(pool, map_cbuf_hdr->member_id);
+			if (IS_ERR(map)) {
+				err = PTR_ERR(map);
+				pr_err("%s: pool %s, member_id %d failed to create map\n",
+				       __func__, pool->poolname, map_cbuf_hdr->member_id);
+				goto unlock;
+			}
+			unpack = true;
+		} else if (rmr_map_empty(map)) {
+			unpack = true;
+		}
+
+		if (map->no_of_chunks != map_cbuf_hdr->no_of_chunks ||
+				map->no_of_flp != map_cbuf_hdr->no_of_flp ||
+				map->no_of_slp_in_last_flp != map_cbuf_hdr->no_of_slp_in_last_flp ||
+				map->no_of_chunk_in_last_slp != map_cbuf_hdr->no_of_chunk_in_last_slp ||
+				map->total_slp != map_cbuf_hdr->total_slp) {
+			pr_err("%s: Sanity check failed\n", __func__);
+			goto unlock;
+		}
+
+		xa_store(&pool->stg_members, map_cbuf_hdr->member_id, XA_TRUE, GFP_KERNEL);
+
+		if (!unpack)
+			continue;
+
+		for (i = 0; i < map->no_of_flp; i++) {
+			flp_ptr = (el_flp *)map->dirty_bitmap[i];
+
+			if (i == (map->no_of_flp - 1))
+				no_of_slps = map->no_of_slp_in_last_flp;
+			else
+				no_of_slps = NO_OF_SLP_PER_FLP;
+
+			for (j = 0; j < no_of_slps; j++, flp_ptr++) {
+				slp = (void *)(*flp_ptr);
+
+				err = rmr_srv_read_md(pool, NULL, map_data_offset,
+						PAGE_SIZE, slp);
+				if (err) {
+					pr_err("%s: failed to read bitmap at offset %u\n",
+						__func__, map_data_offset);
+					goto unlock;
+				}
+				map_data_offset += PAGE_SIZE;
+			}
+		}
+	}
+
+unlock:
+	if (!valid_nr)
+		pr_err("%s: no valid map found in metadata\n", __func__);
+
+	/*
+	 * TODO: We need better error handling logic here.
+	 * Lets suppose after successfully reading few pages for a map, we fail to read next page.
+	 * We then error out and fail the register, but leave the partially updated map in the pool.
+	 * Later when another register is called, and we come here to read the maps, we will
+	 * see a non-empty map, and skip reading the map from disk.
+	 */
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+	kfree(buf);
+	return err;
+}
+
+/**
+ * rmr_srv_md_update() - update the metadata of the server pool
+ *
+ * Description:
+ *	Read current in-memory pool states that changes to the srv_md of this pool.
+ */
+static int rmr_srv_md_update(struct rmr_srv_pool *srv_pool)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_md *my_srv_md;
+	int md_i;
+
+	pool = srv_pool->pool;
+	md_i = rmr_pool_find_md(&pool->pool_md, srv_pool->member_id, true);
+	if (md_i < 0) {
+		pr_warn("No space for new member %d.\n", srv_pool->member_id);
+		return -EINVAL;
+	}
+	my_srv_md = &pool->pool_md.srv_md[md_i];
+	my_srv_md->member_id = srv_pool->member_id;
+	my_srv_md->store_state = atomic_read(&srv_pool->store_state);
+	my_srv_md->map_ver = srv_pool->pool->map_ver;
+	my_srv_md->srv_pool_state = atomic_read(&srv_pool->state);
+	pr_debug("Set srv_md[%d] it with the member_id %d.\n", md_i, srv_pool->member_id);
+	return 0;
+}
+
+/**
+ * rmr_srv_flush_pool_md() - Write pool_md region to disk immediately
+ *
+ * @srv_pool:	Server pool whose pool_md is to be flushed
+ *
+ * Description:
+ *	Persist pool_md without waiting for the delayed work.
+ */
+void rmr_srv_flush_pool_md(struct rmr_srv_pool *srv_pool)
+{
+	struct rmr_pool *pool = srv_pool->pool;
+	void *buf;
+	int err;
+
+	if (!atomic_read(&srv_pool->store_state) || !pool->mapped_size)
+		return;
+
+	err = rmr_srv_md_update(srv_pool);
+	if (err) {
+		pr_warn("%s: failed to update pool_md before flush: 0x%x\n", __func__, err);
+		return;
+	}
+
+	buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	memcpy(buf, &pool->pool_md, sizeof(struct rmr_pool_md));
+	err = process_md_io(pool, NULL, 0, RMR_MD_SIZE, RMR_OP_MD_WRITE, buf);
+	if (err)
+		pr_warn("%s: failed to flush pool_md: 0x%x at offset 0 len %lu\n",
+			__func__, err, RMR_MD_SIZE);
+	kfree(buf);
+}
+
+/**
+ * rmr_srv_flush_last_io() - Write last_io region to disk
+ *
+ * @srv_pool:	Server pool whose last_io is to be flushed
+ */
+static void rmr_srv_flush_last_io(struct rmr_srv_pool *srv_pool)
+{
+	struct rmr_pool *pool = srv_pool->pool;
+	u64 last_io_len = rmr_last_io_len(pool->pool_md.queue_depth);
+	void *buf;
+	int err;
+
+	if (!last_io_len || !srv_pool->last_io)
+		return;
+
+	buf = kzalloc(last_io_len, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	memcpy(srv_pool->last_io_idx, srv_pool->last_io, last_io_len);
+	memcpy(buf, srv_pool->last_io_idx, last_io_len);
+
+	err = process_md_io(pool, NULL, RMR_MD_SIZE, last_io_len,
+			    RMR_OP_MD_WRITE, buf);
+	if (err)
+		pr_warn("%s: failed to flush last_io: 0x%x at offset %lu len %llu\n",
+			__func__, err, RMR_MD_SIZE, last_io_len);
+	kfree(buf);
+}
+
+/**
+ * rmr_srv_md_load_buf() - Load the server metadata from buffer to the server pool.
+ *
+ * Description:
+ *	This function loads the server-side metadata from buffer to the pool. The buffer must be
+ *	in the format of rmr pool metadata structure, which may contain updated srv_md of
+ *	multiple servers.
+ */
+static int rmr_srv_md_load_buf(struct rmr_pool *pool, void *buf)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_srv_md *srv_md_buf;
+	u8 member_id = 0;
+	int err = 0, index, i;
+	bool ret = false;
+
+	buf += (RMR_CLT_MD_SIZE - sizeof(struct rmr_srv_md));
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		buf += sizeof(struct rmr_srv_md);
+		srv_md_buf = (struct rmr_srv_md *)buf;
+		member_id = srv_md_buf->member_id;
+		/* skip updating the srv_md of this server pool */
+		if (!member_id || member_id == srv_pool->member_id)
+			continue;
+
+		index = rmr_pool_find_md(&pool->pool_md, member_id, true);
+		if (index < 0) {
+			pr_debug("%s: No space in the pool_md for new member %d\n",
+				 __func__, member_id);
+			err = -EINVAL;
+			continue;
+		}
+
+		pr_debug("Load srv_md[%d] with member_id %d\n", index, member_id);
+		memcpy(&pool->pool_md.srv_md[index], srv_md_buf, sizeof(struct rmr_srv_md));
+		ret = true;
+	}
+
+	if (!ret) {
+		pr_debug("No server metadata found in the buffer\n");
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+/**
+ * rmr_srv_md_process_buf() - Load the metadata from buffer to the server pool.
+ *
+ * Description:
+ *	This node loads the metadata from buffer to the server pool.
+ */
+int rmr_srv_md_process_buf(struct rmr_pool *pool, void *buf, bool sync)
+{
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_pool_md *buf_pool_md, *dest_md = &pool->pool_md;
+	int err = 0;
+
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+	buf_pool_md = (struct rmr_pool_md *)buf;
+	if (!sync) {
+		/* Copy only the client-side header. */
+		memcpy(dest_md, buf_pool_md, RMR_CLT_MD_SIZE);
+	} else {
+		err = rmr_srv_md_load_buf(pool, buf);
+		if (err)
+			pr_err("Failed to load md buf to pool %s\n", pool->poolname);
+	}
+
+	return err;
+}
+
+int rmr_srv_send_md_update(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_pool *sync_pool = srv_pool->clt;
+	struct rmr_msg_pool_cmd msg = {};
+	int err = 0, buflen;
+	void *buf;
+
+	/* Only normal-state server pools should send metadata updates. */
+	if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL)
+		return -EINVAL;
+
+	/* For a stg node A, is A->B alive? */
+	if (!sync_pool) {
+		pr_debug("pool %s has no sync pool assigned. Cannot send md update commands.\n",
+			 pool->poolname);
+		return -ENXIO;
+	}
+
+	buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	buflen = RMR_MD_SIZE;
+
+	rmr_clt_init_cmd(sync_pool, &msg);
+	msg.cmd_type = RMR_CMD_MD_SEND;
+	/* This node sends messages to start md_update. */
+	msg.md_send_cmd.leader_id = srv_pool->member_id;
+	msg.md_send_cmd.src_mapped_size = pool->mapped_size;
+
+	err = rmr_clt_send_cmd_with_data_all(sync_pool, &msg, buf, buflen);
+	if (err < 0) {
+		pr_debug("pool %s sends all sess RMR_CMD_MD_SEND failed\n", pool->poolname);
+		goto free_buf;
+	}
+
+	/*
+	 * keep the original slice of buffer if the corresponding send req failed.
+	 *
+	 * TODO:
+	 * We need to use the err received from rmr_clt_send_cmd_with_data_all in this function,
+	 * and match the sessions we are skipping.
+	 *
+	 * In general, the sessions_skipped == (RMR_POOL_MAX_SESS - (number_of_legs - 1 - err).
+	 * If the above number does not match, then we abandon the buffers, and try again.
+	 */
+	err = rmr_srv_md_load_buf(pool, buf);
+	if (err) {
+		pr_debug("Failed to load md buf to pool %s\n", pool->poolname);
+		goto free_buf;
+	}
+
+free_buf:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * rmr_srv_refresh_md() - Refresh the metadata of the rmr pool.
+ *
+ * @srv_pool: Server pool whose metadata to be find
+ *
+ * Description:
+ *	Read the metadata of the rmr pool from the backing store.
+ *
+ * Return:
+ *	True when reading the metadata succeeds in two cases. The first case is a successful read
+ *	but no metadata found. The second case is it found metadata which contains the srv_md.
+ *	False otherwise.
+ */
+int rmr_srv_refresh_md(struct rmr_srv_pool *srv_pool)
+{
+	struct rmr_pool_md *pool_md_page;
+	struct rmr_pool *pool = srv_pool->pool;
+	int index, ret;
+	u64 md_ver;
+
+	pool_md_page = kzalloc(RMR_MD_SIZE, GFP_KERNEL);
+	if (!pool_md_page)
+		return -ENOMEM;
+
+	if (rmr_srv_read_md(pool, NULL, 0, RMR_MD_SIZE, pool_md_page)) {
+		pr_err("%s: failed reading md of rmr\n", __func__);
+		goto free_md;
+	}
+
+	pr_info("%s: Read md of pool %s from store with magic 0x%llx\n",
+		__func__, pool_md_page->poolname, pool_md_page->magic);
+
+	if (pool_md_page->magic != RMR_POOL_MD_MAGIC) {
+		pr_info("%s: No valid md found on the store for pool %s\n",
+			__func__, pool->poolname);
+		ret = -EINVAL;
+		goto free_md;
+	}
+
+	/*
+	 * TODO: Should we sanity check other params also?
+	 */
+	if (pool_md_page->chunk_size != pool->chunk_size) {
+		pr_err("%s: chunk size mismatched. pool chunk size %u, md chunk size %u\n",
+		       __func__, pool->chunk_size, pool_md_page->chunk_size);
+		goto free_md;
+	}
+
+	/* Import the metadata to the states of the pool. */
+	index = rmr_pool_find_md(pool_md_page, srv_pool->member_id, false);
+	if (index < 0) {
+		pr_info("%s: No md found for member_id %d\n", __func__, srv_pool->member_id);
+		ret = index;
+		goto free_md;
+	}
+
+	if (pool_md_page->srv_md[index].mapped_size != pool->mapped_size) {
+		pr_err("%s: Mapped size mismatched. The srv pool %llu, md %llu\n",
+		       __func__, pool->mapped_size, pool_md_page->mapped_size);
+		ret = -EINVAL;
+		goto free_md;
+	}
+
+	md_ver = pool_md_page->srv_md[index].map_ver;
+	if (md_ver < pool->map_ver)
+		pr_err("The current map ver is %lld but the map ver on md is %lld.\n",
+		       pool->map_ver, md_ver);
+	else
+		pool->map_ver = md_ver;
+
+	pool->pool_md = *pool_md_page;
+
+	ret = rmr_srv_load_last_io(srv_pool);
+	if (ret) {
+		pr_err("%s: failed to load last_io array to memory with err 0x%x\n",
+		       __func__, ret);
+		goto zero_md;
+	}
+
+	pr_info("%s: no_of_chunks %lld\n", __func__, pool->no_of_chunks);
+	ret = rmr_srv_refresh_md_maps(srv_pool);
+	if (ret) {
+		pr_err("%s: failed to load dirty bitmap to memory with err %pe\n",
+		       __func__, ERR_PTR(ret));
+		goto free_last_io;
+	}
+	goto free_md;
+
+free_last_io:
+	kfree(srv_pool->last_io_idx);
+	srv_pool->last_io_idx = NULL;
+zero_md:
+	memset(&pool->pool_md, 0, sizeof(pool->pool_md));
+free_md:
+	kfree(pool_md_page);
+	return ret;
+}
+
+/**
+ * rmr_srv_mark_maps_dirty() - Set MD_DIRTY_MAPS and schedule delayed sync
+ *
+ * @srv_pool:	Server pool with changed maps
+ */
+void rmr_srv_mark_maps_dirty(struct rmr_srv_pool *srv_pool)
+{
+	set_bit(MD_DIRTY_MAPS, &srv_pool->md_dirty);
+	mod_delayed_work(srv_pool->md_sync_wq, &srv_pool->md_sync_dwork,
+			 msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS));
+}
+
+/**
+ * rmr_srv_md_sync - sync dirty metadata regions of pool
+ *
+ * Description:
+ *	Dirty-driven consumer: only flushes regions whose dirty bit is set.
+ *	Producers set bits and schedule this work via mod_delayed_work().
+ *	Does NOT re-queue itself — the next dirty event will schedule it.
+ */
+void rmr_srv_md_sync(struct work_struct *work)
+{
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_pool *pool;
+	bool ret, did_work = false;
+
+	srv_pool = container_of(to_delayed_work(work), struct rmr_srv_pool, md_sync_dwork);
+	if (!srv_pool->pool)
+		return;
+
+	/*
+	 * It could happen that access the pool while the pool is not there. Use reference counting
+	 * for server pool to avoid the issue.
+	 */
+	ret = rmr_get_srv_pool(srv_pool);
+	if (!ret) {
+		pr_err("%s: pool is not there\n", __func__);
+		return;
+	}
+
+	pool = srv_pool->pool;
+
+	/*
+	 * Update srv_md snapshot and notify peers whenever any region is dirty.
+	 */
+	if (!rmr_srv_md_update(srv_pool) && rmr_srv_send_md_update(pool))
+		pr_debug("failed to send md update\n");
+
+	/*
+	 * The io store is ready after the store is registered and the pool metadata is
+	 * updated, if any.
+	 */
+	if (!atomic_read(&srv_pool->store_state) || !pool->mapped_size)
+		goto put_pool;
+
+	/*
+	 * On-disk layout of rmr pool metadata:
+	 *
+	 *   0           RMR_MD_SIZE   +last_io_len    +PAGE_SIZE
+	 *   +-----------+-------------+---------------+--------------------+
+	 *   | pool_md   | last_io     | hdr_region    | maps_region ...    |
+	 *   +-----------+-------------+---------------+--------------------+
+	 *   <-RMR_MD_SIZE><-last_io_len><--PAGE_SIZE--><-per_map slp pages->
+	 *
+	 * pool->maps[0:maps_cnt] is always dense (no NULL gaps).
+	 *
+	 * This I/O covers pool_md + last_io. hdr_region and maps_region are
+	 * written separately by rmr_srv_md_maps_sync().
+	 */
+	if (test_and_clear_bit(MD_DIRTY_POOL, &srv_pool->md_dirty)) {
+		rmr_srv_flush_pool_md(srv_pool);
+		did_work = true;
+	}
+
+	if (test_and_clear_bit(MD_DIRTY_LAST_IO, &srv_pool->md_dirty)) {
+		rmr_srv_flush_last_io(srv_pool);
+		did_work = true;
+	}
+
+	if (test_and_clear_bit(MD_DIRTY_MAPS, &srv_pool->md_dirty)) {
+		rmr_srv_md_maps_sync(pool);
+		did_work = true;
+	}
+
+	if (did_work)
+		pr_debug("%s: flushed dirty regions for server pool %u of %s\n",
+			 __func__, srv_pool->member_id, pool->poolname);
+
+put_pool:
+	rmr_put_srv_pool(srv_pool);
+	/* Do NOT re-queue. Producers schedule us via mod_delayed_work. */
+}
diff --git a/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c b/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c
new file mode 100644
index 000000000000..2aa1e07235b8
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c
@@ -0,0 +1,1047 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/sysfs.h>
+#include <linux/device.h>
+#include <linux/kdev_t.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+
+#include "rmr-srv.h"
+#include "rmr-map.h"
+#include "rmr-clt.h"
+
+#define MAX_POOL_ID 255
+
+static struct class *rmr_dev_class;
+static struct device *rmr_ctl_dev;
+static struct device *rmr_pool_dev;
+
+static struct kobj_type rmr_srv_sess_ktype = {
+	.sysfs_ops      = &kobj_sysfs_ops,
+};
+
+int rmr_srv_sysfs_add_sess(struct rmr_pool *pool,
+			   struct rmr_srv_pool_sess *pool_sess)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&pool_sess->kobj, &rmr_srv_sess_ktype,
+				   &pool->sessions_kobj, "%s",
+				   pool_sess->sessname);
+	if (ret)
+		pr_err("Failed to add session %s into sysfs\n",
+		       pool_sess->sessname);
+
+	return ret;
+}
+
+void rmr_srv_sysfs_del_sess(struct rmr_srv_pool_sess *pool_sess)
+{
+	kobject_del(&pool_sess->kobj);
+	kobject_put(&pool_sess->kobj);
+}
+
+static ssize_t rmr_srv_member_id_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	return sprintf(page, "%d\n", srv_pool->member_id);
+}
+
+static struct kobj_attribute rmr_srv_member_id_attr =
+	__ATTR(member_id, 0444, rmr_srv_member_id_show, NULL);
+
+static ssize_t rmr_srv_pool_blksize_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	/* TODO: introduce blksize for pool */
+	return sprintf(page, "128k\n");
+}
+
+static struct kobj_attribute rmr_srv_pool_blksize_attr =
+	__ATTR(blksize, 0444, rmr_srv_pool_blksize_show, NULL);
+
+static ssize_t rmr_srv_leave_pool_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+					 attr->attr.name);
+}
+
+void rmr_srv_destroy_pool_sysfs_files(struct rmr_pool *pool,
+				      const struct attribute *sysfs_self)
+{
+	if (pool->kobj.state_in_sysfs) {
+		WARN_ON(!list_empty(&pool->sess_list));
+		kobject_del(&pool->sessions_kobj);
+		kobject_put(&pool->sessions_kobj);
+		if (sysfs_self)
+			sysfs_remove_file_self(&pool->kobj, sysfs_self);
+		kobject_del(&pool->kobj);
+		kobject_put(&pool->kobj);
+	}
+}
+
+static ssize_t rmr_srv_leave_pool_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (READ_ONCE(srv_pool->io_store)) {
+		pr_err("pool %s has a store registered\n", pool->poolname);
+		return -EINVAL;
+	}
+
+	if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_EMPTY) {
+		pr_err("pool %s cannot leave: not in EMPTY state (state=%d)\n",
+		       pool->poolname, atomic_read(&srv_pool->state));
+		return -EINVAL;
+	}
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s, %s unknown value: '%s'\n",
+		       pool->poolname, attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	if (srv_pool->clt) {
+		int err;
+
+		err = rmr_srv_remove_clt_pool(srv_pool);
+		if (err) {
+			pr_err("pool %s failed to remove clt_pool\n", pool->poolname);
+			return -EINVAL;
+		}
+	}
+	pr_info("srv: Deleting pool '%s'\n", pool->poolname);
+
+	rmr_srv_destroy_pool(pool);
+	rmr_srv_destroy_pool_sysfs_files(pool, &attr->attr);
+	rmr_put_srv_pool(srv_pool);
+
+	return count;
+}
+
+static struct kobj_attribute rmr_srv_leave_pool_attr =
+		__ATTR(leave_pool, 0644, rmr_srv_leave_pool_show,
+		       rmr_srv_leave_pool_store);
+
+static ssize_t rmr_srv_pool_map_show(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     char *page)
+{
+	struct rmr_pool *pool = container_of(kobj, struct rmr_pool, kobj);
+	struct rmr_dirty_id_map *map;
+	int i, lock_idx;
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		map = rcu_dereference(pool->maps[i]);
+		if (!map)
+			continue;
+
+		rmr_map_dump_bitmap(map);
+	}
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	return 0;
+}
+
+static ssize_t rmr_srv_pool_map_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	rmr_id_t id = { 0, 0 };
+	int srv_id;
+	struct rmr_dirty_id_map *map;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) {
+		pr_err("cannot parse id.a %s\n", buf);
+		return -EINVAL;
+	}
+	pr_debug("Add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id);
+
+	/*
+	 * If given chunk number exceeds total chunks for us, ignore!
+	 */
+	if (id.b > pool->no_of_chunks)
+		return count;
+
+	map = rmr_pool_find_map(pool, srv_id);
+	if (!map) {
+		pr_err("in pool %s cannot find map for srv_id %u\n",
+		       pool->poolname, srv_id);
+		return -EINVAL;
+	}
+
+	rmr_map_set_dirty(map, id, 0);
+	rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+	pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id);
+
+	return count;
+}
+
+static struct kobj_attribute rmr_srv_pool_map_attr =
+	__ATTR(map, 0644, rmr_srv_pool_map_show,
+	       rmr_srv_pool_map_store);
+
+static ssize_t rmr_srv_pool_map_ver_show(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     char *page)
+{
+	struct rmr_pool *pool;
+	ssize_t written;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+
+	written = scnprintf(page, PAGE_SIZE, "Map ver: %llu\n", pool->map_ver);
+
+	return written;
+}
+
+static struct kobj_attribute rmr_srv_pool_map_ver_attr =
+	__ATTR(map_version, 0444, rmr_srv_pool_map_ver_show, NULL);
+
+static ssize_t rmr_srv_pool_last_io_show(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	ssize_t written = 0;
+	int i;
+	rmr_id_t *id;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	for (i = 0; i < srv_pool->queue_depth; i++) {
+		id = &srv_pool->last_io[i];
+
+		if (id->a == U64_MAX && id->b == U64_MAX)
+			continue;
+
+		written += scnprintf(page + written, PAGE_SIZE - written,
+				     "[%d]=(%llu,%llu) ", i, id->a, id->b);
+	}
+	if (written == 0)
+		written += scnprintf(page + written, PAGE_SIZE - written,
+				     "(empty)");
+	written += scnprintf(page + written, PAGE_SIZE - written, "\n");
+
+	return written;
+}
+
+static struct kobj_attribute rmr_srv_pool_last_io_attr =
+	__ATTR(last_io, 0644, rmr_srv_pool_last_io_show, NULL);
+
+static ssize_t rmr_srv_add_clt_pool_show(struct kobject *kobj,
+					 struct kobj_attribute *attr, char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo poolname > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t rmr_srv_add_clt_pool_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_pool *clt = NULL;
+	char name[NAME_MAX];
+	int err;
+	struct rmr_attrs attrs;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (sscanf(buf, "%s", name) != 1) {
+		pr_err("cannot parse %s\n", buf);
+		return -EINVAL;
+	}
+
+	clt = rmr_clt_open(NULL, NULL, name);
+	if (IS_ERR_OR_NULL(clt)) {
+		pr_err("cannot open pool %s err %ld\n", name, PTR_ERR(clt));
+		return -EEXIST;
+	}
+
+	pr_info("%s: Adding client pool %s, to server pool %s\n",
+		__func__, pool->poolname, clt->poolname);
+
+	err = rmr_clt_query(clt, &attrs);
+	if (unlikely(err))
+		goto close_rmr;
+
+	if (!attrs.sync) {
+		pr_err("%s: Add clt called for non-sync rmr client pool %s\n", __func__, name);
+		err = -EINVAL;
+		goto close_rmr;
+	}
+
+	srv_pool->max_sync_io_size = attrs.max_io_size;
+
+	/* The sync client holds a pointer to its parent server pool. */
+	srv_pool->clt = clt;
+
+	/* Re-trigger md sync now that the sync path is available. */
+	rmr_srv_mark_pool_md_dirty(srv_pool);
+
+	/*
+	 * Check if the device paramters of connected servers share the same values.
+	 */
+	err = rmr_srv_check_params(srv_pool);
+	if (err)
+		goto close_clt;
+
+	return count;
+
+close_clt:
+	srv_pool->clt = NULL;
+	srv_pool->max_sync_io_size = 0;
+close_rmr:
+	pr_err("%s: Adding client pool failed\n", __func__);
+	rmr_clt_close(clt);
+	return err;
+}
+
+static struct kobj_attribute rmr_srv_add_clt_pool_attr =
+	__ATTR(add_clt, 0644, rmr_srv_add_clt_pool_show,
+	       rmr_srv_add_clt_pool_store);
+
+static ssize_t rmr_srv_pool_sync_show(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	return scnprintf(page, PAGE_SIZE, "Usage: echo \"start|stop\" > <path_to_pool>/%s\n",
+					  attr->attr.name);
+}
+
+static ssize_t rmr_srv_pool_sync_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	int err = 0;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (!strncasecmp(buf, "start", 5)) {
+		/*
+		 * Start
+		 */
+		if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) {
+			pr_info("For pool %s, sync thread already running\n", pool->poolname);
+			goto out;
+		}
+
+		mutex_lock(&srv_pool->srv_pool_lock);
+
+		if (!atomic_read(&srv_pool->store_state) &&
+		    atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) {
+			pr_err("Pool %s not in working state. Sync thread start failed\n",
+			       pool->poolname);
+			err = -EINVAL;
+			goto unlock_mutex;
+		}
+
+		err = rmr_srv_sync_thread_start(srv_pool);
+		if (err) {
+			pr_err("For pool %s, rmr_srv_sync_thread_start Error %d\n",
+			       pool->poolname, err);
+			goto unlock_mutex;
+		}
+
+		mutex_unlock(&srv_pool->srv_pool_lock);
+
+	} else if (!strncasecmp(buf, "stop", 4)) {
+		/*
+		 * Stop
+		 */
+		if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_STOPPED) {
+			pr_info("For pool %s, sync thread already stopped\n", pool->poolname);
+			goto out;
+		}
+
+		err = rmr_srv_sync_thread_stop(srv_pool);
+		if (err) {
+			pr_err("For pool %s, rmr_srv_sync_thread_stop Error %d\n",
+			       pool->poolname, err);
+			goto err;
+		}
+	} else {
+		pr_err("Unknown value\n");
+		err = -EINVAL;
+		goto err;
+	}
+
+out:
+	return count;
+
+unlock_mutex:
+	mutex_unlock(&srv_pool->srv_pool_lock);
+err:
+	return err;
+}
+
+static struct kobj_attribute rmr_srv_pool_sync_attr =
+	__ATTR(sync, 0644, rmr_srv_pool_sync_show,
+	       rmr_srv_pool_sync_store);
+
+static ssize_t sync_state_show(struct kobject *kobj,
+					    struct kobj_attribute *attr,
+					    char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	int state;
+	ssize_t written = 0;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	state = atomic_read(&srv_pool->thread_state);
+	switch (state) {
+	case SYNC_THREAD_RUNNING:
+		written = sysfs_emit(page, "Running\n");
+		break;
+	case SYNC_THREAD_STOPPED:
+		written = sysfs_emit(page, "Stopped\n");
+		break;
+	case SYNC_THREAD_REQ_STOP:
+		written = sysfs_emit(page, "Request_to_stop\n");
+		break;
+	case SYNC_THREAD_WAIT:
+		written = sysfs_emit(page, "Wait\n");
+		break;
+	default:
+		written = sysfs_emit(page, "Unknown value %d\n", state);
+		break;
+	}
+
+	return written;
+}
+
+static struct kobj_attribute rmr_srv_pool_sync_state_attr =
+	__ATTR_RO(sync_state);
+
+static ssize_t rmr_srv_pool_state_show(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	int state;
+	ssize_t written = 0;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+	state = atomic_read(&srv_pool->state);
+
+	switch (state) {
+	case RMR_SRV_POOL_STATE_EMPTY:
+		written = sysfs_emit(page, "empty\n");
+
+		break;
+	case RMR_SRV_POOL_STATE_REGISTERED:
+		written = sysfs_emit(page, "registered\n");
+
+		break;
+	case RMR_SRV_POOL_STATE_CREATED:
+		written = sysfs_emit(page, "created\n");
+
+		break;
+	case RMR_SRV_POOL_STATE_NORMAL:
+		written = sysfs_emit(page, "normal\n");
+
+		break;
+	case RMR_SRV_POOL_STATE_NO_IO:
+		written = sysfs_emit(page, "no_io\n");
+
+		break;
+	default:
+		written = sysfs_emit(page, "Unknown value %d\n", state);
+
+		break;
+	}
+
+	written += sysfs_emit_at(page, written, "Maintenance mode: %d\n",
+				 srv_pool->maintenance_mode);
+
+	return written;
+}
+
+static struct kobj_attribute rmr_srv_pool_state_attr =
+	__ATTR(state, 0644, rmr_srv_pool_state_show, NULL);
+
+static ssize_t rmr_srv_remove_clt_pool_show(struct kobject *kobj,
+					    struct kobj_attribute *attr, char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t rmr_srv_remove_clt_pool_store(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	int err;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s, %s unknown value: '%s'\n",
+		       pool->poolname, attr->attr.name, buf);
+		return -EINVAL;
+	}
+	err = rmr_srv_remove_clt_pool(srv_pool);
+	if (err) {
+		pr_err("pool %s failed to remove clt_pool\n", pool->poolname);
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+static struct kobj_attribute rmr_srv_remove_clt_pool_attr =
+	__ATTR(remove_clt, 0644, rmr_srv_remove_clt_pool_show,
+	       rmr_srv_remove_clt_pool_store);
+
+static ssize_t rmr_srv_pool_test_map_show(struct kobject *kobj,
+						struct kobj_attribute *attr,
+						char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t rmr_srv_pool_test_map_store(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	int err;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (!sysfs_streq(buf, "1")) {
+		pr_err("%s, %s unknown value: '%s'\n",
+		       pool->poolname, attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	if (!srv_pool->clt) {
+		pr_err("pool %s no clt pool assigned to this rmr pool. cannot do map test.\n",
+		       pool->poolname);
+		return -EINVAL;
+	}
+
+	pr_info("pool %s start test map...\n", pool->poolname);
+	err = rmr_clt_test_map(pool, srv_pool->clt);
+	if (err) {
+		pr_err("pool %s, test map failed, err %d\n",
+		       pool->poolname, err);
+		return err;
+	}
+	pr_info("pool %s test map done.", pool->poolname);
+
+	return count;
+}
+
+static struct kobj_attribute rmr_srv_pool_test_map_attr =
+	__ATTR(test_map, 0644, rmr_srv_pool_test_map_show,
+	       rmr_srv_pool_test_map_store);
+
+static ssize_t rmr_srv_pool_metadata_show(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_pool_md *pool_md;
+	struct rmr_srv_md *srv_md;
+	int i;
+	ssize_t written = 0;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	pool_md = &pool->pool_md;
+
+	written += sysfs_emit_at(page, written,
+				 "The metadata of %s is: group_id %u, chunk_size %u, "
+				 "mapped_size %llu, queue_depth %u, "
+				 "bitmap_offset %llu, bitmap_len %llu, "
+				 "last_io_offset %llu, last_io_len %llu\n\n",
+				 pool_md->poolname, pool_md->group_id, pool_md->chunk_size,
+				 pool_md->mapped_size, pool_md->queue_depth,
+				 rmr_bitmap_offset(pool_md->queue_depth),
+				 rmr_bitmap_len(pool->no_of_chunks),
+				 (u64)RMR_LAST_IO_OFFSET,
+				 rmr_last_io_len(pool_md->queue_depth));
+	written += sysfs_emit_at(page, written,
+				 "The client pool: map_ver %llu\n\n", pool_md->map_ver);
+
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		srv_md = &pool_md->srv_md[i];
+		if (!srv_md->member_id)
+			continue;
+
+		written += sysfs_emit_at(page, written, "The server pool with member_id %u: "
+					 "mapped_size %llu, store_state %u, "
+					 "pool_state %u, map_update_state %u, "
+					 "map_ver %llu, discard_entries %x.\n\n",
+					 srv_md->member_id, srv_md->mapped_size,
+					 srv_md->store_state,
+					 srv_md->srv_pool_state,
+					 srv_md->map_update_state, srv_md->map_ver,
+					 srv_md->discard_entries);
+	}
+
+	return written;
+}
+
+static struct kobj_attribute rmr_srv_pool_metadata_attr =
+	__ATTR(metadata, 0444, rmr_srv_pool_metadata_show, NULL);
+
+static const char *map_update_state_str(enum srv_map_update_state state)
+{
+	switch (state) {
+	case MAP_UPDATE_STATE_DISABLED:
+		return "disabled";
+	case MAP_UPDATE_STATE_READY:
+		return "ready";
+	case MAP_UPDATE_STATE_DONE:
+		return "done";
+	}
+	return "unknown";
+}
+
+static ssize_t rmr_srv_pool_map_update_state_show(struct kobject *kobj,
+						  struct kobj_attribute *attr,
+						  char *page)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	return sysfs_emit(page, "%s\n", map_update_state_str(srv_pool->map_update_state));
+}
+
+static struct kobj_attribute rmr_srv_pool_map_update_state_attr =
+	__ATTR(map_update_state, 0644, rmr_srv_pool_map_update_state_show, NULL);
+
+static ssize_t rmr_srv_pool_map_unsynced_show(struct kobject *kobj,
+					      struct kobj_attribute *attr,
+					      char *page)
+{
+	ssize_t written = 0;
+	struct rmr_pool *pool;
+	struct rmr_dirty_id_map *map;
+	rmr_id_t id;
+	int i, j, lock_idx;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+
+	id.a = 1;
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	for (i = 0; (i < RMR_POOL_MAX_SESS && written < PAGE_SIZE); i++) {
+		map = rcu_dereference(pool->maps[i]);
+		if (!map)
+			continue;
+
+		written += sysfs_emit_at(page, written, "member_id : %d\n", map->member_id);
+		for (j = 0; j < map->no_of_chunks; j++) {
+			size_t len;
+
+			id.b = j;
+			if (rmr_map_check_dirty(map, id) &&
+			    (map->bitmap_filter[id.b] & MAP_ENTRY_UNSYNCED)) {
+				len = sysfs_emit_at(page, written, "(%llu, %llu) ",
+						    id.a, id.b);
+				if (!len) // break early if map is too big
+					break;
+				written += len;
+			}
+		}
+		written += sysfs_emit_at(page, written, "\n");
+	}
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	return written;
+}
+
+static ssize_t rmr_srv_pool_map_unsynced_store(struct kobject *kobj,
+					       struct kobj_attribute *attr,
+					       const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	rmr_id_t id = { 0, 0 };
+	int srv_id;
+	struct rmr_dirty_id_map *map;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+	if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) {
+		pr_err("cannot parse id.a %s\n", buf);
+		return -EINVAL;
+	}
+	pr_debug("add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id);
+
+	map = rmr_pool_find_map(pool, srv_id);
+	if (!map) {
+		pr_err("in pool %s cannot find map for srv_id %u\n",
+		       pool->poolname, srv_id);
+		return -EINVAL;
+	}
+
+	rmr_map_set_dirty(map, id, MAP_ENTRY_UNSYNCED);
+	rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+	pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id);
+
+	return count;
+}
+static struct kobj_attribute rmr_srv_pool_map_unsynced_attr =
+	__ATTR(map_unsynced, 0644, rmr_srv_pool_map_unsynced_show,
+	       rmr_srv_pool_map_unsynced_store);
+
+static ssize_t map_summary_show(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				char *page)
+{
+	struct rmr_pool *pool;
+	int lock_idx;
+	int written;
+
+	pool = container_of(kobj, struct rmr_pool, kobj);
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	written = rmr_map_summary_format(pool, page, PAGE_SIZE);
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	return written;
+}
+
+static struct kobj_attribute rmr_srv_pool_map_summary_attr =
+	__ATTR_RO(map_summary);
+
+static struct attribute *rmr_srv_pool_attrs[] = {
+	&rmr_srv_leave_pool_attr.attr,
+	&rmr_srv_member_id_attr.attr,
+	&rmr_srv_pool_blksize_attr.attr,
+	&rmr_srv_pool_map_attr.attr,
+	&rmr_srv_pool_map_ver_attr.attr,
+	&rmr_srv_pool_last_io_attr.attr,
+	&rmr_srv_add_clt_pool_attr.attr,
+	&rmr_srv_pool_sync_attr.attr,
+	&rmr_srv_pool_sync_state_attr.attr,
+	&rmr_srv_pool_state_attr.attr,
+	&rmr_srv_remove_clt_pool_attr.attr,
+	&rmr_srv_pool_test_map_attr.attr,
+	&rmr_srv_pool_metadata_attr.attr,
+	&rmr_srv_pool_map_update_state_attr.attr,
+	&rmr_srv_pool_map_unsynced_attr.attr,
+	&rmr_srv_pool_map_summary_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(rmr_srv_pool);
+
+static struct kobj_type rmr_srv_pool_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.default_groups  = rmr_srv_pool_groups,
+};
+
+static struct kobj_type ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+};
+
+static int rmr_srv_create_pool_sysfs_files(struct rmr_pool *pool)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&pool->kobj, &rmr_srv_pool_ktype,
+				   &rmr_pool_dev->kobj, "%s", pool->poolname);
+	if (ret) {
+		pr_err("Failed to create sysfs dir for pool '%s': %d\n",
+		       pool->poolname, ret);
+		return ret;
+	}
+
+	ret = kobject_init_and_add(&pool->sessions_kobj, &ktype, &pool->kobj,
+				   "sessions");
+	if (unlikely(ret)) {
+		pr_err("Failed to create sessions dir for pool '%s': %d\n",
+		       pool->poolname, ret);
+		kobject_del(&pool->kobj);
+		kobject_put(&pool->kobj);
+	}
+
+	return ret;
+}
+
+/* remove new line from string */
+static void strip(char *s)
+{
+	char *p = s;
+
+	while (*s != '\0') {
+		if (*s != '\n')
+			*p++ = *s++;
+		else
+			++s;
+	}
+	*p = '\0';
+}
+
+enum rmr_srv_opts {
+	RMR_SRV_OPT_POOL_NAME,
+	RMR_SRV_OPT_MEMBER_ID,
+	RMR_JOIN_OPT_Mandatory_count,
+	RMR_SRV_OPT_ERR,
+};
+
+static const char * const rmr_srv_opts_mandatory_names[] = {
+	[RMR_SRV_OPT_POOL_NAME] = "poolname",
+	[RMR_SRV_OPT_MEMBER_ID] = "member_id",
+};
+
+static const match_table_t rmr_srv_opt_tokens = {
+	{ RMR_SRV_OPT_POOL_NAME, "poolname=%s" },
+	{ RMR_SRV_OPT_MEMBER_ID, "member_id=%s" },
+	{ RMR_SRV_OPT_ERR, NULL },
+};
+
+static int rmr_srv_parse_options(const char *buf, char *poolname,
+				 u32 *member_id)
+{
+	char *options, *p;
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token, ret = 0, i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	options = strstrip(options);
+	strip(options);
+	while ((p = strsep(&options, " ")) != NULL) {
+		if (!*p)
+			continue;
+		token = match_token(p, rmr_srv_opt_tokens, args);
+		opt_mask |= (1 << token);
+
+		switch (token) {
+		case RMR_SRV_OPT_POOL_NAME:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("join_pool: name too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(poolname, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case RMR_SRV_OPT_MEMBER_ID:
+			p = match_strdup(args);
+
+			ret = kstrtou32(p, 0, member_id);
+			if (ret) {
+				pr_err("member_id isn't an integer: %d\n", ret);
+				kfree(p);
+				goto out;
+			}
+
+			kfree(p);
+			break;
+
+		default:
+			pr_err("join_pool: Unknown parameter or missing value"
+			       " '%s'\n", p);
+			ret = -EINVAL;
+			goto out;
+		}
+	};
+
+	for (i = 0; i < RMR_JOIN_OPT_Mandatory_count; i++) {
+		if ((opt_mask & (1 << rmr_srv_opt_tokens[i].token))) {
+			ret = 0;
+		} else {
+			pr_err("join_pool: Mandatory parameter missing: %s\n",
+			       rmr_srv_opts_mandatory_names[i]);
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	kfree(options);
+	return ret;
+}
+
+
+static ssize_t rmr_srv_join_pool_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	char poolname[NAME_MAX];
+	u32 member_id = UINT_MAX;
+	int err;
+
+	err = rmr_srv_parse_options(buf, poolname, &member_id);
+	if (unlikely(err))
+		return err;
+
+	if (member_id > MAX_POOL_ID) {
+		pr_err("%s: member_id gt max allowed pools (%u > %u)\n",
+		       __func__, member_id, MAX_POOL_ID);
+		return -EINVAL;
+	}
+
+	if (member_id == 0) {
+		pr_err("%s: member_id is not allowed to be zero\n", __func__);
+		return -EINVAL;
+	}
+
+	strip(poolname);
+
+	pr_info("%s: Creating server pool with poolname %s, member_id %u\n",
+		__func__, poolname, member_id);
+
+	srv_pool = rmr_create_srv_pool(poolname, member_id);
+	if (IS_ERR(srv_pool)) {
+		pr_err("failed to create srv pool %s\n", poolname);
+		return PTR_ERR(srv_pool);
+	}
+
+	pool = rmr_create_pool(poolname, srv_pool);
+	if (IS_ERR(pool)) {
+		err = PTR_ERR(pool);
+		goto destroy_pool;
+	}
+
+	srv_pool->pool = pool;
+	pool->is_clt = false;
+	rmr_srv_pool_update_params(pool);
+
+	err = rmr_srv_create_pool_sysfs_files(pool);
+	if (err) {
+		pr_err("%s: pool %s failed to create sysfs files\n", __func__, pool->poolname);
+		goto destroy_pool;
+	}
+
+	return count;
+
+destroy_pool:
+	rmr_put_srv_pool(srv_pool);
+
+	return err;
+}
+
+static ssize_t rmr_srv_join_pool_show(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      char *page)
+{
+	return scnprintf(page, PAGE_SIZE,
+			 "Usage: echo \"poolname=<name_of_pool> member_id=<id_of_pool> > %s\n",
+			 attr->attr.name);
+}
+
+static struct kobj_attribute rmr_srv_join_pool_attr =
+	__ATTR(join_pool, 0644, rmr_srv_join_pool_show,
+	       rmr_srv_join_pool_store);
+
+static struct attribute *default_attrs[] = {
+	&rmr_srv_join_pool_attr.attr,
+	NULL,
+};
+
+static struct attribute_group default_attr_group = {
+	.attrs = default_attrs,
+};
+
+int rmr_srv_create_sysfs_files(void)
+{
+	int err;
+	dev_t devt = MKDEV(0, 0);
+
+	rmr_dev_class = class_create("rmr-server");
+	if (IS_ERR(rmr_dev_class))
+		return PTR_ERR(rmr_dev_class);
+
+	rmr_ctl_dev = device_create(rmr_dev_class, NULL, devt, NULL, "ctl");
+	if (IS_ERR(rmr_ctl_dev)) {
+		err = PTR_ERR(rmr_ctl_dev);
+		goto cls_destroy;
+	}
+
+	rmr_pool_dev = device_create(rmr_dev_class, NULL, devt, NULL, "pools");
+	if (IS_ERR(rmr_pool_dev)) {
+		err = PTR_ERR(rmr_pool_dev);
+		goto ctl_destroy;
+	}
+
+	err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group);
+	if (unlikely(err))
+		goto pool_destroy;
+
+	return 0;
+
+pool_destroy:
+	device_unregister(rmr_pool_dev);
+ctl_destroy:
+	device_unregister(rmr_ctl_dev);
+cls_destroy:
+	class_destroy(rmr_dev_class);
+
+	return err;
+}
+
+void rmr_srv_destroy_sysfs_files(void)
+{
+	sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group);
+	device_unregister(rmr_pool_dev);
+	device_unregister(rmr_ctl_dev);
+	class_destroy(rmr_dev_class);
+}
diff --git a/drivers/infiniband/ulp/rmr/rmr-srv.c b/drivers/infiniband/ulp/rmr/rmr-srv.c
new file mode 100644
index 000000000000..66af29b90c53
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-srv.c
@@ -0,0 +1,3306 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+
+#include "rmr-srv.h"
+#include "rmr-req.h"
+#include "rmr-clt.h"
+
+MODULE_AUTHOR("The RMR and BRMR developers");
+MODULE_VERSION(RMR_VER_STRING);
+MODULE_DESCRIPTION("RMR Server");
+MODULE_LICENSE("GPL");
+
+static struct rtrs_srv_ctx *rtrs_ctx;
+struct kmem_cache *rmr_req_cachep;
+
+static LIST_HEAD(g_sess_list);
+static DEFINE_MUTEX(g_sess_lock);
+
+#define MIN_CHUNK_SIZE (128 << 10)
+#define MAX_CHUNK_SIZE (1024 << 10)
+#define DEFAULT_CHUNK_SIZE MIN_CHUNK_SIZE
+
+static int __read_mostly chunk_size = DEFAULT_CHUNK_SIZE;
+
+module_param_named(chunk_size, chunk_size, uint, 0444);
+MODULE_PARM_DESC(chunk_size,
+		 "Unit size which is tracked for being dirty. (default: "
+		 /* cppcheck-suppress unknownMacro */
+		 __stringify(DEFAULT_CHUNK_SIZE) "KB)");
+
+static int __read_mostly sync_queue_depth = DEFAULT_SYNC_QUEUE_DEPTH;
+
+module_param_named(sync_queue_depth, sync_queue_depth, uint, 0644);
+MODULE_PARM_DESC(sync_queue_depth,
+		 "Max in-flight sync requests per pool (default: "
+		 __stringify(DEFAULT_SYNC_QUEUE_DEPTH) ")");
+
+bool rmr_get_srv_pool(struct rmr_srv_pool *srv_pool)
+{
+	pr_debug("pool %s, before inc refcount %d\n",
+		 srv_pool->pool->poolname, refcount_read(&srv_pool->refcount));
+	return refcount_inc_not_zero(&srv_pool->refcount);
+}
+
+static struct rmr_srv_pool *rmr_find_and_get_srv_pool(u32 group_id)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+
+	mutex_lock(&pool_mutex);
+	pool = rmr_find_pool_by_group_id(group_id);
+	if (!pool) {
+		mutex_unlock(&pool_mutex);
+		return ERR_PTR(-ENOENT);
+	}
+
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+	if (!rmr_get_srv_pool(srv_pool)) {
+		mutex_unlock(&pool_mutex);
+		return ERR_PTR(-EINVAL);
+	}
+	mutex_unlock(&pool_mutex);
+
+	return srv_pool;
+}
+
+void rmr_put_srv_pool(struct rmr_srv_pool *srv_pool)
+{
+	struct rmr_pool *pool = srv_pool->pool;
+
+	might_sleep();
+
+	pr_debug("pool %s, before dec refcnt %d\n",
+		 (pool ? pool->poolname : "(empty)"), refcount_read(&srv_pool->refcount));
+	if (refcount_dec_and_test(&srv_pool->refcount)) {
+		mutex_destroy(&srv_pool->srv_pool_lock);
+
+		if (srv_pool->clt)
+			rmr_clt_close(srv_pool->clt);
+
+		kfree(srv_pool->last_io);
+		srv_pool->last_io = NULL;
+		kfree(srv_pool->last_io_idx);
+		srv_pool->last_io_idx = NULL;
+
+		if (pool) {
+			pr_info("srv: destroy pool %s\n", pool->poolname);
+			free_pool(pool);
+		}
+
+		cancel_delayed_work_sync(&srv_pool->md_sync_dwork);
+		destroy_workqueue(srv_pool->md_sync_wq);
+
+		cancel_delayed_work_sync(&srv_pool->clean_dwork);
+		destroy_workqueue(srv_pool->clean_wq);
+
+		kfree(srv_pool);
+	}
+}
+
+static const char *rmr_get_srv_pool_state_name(enum rmr_srv_pool_state state)
+{
+	switch (state) {
+	case RMR_SRV_POOL_STATE_EMPTY: return "RMR_SRV_POOL_STATE_EMPTY";
+	case RMR_SRV_POOL_STATE_REGISTERED: return "RMR_SRV_POOL_STATE_REGISTERED";
+	case RMR_SRV_POOL_STATE_CREATED: return "RMR_SRV_POOL_STATE_CREATED";
+	case RMR_SRV_POOL_STATE_NORMAL: return "RMR_SRV_POOL_STATE_NORMAL";
+	case RMR_SRV_POOL_STATE_NO_IO: return "RMR_SRV_POOL_STATE_NO_IO";
+
+	default: return "Unknown state";
+	}
+}
+
+/**
+ * rmr_srv_change_pool_state() - Change srv pool state
+ *
+ * @srv_pool:	Server pool whose state is to be changed
+ * @new_state:	State to which the transition is to be made
+ *
+ * Return:
+ *	old state on succes
+ *	negative error code on failure
+ *
+ * Description:
+ *	This function controls the state transitions for rmr-srv pool state.
+ *	Every state transition is controlled by this except to NORMAL.
+ *	Function rmr_srv_set_pool_state_normal handles transition to state NORMAL.
+ *	"always-invalid" state transitions are checked and prevented here
+ *	Case dependent valid/invalid state transition, should be handled by caller
+ */
+static inline int rmr_srv_change_pool_state(struct rmr_srv_pool *srv_pool,
+					    enum rmr_srv_pool_state new_state)
+{
+	enum rmr_srv_pool_state old_state = atomic_read(&srv_pool->state);
+	int cmp_state;
+
+	WARN_ON(new_state == RMR_SRV_POOL_STATE_NORMAL);
+
+	if (old_state == new_state)
+		return old_state;
+
+	pr_info("%s: Old state %s, Requested state %s\n",
+		__func__, rmr_get_srv_pool_state_name(old_state),
+		rmr_get_srv_pool_state_name(new_state));
+
+	switch (new_state) {
+	case RMR_SRV_POOL_STATE_NO_IO:
+		/*
+		 * NO_IO can be reached from REGISTERED, CREATED, or NORMAL.
+		 * EMPTY -> NO_IO is illegal: a pool with no store cannot have
+		 * active sessions that fail.
+		 */
+		if (WARN_ON(old_state == RMR_SRV_POOL_STATE_EMPTY))
+			goto err;
+		atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_NO_IO);
+		break;
+	case RMR_SRV_POOL_STATE_EMPTY:
+		/*
+		 * EMPTY is reached from REGISTERED (store unregistered, no
+		 * sessions) or from NO_IO (last session left, no store). A
+		 * direct jump from CREATED or NORMAL is illegal — those states
+		 * must pass through NO_IO first.
+		 */
+		if (WARN_ON(old_state == RMR_SRV_POOL_STATE_CREATED ||
+			    old_state == RMR_SRV_POOL_STATE_NORMAL))
+			goto err;
+		atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_EMPTY);
+		break;
+	case RMR_SRV_POOL_STATE_REGISTERED:
+		/*
+		 * REGISTERED is entered from EMPTY (store just registered, no
+		 * sessions) or from NO_IO (last session left, store still
+		 * present). A direct jump from CREATED or NORMAL is illegal —
+		 * those states must pass through NO_IO first.
+		 */
+		if (WARN_ON(old_state == RMR_SRV_POOL_STATE_CREATED ||
+			    old_state == RMR_SRV_POOL_STATE_NORMAL))
+			goto err;
+		atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_REGISTERED);
+
+		break;
+	case RMR_SRV_POOL_STATE_CREATED:
+		/*
+		 * CREATED is entered only from REGISTERED, when the first
+		 * non-sync create-mode join arrives. Any other predecessor
+		 * is illegal.
+		 */
+		cmp_state = RMR_SRV_POOL_STATE_REGISTERED;
+		if (atomic_try_cmpxchg(&srv_pool->state, &cmp_state, RMR_SRV_POOL_STATE_CREATED))
+			goto out;
+		WARN_ON(1);
+		goto err;
+	default:
+		pr_err("%s: Unknown state %d\n", __func__, new_state);
+		goto err;
+	}
+
+out:
+	rmr_srv_mark_pool_md_dirty(srv_pool);
+	return old_state;
+
+err:
+	pr_err("%s: Failed. Old state %s, Requested state %s\n",
+	       __func__, rmr_get_srv_pool_state_name(old_state),
+	       rmr_get_srv_pool_state_name(new_state));
+	return -EINVAL;
+}
+
+/**
+ * rmr_srv_set_pool_state_normal() - Change srv pool state to NORMAL
+ *
+ * @srv_pool:	Server pool whose state is to be changed to NORMAL
+ *
+ * Return:
+ *	old state on succes
+ *	negative error code on failure
+ *
+ * Description:
+ *	This function controls the state transitions for rmr-srv pool state to NORMAL
+ *	"always-invalid" state transitions are checked and prevented here
+ *	Case dependent valid/invalid state transition, should be handled by caller
+ */
+static int rmr_srv_set_pool_state_normal(struct rmr_srv_pool *srv_pool)
+{
+	int old_state;
+
+	mutex_lock(&srv_pool->srv_pool_lock);
+	old_state = atomic_read(&srv_pool->state);
+
+	pr_info("%s: Old state %s\n", __func__,
+		rmr_get_srv_pool_state_name(old_state));
+
+	if (old_state == RMR_SRV_POOL_STATE_NORMAL)
+		goto out;
+
+	/*
+	 * CREATED -> NORMAL: normal enable on a newly created pool.
+	 * NO_IO -> NORMAL: map update completed, pool can serve IOs again.
+	 * Any other predecessor is illegal.
+	 */
+	if (WARN_ON(old_state != RMR_SRV_POOL_STATE_CREATED &&
+		    old_state != RMR_SRV_POOL_STATE_NO_IO)) {
+		old_state = -EINVAL;
+		goto out;
+	}
+
+	atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_NORMAL);
+	rmr_srv_mark_pool_md_dirty(srv_pool);
+	pr_info("%s: Server pool state changed to NORMAL\n", __func__);
+
+out:
+	mutex_unlock(&srv_pool->srv_pool_lock);
+
+	return old_state;
+}
+
+/**
+ * rmr_srv_clear_map() - clear the  dirty map if other pool member completely synced it
+ *
+ * @pool:	rmr pool that holds the maps to clean
+ * @member_id:	pool member id for which map is reported as clean
+ *
+ * Description:
+ *	If other pool member responded that he finished syncing his data, then we can
+ *	clear his map replicated to this nodes, in case of some clear commands were
+ *	lost or failed.
+ *
+ * Return:
+ *	no
+ *
+ * Context:
+ *	This function can wait on spin_lock if the deleted entry should be inserted back
+ *
+ * Locks:
+ *	no
+ */
+static void rmr_srv_clear_map(struct rmr_pool *pool, u8 member_id)
+{
+	// TODO: this looks like rmr_pool_map_remove_entries, can we do something about this?
+	// I was not able to merge them, but it would be nice.
+	struct rmr_dirty_id_map *map = NULL;
+	rmr_id_t id;
+	int i, lock_idx;
+
+	pr_debug("pool %s clear map entries for member_id=%u\n",
+		 pool->poolname, member_id);
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	map = rmr_pool_find_map(pool, member_id);
+	if (!map) {
+		pr_err("for pool %s cannot find map for member id %u\n", pool->poolname, member_id);
+		goto unlock;
+	}
+
+	/* if the map state changed since we send our CHECK_MAP command, it means that
+	 * some entries were added and the map is not clean and we should not wipe it.
+	 * rsp of CHECK_MAP cmd can be outdated a little so we do not trust it then.
+	 */
+	if (atomic_read(&map->check_state) != RMR_MAP_STATE_CHECKING)
+		pr_debug("map for member_id=%u cannot be cleared now, state changed\n",
+			 map->member_id);
+
+	for (i = 0; i < map->no_of_chunks; i++) {
+		id.a = 1;
+		id.b = i;
+
+		rmr_map_unset_dirty(map, id, MAP_NO_FILTER);
+
+		/* If the state changed since the last check then it is possible that after
+		 * clear_bit of RMR_MAP_STATE_CHECK_CLEAR in the rmr_req_check_map we called
+		 * rmr_map_insert. There we check that entry is already in the map and leave
+		 * the function. But the following erease here would delete it. So we return
+		 * erased entry back to the table if the state of checking changed.
+		 */
+		if (atomic_read(&map->check_state) != RMR_MAP_STATE_CHECKING) {
+			pr_debug("map for member_id=%u cannot be cleared now, state changed\n",
+				 map->member_id);
+
+			rmr_map_set_dirty(map, id, 0);
+			goto unlock;
+		}
+	}
+	pr_debug("clear map entries for member_id=%u is done\n", member_id);
+unlock:
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+	rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+}
+
+/**
+ * rmr_srv_check_map_clear() - periodic work that checks if the other node finished sync
+ *
+ * @work:	delayed work structure to start and repeat the work
+ *
+ * Description:
+ *	Check the dirty maps of all of the other pool members. If any of the maps is dirty
+ *	then send check command and if the pool member responds that it has cleared his map,
+ *	then we should clear it locally. When checking is done reschedule itself again.
+ *
+ * Return:
+ *	no
+ *
+ * Context:
+ *	runs in the process context.
+ *
+ * Locks:
+ *	no
+ */
+static void rmr_srv_check_map_clear(struct work_struct *work)
+{
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_pool *pool;
+	int i, lock_idx;
+
+	srv_pool = container_of(to_delayed_work(work), struct rmr_srv_pool, clean_dwork);
+
+	if (!srv_pool->pool) {
+		pr_debug("no rmr pool assigend to srv_pool yet.\n");
+		goto out;
+	}
+
+	pool = srv_pool->pool;
+	pr_debug("check map for srv pool %s started...\n", pool->poolname);
+
+	if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) {
+		pr_debug("srv pool %s is not in normal state, skip map clear check",
+			 pool->poolname);
+		goto out;
+	}
+
+	if (!srv_pool->clt) {
+		pr_debug("srv pool %s does not have sync pool assigned, skip map clear check\n",
+			 pool->poolname);
+		goto out;
+	}
+
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	for (i = 0; i < pool->maps_cnt; i++) {
+		struct rmr_dirty_id_map *map;
+		u8 member_id;
+		int ret;
+
+		map = rcu_dereference(pool->maps[i]);
+		if (WARN_ON(!map))
+			break;
+
+		member_id = map->member_id;
+		if (member_id == srv_pool->member_id) {
+			pr_debug("srv pool %s skip checking map with id %u, since it is me.\n",
+				 pool->poolname, member_id);
+			continue;
+		}
+
+		if (rmr_map_empty(map)) {
+			pr_debug("srv pool %s map for member_id=%u is empty, no need to check\n",
+				 pool->poolname, map->member_id);
+			continue;
+		}
+
+		atomic_set(&map->check_state, RMR_MAP_STATE_CHECKING);
+
+		ret = rmr_clt_pool_member_synced(srv_pool->clt, member_id);
+		if (ret < 0) {
+			pr_debug("pool %s failed to check if member_id=%u synced, ret %d\n",
+				 pool->poolname, member_id, ret);
+			atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK);
+			continue;
+		}
+
+		pr_debug("pool %s check if pool member %u synced, reported %u\n\n",
+			 pool->poolname, member_id, ret);
+		if (ret)
+			rmr_srv_clear_map(pool, member_id);
+
+		atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK);
+	}
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	pr_debug("check map for pool %s done. schedule next one.\n", pool->poolname);
+
+out:
+	queue_delayed_work(srv_pool->clean_wq, &srv_pool->clean_dwork,
+			   msecs_to_jiffies(RMR_SRV_CHECK_MAPS_INTERVAL_MS));
+}
+
+struct rmr_srv_pool *rmr_create_srv_pool(char *poolname, u32 member_id)
+{
+	struct rmr_srv_pool *srv_pool;
+	srv_pool = kzalloc(sizeof(struct rmr_srv_pool), GFP_KERNEL);
+	if (unlikely(!srv_pool))
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_EMPTY);
+	srv_pool->maintenance_mode = false;
+	refcount_set(&srv_pool->refcount, 1);
+	mutex_init(&srv_pool->srv_pool_lock);
+
+	atomic_set(&srv_pool->store_state, false);
+
+	srv_pool->member_id = member_id;
+	srv_pool->max_sync_io_size = U32_MAX;
+
+	/* Sync thread */
+	srv_pool->th_tsk = NULL;
+	atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED);
+	atomic_set(&srv_pool->in_flight_sync_reqs, 0);
+
+	/* clean outdated entries from the map work */
+	srv_pool->clean_wq = alloc_workqueue("%s_clean_wq", 0, 0, poolname);
+	if (!srv_pool->clean_wq) {
+		kfree(srv_pool);
+		pr_err("failed to create wq pool %s\n", poolname);
+		return ERR_PTR(-ENOMEM);
+	}
+	INIT_DELAYED_WORK(&srv_pool->clean_dwork, rmr_srv_check_map_clear);
+	queue_delayed_work(srv_pool->clean_wq, &srv_pool->clean_dwork,
+			   msecs_to_jiffies(RMR_SRV_CHECK_MAPS_INTERVAL_MS));
+
+	/* sync metadata of the rmr pool */
+	srv_pool->md_sync_wq = alloc_workqueue("%s_md_sync_wq", 0, 0, poolname);
+	if (!srv_pool->md_sync_wq) {
+		kfree(srv_pool);
+		pr_err("failed to create md_sync_wq pool %s\n", poolname);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_DELAYED_WORK(&srv_pool->md_sync_dwork, rmr_srv_md_sync);
+	/* No initial queue — first dirty event will schedule the work. */
+	return srv_pool;
+}
+
+void rmr_srv_pool_update_params(struct rmr_pool *pool)
+{
+	pr_info("%s: Setting chunk_size for pool %s to %d",
+		__func__, pool->poolname, chunk_size);
+	pool->chunk_size = chunk_size;
+	pool->chunk_size_shift = ilog2(chunk_size);
+}
+
+static struct rmr_pool *rmr_srv_sess_get_pool(struct rmr_srv_sess *srv_sess, u32 group_id)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	bool ret;
+
+	rcu_read_lock();
+	pool = xa_load(&srv_sess->pools, group_id);
+	if (!pool) {
+		pool = ERR_PTR(-ENXIO);
+		goto out;
+	}
+
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+	ret = rmr_get_srv_pool(srv_pool);
+	if (!ret)
+		pool = ERR_PTR(-ENXIO);
+
+out:
+	rcu_read_unlock();
+	return pool;
+}
+
+static void rmr_srv_sess_put_pool(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	rmr_put_srv_pool(srv_pool);
+}
+
+/**
+ * rmr_srv_endreq() - Function called when an rmr server request finishes processing
+ *
+ * @req:	Pointer to the request ending
+ * @err:	Error value. Would be 0 for a successful request
+ */
+void rmr_srv_endreq(struct rmr_srv_req *req, int err)
+{
+	struct rmr_srv_pool *srv_pool = req->srv_pool;
+	struct rmr_pool *pool = srv_pool->pool;
+	struct rtrs_srv_op *rtrs_op = req->rtrs_op;
+	struct rmr_dirty_id_map *map;
+	int i;
+
+	if (req->flags == RMR_OP_MD_WRITE || req->flags == RMR_OP_MD_READ) {
+		if (unlikely(err))
+			pr_err("Failed to complete the md req %x\n", req->flags);
+		goto put_ref;
+	} else if (unlikely(err) && !req->sync) {
+		struct rmr_srv_pool *srv_pool = req->srv_pool;
+
+		rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+	} else if (rmr_op(req->flags) == RMR_OP_WRITE) {
+		srv_pool->last_io[req->mem_id].a = req->id.a;
+		srv_pool->last_io[req->mem_id].b = req->id.b;
+
+		if (!test_and_set_bit(MD_DIRTY_LAST_IO, &srv_pool->md_dirty)) {
+			mod_delayed_work(srv_pool->md_sync_wq,
+					 &srv_pool->md_sync_dwork,
+					 msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS));
+		}
+
+		for (i = 0; i < req->failed_cnt; i++) {
+			int err;
+
+			map = rmr_pool_find_map(srv_pool->pool, req->failed_srv_id[i]);
+			if (!map) {
+				pr_err("Cannot find map for srv_id %u\n", req->failed_srv_id[i]);
+				err = -EINVAL;
+				goto out;
+			}
+
+			atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK);
+			rmr_map_set_dirty(map, req->id, 0);
+
+			if (req->map_ver > srv_pool->pool->map_ver)
+				srv_pool->pool->map_ver = req->map_ver;
+		}
+		if (req->failed_cnt) {
+			rmr_srv_mark_pool_md_dirty(srv_pool);
+			rmr_srv_mark_maps_dirty(srv_pool);
+		}
+	}
+
+out:
+	/* The requests created by rmr-srv don't use rtrs_op. */
+	rtrs_srv_resp_rdma(rtrs_op, err);
+	rmr_srv_sess_put_pool(req->srv_pool->pool);
+put_ref:
+	percpu_ref_put(&pool->ids_inflight_ref);
+}
+
+static void rmr_srv_stop_sync_and_unset_store(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	atomic_set(&srv_pool->store_state, false);
+
+	if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) {
+		atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP);
+		wake_up_process(srv_pool->th_tsk);
+
+		while (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(msecs_to_jiffies(1000));
+		}
+	}
+}
+
+static void rmr_srv_delete_store_member(struct rmr_pool *pool, unsigned long id)
+{
+	rmr_pool_remove_map(pool, id);
+	xa_erase(&pool->stg_members, id);
+	rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+}
+
+/**
+ * rmr_srv_add_store_member() - Register a storage member and create its dirty map
+ *
+ * @pool:	The pool to which the member belongs.
+ * @id:		Member ID of the storage node to add.
+ *
+ * Records @id in pool->stg_members and allocates a dirty map for it.
+ * On failure the stg_members entry is removed before returning.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+static int rmr_srv_add_store_member(struct rmr_pool *pool, unsigned long id)
+{
+	struct rmr_dirty_id_map *map;
+	int ret;
+
+	map = rmr_pool_find_map(pool, id);
+	if (map) {
+		pr_err("%s: pool %s, member_id %lu map already exists\n",
+		       __func__, pool->poolname, id);
+		return -EEXIST;
+	}
+
+	ret = xa_err(xa_store(&pool->stg_members, id, XA_TRUE, GFP_KERNEL));
+	if (ret) {
+		pr_err("%s: Failed to add storage member %lu: %d\n",
+		       __func__, id, ret);
+		return ret;
+	}
+
+	/*
+	 * Create the map of the newly added member.
+	 */
+	map = rmr_map_create(pool, id);
+	if (IS_ERR(map)) {
+		ret = PTR_ERR(map);
+		pr_err("%s: pool %s, member_id %lu failed to create map on err %d: %pe\n",
+		       __func__, pool->poolname, id, ret, map);
+		goto rem_store;
+	}
+	return 0;
+
+rem_store:
+	xa_erase(&pool->stg_members, id);
+	return ret;
+}
+
+/**
+ * rmr_srv_handle_other_member_add() - Handle a POOL_INFO ADD message for a different member
+ *
+ * @srv_pool:		The server pool receiving the notification.
+ * @pool_info_cmd:	The received POOL_INFO command carrying member_id, mode, and dirty.
+ *
+ * For %RMR_POOL_INFO_MODE_ASSEMBLE, verifies that the member and its dirty map
+ * already exist (the node is rejoining a pool it was previously part of).
+ * For %RMR_POOL_INFO_MODE_CREATE, adds the member via rmr_srv_add_store_member()
+ * and optionally marks its map fully dirty if the client reported outstanding data.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+static int rmr_srv_handle_other_member_add(struct rmr_srv_pool *srv_pool,
+					   const struct rmr_msg_pool_info_cmd *pool_info_cmd)
+{
+	struct rmr_pool *pool = srv_pool->pool;
+	struct rmr_dirty_id_map *map;
+	int ret;
+
+	if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_ASSEMBLE) {
+		pr_info("%s: Member %u got add of member %u with mode assemble\n",
+			__func__, srv_pool->member_id, pool_info_cmd->member_id);
+
+		/*
+		 * For assemble, member info should already exist.
+		 */
+		if (xa_load(&pool->stg_members, pool_info_cmd->member_id) != XA_TRUE) {
+			pr_err("%s: pool %s, member_id %u not present\n",
+			       __func__, pool->poolname, pool_info_cmd->member_id);
+			return -ENOENT;
+		}
+
+		map = rmr_pool_find_map(pool, pool_info_cmd->member_id);
+		if (!map) {
+			pr_err("%s: pool %s, member_id %u, map not present\n",
+			       __func__, pool->poolname, pool_info_cmd->member_id);
+			return -ENOENT;
+		}
+	} else if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_CREATE &&
+		    pool_info_cmd->member_id != srv_pool->member_id) {
+		pr_info("%s: Member %u got add of member %u with mode create\n",
+			__func__, srv_pool->member_id, pool_info_cmd->member_id);
+
+		ret = rmr_srv_add_store_member(pool, pool_info_cmd->member_id);
+		if (ret) {
+			pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret);
+			return ret;
+		}
+
+		if (pool_info_cmd->dirty) {
+			map = rmr_pool_find_map(pool, pool_info_cmd->member_id);
+			if (WARN_ON(!map)) {
+				xa_erase(&pool->stg_members, pool_info_cmd->member_id);
+				return -EINVAL;
+			}
+			rmr_map_set_dirty_all(map, MAP_NO_FILTER);
+		}
+		rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+	} else {
+		pr_err("%s: pool %s, member_id %u, unexpected mode %u for ADD operation\n",
+		       __func__, pool->poolname, pool_info_cmd->member_id,
+		       pool_info_cmd->mode);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int rmr_srv_query(struct rmr_pool *pool, u64 mapped_size, struct rmr_attrs *attr)
+{
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_dirty_id_map *map;
+	size_t queue_depth;
+
+	if (pool) {
+		srv_pool = (struct rmr_srv_pool *)pool->priv;
+		queue_depth = srv_pool->queue_depth;
+	} else {
+		/*
+		 * If pool is NULL, we are being called to estimate the md size
+		 * before the pool is created. Use max queue depth in that case.
+		 */
+		queue_depth = RMR_SRV_MAX_QDEPTH;
+	}
+
+	/*
+	 * Dummy map structure, so that we can reuse the update map param function.
+	 */
+	map = (struct rmr_dirty_id_map *)get_zeroed_page(GFP_KERNEL);
+	if (!map) {
+		pr_err("%s: Cannot allocate map\n", __func__);
+		return -ENOMEM;
+	}
+
+	map->no_of_chunks = (mapped_size >> (ilog2(chunk_size) - 9));
+	rmr_map_update_page_params(map);
+
+	attr->rmr_md_size = (map->total_slp * PAGE_SIZE * RMR_POOL_MAX_SESS) + RMR_MD_SIZE;
+	attr->rmr_md_size += (queue_depth * sizeof(*srv_pool->last_io_idx));
+
+	attr->rmr_md_size = attr->rmr_md_size / SECTOR_SIZE;
+
+	free_page((unsigned long)map);
+	return 0;
+}
+EXPORT_SYMBOL(rmr_srv_query);
+
+/**
+ * rmr_srv_set_map() - Create the dirty map for this server's member in the pool
+ *
+ * @pool:	The pool for which the map is to be created.
+ * @mode:	Registration mode; if %RMR_SRV_DISK_REPLACE, any existing map for
+ *		this member is removed before creating the new one.
+ *
+ * Description:
+ *	Invoked after the mapped size of the pool has been validated.  Updates
+ *	pool metadata with the mapped size, recalculates the chunk count, and
+ *	calls rmr_srv_add_store_member() to register this node's map.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+static int rmr_srv_set_map(struct rmr_pool *pool, enum rmr_srv_register_disk_mode mode)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	int ret, md_i;
+
+	pr_info("%s: Mapped size of the pool %s is set to %lld\n",
+		__func__, pool->poolname, pool->mapped_size);
+
+	/* Update mapped_size in the pool metadata. */
+	md_i = rmr_pool_find_md(&pool->pool_md, srv_pool->member_id, true);
+	if (md_i < 0) {
+		pr_err("No space for new member %d.\n", srv_pool->member_id);
+		return -ENOMEM;
+	}
+	pool->pool_md.srv_md[md_i].mapped_size = pool->mapped_size;
+
+	/*
+	 * The existing map is irrelevant if user asked for store REPLACE.
+	 */
+	if (mode == RMR_SRV_DISK_REPLACE)
+		rmr_pool_remove_map(pool, srv_pool->member_id);
+
+	ret = rmr_srv_add_store_member(pool, srv_pool->member_id);
+	if (ret) {
+		pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret);
+		goto err_out;
+	}
+
+	return ret;
+
+err_out:
+	pool->pool_md.srv_md[md_i].mapped_size = 0;
+	return ret;
+}
+
+/**
+ * rmr_srv_register() - Register a backend store with an RMR server pool
+ *
+ * @poolname:	Name of the pool to which the store is to be registered.
+ * @ops:	Store operations pointer.
+ * @priv:	Private data for the store.
+ * @mapped_size:	Size of the storage device in sectors.
+ * @mode:	Registration mode: %RMR_SRV_DISK_CREATE for a new store,
+ *		%RMR_SRV_DISK_REPLACE to replace an existing one, or
+ *		%RMR_SRV_DISK_ADD to rejoin an existing pool.
+ *
+ * Description:
+ *	An RMR server pool requires a backend store to service I/Os.
+ *	This function registers that store, sets up the pool's dirty map for
+ *	this member, and records the marked_create flag for validation when
+ *	the first client joins.
+ *
+ * Return:
+ *	Pointer to the rmr_pool on success, NULL on error.
+ */
+static bool rmr_srv_pool_has_non_sync_sess(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool_sess *pool_sess;
+
+	list_for_each_entry(pool_sess, &pool->sess_list, pool_entry) {
+		if (!pool_sess->sync)
+			return true;
+	}
+	return false;
+}
+
+struct rmr_pool *rmr_srv_register(char *poolname, struct rmr_srv_store_ops *ops, void *priv,
+				  u64 mapped_size, enum rmr_srv_register_disk_mode mode)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_io_store *io_store;
+	struct rmr_srv_pool *srv_pool;
+	u32 group_id = rmr_pool_hash(poolname);
+	enum rmr_srv_pool_state state;
+	int ret;
+
+	srv_pool = rmr_find_and_get_srv_pool(group_id);
+	if (IS_ERR(srv_pool)) {
+		pr_err("pool %s does not exists: %pe\n", poolname, srv_pool);
+		return NULL;
+	}
+	pool = srv_pool->pool;
+
+	mutex_lock(&srv_pool->srv_pool_lock);
+	if (mode == RMR_SRV_DISK_CREATE &&
+	    (rmr_srv_pool_has_non_sync_sess(pool) ||
+	     rmr_pool_find_map(pool, srv_pool->member_id))) {
+		pr_err("%s: Cannot register (create) new backend for %s; Sessions/Map exists\n",
+		       __func__, poolname);
+		ret = -EEXIST;
+		goto put_err;
+	}
+
+	if (mode == RMR_SRV_DISK_REPLACE &&
+	    (!rmr_srv_pool_has_non_sync_sess(pool))) {
+		pr_err("%s: Cannot register (replace) new backend for %s; No non-sync session\n",
+		       __func__, poolname);
+		ret = -EINVAL;
+		goto put_err;
+	}
+
+	if (srv_pool->io_store) {
+		pr_err("Srv pool %s already has store registered\n", poolname);
+		goto put_err;
+	}
+
+	if (pool->mapped_size && pool->mapped_size != mapped_size) {
+		pr_err("Pool %s already has mapped size %lld, cannot register store with %lld\n",
+		       poolname, pool->mapped_size, mapped_size);
+		ret = -EINVAL;
+		goto put_err;
+	}
+
+	io_store = kzalloc(sizeof(*io_store), GFP_KERNEL);
+	if (!io_store) {
+		pr_err("Failed to allocate io_store for %s\n", poolname);
+		goto put_err;
+	}
+
+	pool->mapped_size = mapped_size;
+	io_store->ops = ops;
+	io_store->priv = priv;
+	srv_pool->io_store = io_store;
+
+	/* The pool updates its number of tracking chunks with the mapped size just provided. */
+	rmr_pool_update_no_of_chunk(pool);
+
+	if (mode == RMR_SRV_DISK_CREATE || mode == RMR_SRV_DISK_REPLACE) {
+		ret = rmr_srv_set_map(pool, mode);
+		if (ret) {
+			pr_err("%s: failed to set maps in rmr pool %s, err %d\n",
+			       __func__, poolname, ret);
+			goto free_io_store;
+		}
+	} else if (mode == RMR_SRV_DISK_ADD) {
+		/*
+		 * Read the pool metadata stored on this device before md_sync writes
+		 * new metadata to the store.
+		 */
+		ret = rmr_srv_refresh_md(srv_pool);
+		if (ret) {
+			pr_err("%s: cannot refresh md of the pool\n", __func__);
+			goto free_io_store;
+		}
+	} else {
+		pr_err("%s: Wrong register disk mode %d\n", __func__, mode);
+		ret = -EINVAL;
+		goto free_io_store;
+	}
+
+	srv_pool->marked_create = (mode == RMR_SRV_DISK_CREATE);
+	atomic_set(&srv_pool->store_state, true);
+	rmr_srv_mark_pool_md_dirty(srv_pool);
+	state = atomic_read(&srv_pool->state);
+	if (state != RMR_SRV_POOL_STATE_NORMAL &&
+	    state != RMR_SRV_POOL_STATE_NO_IO)
+		rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_REGISTERED);
+	mutex_unlock(&srv_pool->srv_pool_lock);
+
+	__module_get(THIS_MODULE);
+	pr_info("Registered store with pool %s\n", poolname);
+
+	return srv_pool->pool;
+
+free_io_store:
+	kfree(io_store);
+	srv_pool->io_store = NULL;
+put_err:
+	mutex_unlock(&srv_pool->srv_pool_lock);
+	rmr_put_srv_pool(srv_pool);
+	return NULL;
+}
+EXPORT_SYMBOL(rmr_srv_register);
+
+static void rmr_srv_delete_md(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_dirty_id_map *map = NULL;
+	int err, lock_idx;
+	u32 map_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth) + RMR_MAP_BUF_HDR_SIZE;
+	u64 per_map_size = 0;
+	u64 len;
+	u8 map_idx;
+	void *buf;
+
+	/*
+	 * It could happen to access the pool while the pool is not there. Use reference counting
+	 * for server pool to avoid the issue.
+	 */
+	err = rmr_get_srv_pool(srv_pool);
+	if (!err) {
+		pr_err("%s: pool is not there\n", __func__);
+		return;
+	}
+
+	len = rmr_bitmap_offset(pool->pool_md.queue_depth) + PAGE_SIZE;
+	buf = kzalloc(len, GFP_KERNEL);
+	if (!buf)
+		goto put_pool;
+
+	/*
+	 * On-disk layout of rmr pool metadata:
+	 *
+	 *   0           RMR_MD_SIZE   +last_io_len    +PAGE_SIZE
+	 *   +-----------+-------------+---------------+--------------------+
+	 *   | pool_md   | last_io     | hdr_region    | maps_region ...    |
+	 *   +-----------+-------------+---------------+--------------------+
+	 *   <-RMR_MD_SIZE><-last_io_len><--PAGE_SIZE--> maps_cnt * per_map
+	 */
+	err = process_md_io(pool, NULL, 0, len, RMR_OP_MD_WRITE, buf);
+	if (err)
+		pr_warn("%s: failed to process md write io with err 0x%x.\n", __func__, err);
+
+	/*
+	 * Zero the bitmap on disk using O(1) offset formula.
+	 */
+	lock_idx = srcu_read_lock(&pool->map_srcu);
+	for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) {
+		u32 map_data_offset;
+		el_flp *flp_ptr;
+		u64 no_of_slps;
+		int i, j;
+
+		map = rcu_dereference(pool->maps[map_idx]);
+		if (WARN_ON(!map))
+			break;
+
+		per_map_size = map->total_slp * PAGE_SIZE;
+		map_data_offset = map_region_offset + map_idx * per_map_size;
+
+		for (i = 0; i < map->no_of_flp; i++) {
+			flp_ptr = (el_flp *)map->dirty_bitmap[i];
+
+			if (i == (map->no_of_flp - 1))
+				no_of_slps = map->no_of_slp_in_last_flp;
+			else
+				no_of_slps = NO_OF_SLP_PER_FLP;
+
+			for (j = 0; j < no_of_slps; j++, flp_ptr++) {
+				err = process_md_io(pool, NULL, map_data_offset,
+						    PAGE_SIZE, RMR_OP_MD_WRITE, buf);
+				if (err)
+					pr_warn("%s: bitmap write failed at 0x%x, err 0x%x.\n",
+						__func__, map_data_offset, err);
+				map_data_offset += PAGE_SIZE;
+			}
+		}
+	}
+	srcu_read_unlock(&pool->map_srcu, lock_idx);
+
+	rmr_srv_delete_store_member(pool, srv_pool->member_id);
+
+	free_page((unsigned long)buf);
+put_pool:
+	rmr_put_srv_pool(srv_pool);
+}
+
+/**
+ * rmr_srv_unregister() - Unregister the backend store from rmr server pool
+ *
+ * @poolname:	Name of the pool from which the store is to be unregistered
+ * @delete:	If true, delete all the metadata associated with this pool
+ *
+ * Description:
+ *	rmr server pool needs a backend store which serves the IOs
+ *	This function is used to unregister a backend store from rmr server pool.
+ *
+ * Return:
+ *	None
+ */
+void rmr_srv_unregister(char *poolname, bool delete)
+{
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_srv_io_store	*io_store;
+
+	mutex_lock(&pool_mutex);
+	pool = rmr_find_pool(poolname);
+	mutex_unlock(&pool_mutex);
+
+	if (!pool) {
+		pr_err("%s, Pool %s does not exists\n", __func__, poolname);
+		return;
+	}
+
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+	mutex_lock(&srv_pool->srv_pool_lock);
+
+	if (!srv_pool->io_store) {
+		pr_err("Srv pool %s not registered\n", poolname);
+		mutex_unlock(&srv_pool->srv_pool_lock);
+		return;
+	}
+
+	if (srv_pool->marked_delete) {
+		if (!delete) {
+			pr_err("%s: Storage server marked for delete, but delete mode not set\n",
+			       __func__);
+			pr_err("%s: Continuing with only removal", __func__);
+		}
+	} else if (!srv_pool->marked_create && delete) {
+		pr_err("%s: Storage server not marked for delete, abandoning delete.\n", __func__);
+		delete = false;
+	}
+
+	io_store = srv_pool->io_store;
+
+	rmr_srv_stop_sync_and_unset_store(pool);
+
+	percpu_ref_kill_and_confirm(&pool->ids_inflight_ref, rmr_pool_confirm_inflight_ref);
+	wait_for_completion(&pool->complete_done);
+	wait_for_completion(&pool->confirm_done);
+
+	/*
+	 * Re-init so metadata IO can go in if needed
+	 */
+	reinit_completion(&pool->complete_done);
+	reinit_completion(&pool->confirm_done);
+	percpu_ref_reinit(&pool->ids_inflight_ref);
+
+	if (delete)
+		rmr_srv_delete_md(pool);
+
+	kfree(srv_pool->io_store);
+	srv_pool->io_store = NULL;
+
+	mutex_lock(&pool->sess_lock);
+	if (!rmr_srv_pool_has_non_sync_sess(pool))
+		rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_EMPTY);
+	mutex_unlock(&pool->sess_lock);
+
+	srv_pool->marked_delete = false;
+	mutex_unlock(&srv_pool->srv_pool_lock);
+
+	pool->mapped_size = 0;
+
+	rmr_put_srv_pool(srv_pool);
+
+	pr_info("Unregistered store with pool %s\n", poolname);
+
+	module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL(rmr_srv_unregister);
+
+/**
+ * rmr_srv_pool_cmd_with_rsp() - Sends a user command to all sessions of the internal (sync) clt
+ *
+ * @pool:	rmr pool to which the command is for
+ * @conf:	confirmation function to be called after completion
+ * @priv:	pointer to priv data, to be returned to user while calling conf function
+ * @usr_vec:	kvec containing user data (mostly command messages?)
+ * @nr:		number of kvecs
+ * @buf:	buf where the response from the user server is to be directed
+ * @buf_len:	length of the buffer
+ * @size:	size of the buf to be sent to a single session
+ *
+ * Description:
+ *	This function provides an interface for the user to send commands to storage nodes connected
+ *	through the internal network of this rmr pool.
+ *	It redirects the command through the rmr-client pool in this storage node, which then sends
+ *	the command to all the storage nodes it is connected to.
+ *	The command is sent as a read, so that the response from the user srv side can be received
+ *	The buffer sent by the user is meant to receive the response from the user server side.
+ *	The size of the buffer is set during rmr_clt_open.
+ *
+ * Return:
+ *	0 on success
+ *	negative errno in case of error
+ *
+ * Context:
+ *	Inflight commands will block map update, until the inflights are completed.
+ */
+int rmr_srv_pool_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv,
+			     const struct kvec *usr_vec, size_t nr, void *buf, int buf_len,
+			     size_t size)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (!srv_pool->clt) {
+		pr_warn("srv pool %s does not have sync pool assigned.\n",
+			pool->poolname);
+		return -EAGAIN;
+	}
+
+	return rmr_clt_cmd_with_rsp(srv_pool->clt, conf, priv, usr_vec, nr, buf, buf_len, size);
+}
+EXPORT_SYMBOL(rmr_srv_pool_cmd_with_rsp);
+
+static int rmr_srv_send_discard_all(struct rmr_pool *pool, u8 member_id)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_pool *sync_pool = srv_pool->clt;
+	struct rmr_msg_pool_cmd msg = {};
+	int err;
+
+	/*
+	 * If the member_id is not this server's member_id, it means this server is the receiving
+	 * node of the discard request.
+	 */
+	if (srv_pool->member_id != member_id)
+		return 0;
+
+	pr_info("%s: Send discards across storage nodes for pool %s\n",
+		__func__, pool->poolname);
+
+	rmr_clt_init_cmd(sync_pool, &msg);
+	msg.cmd_type = RMR_CMD_SEND_DISCARD;
+	msg.send_discard_cmd.member_id = member_id;
+
+	err = rmr_clt_pool_send_all(sync_pool, &msg);
+	if (err) {
+		pr_err("Failed to send discard cmd for pool %s: %d\n",
+		       pool->poolname, err);
+	}
+	return err;
+}
+
+/**
+ * rmr_srv_discard_id() - discard the data chunks of length from offset on disk
+ *
+ * @pool:	source pool.
+ * @offset	offset in bytes.
+ * @length:	length in bytes
+ * @member_id:	member id of the storage node to discard the data from. If 0, then the node is
+ *		this server pool.
+ * @sync:	indicates whether to send sync requests to other connected nodes.
+ *
+ * Return:
+ *	0 on success, err code otherwise
+ *
+ * Description:
+ *	This function discards the data chunks on the server with member_id. It will mark the
+ *	data chunks as dirty and set the discard_entries flag of the corresponding srv_md true.
+ *	Then it notifies all the connected nodes it has discarded data.
+ */
+int rmr_srv_discard_id(struct rmr_pool *pool, u64 offset, u64 length, u8 member_id, bool sync)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_dirty_id_map *map;
+	rmr_id_t id;
+	int md_i, err;
+
+	if (!member_id)
+		member_id = srv_pool->member_id;
+
+	map = rmr_pool_find_map(pool, member_id);
+	if (!map) {
+		pr_err("for srv pool %s cannot find map for member_id %u\n",
+		       pool->poolname, member_id);
+		return -EINVAL;
+	}
+
+	md_i = rmr_pool_find_md(&pool->pool_md, member_id, false);
+	if (md_i < 0) {
+		pr_err("%s: for srv pool %s cannot find md for member_id %u\n",
+		       __func__, pool->poolname, member_id);
+		return -EINVAL;
+	}
+
+	/*
+	 * If this node has received a response of the discard request from a normal server,
+	 * the node will continue to mark all the data chunks as dirty.
+	 */
+	if (member_id == srv_pool->member_id && sync) {
+		if (!srv_pool->clt) {
+			pr_err("pool %s has no sync pool assigned. Cannot send discards.\n",
+			       pool->poolname);
+			return -ENXIO;
+		}
+
+		/*
+		 * This node tries to send discards to all its connected nodes. The other node
+		 * that has received the discards will start a new round. In the end, all normal
+		 * nodes that are connected to this node should receive the discards.
+		 */
+		err = rmr_srv_send_discard_all(pool, member_id);
+		if (err) {
+			pr_err("%s: no server receives discards for pool %s: %d\n",
+			       __func__, pool->poolname, err);
+			return err;
+		}
+	}
+
+	/*
+	 * Set the discard_entries flag of the corresponding srv_md true. Be careful that setting
+	 * the wrong srv_md will lead to loops of discards.
+	 */
+	pool->pool_md.srv_md[md_i].discard_entries = true;
+	rmr_srv_mark_pool_md_dirty(srv_pool);
+
+	if (length) {
+		rmr_map_calc_chunk(pool, offset, length, &id);
+		rmr_map_set_dirty(map, id, MAP_ENTRY_UNSYNCED);
+	} else {
+		/* discard all data chunks */
+		rmr_map_set_dirty_all(map, MAP_ENTRY_UNSYNCED);
+		pr_info("%s: Discard all data chunks for member_id %u in srv_pool %s: %u\n",
+			__func__, member_id, pool->poolname, srv_pool->member_id);
+	}
+
+	rmr_map_clear_filter_all(map, MAP_ENTRY_UNSYNCED);
+	rmr_srv_mark_maps_dirty(srv_pool);
+
+	return 0;
+}
+EXPORT_SYMBOL(rmr_srv_discard_id);
+
+void rmr_srv_replace_store(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	RMR_STORE_SET_REPLACE(pool->map_ver);
+	rmr_srv_flush_pool_md(srv_pool);
+}
+EXPORT_SYMBOL(rmr_srv_replace_store);
+
+/**
+ * rmr_srv_pool_check_store() - Check whether IO is allowed for a pool or not
+ *
+ * @pool:	pool to check
+ *
+ * Return:
+ *	1 if IO is allowed, 0 therwise
+ *
+ * Description:
+ *	For a rmr-srv pool, the store registered provides a way to check whether it can process
+ *	IOs or not.
+ */
+static int rmr_srv_pool_check_store(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_srv_io_store *store = srv_pool->io_store;
+	void *store_priv;
+
+	if (!store) {
+		pr_debug("for srv pool %s no store assigned\n", pool->poolname);
+		return false;
+	}
+
+	if (!store->ops) {
+		pr_err("for pool %s store has no ops assigned\n", pool->poolname);
+		return false;
+	}
+	store_priv = store->priv;
+
+	return store->ops->io_allowed(store_priv);
+}
+
+/**
+ * process_msg_io() - Process IO message
+ *
+ * @srv_sess:	rmr srv session over which the message was received
+ * @rtrs_op:	rtrs IO context
+ * @data:	pointer to data buf
+ * @datalen:	len of data buf
+ * @usr:	pointer to user buf
+ * @usrlen:	len of user buf
+ *
+ * Return:
+ *	0 on success
+ *	negative error code otherwise
+ *
+ * Description:
+ *	Perform some basic checks.
+ *	Create an IO request and start its state machine.
+ */
+static int process_msg_io(struct rmr_srv_sess *srv_sess,
+			  struct rtrs_srv_op *rtrs_op, void *data,
+			  u32 datalen, const void *usr, size_t usrlen)
+{
+	const struct rmr_msg_io *msg = usr;
+	struct rmr_pool *pool;
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_srv_req *req;
+	int err = 0;
+	u32 group_id = le32_to_cpu(msg->hdr.group_id);
+
+	pool = rmr_srv_sess_get_pool(srv_sess, group_id);
+	if (IS_ERR(pool)) {
+		pr_err_ratelimited("Got I/O request on session %s for unknown pool group id %d: %pe\n",
+				   srv_sess->sessname, group_id, pool);
+		return PTR_ERR(pool);
+	}
+
+	srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	/*
+	 * No new references will come in after we have killed the percpu_ref.
+	 * Percpu_ref_tryget_live() returns false when @confirm_kill in
+	 * percpu_ref_kill_and_confirm() is done.
+	 */
+	if (!percpu_ref_tryget_live(&pool->ids_inflight_ref)) {
+		err = -EIO;
+		goto no_put;
+	}
+
+	if (!atomic_read(&srv_pool->store_state) ||
+	    atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) {
+		pr_err_ratelimited("server pool %s is not up for IO (state = %s)\n",
+				   pool->poolname,
+				   rmr_get_srv_pool_state_name(atomic_read(&srv_pool->state)));
+		err = -EIO;
+		goto put_pool;
+	}
+
+	/*
+	 * The IOs coming from internal sync sessions are always READ.
+	 */
+	if (msg->sync && rmr_op(le32_to_cpu(msg->flags)) != RMR_OP_READ) {
+		pr_err_ratelimited("process_msg_io: pool %s write IO from internal connection.\n",
+				   pool->poolname);
+		err = -EIO;
+		goto put_pool;
+	}
+
+	/*
+	 * For non internal IOs, make sure the underlying store is ready for IO
+	 */
+	if (!msg->sync && !rmr_srv_pool_check_store(pool)) {
+		pr_err("process_msg_io: pool %s IO not allowed\n", pool->poolname);
+		err = -EIO;
+		goto put_pool;
+	}
+
+	req = rmr_srv_req_create(msg, srv_pool, rtrs_op, data, datalen, rmr_srv_endreq);
+	if (IS_ERR(req)) {
+		pr_err("Failed to create rmr_req %pe\n", req);
+
+		//TODO: do we have to rtrs_srv_resp_rdma here ?
+		err = PTR_ERR(req);
+		goto put_pool;
+	}
+
+	rmr_req_submit(req);
+	return 0;
+
+put_pool:
+	percpu_ref_put(&pool->ids_inflight_ref);
+
+no_put:
+	rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+	rmr_srv_sess_put_pool(pool);
+	return err;
+}
+
+int rmr_srv_get_sync_permit(struct rmr_srv_pool *srv_pool)
+{
+	atomic_inc(&srv_pool->in_flight_sync_reqs);
+
+	while (atomic_read(&srv_pool->in_flight_sync_reqs) >= sync_queue_depth) {
+		/* Permit overslow; sleep */
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+
+		if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_RUNNING) {
+			atomic_dec(&srv_pool->in_flight_sync_reqs);
+
+			return -EINTR;
+		}
+	}
+
+	return 0;
+}
+
+void rmr_srv_put_sync_permit(struct rmr_srv_pool *srv_pool)
+{
+	atomic_dec(&srv_pool->in_flight_sync_reqs);
+
+	wake_up_process(srv_pool->th_tsk);
+}
+
+static int rmr_srv_sync_map(void *arg)
+{
+	struct rmr_srv_pool *srv_pool = arg;
+	struct rmr_pool *pool = srv_pool->pool;
+	struct rmr_dirty_id_map *map;
+	rmr_id_t rmr_id;
+	struct rmr_map_entry *entry;
+	int err = 0;
+	u64 i;
+
+	pr_info("Sync thread starting!\n");
+
+	map = rmr_pool_find_map(pool, srv_pool->member_id);
+	if (!map) {
+		/*
+		 * We do not need to error out here.
+		 * Since no session has ever been added to this pool,
+		 * it technically means this pool is in sync state.
+		 */
+		pr_info("No map found for pool %s\n", pool->poolname);
+		goto out;
+	}
+
+	rmr_id.a = 1;
+	for (i = 0; i < map->no_of_chunks; i++) {
+		if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_REQ_STOP) {
+			pr_info("Request to stop sync thread\n");
+			err = -EINTR;
+			goto err;
+		}
+
+		if (!atomic_read(&srv_pool->store_state) ||
+		    atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) {
+			atomic_set(&srv_pool->thread_state, SYNC_THREAD_WAIT);
+			pr_err("Pool not in desired state\n");
+			/* Unsure what error to return here */
+			err = -EINVAL;
+			goto err;
+		}
+
+		rmr_id.b = i;
+		entry = rmr_map_get_dirty_entry(map, rmr_id);
+		if (entry) {
+			if (atomic_cmpxchg(&entry->sync_cnt, -1, 0) != -1) {
+				/* someone has already started sync for this id */
+				continue;
+			}
+
+			err = rmr_srv_sync_chunk_id(srv_pool, entry, rmr_id, true);
+			if (err) {
+				/* this is to undo the previous cmpxchg if the error in
+				 * rmr_srv_sync_chunk_id happened before any requests were created
+				 */
+				atomic_cmpxchg(&entry->sync_cnt, 0, -1);
+				pr_err("Failed to sync chunk (%llu, %llu)\n", rmr_id.a, rmr_id.b);
+				goto err;
+			}
+		}
+	}
+
+	/*
+	 * Finished syncing chunks,
+	 * Now change the thread state to wait,
+	 * to wait for the in flight syncs
+	 */
+	atomic_set(&srv_pool->thread_state, SYNC_THREAD_WAIT);
+
+err:
+	while (atomic_read(&srv_pool->in_flight_sync_reqs) != 0) {
+		/*
+		 * Wait for all permits to get freed.
+		 * Since the completion path needs this thread to
+		 * be up and running
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+		//TODO: should it be timeout?
+	}
+
+out:
+	atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED);
+
+	pr_info("Sync thread exiting with err %d\n", err);
+	return err;
+}
+
+int rmr_srv_sync_thread_start(struct rmr_srv_pool *srv_pool)
+{
+	atomic_set(&srv_pool->in_flight_sync_reqs, 0);
+	srv_pool->th_tsk = kthread_run(rmr_srv_sync_map, srv_pool,
+				       "rmr_srv_sync_thread");
+	if (IS_ERR(srv_pool->th_tsk)) {
+		atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED);
+		return -ENOMEM;
+	}
+
+	atomic_set(&srv_pool->thread_state, SYNC_THREAD_RUNNING);
+	return 0;
+}
+
+int rmr_srv_sync_thread_stop(struct rmr_srv_pool *srv_pool)
+{
+	if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_RUNNING) {
+		atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP);
+		wake_up_process(srv_pool->th_tsk);
+	}
+
+	return 0;
+}
+
+void rmr_srv_sync_req_failed(struct rmr_srv_pool *srv_pool)
+{
+	/*
+	 * TODO: Investigate the necessity to change server state
+	 * to RMR_SRV_POOL_STATE_NO_IO for sync_req failure.
+	 */
+	// rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+	rmr_srv_sync_thread_stop(srv_pool);
+}
+
+static void rmr_srv_read_map_buf(struct rmr_pool *pool, void *buf, size_t buflen,
+				 const struct rmr_msg_map_buf_cmd *map_buf_cmd)
+{
+	int size;
+	u8 map_idx = map_buf_cmd->map_idx;
+	u64 slp_idx = map_buf_cmd->slp_idx;
+
+	size = rmr_pool_maps_to_buf(pool, &map_idx, &slp_idx, buf, buflen, MAP_NO_FILTER);
+	if (size == 0) {
+		// No more dirty map to write
+		struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf;
+
+		map_buf_hdr->version = RMR_MAP_FORMAT_VER;
+		map_buf_hdr->member_id = 0;
+	}
+}
+
+static void rmr_srv_update_md_buf(struct rmr_srv_pool *srv_pool, void *buf, size_t buflen)
+{
+	struct rmr_pool *pool = srv_pool->pool;
+	struct rmr_pool_md *pool_md = &pool->pool_md;
+	struct rmr_pool_md *buf_md = (struct rmr_pool_md *)buf;
+	u8 member_id = srv_pool->member_id;
+	int idx, buf_idx;
+
+	/* Zero out the buffer in case data is corrupted somehow. */
+	memset(buf, 0, buflen);
+	idx = rmr_pool_find_md(pool_md, member_id, false);
+	if (idx < 0) {
+		pr_err("The server pool hasn't updated srv_md yet %d\n", member_id);
+		return;
+	}
+
+	buf_idx = rmr_pool_find_md(buf_md, member_id, true);
+	if (buf_idx < 0) {
+		pr_err("The buffer has no space for the member_id %d\n", member_id);
+		return;
+	}
+
+	memcpy(&buf_md->srv_md[buf_idx], &pool_md->srv_md[idx], sizeof(struct rmr_srv_md));
+}
+
+static int rmr_srv_save_last_io_to_map(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_dirty_id_map *map;
+	int i, j, lock_idx;
+
+	map = rmr_pool_find_map(pool, srv_pool->member_id);
+	if (!map) {
+		pr_err("no map found for member_id %u\n", srv_pool->member_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < srv_pool->queue_depth; i++) {
+		rmr_id_t *id;
+		struct rmr_dirty_id_map *mp;
+
+		id = &srv_pool->last_io[i];
+
+		if (id->a == U64_MAX && id->b == U64_MAX)
+			continue;
+
+		if (rmr_map_check_dirty(map, *id)) {
+			/*
+			 * We already have this id added to our map, and which says
+			 * that its dirty for us. This means that last_io info about
+			 * this id is outdated.
+			 * We honor the info in the map, and skip this entry
+			 */
+			continue;
+		}
+
+		lock_idx = srcu_read_lock(&pool->map_srcu);
+		for (j = 0; j < pool->maps_cnt; j++) {
+			mp = rcu_dereference(pool->maps[j]);
+			if (WARN_ON(!mp) || mp->member_id == srv_pool->member_id)
+				continue;
+
+			rmr_map_set_dirty(mp, *id, 0);
+
+			// Clean the entry since it has been used up
+			id->a = U64_MAX;
+			id->b = U64_MAX;
+		}
+		srcu_read_unlock(&pool->map_srcu, lock_idx);
+	}
+
+	rmr_srv_mark_maps_dirty(srv_pool);
+	return 0;
+}
+
+/**
+ * process_msg_user_cmd() - Process user command
+ *
+ * @pool:	rmr pool
+ * @cmd_msg:	pointer to command message. The user data is right after this struct.
+ * @data:	data buffer to be passed down the user
+ * @datalen:	length of the user buffer
+ *
+ * Description:
+ *	Pass down the user command to the user server side.
+ *	The user command data is kept right after the pool command (see arranging of kvec)
+ *
+ * Return:
+ *	0 in case of success
+ *	negative is case of failure
+ *
+ * Context:
+ *	The call goes to the user server side. Care must be taken not to block.
+ */
+static int process_msg_user_cmd(struct rmr_srv_pool *srv_pool,
+				const struct rmr_msg_pool_cmd *cmd_msg, void *data, int datalen)
+{
+	struct rmr_srv_io_store *store = srv_pool->io_store;
+	size_t usr_len = cmd_msg->user_cmd.usr_len;
+	int ret;
+
+	pr_debug("%s: cmd_len=%zu usr_len=%zu\n", __func__, sizeof(*cmd_msg), usr_len);
+
+	if (!store) {
+		pr_err("%s: No store registered\n", __func__);
+		return -EAGAIN;
+	}
+
+	ret = store->ops->submit_cmd(store->priv, cmd_msg + 1, usr_len, data, datalen);
+
+	return ret;
+}
+
+static void do_sess_leave_srv_sess(struct rmr_srv_pool_sess *pool_sess)
+{
+	struct rmr_srv_sess *srv_sess = pool_sess->srv_sess;
+
+	mutex_lock(&srv_sess->lock);
+	list_del(&pool_sess->srv_sess_entry);
+	mutex_unlock(&srv_sess->lock);
+}
+
+static void sess_leave_pool(struct rmr_pool *pool,
+			    struct rmr_srv_pool_sess *pool_sess)
+{
+	struct rmr_srv_sess *srv_sess = pool_sess->srv_sess;
+
+	pr_info("pool sesss %s leaves pool %s\n",
+		pool_sess->sessname, pool->poolname);
+
+	mutex_lock(&pool->sess_lock);
+	list_del(&pool_sess->pool_entry);
+	xa_erase(&srv_sess->pools, pool->group_id);
+	mutex_unlock(&pool->sess_lock);
+
+	rmr_srv_sysfs_del_sess(pool_sess);
+
+	pool_sess->srv_pool = NULL;
+}
+
+static void rmr_srv_free_pool_sess(struct rmr_srv_pool_sess *pool_sess)
+{
+	kfree(pool_sess);
+}
+
+static void destroy_sess(struct rmr_srv_sess *srv_sess)
+{
+	struct rmr_srv_pool *srv_pool;
+	struct rmr_srv_pool_sess *pool_sess, *tmp;
+
+	// why do they do this in rnbd srv ?
+	// if (list_empty(&srv_sess->pool_sess_list))
+	// 	goto out;
+
+	mutex_lock(&srv_sess->lock);
+	list_for_each_entry_safe (pool_sess, tmp, &srv_sess->pool_sess_list, srv_sess_entry) {
+		list_del(&pool_sess->srv_sess_entry);
+		srv_pool = pool_sess->srv_pool;
+
+		// A network disconnect event
+		if (!pool_sess->sync)
+			rmr_srv_change_pool_state(pool_sess->srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+
+		sess_leave_pool(srv_pool->pool, pool_sess);
+		rmr_put_srv_pool(srv_pool);
+		rmr_srv_free_pool_sess(pool_sess);
+	}
+	mutex_unlock(&srv_sess->lock);
+
+	xa_destroy(&srv_sess->pools);
+	might_sleep();
+
+	mutex_lock(&g_sess_lock);
+	list_del(&srv_sess->g_list_entry);
+	mutex_unlock(&g_sess_lock);
+
+	mutex_destroy(&srv_sess->lock);
+	kfree(srv_sess);
+}
+
+void rmr_srv_destroy_pool(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool_sess *pool_sess, *tmp;
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	if (!pool) {
+		pr_err("%s: pool is empty\n", __func__);
+		return;
+	}
+
+	list_for_each_entry_safe (pool_sess, tmp, &pool->sess_list, pool_entry) {
+		WARN_ON(!pool_sess->srv_pool);
+
+		do_sess_leave_srv_sess(pool_sess);
+		sess_leave_pool(srv_pool->pool, pool_sess);
+		rmr_put_srv_pool(srv_pool);
+		rmr_srv_free_pool_sess(pool_sess);
+	}
+}
+
+int rmr_srv_remove_clt_pool(struct rmr_srv_pool *srv_pool)
+{
+	struct rmr_pool *clt;
+
+	clt = srv_pool->clt;
+	if (!clt) {
+		pr_info("Srv pool %s has no internal clt pool assigned\n",
+			srv_pool->pool->poolname);
+		return -EINVAL;
+	}
+
+	pr_info("from pool %s remove sync (internal) pool %s\n",
+		srv_pool->pool->poolname, clt->poolname);
+	srv_pool->clt = NULL;
+
+	rmr_clt_close(clt);
+
+	pr_info("pool %s removed\n", clt->poolname);
+
+	return 0;
+}
+
+static int create_srv_sess(struct rtrs_srv_sess *rtrs)
+{
+	struct rmr_srv_sess *srv_sess;
+	char sessname[NAME_MAX];
+	int err;
+
+	err = rtrs_srv_get_path_name(rtrs, sessname, sizeof(sessname));
+	if (unlikely(err)) {
+		pr_err("rtrs_srv_get_sess_name(%s): %d\n", sessname, err);
+		return err;
+	}
+	srv_sess = kzalloc(sizeof(*srv_sess), GFP_KERNEL);
+	if (!srv_sess)
+		return -ENOMEM;
+
+	mutex_init(&srv_sess->lock);
+	srv_sess->rtrs = rtrs;
+	strscpy(srv_sess->sessname, sessname, NAME_MAX);
+	xa_init_flags(&srv_sess->pools, XA_FLAGS_ALLOC);
+	INIT_LIST_HEAD(&srv_sess->pool_sess_list);
+	mutex_init(&srv_sess->lock);
+
+	mutex_lock(&g_sess_lock);
+	list_add(&srv_sess->g_list_entry, &g_sess_list);
+	mutex_unlock(&g_sess_lock);
+
+	rtrs_srv_set_sess_priv(rtrs, srv_sess);
+
+	return 0;
+}
+
+static int rmr_srv_link_ev(struct rtrs_srv_sess *rtrs,
+			   enum rtrs_srv_link_ev ev, void *priv)
+{
+	struct rmr_srv_sess *srv_sess = priv;
+
+	switch (ev) {
+	case RTRS_SRV_LINK_EV_CONNECTED:
+		return create_srv_sess(rtrs);
+
+	case RTRS_SRV_LINK_EV_DISCONNECTED:
+		if (WARN_ON(!srv_sess))
+			return -EINVAL;
+
+		destroy_sess(srv_sess);
+		return 0;
+
+	default:
+		pr_warn("Received unknown rtrs session event %d from session %s\n",
+			ev, srv_sess->sessname);
+		return -EINVAL;
+	}
+}
+
+static struct rmr_srv_pool_sess *__find_sess_in_pool(struct rmr_pool *pool,
+						     const char *sessname)
+{
+	struct rmr_srv_pool_sess *pool_sess;
+
+	list_for_each_entry (pool_sess, &pool->sess_list, pool_entry) {
+		if (!strcmp(pool_sess->sessname, sessname)) {
+			return pool_sess;
+		}
+	}
+
+	return NULL;
+}
+
+static int sess_join_pool(struct rmr_pool *pool, struct rmr_srv_pool_sess *pool_sess)
+{
+	struct rmr_srv_pool_sess *find;
+	struct rmr_srv_sess *srv_sess = pool_sess->srv_sess;
+	int ret = 0;
+
+	mutex_lock(&pool->sess_lock);
+	find = __find_sess_in_pool(pool, pool_sess->sessname);
+	if (find) {
+		ret = -EEXIST;
+		goto unlock;
+	}
+
+	ret = xa_err(xa_store(&srv_sess->pools, pool->group_id, pool, GFP_KERNEL));
+	if (ret) {
+		pr_err("can not add pool %s err %d\n", pool->poolname, ret);
+		goto unlock;
+	}
+	pr_info("%s: Added pool %s to rmr_srv_sess %s\n",
+		__func__, pool->poolname, srv_sess->sessname);
+
+	ret = rmr_srv_sysfs_add_sess(pool, pool_sess);
+	if (ret) {
+		pr_err("failed to create sysfs for pool sess %s in pool %s\n",
+		       pool_sess->sessname, pool->poolname);
+
+		xa_erase(&srv_sess->pools, pool->group_id);
+		goto unlock;
+	}
+	list_add(&pool_sess->pool_entry, &pool->sess_list);
+
+unlock:
+	mutex_unlock(&pool->sess_lock);
+
+	return ret;
+}
+
+static void do_sess_leave_pool(struct rmr_pool *pool, struct rmr_srv_pool_sess *pool_sess)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	do_sess_leave_srv_sess(pool_sess);
+	sess_leave_pool(pool, pool_sess);
+	rmr_put_srv_pool(srv_pool);
+	rmr_srv_free_pool_sess(pool_sess);
+}
+
+/**
+ * process_msg_pool_info() - Process a POOL_INFO membership change notification
+ *
+ * @pool:		Pool which received the command.
+ * @pool_info_cmd:	The received POOL_INFO command carrying member_id,
+ *			operation, mode, and dirty flag.
+ *
+ * Dispatches on (operation, mode) pairs notified by the client:
+ *  - ADD + CREATE:    a new storage node is joining; add it via
+ *		       rmr_srv_handle_other_member_add().
+ *  - ADD + ASSEMBLE:  an existing node is reassembling; verify its map and
+ *		       stg_members entry already exist.
+ *  - REMOVE + DELETE: a storage node is permanently leaving; remove its map
+ *		       and stg_members entry via rmr_srv_delete_store_member().
+ *  - REMOVE + DISASSEMBLE: temporary leave; no map changes needed (TODO).
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+static int process_msg_pool_info(struct rmr_pool *pool,
+				 const struct rmr_msg_pool_info_cmd *pool_info_cmd)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	int ret = 0;
+
+	pr_info("%s: Server pool %s with member_id %u, received pool_info message\n",
+		__func__, pool->poolname, srv_pool->member_id);
+
+	if (pool_info_cmd->operation == RMR_POOL_INFO_OP_ADD) {
+		ret = rmr_srv_handle_other_member_add(srv_pool, pool_info_cmd);
+		if (ret) {
+			pr_err("%s: Failed to create maps for other pools: %d\n",
+			       __func__, ret);
+			return ret;
+		}
+	} else if (pool_info_cmd->operation == RMR_POOL_INFO_OP_REMOVE) {
+		if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_DELETE) {
+			pr_info("%s: Member %u got remove of member %u with mode delete\n",
+				__func__, srv_pool->member_id, pool_info_cmd->member_id);
+			rmr_srv_delete_store_member(pool, pool_info_cmd->member_id);
+		} else if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_DISASSEMBLE) {
+			pr_info("%s: Member %u got remove of member %u with mode disassemble, "
+				"preserving dirty map\n",
+				__func__, srv_pool->member_id, pool_info_cmd->member_id);
+			/*
+			 * Do NOT remove the dirty map or stg_members entry for the
+			 * disassembled member.  IOs arriving after this point will
+			 * continue to accumulate dirty entries for that member via
+			 * the piggyback mechanism, so it can resync on reassembly.
+			 */
+		}
+	}
+	rmr_srv_flush_pool_md(srv_pool);
+
+	return ret;
+}
+
+static struct rmr_srv_pool_sess *alloc_pool_sess(struct rmr_srv_pool *srv_pool,
+						 struct rmr_srv_sess *srv_sess)
+{
+	struct rmr_srv_pool_sess *pool_sess;
+
+	pool_sess = kzalloc_node(sizeof(*pool_sess), GFP_KERNEL, NUMA_NO_NODE);
+	if (unlikely(!pool_sess)) {
+		pr_err("Failed to allocate session for srv pool %s\n", srv_pool->pool->poolname);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	strscpy(pool_sess->sessname, srv_sess->sessname, NAME_MAX);
+	INIT_LIST_HEAD(&pool_sess->pool_entry);
+	INIT_LIST_HEAD(&pool_sess->srv_sess_entry);
+	pool_sess->srv_sess = srv_sess;
+	pool_sess->srv_pool = srv_pool;
+
+	return pool_sess;
+}
+
+/**
+ * rmr_srv_process_join_create() - Handle the CREATE case of a join_pool message
+ *
+ * @pool:		The pool being created.
+ * @join_pool_cmd:	The received join_pool command carrying dirty flag and
+ *			per-member info for any pre-existing pool members.
+ *
+ * If the client reports that this server's existing data is dirty, marks own
+ * map fully dirty.  Then iterates the per-member list in the message and adds
+ * each member via rmr_srv_add_store_member(), marking its map dirty if the
+ * client flagged it.  On failure, all members added so far are cleaned up.
+ *
+ * Return:
+ *	0 on success, negative error code on failure.
+ */
+static int rmr_srv_process_join_create(struct rmr_pool *pool,
+				       const struct rmr_msg_join_pool_cmd *join_pool_cmd)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_dirty_id_map *map;
+	int i, ret;
+	u8 member_id;
+
+	/*
+	 * Mark our maps dirty if client asked us to.
+	 */
+	if (join_pool_cmd->dirty) {
+		map = rmr_pool_find_map(pool, srv_pool->member_id);
+		if (!map) {
+			pr_err("%s: No map found for %u\n",
+			       __func__, srv_pool->member_id);
+			return -EINVAL;
+		}
+		rmr_map_set_dirty_all(map, MAP_NO_FILTER);
+	}
+
+	/*
+	 * Add other storage members in case its a create message.
+	 */
+	for (i = 0; i < join_pool_cmd->mem_info.no_of_stor; i++) {
+		member_id = join_pool_cmd->mem_info.p_mem_info[i].member_id;
+
+		ret = rmr_srv_add_store_member(pool, member_id);
+		if (ret) {
+			pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret);
+			goto cleanup;
+		}
+
+		if (join_pool_cmd->mem_info.p_mem_info[i].c_dirty) {
+			map = rmr_pool_find_map(pool, member_id);
+			if (WARN_ON(!map)) {
+				xa_erase(&pool->stg_members, member_id);
+				ret = -EINVAL;
+				goto cleanup;
+			}
+			rmr_map_set_dirty_all(map, MAP_NO_FILTER);
+		}
+	}
+
+	return 0;
+
+cleanup:
+	while (i--)
+		rmr_srv_delete_store_member(pool,
+					    join_pool_cmd->mem_info.p_mem_info[i].member_id);
+	return ret;
+}
+
+static void rmr_srv_process_leave_delete(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	void *entry;
+	unsigned long id;
+
+	/*
+	 * When we are leaving a pool (not disassembly), we have to,
+	 * 1) Delete dirty entries from all the maps of other storage nodes, since we do not
+	 * need them anymore
+	 * 2) Delete all the maps of other storage nodes.
+	 *
+	 * Map for this storage node is created/deleted during register/unregister.
+	 */
+	xa_for_each(&pool->stg_members, id, entry) {
+		if (id == srv_pool->member_id)
+			continue;
+
+		rmr_srv_delete_store_member(pool, id);
+	}
+}
+
+static int process_msg_join_pool(struct rmr_pool *pool, struct rmr_srv_sess *srv_sess,
+				 struct rtrs_srv_sess *rtrs, bool sync,
+				 const struct rmr_msg_join_pool_cmd *join_pool_cmd)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_srv_pool_sess *pool_sess;
+	int ret = 0, i;
+	bool alloced_last_io = false;
+
+	pr_info("Client %s requests to join pool %s (state=%d)\n",
+		srv_sess->sessname, pool->poolname, atomic_read(&srv_pool->state));
+
+	mutex_lock(&srv_sess->lock);
+
+	/*
+	 * Here we only do chunk size check,
+	 * to make sure different storage nodes do not use different chunk sizes.
+	 */
+	if (join_pool_cmd->chunk_size && pool->chunk_size != join_pool_cmd->chunk_size) {
+		pr_err("pool %s has chunksize %u != msg chunksize %u\n",
+		       pool->poolname, pool->chunk_size, join_pool_cmd->chunk_size);
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	mutex_lock(&srv_pool->srv_pool_lock);
+	if (atomic_read(&srv_pool->state) == RMR_SRV_POOL_STATE_EMPTY) {
+		pr_err("%s: pool %s has no store registered; join rejected\n",
+		       __func__, pool->poolname);
+		ret = -EINVAL;
+		goto unlock_srv_pool_lock;
+	}
+
+	if (!sync) {
+		if (join_pool_cmd->create) {
+			if (srv_pool->last_io || srv_pool->last_io_idx) {
+				pr_err("%s: pool %s already has last_io buffer allocated\n",
+				       __func__, pool->poolname);
+				ret = -EEXIST;
+				goto unlock_srv_pool_lock;
+			}
+
+			if (!srv_pool->marked_create) {
+				pr_err("%s: pool %s not in create state\n",
+				       __func__, pool->poolname);
+				ret = -EINVAL;
+				goto unlock_srv_pool_lock;
+			}
+		} else if (srv_pool->marked_create) {
+			pr_err("%s: pool %s should not be in create state\n",
+			       __func__, pool->poolname);
+			ret = -EINVAL;
+			goto unlock_srv_pool_lock;
+		}
+	}
+
+	pool_sess = alloc_pool_sess(srv_pool, srv_sess);
+	if (IS_ERR(pool_sess)) {
+		pr_err("failed to allc pool_sees for pool %s sev_sess %s: %pe\n",
+		       pool->poolname, srv_sess->sessname, pool_sess);
+		ret = PTR_ERR(pool_sess);
+		goto unlock_srv_pool_lock;
+	}
+	srv_pool->queue_depth = join_pool_cmd->queue_depth;
+
+	ret = sess_join_pool(pool, pool_sess);
+	if (ret) {
+		pr_err("Failed to join pool\n");
+		goto free_sess;
+	}
+	pool_sess->sync = sync;
+
+	if (!pool_sess->sync && !srv_pool->last_io) {
+		/* Joining for the first time */
+		srv_pool->last_io = kcalloc(srv_pool->queue_depth, sizeof(*srv_pool->last_io),
+					    GFP_KERNEL);
+		if (!srv_pool->last_io) {
+			pr_err("Memory allocation failed for srv_pool->last_io\n");
+			ret = -ENOMEM;
+			goto sess_leave;
+		}
+		alloced_last_io = true;
+
+		/* The previous last_io buffer exists. */
+		if (srv_pool->last_io_idx) {
+			memcpy(srv_pool->last_io, srv_pool->last_io_idx,
+			       rmr_last_io_len(srv_pool->queue_depth));
+		} else {
+			for (i = 0; i < srv_pool->queue_depth; i++) {
+				srv_pool->last_io[i].a = U64_MAX;
+				srv_pool->last_io[i].b = U64_MAX;
+			}
+
+			srv_pool->last_io_idx = kcalloc(srv_pool->queue_depth,
+						    sizeof(*srv_pool->last_io_idx), GFP_KERNEL);
+			if (!srv_pool->last_io_idx) {
+				ret = -ENOMEM;
+				goto free_last_io;
+			}
+		}
+		pr_info("Allocated %ld B last_io buffer for pool %s\n",
+			srv_pool->queue_depth * sizeof(*srv_pool->last_io), pool->poolname);
+	}
+
+	/*
+	 * Join/Rejoin messages from sync sessions do not affect our state.
+	 *
+	 * For non-sync sessions, if our state is NO_IO, pserver can either send a,
+	 * - rejoin message in case our state NO_IO due to network/IO issue
+	 * - join message in case pserver crashed
+	 * hence, no state transition is needed.
+	 */
+	if (!pool_sess->sync) {
+		if (join_pool_cmd->create) {
+			/*
+			 * First-time pool creation: set up member info and maps,
+			 * then move to CREATED awaiting enable_pool(1).
+			 */
+			ret = rmr_srv_process_join_create(pool, join_pool_cmd);
+			if (ret) {
+				pr_err("%s: rmr_srv_process_join_create failed %d\n",
+				       __func__, ret);
+				goto free_last_io;
+			}
+
+			/*
+			 * In the CREATE path pool_md has only magic set; all other
+			 * header fields are normally populated later by
+			 * RMR_CMD_SEND_MD_BUF.  Initialise them now so that
+			 * queue_depth (and the bitmap/last_io offsets derived from
+			 * it) are correct before the first on-demand map flush fires.
+			 */
+			pool->pool_md.queue_depth = join_pool_cmd->queue_depth;
+			pool->pool_md.chunk_size  = pool->chunk_size;
+			pool->pool_md.mapped_size = pool->mapped_size;
+			pool->pool_md.group_id    = pool->group_id;
+			strscpy(pool->pool_md.poolname, pool->poolname,
+				sizeof(pool->pool_md.poolname));
+			rmr_srv_mark_pool_md_dirty(srv_pool);
+			rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+
+			ret = rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_CREATED);
+			if (ret < 0)
+				goto leave_delete;
+
+			srv_pool->marked_create = false;
+		} else if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NO_IO) {
+			/*
+			 * Assemble or rejoin: a map update is needed before IOs
+			 * can resume, so move to NO_IO. If we are already in
+			 * NO_IO (e.g. pserver reconnecting after a network event
+			 * that already drove us there), no transition is needed.
+			 */
+			ret = rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+			if (ret < 0)
+				goto leave_delete;
+		}
+	}
+
+	mutex_unlock(&srv_pool->srv_pool_lock);
+
+	rmr_get_srv_pool(srv_pool);
+	list_add_tail(&pool_sess->srv_sess_entry, &srv_sess->pool_sess_list);
+
+	mutex_unlock(&srv_sess->lock);
+
+	return 0;
+
+leave_delete:
+	if (!pool_sess->sync && join_pool_cmd->create)
+		rmr_srv_process_leave_delete(pool);
+free_last_io:
+	if (alloced_last_io) {
+		kfree(srv_pool->last_io);
+		srv_pool->last_io = NULL;
+
+		kfree(srv_pool->last_io_idx);
+		srv_pool->last_io_idx = NULL;
+	}
+sess_leave:
+	sess_leave_pool(pool, pool_sess);
+free_sess:
+	rmr_srv_free_pool_sess(pool_sess);
+unlock_srv_pool_lock:
+	mutex_unlock(&srv_pool->srv_pool_lock);
+unlock:
+	mutex_unlock(&srv_sess->lock);
+	return ret;
+}
+
+void rmr_srv_stop_sync_and_go_offline(struct rmr_pool *pool)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+	rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+
+	if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) {
+		atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP);
+		wake_up_process(srv_pool->th_tsk);
+
+		while (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(msecs_to_jiffies(1000));
+		}
+	}
+}
+
+static int process_msg_leave_pool(struct rmr_pool *pool, struct rmr_srv_sess *sess, bool sync,
+				  const struct rmr_msg_leave_pool_cmd *leave_pool_cmd)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_srv_pool_sess *pool_sess;
+	u64 last_io_len;
+	int ret = 0;
+	void *buf;
+
+	pr_info("Session %s requests to leave pool %d\n", sess->sessname,
+		leave_pool_cmd->member_id);
+
+	if (srv_pool->member_id != leave_pool_cmd->member_id) {
+		pr_err("%s: For sess %s, Srv pool member_id %d, Message member_id %d\n",
+		       __func__, sess->sessname, srv_pool->member_id, leave_pool_cmd->member_id);
+		return -ENOENT;
+	}
+
+	mutex_lock(&pool->sess_lock);
+	pool_sess = __find_sess_in_pool(pool, sess->sessname);
+	if (!pool_sess) {
+		mutex_unlock(&pool->sess_lock);
+		pr_err("Session %s is not in pool %s\n", sess->sessname,
+		       pool->poolname);
+		return -ENOENT;
+	}
+	mutex_unlock(&pool->sess_lock);
+
+	do_sess_leave_pool(pool, pool_sess);
+
+	mutex_lock(&srv_pool->srv_pool_lock);
+	srv_pool->marked_delete = leave_pool_cmd->delete;
+	mutex_unlock(&srv_pool->srv_pool_lock);
+
+	if (!sync) {
+		/*
+		 * Stop the sync thread if its running, and go offline.
+		 */
+		rmr_srv_stop_sync_and_go_offline(pool);
+
+		if (leave_pool_cmd->delete) {
+			rmr_srv_process_leave_delete(pool);
+		} else {
+			/*
+			 * Disassemble: flush the dirty map to disk first so that
+			 * the on-disk map reflects all dirty entries accumulated
+			 * up to this point.  On reassembly the map is read back
+			 * and used to drive resync of any members that missed IOs.
+			 */
+			rmr_srv_md_maps_sync(pool);
+
+			/*
+			 * Clear last_io and persist it to disk so that it is not
+			 * used after reassembly.  Note: maps are always flushed
+			 * above regardless of whether last_io is valid; the two
+			 * operations are independent.
+			 */
+			last_io_len = rmr_last_io_len(pool->pool_md.queue_depth);
+
+			if (!srv_pool->last_io || !last_io_len)
+				goto change_state;
+
+			memset(srv_pool->last_io, 0, last_io_len);
+			if (srv_pool->last_io_idx)
+				memset(srv_pool->last_io_idx, 0, last_io_len);
+
+			buf = kzalloc(last_io_len, GFP_KERNEL);
+			if (!buf)
+				goto change_state;
+
+			ret = process_md_io(pool, NULL,
+					    RMR_LAST_IO_OFFSET,
+					    last_io_len,
+					    RMR_OP_MD_WRITE, buf);
+			if (ret) {
+				pr_err("%s: For pool %s process_md_io failed\n",
+				       __func__, pool->poolname);
+			}
+			kfree(buf);
+		}
+
+change_state:
+		/*
+		 * All sessions have left. Transition back to REGISTERED if the
+		 * backend store is still present, or to EMPTY if it is not.
+		 */
+		mutex_lock(&srv_pool->srv_pool_lock);
+		if (srv_pool->io_store)
+			rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_REGISTERED);
+		else
+			rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_EMPTY);
+		mutex_unlock(&srv_pool->srv_pool_lock);
+	}
+
+	return 0;
+}
+
+static int process_msg_map_clear(struct rmr_srv_sess *srv_sess,
+				 const void *usr)
+{
+	const struct rmr_msg_io *msg = usr;
+	struct rmr_pool *pool;
+	rmr_id_t id;
+	unsigned long key;
+	struct rmr_map_entry *entry;
+	struct rmr_dirty_id_map *map;
+	u8 member_id;
+	int err = 0;
+	u32 group_id = le32_to_cpu(msg->hdr.group_id);
+
+	id.a = le64_to_cpu(msg->id_a);
+	id.b = le64_to_cpu(msg->id_b);
+	key = rmr_id_to_key(id);
+	member_id = msg->member_id;
+
+	pr_debug("received map clear msg, id (%llu, %llu), member_id %u\n",
+		 id.a, id.b, member_id);
+
+	pool = rmr_srv_sess_get_pool(srv_sess, group_id);
+	if (IS_ERR(pool)) {
+		pr_err_ratelimited("Got I/O request on session %s for unknown pool: %pe\n",
+				   srv_sess->sessname, pool);
+		return PTR_ERR(pool);
+	}
+
+	map = rmr_pool_find_map(pool, member_id);
+	if (!map) {
+		pr_err("no map found for member_id %u\n", member_id);
+		err = -EINVAL;
+		goto put_pool;
+		//TODO: handle this , probably initialize map, or just throw err?
+	}
+
+	entry = rmr_map_unset_dirty(map, id, MAP_NO_FILTER);
+	if (entry) {
+		/* We do not need any rcu protection here since it is deleted by the other
+		 * rmr server. And sync can only be done for entries that are
+		 * dirty for this particaular server.
+		 */
+		kmem_cache_free(rmr_map_entry_cachep, entry);
+	}
+	rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+
+put_pool:
+	rmr_srv_sess_put_pool(pool);
+	return err;
+}
+
+static int process_msg_map_add(struct rmr_srv_sess *srv_sess,
+			       const void *usr)
+{
+	const struct rmr_msg_io *msg = usr;
+	struct rmr_pool *pool;
+	int i, ret = 0;
+	struct rmr_dirty_id_map *map;
+	u32 group_id = le32_to_cpu(msg->hdr.group_id);
+
+	pr_debug("received map add member_id %u, id (%llu %llu)\n",
+		 msg->member_id, msg->id_a, msg->id_b);
+
+	pool = rmr_srv_sess_get_pool(srv_sess, group_id);
+	if (IS_ERR(pool)) {
+		pr_err_ratelimited("Got I/O request on session %s for unknown pool: %pe\n",
+				   srv_sess->sessname, pool);
+		return PTR_ERR(pool);
+	}
+
+	for (i = 0; i < msg->failed_cnt; i++) {
+		u64 msg_map_ver = le64_to_cpu(msg->map_ver);
+		rmr_id_t id;
+
+		map = rmr_pool_find_map(pool, msg->failed_id[i]);
+		if (!map) {
+			pr_err("no map found for member_id %u\n", msg->failed_id[i]);
+			ret = -EINVAL;
+			goto put_pool;
+		}
+
+		atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK);
+		id.a = le64_to_cpu(msg->id_a);
+		id.b = le64_to_cpu(msg->id_b);
+		rmr_map_set_dirty(map, id, 0);
+
+		if (msg_map_ver > pool->map_ver)
+			pool->map_ver = msg_map_ver;
+	}
+	if (msg->failed_cnt) {
+		rmr_srv_mark_pool_md_dirty((struct rmr_srv_pool *)pool->priv);
+		rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv);
+	}
+
+put_pool:
+	rmr_srv_sess_put_pool(pool);
+
+	return ret;
+}
+
+/**
+ * rmr_srv_set_pool_mm() - Set the rmr srv pool to maintenance mode
+ *
+ * @srv_pool:	The rmr srv pool to set in maintenance mode
+ *
+ * Description:
+ *	While in maintenance mode, we do not serve IOs either, so we set state to NO_IO
+ *
+ * Return:
+ *	0 on success
+ *	Error value on failure
+ */
+static int rmr_srv_set_pool_mm(struct rmr_srv_pool *srv_pool)
+{
+	srv_pool->maintenance_mode = true;
+
+	return rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+}
+
+/**
+ * rmr_srv_unset_pool_mm() - Clear the rmr srv pool maintenance mode
+ *
+ * @srv_pool:	The rmr srv pool to clear maintenance mode of
+ *
+ * Description:
+ *	While in maintenance mode, we do not serve IOs either, so we set state to NO_IO
+ *
+ * Return:
+ *	0 on success
+ *	Error value on failure
+ */
+static int rmr_srv_unset_pool_mm(struct rmr_srv_pool *srv_pool)
+{
+	srv_pool->maintenance_mode = false;
+	rmr_srv_flush_pool_md(srv_pool);
+
+	return 0;
+}
+
+static int process_msg_enable_pool(struct rmr_pool *pool, struct rmr_srv_sess *sess, bool sync,
+				   const struct rmr_msg_enable_pool_cmd *enable_pool_cmd)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	enum rmr_srv_pool_state old_state = atomic_read(&srv_pool->state);
+	int ret = 0;
+
+	/*
+	 * Enable/Disable messages from sync sessions do not affect us.
+	 */
+	if (sync) {
+		pr_info("%s: From sync sess %s, for pool %s\n", __func__, sess->sessname,
+			pool->poolname);
+		return 0;
+	}
+
+	pr_info("Client %s requests to set enable=%d pool %s current state %s\n",
+		sess->sessname, enable_pool_cmd->enable, pool->poolname,
+		rmr_get_srv_pool_state_name(old_state));
+
+	/*
+	 * Enable when not in maintenance mode, can be handled simply
+	 */
+	if (enable_pool_cmd->enable && !srv_pool->maintenance_mode) {
+		/*
+		 * CREATED -> NORMAL: initial enable after create-mode join.
+		 * NO_IO -> NORMAL: was_last_authoritative recovery (pserver
+		 * enables this node directly without a map update because its
+		 * dirty map is already authoritative).
+		 */
+		if (old_state != RMR_SRV_POOL_STATE_CREATED &&
+		    old_state != RMR_SRV_POOL_STATE_NO_IO) {
+			pr_err("%s: pool %s cannot be enabled in state %s\n",
+			       __func__, pool->poolname,
+			       rmr_get_srv_pool_state_name(old_state));
+			return -EINVAL;
+		}
+
+		ret = rmr_srv_set_pool_state_normal(srv_pool);
+		if (ret < 0)
+			goto out_err;
+
+		return 0;
+	}
+
+	/*
+	 * Any other case involves considering maintenance mode settings
+	 */
+	if (!enable_pool_cmd->enable) {
+		if (old_state != RMR_SRV_POOL_STATE_NORMAL &&
+		    old_state != RMR_SRV_POOL_STATE_NO_IO) {
+			pr_err("%s: pool %s can only disable from NORMAL or NO_IO state (current: %s)\n",
+			       __func__, pool->poolname,
+			       rmr_get_srv_pool_state_name(old_state));
+			return -EINVAL;
+		}
+		ret = rmr_srv_set_pool_mm(srv_pool);
+	} else {
+		ret = rmr_srv_unset_pool_mm(srv_pool);
+	}
+
+	if (ret < 0)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	/*
+	 * Put srv pool state to old one
+	 */
+	atomic_set(&srv_pool->state, old_state);
+	return ret;
+}
+
+/**
+ * process_msg_map_ready() - Process RMR_CMD_MAP_READY command
+ *
+ * @pool:		Pool which received the command
+ * @sync:		Whether the command was sent from an internal (sync) rmr-client or not
+ *
+ * Return:
+ *	0 on success
+ *	Negative errno on failure
+ *
+ * Description:
+ *	A RMR_CMD_MAP_READY command is the first command that is sent to a storage node which will
+ *	receive a map from another storage node as part of a map update.
+ *
+ *	It checks whether this storage node is ready and in an expected state to receive a map.
+ */
+static int process_msg_map_ready(struct rmr_pool *pool, bool sync)
+{
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_dirty_id_map *map;
+	int i, err = 0, pool_state;
+
+	mutex_lock(&srv_pool->srv_pool_lock);
+	pool_state = atomic_read(&srv_pool->state);
+
+	/* A map update from another storage node is not allowed. */
+	if (sync) {
+		pr_err("%s: (sync) Cannot receive map from other storage nodes\n", __func__);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * A map update from pserver should start only when in,
+	 * NO_IO - after a network/IO error
+	 * CREATED - For extend (This is not nice.
+	 *			 Extend should inform the storage node that it is being
+	 *			 used for an extend leg for an already existing node, and
+	 *			 the state should be set accordingly. So that we can allow
+	 *			 this only when in NO_IO state.)
+	 */
+	if (pool_state != RMR_SRV_POOL_STATE_NO_IO && pool_state != RMR_SRV_POOL_STATE_CREATED) {
+		pr_err("(non-sync) pool state not correct %d", pool_state);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * We seem to be in process of another map update.
+	 */
+	if (srv_pool->map_update_state != MAP_UPDATE_STATE_DISABLED) {
+		pr_err("rmr_srv_send_map Map update already in progress\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * If pserver is instructing us to receive a map, then the map we
+	 * hold is meaningless.
+	 */
+	mutex_lock(&pool->maps_lock);
+	for (i = 0; i < RMR_POOL_MAX_SESS; i++) {
+		map = rcu_dereference_protected(pool->maps[i],
+						lockdep_is_held(&pool->maps_lock));
+		if (!map)
+			continue;
+
+		rmr_map_unset_dirty_all(map);
+	}
+	mutex_unlock(&pool->maps_lock);
+	rmr_srv_mark_maps_dirty(srv_pool);
+
+	srv_pool->map_update_state = MAP_UPDATE_STATE_READY;
+
+	pr_info("%s: process_msg_cmd: moved to MAP_UPDATE_STATE_READY\n", __func__);
+
+out:
+	mutex_unlock(&srv_pool->srv_pool_lock);
+	return err;
+}
+
+/**
+ * process_msg_cmd_handler() - Processes rmr command message
+ *
+ * @work:	scheduled work structure
+ *
+ * Description:
+ *	The command messages being processed here, can be broadly divided into 2 categories.
+ *	Ones which are able to use the rsp buffer to send back status.
+ *	Ones which cannot use the rsp buffer to send back status. These ones use the rsp buffer
+ *	for other purposes; like sending map data, or read user rsp buffer.
+ *
+ * Context:
+ *	Execution time depends on the command. It may take a long time for commands which sends
+ *	data (map).
+ */
+static void process_msg_cmd_handler(struct work_struct *work)
+{
+	struct rmr_cmd_work_info *work_info = container_of(work, struct rmr_cmd_work_info, cmd_work);
+	struct rmr_pool *pool = work_info->pool;
+	struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv;
+	struct rmr_srv_sess *sess = work_info->sess;
+	struct rtrs_srv_sess *rtrs = work_info->rtrs;
+	const struct rmr_msg_pool_cmd *cmd_msg = work_info->cmd_msg;
+	struct rmr_dirty_id_map *map;
+	u8 sync, flags;
+	u64 src_mapped_size;
+	int md_i, err = 0;
+
+	/*
+	 * The switch cases below are used by either map sending node,
+	 * or the node which is to receive the map, but not both.
+	 */
+	switch (cmd_msg->cmd_type) {
+	case RMR_CMD_REJOIN_POOL:
+		/*
+		 * For now, we do not have any difference between joinand
+		 * rejoin on the storage server side
+		 */
+	case RMR_CMD_JOIN_POOL:
+		/*
+		 * Server node, received a request for a new session
+		 */
+		err = process_msg_join_pool(pool, sess, rtrs, cmd_msg->sync,
+					    &cmd_msg->join_pool_cmd);
+		if (err) {
+			pr_err("process_msg_join_pool failed with err %d\n", err);
+			goto out;
+		}
+		work_info->rsp->join_pool_cmd_rsp.chunk_size = pool->chunk_size;
+
+		if (pool->mapped_size) {
+			work_info->rsp->join_pool_cmd_rsp.mapped_size = pool->mapped_size;
+			pr_info("srv pool %s sets mapped size %llu\n",
+			       pool->poolname, pool->mapped_size);
+		} else
+			work_info->rsp->join_pool_cmd_rsp.mapped_size = 0;
+
+		break;
+	case RMR_CMD_POOL_INFO:
+		/*
+		 * Server node, received pool info command
+		 */
+		err = process_msg_pool_info(pool, &cmd_msg->pool_info_cmd);
+		if (err) {
+			pr_err("process_msg_pool_info failed with err %d\n", err);
+			goto out;
+		}
+
+		break;
+	case RMR_CMD_LEAVE_POOL:
+		err = process_msg_leave_pool(pool, sess, cmd_msg->sync, &cmd_msg->leave_pool_cmd);
+		if (err) {
+			pr_err("process_msg_leave_pool failed with err %d\n", err);
+			goto out;
+		}
+
+		break;
+	case RMR_CMD_ENABLE_POOL:
+		err = process_msg_enable_pool(pool, sess, cmd_msg->sync, &cmd_msg->enable_pool_cmd);
+		if (err) {
+			pr_err("process_msg_enable_pool failed with err %d\n", err);
+			goto out;
+		}
+
+		break;
+	case RMR_CMD_MAP_READY:
+		/*
+		 * Map receiving node.
+		 * Getting ready to receive dirty map
+		 */
+		pr_info("%s: RMR_CMD_MAP_READY\n", __func__);
+
+		err = process_msg_map_ready(pool, cmd_msg->sync);
+		if (err) {
+			pr_err("process_msg_map_ready failed with err %d\n", err);
+			goto out;
+		}
+
+		break;
+	case RMR_CMD_MAP_SEND:
+		/*
+		 * Map sending node.
+		 * Send map to the node with member_id == map_send_cmd->receiver_member_id
+		 */
+		pr_info("%s: RMR_CMD_MAP_SEND\n", __func__);
+
+		err = rmr_clt_send_map(pool, srv_pool->clt, &cmd_msg->map_send_cmd, MAP_NO_FILTER);
+		if (err) {
+			pr_err("rmr_clt_send_map failed with err %d\n", err);
+			goto out;
+		}
+
+		break;
+	case RMR_CMD_SEND_MAP_BUF:
+		/*
+		 * Map receiving node.
+		 * Received the map from another node. Save it.
+		 */
+		pr_info("%s: RMR_CMD_SEND_MAP_BUF\n", __func__);
+
+		if (srv_pool->map_update_state != MAP_UPDATE_STATE_READY) {
+			pr_err("rmr_srv_send_map Node not ready to receive map\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = rmr_pool_save_map(pool, work_info->data, work_info->datalen,
+					false);
+		if (err) {
+			if (!cmd_msg->sync)
+				rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+
+			pr_err("rmr_pool_save_map failed\n");
+			goto out;
+		}
+		break;
+	case RMR_CMD_MAP_BUF_DONE:
+		/*
+		 * Map receiving node.
+		 * A confirmation that all map updates have been sent.
+		 */
+		pr_info("%s: RMR_CMD_MAP_BUF_DONE\n", __func__);
+
+		if (srv_pool->map_update_state != MAP_UPDATE_STATE_READY) {
+			pr_err("rmr_srv_send_map Node state not correct\n");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (cmd_msg->map_buf_done_cmd.map_version < pool->map_ver) {
+			pr_err("Map version received (%llu) is older than ours (%llu)\n",
+			       cmd_msg->map_buf_done_cmd.map_version, pool->map_ver);
+			err = -EINVAL;
+			goto out;
+		}
+
+		pool->map_ver = cmd_msg->map_buf_done_cmd.map_version;
+		rmr_srv_mark_pool_md_dirty(srv_pool);
+
+		srv_pool->map_update_state = MAP_UPDATE_STATE_DONE;
+
+		break;
+	case RMR_CMD_MAP_DONE:
+		/*
+		 * Map receiving node.
+		 * A confirmation from the client, that map update was done successfully or not.
+		 */
+		pr_info("%s: RMR_CMD_MAP_DONE\n", __func__);
+
+		if (srv_pool->map_update_state != MAP_UPDATE_STATE_DONE) {
+			pr_err("rmr_srv_send_map Map not updated succesfully\n");
+			err = -EINVAL;
+		}
+
+		/*
+		 * On a successful map update, we go to NORMAL state.
+		 *
+		 * map_done_cmd.enable says whether this map update should make us go to
+		 * NORMAL state or not. This is controlled by the pserver.
+		 */
+		if (cmd_msg->map_done_cmd.enable) {
+			if (rmr_srv_set_pool_state_normal(srv_pool) < 0)
+				err = -EINVAL;
+		}
+
+		srv_pool->map_update_state = MAP_UPDATE_STATE_DISABLED;
+		break;
+	case RMR_CMD_MAP_DISABLE:
+		/*
+		 * Something went wrong on the client side; we need to reset everything.
+		 */
+		pr_info("%s: RMR_CMD_MAP_DISABLE\n", __func__);
+
+		if (!cmd_msg->sync)
+			rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO);
+
+		srv_pool->map_update_state = MAP_UPDATE_STATE_DISABLED;
+		break;
+	case RMR_CMD_READ_MAP_BUF:
+		/*
+		 * Pserver wants to read our dirty map. So send it.
+		 */
+		pr_info("%s: RMR_CMD_READ_MAP_BUF\n", __func__);
+
+		rmr_srv_read_map_buf(pool, work_info->data, work_info->datalen,
+				     &cmd_msg->map_buf_cmd);
+
+		goto out_no_rsp;
+	case RMR_CMD_MAP_CHECK:
+		pr_debug("%s: RMR_CMD_MAP_CHECK\n", __func__);
+
+		if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) {
+			pr_debug("srv pool %s is not in normal state, cannot do map check\n",
+				pool->poolname);
+			work_info->rsp->value = false;
+			break;
+		}
+		map = rmr_pool_find_map(pool, srv_pool->member_id);
+		if (!map) {
+			pr_err("pool %s no map found for member_id %u\n",
+			       pool->poolname, srv_pool->member_id);
+			err = -EINVAL;
+			goto out;
+		}
+		work_info->rsp->value = rmr_map_empty(map);
+		pr_debug("pool %s member_id %d rsp with map_empty=%llu\n",
+			 pool->poolname, srv_pool->member_id,
+			 work_info->rsp->value);
+
+		break;
+
+	case RMR_CMD_LAST_IO_TO_MAP:
+		/*
+		 * Use the last_io list, and add those IOs as dirty IDs to the map
+		 * for every other storage server other than this one.
+		 */
+		pr_info("%s: RMR_CMD_LAST_IO_TO_MAP\n", __func__);
+		err = rmr_srv_save_last_io_to_map(pool);
+		if (err) {
+			pr_err("rmr_srv_save_last_io_to_map failed\n");
+			goto out;
+		}
+
+		break;
+
+	case RMR_CMD_MAP_TEST:
+		/*
+		 * Received the map test from another node.
+		 * Check that we have everything that other node has.
+		 */
+		pr_info("%s: RMR_CMD_MAP_TEST\n", __func__);
+
+		err = rmr_pool_save_map(pool, work_info->data, work_info->datalen, true);
+		if (err) {
+			pr_err("rmr_srv_save_map failed, test_only, err %d\n", err);
+		}
+		goto out_no_rsp;
+	case RMR_CMD_MD_SEND:
+		/*
+		 * Received the message to copy metadata of server pool to the sender.
+		 */
+		src_mapped_size = cmd_msg->md_send_cmd.src_mapped_size;
+		pr_debug("stg %u: receives md_update message from pool %u\n",
+			 srv_pool->member_id, cmd_msg->md_send_cmd.leader_id);
+
+		/* Check the pool mapped_sizes are consistent or not */
+		if (pool->mapped_size && src_mapped_size && pool->mapped_size != src_mapped_size) {
+			pr_err_ratelimited("This %s mapped_size %llu != src %d mapped_size %llu\n",
+			       pool->poolname, pool->mapped_size, cmd_msg->md_send_cmd.leader_id,
+			       src_mapped_size);
+			goto out;
+		}
+
+		if (cmd_msg->md_send_cmd.read_full_md) {
+			if (work_info->datalen < sizeof(struct rmr_pool_md)) {
+				pr_err("%s: buffer too small for full pool_md (%zu < %zu)\n",
+				       __func__, work_info->datalen,
+				       sizeof(struct rmr_pool_md));
+				err = -EINVAL;
+				goto out;
+			}
+			memcpy(work_info->data, &pool->pool_md, sizeof(struct rmr_pool_md));
+		} else {
+			/* If updating buf incurs error, it simply waits for next md_update. */
+			rmr_srv_update_md_buf(srv_pool, work_info->data, work_info->datalen);
+		}
+
+		break;
+	case RMR_CMD_SEND_MD_BUF:
+		/*
+		 * Received the client pool metadata. Save it.
+		 */
+		sync = cmd_msg->send_md_buf_cmd.sync;
+		flags = cmd_msg->send_md_buf_cmd.flags;
+		if (flags == RMR_OP_MD_WRITE) {
+			err = rmr_srv_md_process_buf(pool, work_info->data, sync);
+			if (err) {
+				pr_err("rmr_srv_write_md failed\n");
+				goto out;
+			}
+
+			if (atomic_read(&srv_pool->store_state)) {
+				/* write back to disk */
+				err = process_md_io(pool, NULL, 0, work_info->datalen, flags,
+						    &pool->pool_md);
+				if (err) {
+					pr_err("Failed to process md io\n");
+					goto out;
+				}
+			}
+		}
+
+		if (!sync && flags == RMR_OP_MD_READ)
+			memcpy(work_info->data, &pool->pool_md, sizeof(struct rmr_pool_md));
+
+		break;
+	case RMR_CMD_SEND_DISCARD:
+		/* Received the message to handle discards. */
+		pr_info("%s: RMR_CMD_SEND_DISCARD for srv %u\n",
+			__func__, cmd_msg->send_discard_cmd.member_id);
+		if (!cmd_msg->sync) {
+			err = rmr_pool_md_check_discard(pool, cmd_msg->send_discard_cmd.member_id);
+			if (err > 0) {
+				/* This node has received discards. */
+				err = 0;
+				pr_info("pool %s member_id %d has received discards\n",
+					pool->poolname, srv_pool->member_id);
+				goto out;
+			}
+		}
+
+		/*
+		 * For sync requests, even if the server that is not in normal state has received
+		 * the discard request, its dirty map is still outdated. However, non-sync
+		 * requests can overlook this check and proceed discarding directly.
+		 */
+		if (cmd_msg->sync && atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL){
+			pr_err("srv pool %s not in normal state for sync discard request\n",
+				pool->poolname);
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = rmr_srv_discard_id(pool, 0, 0, cmd_msg->send_discard_cmd.member_id,
+				cmd_msg->sync);
+		if (err)
+			pr_err("Failed to discard id\n");
+
+		break;
+	case RMR_CMD_STORE_CHECK:
+		pr_debug("%s: RMR_CMD_STORE_CHECK\n", __func__);
+
+		work_info->rsp->value = rmr_srv_pool_check_store(pool);
+		pr_debug("pool %s member_id %d rsp with value=%llu\n",
+			 pool->poolname, srv_pool->member_id,
+			 work_info->rsp->value);
+
+		break;
+	case RMR_CMD_MAP_GET_VER:
+		pr_debug("%s: RMR_CMD_MAP_GET_VER\n", __func__);
+
+		work_info->rsp->value = pool->map_ver;
+		pr_debug("pool %s member_id %d rsp with value=%llu\n",
+			 pool->poolname, srv_pool->member_id,
+			 work_info->rsp->value);
+
+		break;
+	case RMR_CMD_MAP_SET_VER:
+		pr_debug("%s: RMR_CMD_MAP_SET_VER\n", __func__);
+
+		pool->map_ver = work_info->cmd_msg->set_map_ver_cmd.map_ver;
+		rmr_srv_mark_pool_md_dirty(srv_pool);
+		break;
+	case RMR_CMD_DISCARD_CLEAR_FLAG:
+		pr_info("%s: RMR_CMD_DISCARD_CLEAR_FLAG\n", __func__);
+
+		md_i = rmr_pool_find_md(&pool->pool_md, cmd_msg->send_discard_cmd.member_id, false);
+		if (md_i < 0) {
+			pr_info("Didn't find md for member_id %u\n",
+				cmd_msg->send_discard_cmd.member_id);
+			goto out;
+		}
+
+		pool->pool_md.srv_md[md_i].discard_entries = false;
+		rmr_srv_flush_pool_md(srv_pool);
+		break;
+	case RMR_CMD_USER:
+		pr_debug("%s: RMR_CMD_USER\n", __func__);
+
+		err = process_msg_user_cmd(srv_pool, cmd_msg, work_info->data, work_info->datalen);
+		if (err) {
+			pr_err("process_msg_user_cmd failed with err %d\n", err);
+			goto out_no_rsp;
+		}
+
+		goto out_no_rsp;
+	default:
+		pr_warn("%s: switch default type: %d\n", __func__, cmd_msg->cmd_type);
+
+		err = -EINVAL;
+	}
+
+out:
+	work_info->rsp->err = err;
+	work_info->rsp->member_id = srv_pool->member_id;
+	work_info->rsp->cmd_type = cmd_msg->cmd_type;
+
+out_no_rsp:
+	// Should we return err in rdma_resp ?
+	pr_debug("send rtrs completion from msg_cmd_handler, err:%d\n", err);
+	rtrs_srv_resp_rdma(work_info->rtrs_op, err);
+
+	rmr_put_srv_pool(srv_pool);
+	kfree(work_info);
+}
+
+static int schedule_process_msg_cmd(struct rmr_srv_sess *srv_sess,
+				    struct rtrs_srv_op *rtrs_op,
+				    void *data, size_t datalen,
+				    const void *msg, size_t len)
+{
+	struct rmr_srv_pool *srv_pool;
+	const struct rmr_msg_pool_cmd *cmd_msg = msg;
+	const char *poolname = cmd_msg->pool_name;
+	struct rmr_cmd_work_info *work_info;
+	u32 group_id = le32_to_cpu(cmd_msg->hdr.group_id);
+
+	pr_debug("pool %s received cmd %d\n",
+		 poolname, cmd_msg->cmd_type);
+
+	srv_pool = rmr_find_and_get_srv_pool(group_id);
+	if (IS_ERR(srv_pool)) {
+		pr_err("Cmd %s: pool %s does not exists: %pe\n",
+                        rmr_get_cmd_name(cmd_msg->cmd_type), poolname, srv_pool);
+		return PTR_ERR(srv_pool);
+	}
+
+	pr_debug("process_msg_cmd: pool %s found\n", poolname);
+
+	work_info = kzalloc(sizeof(struct rmr_cmd_work_info), GFP_KERNEL);
+	if (!work_info) {
+		pr_err("failed to allocate work info to send map\n");
+		rmr_put_srv_pool(srv_pool);
+		return -ENOMEM;
+	}
+	work_info->pool = srv_pool->pool;
+	work_info->sess = srv_sess;
+	work_info->rtrs = srv_sess->rtrs;
+	work_info->rtrs_op = rtrs_op;
+	work_info->cmd_msg = cmd_msg;
+	work_info->rsp = data;
+	work_info->data = data;
+	work_info->datalen = datalen;
+
+	INIT_WORK(&work_info->cmd_work, process_msg_cmd_handler);
+	schedule_work(&work_info->cmd_work);
+
+	return 0;
+}
+
+static int rmr_srv_rdma_ev(void *priv, struct rtrs_srv_op *id,
+			   void *data, size_t datalen,
+			   const void *usr, size_t usrlen)
+{
+	struct rmr_srv_sess *srv_sess = priv;
+	const struct rmr_msg_hdr *hdr = usr;
+	int ret = 0;
+	u16 type;
+
+	if (unlikely(WARN_ON(!srv_sess)))
+		return -ENODEV;
+
+	type = le16_to_cpu(hdr->type);
+
+	switch (type) {
+	case RMR_MSG_IO:
+		return process_msg_io(srv_sess, id, data, datalen,
+				      usr, usrlen);
+	case RMR_MSG_MAP_CLEAR:
+		ret = process_msg_map_clear(srv_sess, usr);
+		break;
+	case RMR_MSG_MAP_ADD:
+		ret = process_msg_map_add(srv_sess, usr);
+		break;
+	case RMR_MSG_CMD:
+		return schedule_process_msg_cmd(srv_sess, id, data, datalen,
+						usr, usrlen);
+	default:
+		pr_warn("Received unexpected message type %d from session %s\n",
+			type, srv_sess->sessname);
+		return -EINVAL;
+	}
+
+	rtrs_srv_resp_rdma(id, ret);
+
+	return 0;
+}
+
+/**
+ * rmr_srv_check_params() - Check the parameters of the storage node
+ *
+ * @srv_pool:	The rmr srv pool to check parameters for
+ *
+ * Description:
+ *	Checks the device params with other connected server nodes.
+ *
+ * Return:
+ *	0 on success.
+ *	-Negative error code on failure.
+ */
+int rmr_srv_check_params(struct rmr_srv_pool *srv_pool)
+{
+	void *dev;
+	int err;
+
+	/* If the store has not been added to this server pool, ignore device param checks. */
+	if (!srv_pool->io_store)
+		return 0;
+
+	dev = srv_pool->io_store->priv;
+	err = srv_pool->io_store->ops->get_params(dev);
+	if (err) {
+		pr_err("%s: store get_params failed for pool %s, err %d\n",
+		       __func__, srv_pool->pool->poolname, err);
+		return err;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(rmr_srv_check_params);
+
+static struct rtrs_srv_ops rtrs_ops;
+static int __init rmr_srv_init_module(void)
+{
+	int err;
+
+	if (!is_power_of_2(chunk_size) ||
+	    chunk_size < MIN_CHUNK_SIZE || chunk_size > MAX_CHUNK_SIZE) {
+		pr_err("Loading module %s failed. Invalid chunk_size %u\n",
+			KBUILD_MODNAME, chunk_size);
+		pr_err("Chunk size should be a power of 2, and between (min %u - max %u)\n",
+			MIN_CHUNK_SIZE, MAX_CHUNK_SIZE);
+		return -EINVAL;
+	}
+
+	pr_info("Loading module %s, version %s, proto %s, chunk_size %u\n",
+		KBUILD_MODNAME, RMR_VER_STRING, RMR_PROTO_VER_STRING, chunk_size);
+
+	rtrs_ops = (struct rtrs_srv_ops){
+		.rdma_ev = rmr_srv_rdma_ev,
+		.link_ev = rmr_srv_link_ev,
+	};
+
+	rmr_req_cachep = kmem_cache_create("rmr_req_cachep", sizeof(struct rmr_srv_req),
+					   0, 0, NULL);
+	if (!rmr_req_cachep) {
+		pr_err("can not allocagte cachep for rmr_req\n");
+		err = -ENOMEM;
+		goto out;
+	}
+	rmr_map_entry_cachep = kmem_cache_create("rmr_map_entry_cachep",
+						 sizeof(struct rmr_map_entry),
+						 0, 0, NULL);
+	if (!rmr_map_entry_cachep) {
+		pr_err("can not allocagte cachep for rmr_map_entry\n");
+		err = -ENOMEM;
+		goto req_destroy;
+	}
+
+	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct rmr_map_cbuf_hdr) < RMR_POOL_MAX_SESS);
+
+	rtrs_ctx = rtrs_srv_open(&rtrs_ops, RTRS_PORT);
+	if (IS_ERR(rtrs_ctx)) {
+		err = PTR_ERR(rtrs_ctx);
+		pr_err("rtrs_srv_open(), err: %pe\n", rtrs_ctx);
+		goto map_destroy;
+	}
+
+	err = rmr_srv_create_sysfs_files();
+	if (err) {
+		pr_err("rmr_srv_create_sysfs_files(), err: %d\n", err);
+		goto srv_close;
+	}
+
+	return 0;
+
+srv_close:
+	rtrs_srv_close(rtrs_ctx);
+map_destroy:
+	kmem_cache_destroy(rmr_map_entry_cachep);
+req_destroy:
+	kmem_cache_destroy(rmr_req_cachep);
+out:
+	return err;
+}
+
+static void __exit rmr_srv_cleanup_module(void)
+{
+	struct rmr_pool *pool, *tmp;
+	struct rmr_srv_pool *srv_pool;
+
+	pr_info("Unloading module\n");
+	kmem_cache_destroy(rmr_req_cachep);
+
+	rtrs_srv_close(rtrs_ctx);
+
+	list_for_each_entry_safe (pool, tmp, &pool_list, entry) {
+		srv_pool = (struct rmr_srv_pool *)pool->priv;
+
+		WARN_ON(!list_empty(&pool->sess_list));
+		rmr_srv_destroy_pool(pool);
+		rmr_srv_destroy_pool_sysfs_files(pool, NULL);
+		rmr_put_srv_pool(srv_pool);
+	}
+
+	rmr_srv_destroy_sysfs_files();
+	pr_info("Module unloaded\n");
+}
+
+module_init(rmr_srv_init_module);
+module_exit(rmr_srv_cleanup_module);
diff --git a/drivers/infiniband/ulp/rmr/rmr-srv.h b/drivers/infiniband/ulp/rmr/rmr-srv.h
new file mode 100644
index 000000000000..a84586aa78bd
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr-srv.h
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef RMR_SRV_H
+#define RMR_SRV_H
+
+/* rmr-srv-sysfs.c */
+
+#include <linux/types.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/limits.h>
+#include <linux/kthread.h>
+
+#include "rmr-pool.h"
+
+/*
+ * IO store interface implemented by an upper-layer consumer of rmr-server.
+ * All consumer-specific types are passed as void * so RMR remains
+ * independent of any particular client.
+ */
+struct rmr_srv_store_ops {
+	int (*submit_req)(void *device, void *data, u32 offset, u32 length,
+			  unsigned long flags, u16 prio, void *priv);
+	int (*submit_md_req)(void *device, void *data, u32 offset, u32 length,
+			     unsigned long flags, void *priv);
+	int (*submit_cmd)(void *device, const void *usr_buf, int usr_len,
+			  void *data, int datalen);
+	bool (*io_allowed)(void *store_priv);
+	int (*get_params)(void *device);
+};
+
+#define DEFAULT_SYNC_QUEUE_DEPTH 32
+#define RMR_SRV_CHECK_MAPS_INTERVAL_MS 3000
+#define RMR_SRV_MD_SYNC_INTERVAL_MS 500
+#define RMR_SRV_DISCARD_TIMEOUT_MS 500
+
+/* Bit indices for srv_pool->md_dirty — used with set_bit / test_and_clear_bit */
+enum rmr_srv_md_dirty_bit {
+	MD_DIRTY_POOL,		/* pool_md fields changed */
+	MD_DIRTY_MAPS,		/* map bitmap changed */
+	MD_DIRTY_LAST_IO,	/* last_io updated */
+};
+
+extern struct kmem_cache *rmr_req_cachep;
+extern struct kmem_cache *rmr_map_entry_cachep;
+
+enum rmr_srv_register_disk_mode {
+	RMR_SRV_DISK_CREATE,	/* Fresh store, new pool */
+	RMR_SRV_DISK_ADD,	/* Rejoin an existing pool */
+	RMR_SRV_DISK_REPLACE,	/* Replace an existing store */
+};
+
+/*
+ * When adding state, remember to add an entry in the function rmr_get_srv_pool_state_name()
+ */
+enum rmr_srv_pool_state {
+	RMR_SRV_POOL_STATE_EMPTY,
+	RMR_SRV_POOL_STATE_REGISTERED,
+	RMR_SRV_POOL_STATE_CREATED,
+	RMR_SRV_POOL_STATE_NORMAL,
+	RMR_SRV_POOL_STATE_NO_IO,
+};
+
+struct rmr_srv_pool {
+	u8			member_id;
+	refcount_t		refcount;
+	atomic_t		state;
+	bool			maintenance_mode;
+
+	struct rmr_pool		*pool;
+
+	/* Sync thread */
+	struct task_struct	*th_tsk;
+	atomic_t		thread_state;
+	atomic_t		in_flight_sync_reqs;
+
+	struct rmr_srv_io_store	*io_store;
+	struct mutex		srv_pool_lock;
+	atomic_t		store_state;
+
+	bool			marked_create;
+	bool			marked_delete;
+
+	unsigned long           md_dirty;  /* bitmask of dirty regions */
+	unsigned long           map_update_state;
+	/* The internal client pool assigned to this server pool. */
+	struct rmr_pool         *clt;
+	size_t			queue_depth;
+	rmr_id_t		*last_io;
+	/*
+	 *  Each storage node keeps a command array with the length of queue depth to track the IOs
+	 *  in the last round. Use an array of chunk indexes as a copy of srv_pool->last_io so that
+	 *  it can be written back to/read from backing store as needed.
+	 */
+	rmr_id_t		*last_io_idx;
+
+	u32			max_sync_io_size;
+	struct workqueue_struct *clean_wq;
+	struct delayed_work	clean_dwork;
+
+	struct workqueue_struct *md_sync_wq;
+	struct delayed_work	md_sync_dwork;
+	struct delayed_work	last_io_sync_dwork;
+};
+
+/**
+ * rmr_srv_mark_pool_md_dirty() - Set MD_DIRTY_POOL and schedule delayed sync
+ * @srv_pool:	Server pool with changed pool_md fields
+ */
+static inline void rmr_srv_mark_pool_md_dirty(struct rmr_srv_pool *srv_pool)
+{
+	set_bit(MD_DIRTY_POOL, &srv_pool->md_dirty);
+	mod_delayed_work(srv_pool->md_sync_wq, &srv_pool->md_sync_dwork,
+			 msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS));
+}
+
+struct rmr_srv_sess {
+	struct list_head pool_sess_list;
+	struct rtrs_srv_sess *rtrs;
+	struct kobject		kobj;
+	char			sessname[NAME_MAX];
+	struct mutex		lock;
+	u8			ver;
+	struct xarray		pools;
+	struct list_head g_list_entry;
+};
+
+struct rmr_srv_pool_sess {
+	struct list_head pool_entry; /* for pool->sess_list */
+	struct list_head srv_sess_entry;
+	struct rmr_srv_pool *srv_pool;
+	struct kobject kobj;
+	char sessname[NAME_MAX];
+	struct rmr_srv_sess *srv_sess;
+	bool sync;
+};
+
+struct rmr_srv_io_store {
+	struct rmr_srv_store_ops *ops;
+	void *priv;
+};
+
+struct rmr_cmd_work_info {
+	struct work_struct		cmd_work;
+	struct rmr_pool			*pool;
+	struct rmr_srv_sess *sess;
+	struct rtrs_srv_sess		*rtrs;
+	const struct rmr_msg_pool_cmd	*cmd_msg;
+	struct rmr_msg_pool_cmd_rsp	*rsp;
+	struct rtrs_srv_op		*rtrs_op;
+	void				*data;
+	size_t				datalen;
+};
+
+void rmr_put_srv_pool(struct rmr_srv_pool *srv_pool);
+struct rmr_srv_pool *rmr_create_srv_pool(char *poolname, u32 member_id);
+void rmr_srv_pool_update_params(struct rmr_pool *pool);
+int rmr_srv_read_md(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len,
+		    struct rmr_pool_md *pool_md_page);
+int rmr_srv_send_md_update(struct rmr_pool *pool);
+int rmr_srv_check_params(struct rmr_srv_pool *srv_pool);
+void rmr_srv_mark_maps_dirty(struct rmr_srv_pool *srv_pool);
+
+/* rmr-srv-md.c */
+struct rmr_srv_req;	/* forward decl for endreq prototype */
+
+bool rmr_get_srv_pool(struct rmr_srv_pool *srv_pool);
+void rmr_srv_endreq(struct rmr_srv_req *req, int err);
+
+int process_md_io(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op,
+		  u32 offset, u32 len, unsigned long flags, void *buf);
+void rmr_srv_md_maps_sync(struct rmr_pool *pool);
+void rmr_srv_flush_pool_md(struct rmr_srv_pool *srv_pool);
+void rmr_srv_md_sync(struct work_struct *work);
+int rmr_srv_md_process_buf(struct rmr_pool *pool, void *buf, bool sync);
+int rmr_srv_refresh_md(struct rmr_srv_pool *srv_pool);
+
+/* rmr-srv-sysfs.c */
+
+int rmr_srv_create_sysfs_files(void);
+void rmr_srv_destroy_sysfs_files(void);
+void rmr_srv_destroy_pool_sysfs_files(struct rmr_pool *pool,
+				      const struct attribute *sysfs_self);
+int rmr_srv_sysfs_add_sess(struct rmr_pool *pool,
+			   struct rmr_srv_pool_sess *pool_sess);
+void rmr_srv_sysfs_del_sess(struct rmr_srv_pool_sess *pool_sess);
+
+void rmr_srv_free_sync_permits(struct rmr_pool *pool);
+void rmr_srv_destroy_pool(struct rmr_pool *pool);
+int rmr_srv_remove_clt_pool(struct rmr_srv_pool *srv_pool);
+
+void rmr_srv_stop_sync_and_go_offline(struct rmr_pool *pool);
+
+int rmr_srv_get_sync_permit(struct rmr_srv_pool *srv_pool);
+void rmr_srv_put_sync_permit(struct rmr_srv_pool *srv_pool);
+
+int rmr_srv_sync_thread_start(struct rmr_srv_pool *srv_pool);
+int rmr_srv_sync_thread_stop(struct rmr_srv_pool *srv_pool);
+
+void rmr_srv_sync_req_failed(struct rmr_srv_pool *srv_pool);
+
+int rmr_srv_query(struct rmr_pool *pool, u64 mapped_size, struct rmr_attrs *attr);
+/* register/unregister rmr-srv */
+struct rmr_pool *rmr_srv_register(char *poolname, struct rmr_srv_store_ops *ops, void *priv,
+				  u64 mapped_size, enum rmr_srv_register_disk_mode mode);
+void rmr_srv_unregister(char *poolname, bool delete);
+
+int rmr_srv_pool_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv,
+			     const struct kvec *usr_vec, size_t nr, void *buf, int buf_len,
+			     size_t size);
+int rmr_srv_discard_id(struct rmr_pool *pool, u64 offset, u64 length, u8 member_id, bool sync);
+void rmr_srv_replace_store(struct rmr_pool *pool);
+
+#endif /* RMR_SRV_H */
diff --git a/drivers/infiniband/ulp/rmr/rmr.h b/drivers/infiniband/ulp/rmr/rmr.h
new file mode 100644
index 000000000000..72d591ccc047
--- /dev/null
+++ b/drivers/infiniband/ulp/rmr/rmr.h
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Reliable multicast over RTRS (RMR)
+ *
+ * Copyright (c) 2026 IONOS SE
+ */
+
+#ifndef RMR_H
+#define RMR_H
+
+#include <linux/scatterlist.h>
+#include <linux/kobject.h>
+#include <rtrs.h>
+
+#include "rmr-proto.h"
+struct rmr_pool;
+
+typedef void (rmr_conf_fn)(void *priv, int errno);
+enum rmr_wait_type {
+	NO_WAIT = RTRS_PERMIT_NOWAIT,
+	WAIT = RTRS_PERMIT_WAIT
+};
+
+/*
+ * Here goes RMR client API
+ */
+
+/**
+ * inverse operation. decrements refcount
+ * and free if it reaches 0.
+ */
+void rmr_clt_put_pool(struct rmr_pool *pool);
+
+/**
+ * enum rmr_clt_link_ev - Events about connectivity state of a client
+ * @RMR_CLT_LINK_EV_RECONNECTED	Client was reconnected.
+ * @RMR_CLT_LINK_EV_DISCONNECTED	Client was disconnected.
+ */
+enum rmr_clt_link_ev {
+	RMR_CLT_LINK_EV_RECONNECTED,
+	RMR_CLT_LINK_EV_DISCONNECTED,
+};
+
+typedef void (rmr_clt_ev_fn)(void *priv, enum rmr_clt_link_ev ev);
+/**
+ * rmr_clt_open() - Opens a pool from the RMR client
+ * @priv:	User supplied private data.
+ * @link_ev:	Event notification for connection state changes
+ * @priv:	user supplied data that was passed to rmr_clt_open()
+ * @ev:		Occurred event
+ * @poolname:	name of the pool
+ *
+ * Only one user can open a pool at the same time.
+ * However administrative operations are possible.
+ *
+ * Return a valid pointer on success otherwise PTR_ERR.
+ */
+struct rmr_pool *rmr_clt_open(void *priv, rmr_clt_ev_fn *link_ev, const char *poolname);
+
+/**
+   returns the priv data that had been provided with open()
+*/
+void *rmr_clt_get_priv(struct rmr_pool *pool);
+
+/**
+ * rmr_clt_close() - Closes a pool
+ * @pool: Pool handler, is freed on return
+ */
+void rmr_clt_close(struct rmr_pool *pool);
+
+#define RMR_OP_BITS 8
+#define RMR_OP_MASK ((1 << RMR_OP_BITS) - 1)
+
+/**
+ * enum rmr_io_flags - RMR request types from rq_flag_bits
+ * @RMR_OP_READ:		read object
+ * @RMR_OP_WRITE:		write object
+ * @RMR_OP_DISCARD:		remove object
+ * @RMR_OP_SYNCREQ:		sync request
+ * @RMR_OP_WRITE_ZEROES:	write zeroes
+ * @RMR_OP_FLUSH:		flush object
+ * @RMR_OP_MD_READ:		read metadata of rmr
+ * @RMR_OP_MD_WRITE:		write metadata of rmr
+ */
+enum rmr_io_flags {
+	/* Operations */
+	RMR_OP_READ = 0,
+	RMR_OP_WRITE = 1,
+	RMR_OP_DISCARD = 2,
+	RMR_OP_SYNCREQ = 3,
+	RMR_OP_WRITE_ZEROES = 4,
+	RMR_OP_FLUSH = 5,
+	/* Add metadata related operations below this. */
+	RMR_OP_MD_READ = 6,
+	RMR_OP_MD_WRITE = 7,
+
+	/* Flags */
+	RMR_F_SYNC = 1 <<(RMR_OP_BITS + 0), // 0x100, 0b0100000000
+	RMR_F_FUA = 1 <<(RMR_OP_BITS + 1),  // 0x200, 0b1000000000
+};
+
+static inline u32 rmr_op(u32 flag)
+{
+	return flag & RMR_OP_MASK;
+}
+
+static inline u32 rmr_flags(u32 flag)
+{
+	return flag & ~RMR_OP_MASK;
+}
+
+/**
+ * Something to keep the 128 bit block_id (a.k.a object_id)
+ */
+typedef struct {
+	u64 a;
+	u64 b;
+} rmr_id_t;
+
+struct rmr_iu;
+
+/**
+ * rmr_clt_get_iu() - allocates iu for future RDMA operation
+ * @pool:	Current pool
+ * @id:		Id of the object/block
+ * @flag:       READ/WRITE/REMOVE
+ * @wait:       WAIT/NO_WAIT
+ *
+ * Description:
+ *    Allocates iu for the following RDMA operation.  Iu is used
+ *    to preallocate all resources and to propagate memory pressure
+ *    up earlier.
+ *
+ */
+struct rmr_iu *rmr_clt_get_iu(struct rmr_pool *pool,
+			      enum rmr_io_flags flag,
+			      enum rmr_wait_type wait);
+
+/**
+ * rmr_clt_put_iu() - puts allocated iu
+ * @pool:	Current pool
+ * @id:		Id of the object/block
+ * @flag:       READ/WRITE/REMOVE
+ * @iu:		Iu to be freed
+ *
+ * Context:
+ *    Does not matter
+ */
+void rmr_clt_put_iu(struct rmr_pool *pool, struct rmr_iu *iu);
+
+/**
+ * rmr_clt_request() - Request data transfer to/from server via RDMA.
+ *
+ *
+ * @pool:	The Pool
+ * @iu:		Iu allocated by pevious rmr_clt_get_iu call.
+ * @offset:	offset inside the object to read/write:
+ * @length:	length of data starting from offset
+ * @flag:	READ/WRITE/REMOVE
+ * @prio:	priority of IO
+ * @priv:	User provided data, passed back with corresponding
+ *		@(conf) confirmation.
+ * @conf:	callback function to be called as confirmation
+ * @sg:		Pages to be sent/received to/from server.
+ * @sg_cnt:	Number of elements in the @sg
+ *
+ * Return:
+ * 0:		Success
+ * -EAGAIN:	Currently there are no resources to execute the request.
+ *              Retry again later.
+ * <0:		Error
+ *
+ * On flag=READ rtrs client will request a data transfer from Server to client.
+ * The data that the server will respond with will be stored in @sg when
+ * the user confirmation function is called.
+ * On flag=WRITE rtrs client will rdma write data in sg to server side.
+ */
+int rmr_clt_request(struct rmr_pool *pool, struct rmr_iu *iu,
+		    size_t offset, size_t length, enum rmr_io_flags flag, unsigned short prio,
+		    void *priv, rmr_conf_fn *conf, struct scatterlist *sg, unsigned int sg_cnt);
+
+int rmr_clt_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv,
+			 const struct kvec *usr_vec, size_t nr, void *buf, int buf_len,
+			 size_t size);
+
+
+/**
+ * rmr_attrs - RMR pool attributes
+ */
+struct rmr_attrs {
+	u32	queue_depth;
+	u32	max_io_size;
+	u32	chunk_size;
+	u32 	max_segments;
+	u64	rmr_md_size; /* in sectors */
+	u8	sync;
+	struct kobject *pool_kobj;
+};
+
+/**
+ * rmr_clt_query() - queries RMR pool attributes
+ *
+ * Returns:
+ *    0 on success
+ *    -EINVAL		no session in the pool
+ */
+int rmr_clt_query(struct rmr_pool *pool, struct rmr_attrs *attr);
+
+typedef enum {
+	RMR_MAP_ADD,
+	RMR_MAP_REMOVE,
+} rmr_map_cmd;
+
+#define RMR_STORE_ID_BITS   32
+#define RMR_STORE_ID_OFFSET (64 - RMR_STORE_ID_BITS)
+
+#define RMR_CHUNK_BITS	 32
+#define RMR_CHUNK_OFFSET 0
+
+enum rmr_pool_state {
+	RMR_POOL_STATE_CREATED = 0,
+	RMR_POOL_STATE_JOINED,
+	RMR_POOL_STATE_ONLINE,
+	/* maybe we will use this later */
+	RMR_POOL_STATE_DEGRADED,
+	RMR_POOL_STATE_SYNCING,
+};
+
+#endif