diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 858320b6ebb7..65167fcb1357 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -353,6 +353,8 @@ config BLKDEV_UBLK_LEGACY_OPCODES source "drivers/block/rnbd/Kconfig" +source "drivers/block/brmr/Kconfig" + config BLK_DEV_ZONED_LOOP tristate "Zoned loopback device support" depends on BLK_DEV_ZONED diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 2d8096eb8cdf..4793c9b0b383 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ +obj-$(CONFIG_BLK_DEV_BRMR) += brmr/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull/ diff --git a/drivers/block/brmr/Kconfig b/drivers/block/brmr/Kconfig new file mode 100644 index 000000000000..a38d59d2c1d4 --- /dev/null +++ b/drivers/block/brmr/Kconfig @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config BLK_DEV_BRMR + bool + +config BLK_DEV_BRMR_CLIENT + tristate "Block device over RMR (BRMR) client" + depends on INFINIBAND_RMR_CLIENT + select BLK_DEV_BRMR + help + BRMR client is a block device driver that sits on top of the + RMR ULP and exposes a standard Linux block device (/dev/brmrX) + backed by an RMR pool. Together with RMR it provides a + single-hop replication and resynchronization solution for + RDMA-connected storage clusters. + + If unsure, say N. + +config BLK_DEV_BRMR_SERVER + tristate "Block device over RMR (BRMR) server" + depends on INFINIBAND_RMR_SERVER + select BLK_DEV_BRMR + help + BRMR server exports a local block device as the backing store + for an RMR pool, so that BRMR clients can map it remotely + over RDMA. + + If unsure, say N. diff --git a/drivers/block/brmr/Makefile b/drivers/block/brmr/Makefile new file mode 100644 index 000000000000..894ba2720557 --- /dev/null +++ b/drivers/block/brmr/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs \ + -I$(srctree)/drivers/infiniband/ulp/rmr \ + -I$(srctree)/drivers/block/brmr + +brmr-client-y := brmr-clt.o \ + brmr-clt-sysfs.o \ + brmr-clt-reque.o \ + brmr-clt-stats.o + +brmr-server-y := brmr-srv-sysfs.o \ + brmr-srv.o + +obj-$(CONFIG_BLK_DEV_BRMR_CLIENT) += brmr-client.o +obj-$(CONFIG_BLK_DEV_BRMR_SERVER) += brmr-server.o diff --git a/drivers/block/brmr/brmr-clt-reque.c b/drivers/block/brmr/brmr-clt-reque.c new file mode 100644 index 000000000000..252661486a0a --- /dev/null +++ b/drivers/block/brmr/brmr-clt-reque.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "brmr-clt.h" +#include "rmr.h" +#include "rmr-pool.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(BRMR_VER_STRING); +MODULE_DESCRIPTION("BRMR Block Device using RMR cluster"); +MODULE_LICENSE("GPL"); + +static inline void brmr_requeue(struct brmr_queue *q) +{ + if (WARN_ON(!q->hctx)) + return; + + /* We can come here from interrupt, thus async=true */ + blk_mq_run_hw_queue(q->hctx, true); +} + +/** + * requeue implementation as used by ibnbd + */ + +void brmr_init_cpu_qlists(struct brmr_cpu_qlist __percpu *cpu_queues) +{ + unsigned int cpu; + struct brmr_cpu_qlist *cpu_q; + + for_each_possible_cpu(cpu) { + cpu_q = per_cpu_ptr(cpu_queues, cpu); + + cpu_q->cpu = cpu; + INIT_LIST_HEAD(&cpu_q->requeue_list); + spin_lock_init(&cpu_q->requeue_lock); + } +} + +/** + * brmr_get_cpu_qlist() - finds a list with HW queues to be requeued + * + * Description: + * Each CPU has a list of HW queues, which needs to be requeed. If a list + * is not empty - it is marked with a bit. This function finds first + * set bit in a bitmap and returns corresponding CPU list. + */ +static struct brmr_cpu_qlist * +brmr_get_cpu_qlist(struct brmr_clt_pool *pool, int cpu) +{ + int bit; + + /* First half */ + bit = find_next_bit(pool->cpu_queues_bm, nr_cpu_ids, cpu); + if (bit < nr_cpu_ids) { + return per_cpu_ptr(pool->cpu_queues, bit); + } else if (cpu != 0) { + /* Second half */ + bit = find_next_bit(pool->cpu_queues_bm, cpu, 0); + if (bit < cpu) + return per_cpu_ptr(pool->cpu_queues, bit); + } + + return NULL; +} + +static inline int nxt_cpu(int cpu) +{ + return (cpu + 1) % nr_cpu_ids; +} + +/** + * brmr_requeue_if_needed() - requeue if CPU queue is marked as non empty + * + * Description: + * Each CPU has it's own list of HW queues, which should be requeued. + * Function finds such list with HW queues, takes a list lock, picks up + * the first HW queue out of the list and requeues it. + * + * Return: + * True if the queue was requeued, false otherwise. + * + * Context: + * Does not matter. + */ +static inline bool brmr_requeue_if_needed(struct brmr_clt_pool *pool) +{ + struct brmr_queue *q = NULL; + struct brmr_cpu_qlist *cpu_q; + unsigned long flags; + int *cpup; + + /* + * To keep fairness and not to let other queues starve we always + * try to wake up someone else in round-robin manner. That of course + * increases latency but queues always have a chance to be executed. + */ + cpup = get_cpu_ptr(pool->cpu_rr); + for (cpu_q = brmr_get_cpu_qlist(pool, nxt_cpu(*cpup)); cpu_q; + cpu_q = brmr_get_cpu_qlist(pool, nxt_cpu(cpu_q->cpu))) { + if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags)) + continue; + if (likely(test_bit(cpu_q->cpu, pool->cpu_queues_bm))) { + q = list_first_entry_or_null(&cpu_q->requeue_list, + typeof(*q), requeue_list); + if (WARN_ON(!q)) + goto clear_bit; + list_del_init(&q->requeue_list); + clear_bit_unlock(0, &q->in_list); + + if (list_empty(&cpu_q->requeue_list)) { + /* Clear bit if nothing is left */ +clear_bit: + clear_bit(cpu_q->cpu, pool->cpu_queues_bm); + } + } + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + + if (q) + break; + } + + /** + * Saves the CPU that is going to be requeued on the per-cpu var. Just + * incrementing it doesn't work because brmr_get_cpu_qlist() will + * always return the first CPU with something on the queue list when the + * value stored on the var is greater than the last CPU with something + * on the list. + */ + if (cpu_q) + *cpup = cpu_q->cpu; + put_cpu_var(pool->cpu_rr); + + if (q) + brmr_requeue(q); + + return !!q; +} + +/** + * brmr_requeue_requests() - requeue all queues left in the list if + * brmr_clt_pool is idling (there are no requests in-flight). + * + * Description: + * This function tries to rerun all stopped queues if there are no + * requests in-flight anymore. This function tries to solve an obvious + * problem, when number of tags < than number of queues (hctx), which + * are stopped and put to sleep. If last tag, which has been just put, + * does not wake up all left queues (hctxs), IO requests hang forever. + * + * That can happen when all number of tags, say N, have been exhausted + * from one CPU, and we have many block devices per session, say M. + * Each block device has it's own queue (hctx) for each CPU, so eventually + * we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids. + * If number of tags N < M x nr_cpu_ids finally we will get an IO hang. + * + * To avoid this hang last caller of brmr_put_iu() (last caller is the + * one who observes pool->busy == 0) must wake up all remaining queues. + * + * Context: + * Called from msg_io_conf which in turn is a completion handler + * that is called from interupt. + */ +void brmr_requeue_requests(struct brmr_clt_pool *pool) +{ + bool requeued; + + do { + requeued = brmr_requeue_if_needed(pool); + } while (atomic_read(&pool->busy) == 0 && requeued); +} + +bool brmr_add_to_requeue(struct brmr_clt_pool *pool, struct brmr_queue *q) +{ + struct brmr_cpu_qlist *cpu_q; + unsigned long flags; + bool added = true; + bool need_set; + + cpu_q = get_cpu_ptr(pool->cpu_queues); + spin_lock_irqsave(&cpu_q->requeue_lock, flags); + + if (likely(!test_and_set_bit_lock(0, &q->in_list))) { + if (WARN_ON(!list_empty(&q->requeue_list))) + goto unlock; + + need_set = !test_bit(cpu_q->cpu, pool->cpu_queues_bm); + if (need_set) { + set_bit(cpu_q->cpu, pool->cpu_queues_bm); + /* Paired with brmr_put_iu(). Set a bit first + * and then observe the busy counter. + */ + smp_mb__before_atomic(); + } + if (likely(atomic_read(&pool->busy))) { + list_add_tail(&q->requeue_list, &cpu_q->requeue_list); + } else { + /* Very unlikely, but possible: busy counter was + * observed as zero. Drop all bits and return + * false to restart the queue by ourselves. + */ + if (need_set) + clear_bit(cpu_q->cpu, pool->cpu_queues_bm); + clear_bit_unlock(0, &q->in_list); + added = false; + } + } +unlock: + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + put_cpu_ptr(pool->cpu_queues); + + return added; +} + diff --git a/drivers/block/brmr/brmr-clt-stats.c b/drivers/block/brmr/brmr-clt-stats.c new file mode 100644 index 000000000000..de080fde779c --- /dev/null +++ b/drivers/block/brmr/brmr-clt-stats.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +//#include +//#include +//#include + +#include "brmr-clt.h" +#include "rmr.h" +#include "rmr-pool.h" + + +int brmr_clt_init_stats(struct brmr_clt_stats *stats) +{ + stats->pcpu_stats = alloc_percpu(typeof(*stats->pcpu_stats)); + if (unlikely(!stats->pcpu_stats)) + return -ENOMEM; + + return 0; +} + +void brmr_clt_free_stats(struct brmr_clt_stats *stats) +{ + free_percpu(stats->pcpu_stats); +} + +int brmr_clt_reset_submitted_req(struct brmr_clt_stats *stats, bool enable) +{ + struct brmr_stats_pcpu *s; + int cpu; + + if (unlikely(!enable)) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->submitted_requests, 0, + sizeof(s->submitted_requests)); + } + + return 0; +} + +int brmr_clt_reset_req_sizes(struct brmr_clt_stats *stats, bool enable) +{ + struct brmr_stats_pcpu *s; + int cpu; + + if (unlikely(!enable)) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->request_sizes, 0, + sizeof(s->request_sizes)); + } + + return 0; +} + +static void brmr_update_submitted_requests(struct brmr_stats_pcpu *s, + size_t size, int split, int d) +{ + s->submitted_requests.dir[d].total_sectors += (size >> SECTOR_SHIFT); + if (split) + s->submitted_requests.dir[d].cnt_split++; + else + s->submitted_requests.dir[d].cnt_whole++; +} + +#define MAX_LEN (128*1024) +#define NUM_CLASSES 16 +#define CLASSIFY_SHIFT (ilog2(MAX_LEN)-ilog2(NUM_CLASSES)) + +/** + classifies length linearly in 16 classes: + + input length in bytes + + < 0x2000 (8K) + >= 0x2000 (8K) + >= 0x4000 (16K) + >= 0x6000 (24K) + >= 0x8000 (32K) + >= 0xa000 (40K) + >= 0xc000 (48K) + >= 0xe000 (56K) + >= 0x10000 (64K) + >= 0x12000 (72K) + >= 0x14000 (80K) + >= 0x16000 (88K) + >= 0x18000 (96K) + >= 0x1a000 (104K) + >= 0x1c000 (112K) + >= 0x1e000 (120K) + + Maximum value is 128K-1. + However everything larger is classified as class 15 as well. +*/ +static inline int classify(long length) +{ + return length < MAX_LEN ? (length >> CLASSIFY_SHIFT) : NUM_CLASSES-1; +} + +static void brmr_update_request_sizes(struct brmr_stats_pcpu *s, + size_t size, int split, int d) +{ + int size_class = classify(size); + switch (split) { + case 0: + s->request_sizes.dir[d].cnt_whole[size_class]++; + break; + case 1: + s->request_sizes.dir[d].cnt_left[size_class]++; + break; + case 2: + s->request_sizes.dir[d].cnt_right[size_class]++; + break; + default: + WARN_ONCE(true,"unexpected value for split"); + } +} + +void brmr_update_stats(struct brmr_clt_stats *stats, size_t size, int split, int d) +{ + struct brmr_stats_pcpu *s; + + s = this_cpu_ptr(stats->pcpu_stats); + + brmr_update_submitted_requests(s, size, split, d); + brmr_update_request_sizes(s, size, split, d); +} + +ssize_t brmr_clt_stats_rq_to_str(struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_rq sum; + struct brmr_stats_rq *r; + int cpu; int d; + + memset(&sum, 0, sizeof(sum)); + + for_each_possible_cpu(cpu) { + r = &per_cpu_ptr(stats->pcpu_stats, cpu)->submitted_requests; + + for (d=READ; d<=WRITE; d++) { + sum.dir[d].cnt_whole += r->dir[d].cnt_whole; + sum.dir[d].cnt_split += r->dir[d].cnt_split; + sum.dir[d].total_sectors += r->dir[d].total_sectors; + } + } + + return scnprintf(page, len, "%llu %llu %llu %llu %llu %llu\n", + sum.dir[READ].cnt_whole, sum.dir[READ].cnt_split, + sum.dir[READ].total_sectors, + sum.dir[WRITE].cnt_whole, sum.dir[WRITE].cnt_split, + sum.dir[WRITE].total_sectors); +} + +ssize_t brmr_clt_stats_sizes_to_str(struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_sizes *sum; + struct brmr_stats_sizes *per_cpu; + int cpu; int d; int i; int cnt = 0; + + sum = kzalloc(sizeof(*sum), GFP_KERNEL); + if (unlikely(!sum)) + return -ENOMEM; + + for (i = 0; i < STATS_SIZES_NUM; i++) { + for_each_possible_cpu(cpu) { + per_cpu = &per_cpu_ptr(stats->pcpu_stats, cpu) + ->request_sizes; + + for (d=READ; d<=WRITE; d++) { + sum->dir[d].cnt_whole[i] + += per_cpu->dir[d].cnt_whole[i]; + sum->dir[d].cnt_left[i] + += per_cpu->dir[d].cnt_left[i]; + sum->dir[d].cnt_right[i] + += per_cpu->dir[d].cnt_right[i]; + } + } + } + + cnt += scnprintf(page + cnt, len - cnt, + " READ " + " whole left right " + "\n"); + if (len - cnt <= 0) + goto free_return; + + cnt += scnprintf(page + cnt, len - cnt, + "<= 8 Kbytes: %19llu %19llu %19llu\n", + sum->dir[READ].cnt_whole[0], + sum->dir[READ].cnt_left[0], + sum->dir[READ].cnt_right[0]); + + for (i = 1; i < STATS_SIZES_NUM; i++) { + + cnt += scnprintf(page + cnt, len - cnt, + "> %3d Kbytes: %19llu %19llu %19llu\n", + (i)<<3, + sum->dir[READ].cnt_whole[i], + sum->dir[READ].cnt_left[i], + sum->dir[READ].cnt_right[i]); + + if (len - cnt <= 0) + goto free_return; + } + + cnt += scnprintf(page + cnt, len - cnt, + "\n WRITE " + " whole left right " + "\n"); + if (len - cnt <= 0) + goto free_return; + + cnt += scnprintf(page + cnt, len - cnt, + "<= 8 Kbytes: %19llu %19llu %19llu\n", + sum->dir[WRITE].cnt_whole[0], + sum->dir[WRITE].cnt_left[0], + sum->dir[WRITE].cnt_right[0]); + + for (i = 1; i < STATS_SIZES_NUM; i++) { + + cnt += scnprintf(page + cnt, len - cnt, + "> %3d Kbytes: %19llu %19llu %19llu\n", + (i)<<3, + sum->dir[WRITE].cnt_whole[i], + sum->dir[WRITE].cnt_left[i], + sum->dir[WRITE].cnt_right[i]); + + if (len - cnt <= 0) + goto free_return; + } + +free_return: + kfree(sum); + + return cnt; +} + +int brmr_clt_reset_sts_resource(struct brmr_clt_stats *stats, bool enable) +{ + struct brmr_stats_pcpu *s; + int cpu; + + if (unlikely(!enable)) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->sts_resource, 0, + sizeof(s->sts_resource)); + } + + return 0; +} + +void brmr_clt_update_sts_resource(struct brmr_clt_stats *stats, int which) +{ + struct brmr_stats_pcpu *s; + + s = this_cpu_ptr(stats->pcpu_stats); + switch (which) { + case 0: + s->sts_resource.get_iu++; + break; + case 1: + s->sts_resource.get_iu2++; + break; + case 2: + s->sts_resource.clt_request1++; + break; + case 3: + s->sts_resource.clt_request++; + break; + default: + WARN_ONCE(true,"unexpected value for which"); + } +} + +ssize_t brmr_stats_sts_resource_to_str( + struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_sts_resource sum; + struct brmr_stats_sts_resource *r; + int cpu; + + memset(&sum, 0, sizeof(sum)); + + for_each_possible_cpu(cpu) { + r = &per_cpu_ptr(stats->pcpu_stats, cpu)->sts_resource; + + sum.get_iu += r->get_iu; + sum.get_iu2 += r->get_iu2; + sum.clt_request1 += r->clt_request1; + sum.clt_request += r->clt_request; + } + + return scnprintf(page, len, "%llu %llu %llu %llu\n", + sum.get_iu, sum.get_iu2, + sum.clt_request1, sum.clt_request); +} + +ssize_t brmr_stats_sts_resource_per_cpu_to_str( + struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_sts_resource *r; + int cpu; int cnt = 0; + + for_each_possible_cpu(cpu) { + r = &per_cpu_ptr(stats->pcpu_stats, cpu)->sts_resource; + + cnt += scnprintf(page+cnt, len, "%d %llu %llu %llu %llu\n", + cpu, r->get_iu, r->get_iu2, + r->clt_request1, r->clt_request); + if (len - cnt <= 0) + goto return_cnt; + } + +return_cnt: + return cnt; +} + diff --git a/drivers/block/brmr/brmr-clt-sysfs.c b/drivers/block/brmr/brmr-clt-sysfs.c new file mode 100644 index 000000000000..7d2435acac6a --- /dev/null +++ b/drivers/block/brmr/brmr-clt-sysfs.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "brmr-clt.h" + +static struct device *brmr_dev; +static struct class *brmr_dev_class; +static struct kobject *brmr_devs_kobj; + +enum { + BRMR_OPT_ERR = 0, + BRMR_OPT_POOL = 1 << 1, + BRMR_OPT_SIZE = 1 << 2, +}; + +static int brmr_clt_create_dev_sysfs_files(struct brmr_clt_dev *dev); +static int brmr_add_dev_symlink(struct brmr_clt_dev *dev); + +static unsigned int brmr_opt_mandatory[] = { + BRMR_OPT_POOL, +}; + +static const match_table_t brmr_opt_tokens = { + { BRMR_OPT_POOL, "pool=%s" }, + { BRMR_OPT_SIZE, "size=%s" }, + { BRMR_OPT_ERR, NULL }, +}; + +/* remove new line from string */ +static void strip(char *s) +{ + char *p = s; + + while (*s != '\0') { + if (*s != '\n') + *p++ = *s++; + else + ++s; + } + *p = '\0'; +} + +static int brmr_clt_parse_options(const char *buf, + char *pool, + unsigned long *size) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = strstrip(options); + strip(sep_opt); + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, brmr_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case BRMR_OPT_POOL: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("poolname too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(pool, p, NAME_MAX); + kfree(p); + break; + + case BRMR_OPT_SIZE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + /* + * The conventional semantics are that if the number begins with 0x, it will + * be parsed as hexadecimal; if it begins with 0, it will be parsed as + * octal; otherwise, it will be parsed as decimal. + */ + ret = kstrtoul(p, 0, size); + if (ret) { + pr_err("size '%s' isn't an integer: %d\n", p, ret); + kfree(p); + goto out; + } + kfree(p); + break; + + + default: + pr_err("unknown parameter or missing value" + " '%s'\n", p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(brmr_opt_mandatory); i++) { + if ((opt_mask & brmr_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("parameters missing\n"); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + +static ssize_t brmr_map_device_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo \"" + "pool= " + "size=\" > %s\n", + attr->attr.name); +} + +static ssize_t brmr_map_device_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_clt_dev *dev; + char pool[NAME_MAX]; + unsigned long size = 0; + int ret; + + ret = brmr_clt_parse_options(buf, pool, &size); + if (ret) + goto err; + + dev = find_and_get_device(pool); + if (dev) { + pr_err("Device exists and opened as %s\n", + dev->gd->disk_name); + brmr_clt_put_dev(dev); + ret = -EEXIST; + goto err; + } + + dev = brmr_clt_map_device(pool, size); + if (IS_ERR(dev)) { + pr_err("Error mapping device to pool %s\n", pool); + ret = PTR_ERR(dev); + goto err; + } + ret = brmr_clt_create_dev_sysfs_files(dev); + if (ret) + goto close_device; + + ret = brmr_add_dev_symlink(dev); + if (ret) + goto destroy_sysfs; + + return count; + +destroy_sysfs: + sysfs_remove_link(&dev->kobj, BRMR_LINK_NAME); + brmr_clt_destroy_dev_sysfs_files(dev, NULL); +close_device: + brmr_clt_close_device(dev, NULL); +err: + return ret; +} + +static struct kobj_attribute brmr_map_device_attr = + __ATTR(map_device, 0644, + brmr_map_device_show, brmr_map_device_store); + +static struct attribute *default_attrs[] = { + &brmr_map_device_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +static ssize_t brmr_unmap_device_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo > %s\n", + attr->attr.name); +} + +static ssize_t brmr_unmap_device_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_clt_dev *dev; + int err; + + dev = container_of(kobj, struct brmr_clt_dev, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + return -EINVAL; + } + + pr_info("Closing device %s.\n", dev->gd->disk_name); + + /* + * We take explicit module reference only for one reason: do not + * race with lockless ibnbd_destroy_sessions(). + */ + if (!try_module_get(THIS_MODULE)) { + return -ENODEV; + } + err = brmr_clt_close_device(dev, &attr->attr); + if (unlikely(err)) { + if (unlikely(err != -EALREADY)) + pr_err("unmap_device %s: %d\n", + dev->gd->disk_name, err); + goto module_put; + } + + /* + * Here device can be vanished! + */ + err = count; + +module_put: + module_put(THIS_MODULE); + + return err; +} + +static struct kobj_attribute brmr_unmap_device_attr = + __ATTR(unmap_device, 0644, + brmr_unmap_device_show, brmr_unmap_device_store); + +static ssize_t brmr_clt_device_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct brmr_clt_dev *dev; + int cnt; + + dev = container_of(kobj, struct brmr_clt_dev, kobj); + + switch (dev->dev_state) { + case DEV_STATE_INIT: + cnt = sysfs_emit(page, "init\n"); + break; + case DEV_STATE_READY: + cnt = sysfs_emit(page, "ready\n"); + break; + case DEV_STATE_DISCONNECTED: + cnt = sysfs_emit(page, "disconnected\n"); + break; + case DEV_STATE_CLOSING: + cnt = sysfs_emit(page, "closing\n"); + break; + default: + cnt = sysfs_emit(page, "unknown\n"); + break; + } + + if (dev->map_incomplete) + cnt += sysfs_emit_at(page, cnt, "degraded\n"); + + return cnt; +} + +static struct kobj_attribute brmr_clt_device_state = + __ATTR(state, 0444, brmr_clt_device_state_show, NULL); + +static struct attribute *brmr_clt_dev_attrs[] = { + &brmr_unmap_device_attr.attr, + &brmr_clt_device_state.attr, + NULL, +}; +ATTRIBUTE_GROUPS(brmr_clt_dev); + +static struct kobj_type brmr_clt_device_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = brmr_clt_dev_groups, +}; + +static struct kobj_type brmr_clt_stats_ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +static int brmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *kobj_stats); + +static int brmr_clt_create_dev_sysfs_files(struct brmr_clt_dev *dev) +{ + int ret; + + ret = kobject_init_and_add(&dev->kobj, &brmr_clt_device_ktype, + brmr_devs_kobj, + "%s", dev->gd->disk_name); + if (ret) + pr_err("Failed to create sysfs dir for device '%s': %d\n", + dev->gd->disk_name, ret); + + ret = brmr_clt_create_stats_files(&dev->kobj, &dev->kobj_stats); + if (unlikely(ret)) { + pr_err("Failed to create sysfs stats files " + "for device '%s': %d\n", dev->gd->disk_name, ret); + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); + } + return ret; +} + +static int brmr_add_dev_symlink(struct brmr_clt_dev *dev) +{ + struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; + int ret; + + ret = sysfs_create_link(&dev->kobj, gd_kobj, BRMR_LINK_NAME); + if (ret) { + pr_err("Creating symlink for %s failed, err: %d\n", + dev->gd->disk_name, ret); + } + + return ret; +} + +void brmr_clt_destroy_dev_sysfs_files(struct brmr_clt_dev *dev, + const struct attribute *sysfs_self) +{ + if (dev->kobj.state_in_sysfs) { + + kobject_del(&dev->kobj_stats); + kobject_put(&dev->kobj_stats); + if (sysfs_self) + sysfs_remove_file_self(&dev->kobj, sysfs_self); + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); + } +} + +int brmr_clt_create_sysfs_files(void) +{ + int err; + + brmr_dev_class = class_create("brmr-client"); + if (IS_ERR(brmr_dev_class)) + return PTR_ERR(brmr_dev_class); + + brmr_dev = device_create(brmr_dev_class, NULL, + MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(brmr_dev)) { + err = PTR_ERR(brmr_dev); + goto cls_destroy; + } + brmr_devs_kobj = kobject_create_and_add("devices", &brmr_dev->kobj); + if (unlikely(!brmr_devs_kobj)) { + err = -ENOMEM; + goto dev_destroy; + } + err = sysfs_create_group(&brmr_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto put_devs_kobj; + + return 0; + +put_devs_kobj: + kobject_del(brmr_devs_kobj); + kobject_put(brmr_devs_kobj); +dev_destroy: + device_unregister(brmr_dev); +cls_destroy: + class_destroy(brmr_dev_class); + + return err; +} + +void brmr_clt_destroy_sysfs_files(void) +{ + sysfs_remove_group(&brmr_dev->kobj, &default_attr_group); + kobject_del(brmr_devs_kobj); + kobject_put(brmr_devs_kobj); + device_unregister(brmr_dev); + class_destroy(brmr_dev_class); +} + +STAT_ATTR(struct brmr_clt_dev, requests, + brmr_clt_stats_rq_to_str, brmr_clt_reset_submitted_req); +STAT_ATTR(struct brmr_clt_dev, request_sizes, + brmr_clt_stats_sizes_to_str, brmr_clt_reset_req_sizes); +STAT_ATTR(struct brmr_clt_dev, sts_resource, + brmr_stats_sts_resource_to_str, brmr_clt_reset_sts_resource); +STAT_ATTR(struct brmr_clt_dev, sts_resource_per_cpu, + brmr_stats_sts_resource_per_cpu_to_str, brmr_clt_reset_sts_resource); + +static struct attribute *brmr_stats_attrs[] = { + &requests_attr.attr, + &request_sizes_attr.attr, + &sts_resource_attr.attr, + &sts_resource_per_cpu_attr.attr, + NULL, +}; + +static struct attribute_group brmr_stats_attr_group = { + .attrs = brmr_stats_attrs, +}; + +static int brmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *kobj_stats) +{ + int ret; + + ret = kobject_init_and_add(kobj_stats, &brmr_clt_stats_ktype, kobj, "stats"); + if (ret) { + pr_err("Failed to init and add stats kobject, err: %d\n", + ret); + return ret; + } + + ret = sysfs_create_group(kobj_stats, &brmr_stats_attr_group); + if (ret) { + pr_err("failed to create stats sysfs group, err: %d\n", + ret); + goto put_stats_obj; + } + + return 0; + +put_stats_obj: + kobject_del(kobj_stats); + kobject_put(kobj_stats); + + return ret; +} diff --git a/drivers/block/brmr/brmr-clt.c b/drivers/block/brmr/brmr-clt.c new file mode 100644 index 000000000000..6f3d2dd2a9d9 --- /dev/null +++ b/drivers/block/brmr/brmr-clt.c @@ -0,0 +1,1222 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include + +#include "brmr-clt.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(BRMR_VER_STRING); +MODULE_DESCRIPTION("BRMR Block Device using RMR cluster"); +MODULE_LICENSE("GPL"); + +/* + * Maximum number of partitions an instance can have. + * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself) + */ +#define BRMR_PART_BITS 6 + +static DEFINE_IDA(index_ida); +static DEFINE_MUTEX(ida_lock); +static DEFINE_MUTEX(brmr_device_lock); +static LIST_HEAD(brmr_device_list); +static int brmr_major; + +static int BRMR_DELAY_10ms = 10; + +static int index_to_minor(int index) +{ + return index << BRMR_PART_BITS; +} + +static int minor_to_index(int minor) +{ + return minor >> BRMR_PART_BITS; +} + +static inline const char *rq_op_to_str(struct request *rq) +{ + switch (req_op(rq)) { + case REQ_OP_READ: + return "READ"; + case REQ_OP_WRITE: + return "WRITE"; + case REQ_OP_DISCARD: + return "DISCARD"; + case REQ_OP_WRITE_ZEROES: + return "WRITE_ZEROES"; + case REQ_OP_FLUSH: + return "FLUSH"; + default: + return "UNKNOWN"; + } + return ""; +} + + +/* copy from blk.h */ +static inline bool biovec_phys_mergeable(struct request_queue *q, + struct bio_vec *vec1, struct bio_vec *vec2) +{ + unsigned long mask = queue_segment_boundary(q); + phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; + phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + + if (addr1 + vec1->bv_len != addr2) + return false; + // Comment out xen related code + /* + if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) + return false; + */ + if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) + return false; + return true; +} + +/* copy from blk_merge.c */ +static inline unsigned get_max_segment_size(const struct request_queue *q, + struct page *start_page, + unsigned long offset) +{ + unsigned long mask = queue_segment_boundary(q); + + offset = mask & (page_to_phys(start_page) + offset); + + /* + * overflow may be triggered in case of zero page physical address + * on 32bit arch, use queue's max segment size when that happens. + */ + return min_not_zero(mask - offset + 1, + (unsigned long)queue_max_segment_size(q)); +} + +static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, + struct scatterlist *sglist) +{ + if (!*sg) + return sglist; + + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); +} + +/* only try to merge bvecs into one sg if they are from two bios */ +static inline bool +__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec, + struct bio_vec *bvprv, struct scatterlist **sg) +{ + + int nbytes = bvec->bv_len; + + if (!*sg) + return false; + + if ((*sg)->length + nbytes > queue_max_segment_size(q)) + return false; + + if (!biovec_phys_mergeable(q, bvprv, bvec)) + return false; + + (*sg)->length += nbytes; + + return true; +} + +/* + * brmr_clt_get_iu() - Get an RMR I/O unit (iu) + * + * Description: + * It gets an RMR I/O unit using rmr_clt_get_iu() and increments + * the pool busy counter. It invokes rmr_clt_get_iu() with NO_WAIT + * as brmr can requeue an I/O request. + * + * Ref. brmr_add_to_requeue() + */ +static inline struct rmr_iu *brmr_clt_get_iu(struct brmr_clt_pool *pool, enum rmr_io_flags flag) +{ + struct rmr_iu *iu = rmr_clt_get_iu(pool->rmr, flag, NO_WAIT); + if (IS_ERR_OR_NULL(iu)) + return iu; + + atomic_inc(&pool->busy); + + return iu; +} + +/* + * brmr_clt_put_iu() - Put the RMR I/O unit (iu) + * + * Description: + * It puts the RMR I/O unit using rmr_clt_put_iu() and decrements + * the pool busy counter. It uses memory barrier to reflect the + * busy counter. + * + * Ref. brmr_add_to_requeue() and brmr_requeue_requests() + */ +static inline void brmr_clt_put_iu(struct brmr_clt_pool *pool, struct rmr_iu *iu) +{ + rmr_clt_put_iu(pool->rmr, iu); + + atomic_dec(&pool->busy); + /* + * Paired with brmr_add_to_requeue(). Decrement first + * and then check queue bits. + */ + smp_mb__after_atomic(); + brmr_requeue_requests(pool); +} + +static void brmr_softirq_done_fn(struct request *rq) +{ + struct brmr_clt_iu *iu = blk_mq_rq_to_pdu(rq); + struct brmr_clt_dev *dev = iu->dev; + + if (blk_rq_nr_phys_segments(rq)) + sg_free_table_chained(&iu->sgt, BRMR_INLINE_SG_CNT); + + brmr_clt_put_iu(dev->pool, iu->rmr_iu); + blk_mq_end_request(rq, iu->status); +} + +static void brmr_request_conf(void *priv, int errno) +{ + struct brmr_clt_iu *iu = (struct brmr_clt_iu *)priv; + struct brmr_clt_dev *dev = iu->dev; + struct request *rq = iu->rq; + + iu->status = (errno && errno != -ENOENT) ? BLK_STS_IOERR : BLK_STS_OK; + + blk_mq_complete_request(rq); + + if (errno == -ENOENT) + pr_debug("%s request for %s IGNORED err: %d\n", + rq_op_to_str(rq), dev->gd->disk_name, errno); + else if (errno) + pr_err_ratelimited("%s request for %s failed with err: %d\n", + rq_op_to_str(rq), dev->gd->disk_name, errno); +} + +static blk_status_t brmr_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct brmr_clt_dev *dev = bd->rq->q->disk->private_data; + struct brmr_clt_pool *pool = dev->pool; + struct brmr_clt_iu *iu = blk_mq_rq_to_pdu(bd->rq); + struct request *rq = bd->rq; + struct rmr_iu *rmr_iu; + unsigned int sg_cnt = 0; + size_t offset; size_t length; + enum rmr_io_flags flag; + unsigned short prio, seg; + int data_dir, err; + blk_status_t ret = BLK_STS_IOERR; + + if (unlikely(dev->dev_state != DEV_STATE_READY)) + return ret; + + iu->rq = rq; + iu->dev = dev; + + offset = blk_rq_pos(rq) << SECTOR_SHIFT; + length = blk_rq_bytes(rq); + flag = rq_to_rmr_flags(rq); + prio = req_get_ioprio(rq); + data_dir = rq_data_dir(rq); + + rmr_iu = brmr_clt_get_iu(pool, flag); + if (unlikely(rmr_iu == NULL)) { + pr_debug("Got no tag to send a request to rmr_clt\n"); + + /* Increment statistic counter for it */ + brmr_clt_update_sts_resource(&dev->stats, 0); + + if (!brmr_add_to_requeue(pool, hctx->driver_data)) + /* + * TODO unlikely + * Restarting queue with some delay is a stupid way + * of handling resource contentions + */ + blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms); + + return BLK_STS_RESOURCE; + } + if (IS_ERR(rmr_iu)) { + pr_err("Error %pe when reserving resources for io in pool %s\n", + rmr_iu, pool->rmr->poolname); + return BLK_STS_IOERR; + } + iu->rmr_iu = rmr_iu; + + iu->sgt.sgl = iu->sgl; + seg = blk_rq_nr_phys_segments(rq); + if (seg) { + err = sg_alloc_table_chained(&iu->sgt, seg, iu->sgt.sgl, BRMR_INLINE_SG_CNT); + if (err) { + pr_err("sg_alloc_table_chained failed, ret=%x\n", err); + blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms); + brmr_clt_put_iu(pool, rmr_iu); + return BLK_STS_RESOURCE; + } + } + + /* We only support discards with single segment and write_zeroes request with no segment. */ + /* See queue limits. */ + if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES)) + sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl); + + blk_mq_start_request(rq); + brmr_update_stats(&dev->stats, length, 0, data_dir); + + pr_debug("brmr %s request with flag %x offset %lu length %lu sg_cnt: %d\n", + rq_op_to_str(rq), flag, offset, length, sg_cnt); + + err = rmr_clt_request(pool->rmr, rmr_iu, offset, length, flag, prio, + iu, brmr_request_conf, iu->sgt.sgl, sg_cnt); + if (likely(err == 0)) + return BLK_STS_OK; + + pr_err_ratelimited("sending %s request for %s failed with err: %d\n", + rq_op_to_str(rq), dev->gd->disk_name, err); + + if (unlikely(err == -EAGAIN || err == -ENOMEM)) { + pr_debug("Got resource error %d when sending a request to rmr_clt\n", err); + + brmr_clt_update_sts_resource(&dev->stats, 3); + blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms); + + ret = BLK_STS_RESOURCE; + } else { + ret = BLK_STS_IOERR; + } + + if (seg) + sg_free_table_chained(&iu->sgt, BRMR_INLINE_SG_CNT); + + brmr_clt_put_iu(pool, rmr_iu); + return ret; +} + +static struct blk_mq_ops brmr_mq_ops = { + .queue_rq = brmr_queue_rq, + .complete = brmr_softirq_done_fn, +}; + +static struct brmr_clt_pool *brmr_clt_create_pool(const char *poolname) +{ + struct brmr_clt_pool *pool; + int err; + struct rmr_attrs attrs; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->rmr = rmr_clt_open(pool, NULL, poolname); + if (IS_ERR_OR_NULL(pool->rmr)) { + err = PTR_ERR(pool->rmr); + goto free_pool; + } + err = rmr_clt_query(pool->rmr, &attrs); + if (unlikely(err)) + goto close_rmr; + + pool->queue_depth = attrs.queue_depth; + pool->max_io_size = attrs.max_io_size; + pool->chunk_size = attrs.chunk_size; + pool->max_segments = attrs.max_segments; + + snprintf(pool->poolname, sizeof(pool->poolname), "%s", poolname); + + /* + * When opening a new pool, allocate mq tags for that pool - they are + * going to be shared among all devices opened in that pool + */ + pool->tag_set.ops = &brmr_mq_ops; + pool->tag_set.queue_depth = pool->queue_depth; + pool->tag_set.numa_node = NUMA_NO_NODE; + pool->tag_set.flags = BLK_MQ_F_TAG_QUEUE_SHARED; + pool->tag_set.cmd_size = sizeof(struct brmr_clt_iu) + BRMR_RDMA_SGL_SIZE; + pool->tag_set.nr_hw_queues = num_online_cpus(); + + err = blk_mq_alloc_tag_set(&pool->tag_set); + if (unlikely(err)) + goto close_rmr; + + refcount_set(&pool->refcount, 1); + + atomic_set(&pool->busy, 0); + bitmap_zero(pool->cpu_queues_bm, NR_CPUS); + pool->cpu_rr = alloc_percpu(int); + if (unlikely(!pool->cpu_rr)) { + pr_err("Failed to alloc percpu var (cpu_rr)\n"); + err = -ENOMEM; + goto free_tag_set; + } + pool->cpu_queues = alloc_percpu(struct brmr_cpu_qlist); + if (unlikely(!pool->cpu_queues)) { + pr_err("Failed to alloc percpu var (cpu_queues)\n"); + err = -ENOMEM; + goto free_cpu_rr; + } + brmr_init_cpu_qlists(pool->cpu_queues); + return pool; +free_cpu_rr: + free_percpu(pool->cpu_rr); +free_tag_set: + blk_mq_free_tag_set(&pool->tag_set); +close_rmr: + rmr_clt_close(pool->rmr); +free_pool: + kfree(pool); + + return ERR_PTR(err); +} + +static void brmr_clt_free_pool(struct brmr_clt_pool *pool) +{ + free_percpu(pool->cpu_queues); + pool->cpu_queues = NULL; + free_percpu(pool->cpu_rr); + pool->cpu_rr = NULL; + blk_mq_free_tag_set(&pool->tag_set); + rmr_clt_close(pool->rmr); + kfree(pool); +} + +static void brmr_clt_put_pool(struct brmr_clt_pool *pool) +{ + if (refcount_dec_and_test(&pool->refcount)) + brmr_clt_free_pool(pool); + else + rmr_clt_put_pool(pool->rmr); +} + +static inline bool brmr_clt_get_dev(struct brmr_clt_dev *dev) +{ + return refcount_inc_not_zero(&dev->refcount); +} + +void brmr_clt_put_dev(struct brmr_clt_dev *dev) +{ + might_sleep(); + + if (refcount_dec_and_test(&dev->refcount)) { + + mutex_lock(&ida_lock); + ida_free(&index_ida, dev->idx); + mutex_unlock(&ida_lock); + + kfree(dev->hw_queues); + + brmr_clt_put_pool(dev->pool); + + if (!list_empty(&dev->list)) { + mutex_lock(&brmr_device_lock); + list_del(&dev->list); + mutex_unlock(&brmr_device_lock); + } + kfree(dev); + } +} + +static int brmr_open(struct gendisk *disk, blk_mode_t mode) +{ + struct brmr_clt_dev *dev = disk->private_data; + + if (READ_ONCE(dev->dev_state) != DEV_STATE_READY) + return -EIO; + + if (!brmr_clt_get_dev(dev)) + return -EIO; + + return 0; +} + +static void brmr_release(struct gendisk *gen) +{ + struct brmr_clt_dev *dev = gen->private_data; + + brmr_clt_put_dev(dev); +} + +#if 0 +static int brmr_getgeo(struct block_device *block_device, + struct hd_geometry *geo) +{ + struct brmr_clt_dev *dev = block_device->bd_disk->private_data; + + geo->cylinders = (dev->size_sect & ~0x3f) >> 6; /* size/64 */ + geo->heads = 4; + geo->sectors = 16; + geo->start = 0; + + return 0; +} +#endif + +static const struct block_device_operations brmr_ops = { + .owner = THIS_MODULE, + .open = brmr_open, + .release = brmr_release, + /*.getgeo = brmr_getgeo,*/ +}; + +/** + * brmr_clt_init_cmd() - Initialize message command + * + * @msg: command message where to init + */ +static void brmr_clt_init_cmd(struct brmr_msg_cmd *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->ver = BRMR_PROTO_VER_MAJOR; +} + +/** + * brmr_cmd_conf() - Confirmation function for brmr command message + * + * @priv: priv pointer to brmr command private data + * @errno: error number passed from RMR. + * See description of errno in RMR function. + * + * Description: + * Command response for a map new command can fail on multiple levels. + * If RMR fails to send the message to any or one of the nodes, that would reflect on the + * errno. If the command fails on BRMR level, that would reflect on the rsp struct. + * The error number will be used differently by different commands accordingly. + */ +static void brmr_clt_cmd_conf(void *priv, int errno) +{ + struct brmr_cmd_priv *cmd_priv = (struct brmr_cmd_priv *)priv; + + switch (cmd_priv->cmd_type) { + case BRMR_CMD_MAP: + pr_info("%s: BRMR_CMD_MAP err=%d\n", __func__, errno); + cmd_priv->errno = errno; + break; + case BRMR_CMD_REMAP: + pr_info("%s: BRMR_CMD_REMAP err=%d\n", __func__, errno); + break; + case BRMR_CMD_UNMAP: + pr_info("%s: BRMR_CMD_UNMAP err=%d\n", __func__, errno); + /* + * No processing needed here. + */ + break; + case BRMR_CMD_GET_PARAMS: + pr_info("%s: BRMR_CMD_GET_PARAMS err=%d\n", __func__, errno); + if (errno) + cmd_priv->errno = errno; + break; + + default: + pr_err("%s: Unknown command type %d err=%d\n", __func__, cmd_priv->cmd_type, errno); + } + + complete(&cmd_priv->complete_done); +} + +/** + * brmr_clt_send_msg_cmd() - Sends command message to rmr pool + * + * @dev: pointer to brmr device + * @msg: msg struct to be sent + * @rsp_buf: response buffer where the response of the storage side is stored + * @rsp_buf_len: length of the response buffer + * + * Return: + * Negative if failed to sent command + * As handled by each command in brmr_clt_cmd_conf, if succeeded to send command + * + * Context: + * Would block until response is received + */ +static int brmr_clt_send_msg_cmd(struct brmr_clt_dev *dev, struct brmr_msg_cmd *msg, void *rsp_buf, + size_t rsp_buf_len) +{ + struct brmr_cmd_priv cmd_priv; + struct kvec vec; + int ret; + + vec = (struct kvec) { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + + cmd_priv.dev = dev; + cmd_priv.cmd_type = msg->cmd_type; + cmd_priv.rsp_buf = rsp_buf; + cmd_priv.rsp_buf_len = rsp_buf_len; + cmd_priv.errno = 0; + init_completion(&cmd_priv.complete_done); + + ret = rmr_clt_cmd_with_rsp(dev->pool->rmr, brmr_clt_cmd_conf, &cmd_priv, &vec, 1, rsp_buf, + rsp_buf_len, sizeof(struct brmr_msg_cmd_rsp)); + + if (!ret) { + wait_for_completion(&cmd_priv.complete_done); + ret = cmd_priv.errno; + } + + return ret; +} + +static struct brmr_clt_dev *brmr_alloc_and_init_dev(struct brmr_clt_pool *pool, + u64 size) +{ + struct brmr_clt_dev *dev; + struct brmr_queue *q; + struct blk_mq_hw_ctx *hctx; + int ret; + unsigned long i; + + /* + * alloc device structure + */ + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&dev->list); + dev->size_sect = size; + dev->pool = pool; + dev->dev_state = DEV_STATE_INIT; + dev->map_incomplete = false; + refcount_set(&dev->refcount, 1); + + /* + * Alloc a "queue" per cpu + */ + dev->hw_queues = kcalloc(nr_cpu_ids, + sizeof(*dev->hw_queues), GFP_KERNEL); + if (unlikely(!dev->hw_queues)) { + ret = -ENOMEM; + goto free_dev; + } + + /* + * Get an id to be used in /dev/brmr + */ + mutex_lock(&ida_lock); + ret = ida_alloc_range(&index_ida, 0, minor_to_index(1 << MINORBITS) - 1, + GFP_KERNEL); + mutex_unlock(&ida_lock); + if (ret < 0) { + pr_err("%s: ida_alloc_range() failed for pool %s, err: %d\n", + __func__, pool->poolname, ret); + goto free_queues; + } + dev->idx = ret; + + /* + * Init mq queue + */ + dev->gd = blk_mq_alloc_disk(&pool->tag_set, NULL, dev); + if (IS_ERR(dev->gd)) { + ret = PTR_ERR(dev->gd); + pr_err("Failed to initialize mq: %pe\n", dev->queue); + goto remove_ida; + } + dev->queue = dev->gd->queue; + + /* + * Assign hardware contexts to our queues + */ + queue_for_each_hw_ctx(dev->queue, hctx, i) { + q = &dev->hw_queues[i]; + INIT_LIST_HEAD(&q->requeue_list); + q->hctx = hctx; + hctx->driver_data = q; + } + + return dev; + +remove_ida: + mutex_lock(&ida_lock); + ida_free(&index_ida, dev->idx); + mutex_unlock(&ida_lock); +free_queues: + kfree(dev->hw_queues); +free_dev: + kfree(dev); +out: + return ERR_PTR(ret); +} + +static int brmr_set_dev_params(struct brmr_clt_dev *dev) +{ + struct brmr_clt_pool *pool = dev->pool; + u32 chunk_size = brmr_pool_chunk_size(pool); + struct queue_limits lim; + int ret; + + /* Aligns requests with the chunks in rmr client */ + if (!is_power_of_2(chunk_size >> SECTOR_SHIFT)) { + pr_err("%u not a power of 2!\n", chunk_size); + return -EINVAL; + } + + /* + * Set request queue parameters via queue_limits API + */ + lim = queue_limits_start_update(dev->queue); + lim.logical_block_size = dev->logical_block_size; + lim.physical_block_size = dev->physical_block_size; + lim.max_segments = dev->max_segments; + lim.max_hw_sectors = dev->max_hw_sectors; + lim.max_write_zeroes_sectors = dev->max_write_zeroes_sectors; + lim.io_opt = brmr_pool_chunk_size(pool); + lim.chunk_sectors = chunk_size >> SECTOR_SHIFT; + + /* however we don't support discards to */ + /* discontiguous segments in one request */ + lim.max_discard_segments = 1; + lim.max_hw_discard_sectors = dev->max_discard_sectors; + if (dev->secure_discard) + lim.max_secure_erase_sectors = dev->max_discard_sectors; + + lim.discard_granularity = dev->discard_granularity; + lim.discard_alignment = dev->discard_alignment; + + /* needed for ibtrs_map_sg_fr to work */ + lim.virt_boundary_mask = SZ_4K - 1; + + /* non-rotational device */ + lim.features &= ~BLK_FEAT_ROTATIONAL; + + if (dev->wc) + lim.features |= BLK_FEAT_WRITE_CACHE; + if (dev->fua) + lim.features |= BLK_FEAT_FUA; + + ret = queue_limits_commit_update(dev->queue, &lim); + if (ret) + goto err; + + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); + + ret = brmr_clt_init_stats(&dev->stats); + if (unlikely(ret)) + goto err; + + dev->gd->major = brmr_major; + dev->gd->minors = 1 << BRMR_PART_BITS; + dev->gd->first_minor = index_to_minor(dev->idx); + dev->gd->fops = &brmr_ops; + dev->gd->queue = dev->queue; + dev->gd->private_data = dev; + snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), + "brmr%d", dev->idx); + set_capacity(dev->gd, dev->size_sect); + + return 0; + +err: + return ret; +} + +/** + * brmr_get_remote_dev_params() - Gets device params from storage nodes + * + * @dev: pointer to brmr device + * + * Description: + * Does the following (sanity) checks + * 1) For an unmapped device, param get should succeed on all legs + * 2) There should not be a mixture of mapped and unmapped devices + * + * In addition to above, it also does the following work + * 1) For a mapped device, read from a single leg is enough for success + * 2) For an unmapped device, it does validation checks for params for every leg + * + * Return: + * Negative in case of failure + * 0 for success, and a non-mapped device is found + * 1 for success, and a mapped device is found + * + * Context: + * Would block until response is received + */ +static int brmr_get_remote_dev_params(struct brmr_clt_dev *dev) +{ + struct brmr_clt_pool *pool = dev->pool; + struct brmr_msg_cmd msg; + struct brmr_msg_cmd_rsp *brmr_cmd_rsp; + void *rsp_buf; + size_t rsp_buf_len; + int err = 0, i; + bool partial_fail = false, mapped = false; + + brmr_clt_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_GET_PARAMS; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) + return -ENOMEM; + + err = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (err < 0) { + pr_err("%s: brmr_clt_send_msg_cmd failed with errno %d\n", __func__, err); + goto free_data; + } else if (err) { + /* + * We cannot directly fail here, since we do not know if this is a map for a + * newly created device, or for one which has gone through mapping before. + * + * For the former, any failure should end in the whole map process failing. + * For the latter, a single read from a device with mapped state set should + * be enough for us to go ahead and map. + */ + partial_fail = true; + } + + /* + * Lets do the sanity check first, because combining it with param checks makes the + * entire loop harder to read + */ + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + struct brmr_cmd_get_params_rsp *get_params_rsp; + + brmr_cmd_rsp = ((struct brmr_msg_cmd_rsp *)rsp_buf) + i; + + /* + * We do not need to worry about not seeing MAGIC. + * This would happen for a non-working sessions, OR + * for extra sessions in the end for which there are no legs in RMR (Don't care) + * + * For non-working sessions, we will be notified by RMR through the return value + */ + if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC) + continue; + + /* + * This is error returned by rmr-store. + */ + if (brmr_cmd_rsp->status) + partial_fail = true; + + get_params_rsp = &brmr_cmd_rsp->get_params_rsp; + + /* + * If we find a mapped device, we save that info. + */ + if (get_params_rsp->mapped) + mapped = true; + } + + /* + * If there is no device mapped, it means that this is the first map after device creation + * In such a case, we need all sessions to be up and running. + */ + if (mapped == false && partial_fail) { + pr_err("%s: Mapping first time, but got failure for some sessions\n", __func__); + err = -EINVAL; + goto free_data; + } + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + struct brmr_cmd_get_params_rsp *get_params_rsp; + struct brmr_blk_dev_params *rsp_dev_params; + + brmr_cmd_rsp = ((struct brmr_msg_cmd_rsp *)rsp_buf) + i; + + /* + * We are tracking partial failures through the above loop, so + * ignore it here. + */ + if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC || + brmr_cmd_rsp->status) + continue; + + get_params_rsp = &brmr_cmd_rsp->get_params_rsp; + + /* + * We cheat a little, and do this sanity check here. + * + * If even a single device was mapped, and we have sessions with non-mapped + * devices, it will be wrong to go forward with brmr map. + */ + if (mapped && !get_params_rsp->mapped) { + /* + * This can only happen if a node went down and up. + * And instead of re-adding a MAPPED device, a create was called + * We cannot allow map this way, since this means discard could + * have been skipped. + */ + pr_err("%s: Mixed combination of mapped+unmapped metadata found\n", + __func__); + err = -EINVAL; + goto free_data; + } + + /* + * The device size_sect, which is the size provided by the user in the map + * command, should be same as the mapped_size of every storage node's backend + * device, which was provided during create_store. + */ + if (dev->size_sect != le64_to_cpu(get_params_rsp->mapped_size)) { + pr_err("%s: Mismatched mapped_size: (Provide) %llu != %llu (Remote)\n", + __func__, dev->size_sect, le64_to_cpu(get_params_rsp->mapped_size)); + err = -EINVAL; + goto free_data; + } + + rsp_dev_params = &get_params_rsp->dev_params; + + dev->max_write_zeroes_sectors = min_not_zero( + dev->max_write_zeroes_sectors, + le32_to_cpu( + rsp_dev_params->max_write_zeroes_sectors)); + dev->max_discard_sectors = min_not_zero(brmr_pool_chunk_size(pool) >> SECTOR_SHIFT, + le32_to_cpu(rsp_dev_params->max_discard_sectors)); + dev->physical_block_size = max_t(u16, dev->physical_block_size, + le16_to_cpu(rsp_dev_params->physical_block_size)); + dev->logical_block_size = max_t(u16, dev->logical_block_size, + le16_to_cpu(rsp_dev_params->logical_block_size)); + + dev->discard_granularity = dev->logical_block_size; + dev->discard_alignment = dev->logical_block_size; + + /* secure_discard is actually true or false, but since we used + * __le16 to transfer this value in msg, min_t should work fine here + */ + dev->secure_discard = min_t(u16, dev->secure_discard, + le16_to_cpu(rsp_dev_params->secure_discard)); + + dev->cache_policy = rsp_dev_params->cache_policy; + dev->wc = !!(rsp_dev_params->cache_policy & BRMR_WRITEBACK); + dev->fua = !!(rsp_dev_params->cache_policy & BRMR_FUA); + } + + /* max segments and max_hw_sectors we get from rtrs sessions values + * stored in pool like in RNBD, not from bdev of the store side. + */ + dev->max_segments = pool->max_segments; + dev->max_hw_sectors = pool->max_io_size / SECTOR_SIZE; + + /* + * Return whether its a new map or an old one + */ + err = mapped; + +free_data: + kfree(rsp_buf); + + return err; +} + +/** + * brmr_clt_send_map_cmd() - Sends map command for a brmr device + * + * @dev: pointer to brmr device + * + * Return: + * Negative error value in case of failure + * 0 on success + * + * Context: + * Would block until response is received + */ +static int brmr_clt_send_map_cmd(struct brmr_clt_dev *dev) +{ + struct brmr_clt_pool *pool = dev->pool; + struct brmr_msg_cmd msg; + struct brmr_blk_dev_params *dev_params = &(msg.map_new_cmd.dev_params); + void *rsp_buf; + size_t rsp_buf_len; + int err = 0; + + brmr_clt_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_MAP; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) + return -ENOMEM; + + msg.map_new_cmd.version = BRMR_CURRENT_HEADER_VERSION; + msg.map_new_cmd.mapped_size = dev->size_sect; + + dev_params->max_hw_sectors = cpu_to_le32(dev->max_hw_sectors); + dev_params->max_write_zeroes_sectors = cpu_to_le32(dev->max_write_zeroes_sectors); + dev_params->max_discard_sectors = cpu_to_le32(dev->max_discard_sectors); + dev_params->discard_granularity = cpu_to_le32(dev->discard_granularity); + dev_params->discard_alignment = cpu_to_le32(dev->discard_alignment); + dev_params->physical_block_size = cpu_to_le16(dev->physical_block_size); + dev_params->logical_block_size = cpu_to_le16(dev->logical_block_size); + dev_params->max_segments = cpu_to_le16(dev->max_segments); + dev_params->secure_discard = cpu_to_le16(dev->secure_discard); + dev_params->cache_policy = dev->cache_policy; + + err = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (err) + pr_err("Failed to send cmd msg BRMR_CMD_MAP in pool %s, err=%d\n", + pool->poolname, err); + + kfree(rsp_buf); + return err; +} + +/* + * brmr_clt_send_unmap_cmd() - Send an unmap command to the server pool + * + * Sending may fail (e.g. no sessions connected). The failure is logged but + * not propagated — callers always continue with local cleanup regardless. + */ +static void brmr_clt_send_unmap_cmd(struct brmr_clt_dev *dev) +{ + struct brmr_msg_cmd msg; + void *rsp_buf; + size_t rsp_buf_len; + int ret; + + brmr_clt_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_UNMAP; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) { + pr_err("Failed to alloc rsp_buf for unmap in pool %s\n", + dev->pool->poolname); + return; + } + + /* + * Sending messages could fail. For example, there are no client pool sessions + * connected to this pool. Unmap_dev still progresses and cleans up the device + * states on the client side. + */ + ret = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (ret) + pr_err("Error %d when unmap device in pool %s\n", + ret, dev->pool->poolname); + + kfree(rsp_buf); +} + +/** + * brmr_clt_map_device() - Maps brmr device through an rmr pool + * + * @id: Id for the device + * @poolname: rmr poolname which is to be used for mapping + * @size: Size of the disk + * + * Description: + * Opens rmr pool with pool name "poolname" + * Allocated brmr device and initializes it + * Maps brmr device using the rmr pool only if its not already mapped + * + * Return: + * Pointer to allocated and mapped brmr device on success + * Error pointer on failure + */ +struct brmr_clt_dev *brmr_clt_map_device(const char *poolname, u64 size) +{ + struct brmr_clt_pool *pool = NULL; + struct brmr_clt_dev *dev; + int ret, mapped; + + /* Create brmr pool */ + pool = brmr_clt_create_pool(poolname); + if (IS_ERR(pool)) { + ret = PTR_ERR(pool); + goto err_out; + } + + /* Alloc device */ + dev = brmr_alloc_and_init_dev(pool, size); + if (IS_ERR(dev)) { + pr_err("Error %pe allocating brmr device in pool %s\n", + dev, pool->poolname); + brmr_clt_put_pool(pool); + ret = PTR_ERR(dev); + goto err_out; + } + + mapped = brmr_get_remote_dev_params(dev); + if (mapped < 0) { + pr_err("Failed to get remote devs block params in pool %s, err=%d\n", + pool->poolname, mapped); + ret = mapped; + goto dest_dev; + } + + /* Set device params */ + ret = brmr_set_dev_params(dev); + if (unlikely(ret)) { + pr_err("Error %d brmr_set_dev_params in pool %s\n", + ret, pool->poolname); + goto dest_dev; + } + + /* + * We send map command only if its a new map. + * This must happen before add_disk() so the server is ready to serve + * I/O by the time the kernel probes the partition table. + */ + if (!mapped) { + pr_info("%s: Sending map command through pool %s\n", __func__, pool->poolname); + ret = brmr_clt_send_map_cmd(dev); + if (ret) { + pr_err("Failed to send map cmd to pool %s, err=%d\n", + pool->poolname, ret); + goto put_disk; + } + } + + dev->dev_state = DEV_STATE_READY; + + /* + * Add gendisk + */ + ret = add_disk(dev->gd); + if (ret) { + pr_err("%s: add_disk failed with err %d\n", __func__, ret); + goto unmap_dev; + } + + mutex_lock(&brmr_device_lock); + list_add(&dev->list, &brmr_device_list); + mutex_unlock(&brmr_device_lock); + + return dev; + +unmap_dev: + dev->dev_state = DEV_STATE_INIT; + if (!mapped) + brmr_clt_send_unmap_cmd(dev); +put_disk: + put_disk(dev->gd); + brmr_clt_free_stats(&dev->stats); +dest_dev: + brmr_clt_put_dev(dev); +err_out: + return ERR_PTR(ret); +} + +static void destroy_gen_disk(struct brmr_clt_dev *dev) +{ + unsigned int memflags; + + del_gendisk(dev->gd); + /* + * Before marking queue as dying (blk_cleanup_queue() does that) + * we have to be sure that everything in-flight has gone. + * Blink with freeze/unfreeze. + */ + memflags = blk_mq_freeze_queue(dev->queue); + blk_mq_unfreeze_queue(dev->queue, memflags); + put_disk(dev->gd); +} + +/** + * brmr_clt_close_device() - Closes a brmr device + * + * @dev: pointer to brmr device to close + * @sysfs_self: pointer to sysfs attribute + * + * Return: + * 0 in case of success + * negative in case of failure + */ +int brmr_clt_close_device(struct brmr_clt_dev *dev, + const struct attribute *sysfs_self) +{ + dev->dev_state = DEV_STATE_CLOSING; + destroy_gen_disk(dev); + brmr_clt_send_unmap_cmd(dev); + sysfs_remove_link(&dev->kobj, BRMR_LINK_NAME); + + if (sysfs_self) + brmr_clt_destroy_dev_sysfs_files(dev, sysfs_self); + + brmr_clt_free_stats(&dev->stats); + brmr_clt_put_dev(dev); + + return 0; +} + +struct brmr_clt_dev *find_and_get_device(const char *name) +{ + struct brmr_clt_dev *dev; + + mutex_lock(&brmr_device_lock); + list_for_each_entry(dev, &brmr_device_list, list) { + if (strncasecmp(dev->pool->poolname, name, NAME_MAX)) + continue; + + if (brmr_clt_get_dev(dev)) { + mutex_unlock(&brmr_device_lock); + return dev; + } + } + mutex_unlock(&brmr_device_lock); + + return NULL; +} + +static int __init brmr_client_init(void) +{ + int err; + + pr_info("Loading module %s, version %s\n", + KBUILD_MODNAME, BRMR_VER_STRING); + + brmr_major = register_blkdev(brmr_major, "brmr"); + if (brmr_major <= 0) { + pr_err("Failed to load module," + " block device registration failed\n"); + err = -EBUSY; + goto out; + } + + err = brmr_clt_create_sysfs_files(); +out: + return err; +} + +static void __exit brmr_client_exit(void) +{ + struct brmr_clt_dev *dev, *tmp; + + pr_info("Unloading module\n"); + + brmr_clt_destroy_sysfs_files(); + unregister_blkdev(brmr_major, "brmr"); + + list_for_each_entry_safe(dev, tmp, &brmr_device_list, list) { + brmr_clt_close_device(dev, NULL); + } + + ida_destroy(&index_ida); + + pr_info("Module %s unloaded\n", KBUILD_MODNAME); +} + +module_init(brmr_client_init); +module_exit(brmr_client_exit); diff --git a/drivers/block/brmr/brmr-clt.h b/drivers/block/brmr/brmr-clt.h new file mode 100644 index 000000000000..1482c7517ee8 --- /dev/null +++ b/drivers/block/brmr/brmr-clt.h @@ -0,0 +1,299 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef BRMR_PRI_H +#define BRMR_PRI_H + +#include +#include +#include "rmr-pool.h" + +#include "brmr-proto.h" + +#define BRMR_VER_MAJOR 0 +#define BRMR_VER_MINOR 1 + +#ifndef BRMR_VER_STRING +#define BRMR_VER_STRING __stringify(BRMR_VER_MAJOR) "." \ + __stringify(BRMR_VER_MINOR) +#endif + +#define BRMR_LINK_NAME "block" + +#ifdef CONFIG_ARCH_NO_SG_CHAIN +#define BRMR_INLINE_SG_CNT 0 +#else +#define BRMR_INLINE_SG_CNT 2 +#endif +#define BRMR_RDMA_SGL_SIZE (sizeof(struct scatterlist) * BRMR_INLINE_SG_CNT) + +enum brmr_dev_state { + DEV_STATE_INIT, + DEV_STATE_READY, + DEV_STATE_DISCONNECTED, + DEV_STATE_CLOSING, +}; + +struct brmr_clt_iu { + struct request *rq; + struct rmr_iu *rmr_iu; + struct brmr_clt_dev *dev; + blk_status_t status; + struct sg_table sgt; + struct scatterlist sgl[]; +}; + +struct brmr_queue { + struct list_head requeue_list; + unsigned long in_list; + struct blk_mq_hw_ctx *hctx; +}; + +struct brmr_cpu_qlist { + struct list_head requeue_list; + spinlock_t requeue_lock; + unsigned int cpu; +}; + +struct brmr_clt_pool { + struct list_head list; + struct rmr_pool *rmr; + wait_queue_head_t rmr_waitq; + bool rmr_ready; + int queue_depth; + u32 max_io_size; + u32 chunk_size; + u32 max_segments; + struct brmr_cpu_qlist __percpu + *cpu_queues; + DECLARE_BITMAP(cpu_queues_bm, NR_CPUS); + int __percpu *cpu_rr; /* per-cpu var for CPU round-robin */ + atomic_t busy; + struct blk_mq_tag_set tag_set; + struct mutex lock; /* protects state and devs_list */ + struct list_head devs_list; /* list of struct brmr_clt_dev */ + refcount_t refcount; + char poolname[NAME_MAX]; +}; + +/** + * Statistic of requests submitted to the rmr-clt layer. + * This means total number of requests received from blk + * is cnt_whole+(cnt_split/2) + * while total number submitted to rmr-clt is cnt_whole+cnt_split + */ +struct brmr_stats_rq { + struct { + u64 cnt_whole; + u64 cnt_split; + u64 total_sectors; + } dir[2]; +}; + +#define STATS_SIZES_NUM 16 + +struct brmr_stats_sizes { + struct { + u64 cnt_whole[STATS_SIZES_NUM]; + u64 cnt_left[STATS_SIZES_NUM]; + u64 cnt_right[STATS_SIZES_NUM]; + } dir[2]; +}; + +struct brmr_stats_sts_resource { + u64 get_iu; + u64 get_iu2; + u64 clt_request1; + u64 clt_request; +}; + +struct brmr_stats_pcpu { + + struct brmr_stats_rq submitted_requests; + struct brmr_stats_sizes request_sizes; + struct brmr_stats_sts_resource sts_resource; +}; + +struct brmr_clt_stats { + struct brmr_stats_pcpu __percpu *pcpu_stats; +}; + +struct brmr_clt_dev { + struct brmr_clt_pool *pool; + struct request_queue *queue; + struct brmr_queue *hw_queues; + u32 idx; + enum brmr_dev_state dev_state; + bool read_only; + bool map_incomplete; + u64 size_sect; /* device size in sectors */ + struct list_head list; + struct brmr_clt_stats stats; + struct gendisk *gd; + struct kobject kobj; + struct kobject kobj_stats; + char blk_symlink_name[NAME_MAX]; + refcount_t refcount; + struct work_struct unmap_on_rmmod_work; + bool wc; + bool fua; + + /* + * Params holding block device related info + */ + u32 max_hw_sectors; + u32 max_write_zeroes_sectors; + u32 max_discard_sectors; + u32 discard_granularity; + u32 discard_alignment; + u16 physical_block_size; + u16 logical_block_size; + u16 max_segments; + u16 secure_discard; + u8 cache_policy; +}; + +#define BRMR_HEADER_MAGIC_TOKEN 0x312631494f4e4f53 + +#define BRMR_HEADER_VERSION_INITIAL 1 +#define BRMR_CURRENT_HEADER_VERSION BRMR_HEADER_VERSION_INITIAL + +static inline enum rmr_io_flags rq_to_rmr_flags(struct request *rq) +{ + enum rmr_io_flags rmr_flag; + + switch (req_op(rq)) { + case REQ_OP_READ: + rmr_flag = RMR_OP_READ; + break; + case REQ_OP_WRITE: + rmr_flag = RMR_OP_WRITE; + break; + case REQ_OP_DISCARD: + rmr_flag = RMR_OP_DISCARD; + break; + case REQ_OP_WRITE_ZEROES: + rmr_flag = RMR_OP_WRITE_ZEROES; + break; + case REQ_OP_FLUSH: + rmr_flag = RMR_OP_FLUSH; + break; +/* TODO + case REQ_OP_SECURE_ERASE: + rmr_flag = IBNBD_OP_SECURE_ERASE; + break; +*/ + default: + WARN(1, "Unknown request type %d (flags %u)\n", + req_op(rq), rq->cmd_flags); + rmr_flag = 0; + } + + /* Set sync flag for write request. */ + if (op_is_sync(rq->cmd_flags)) + rmr_flag |= RMR_F_SYNC; + + if (op_is_flush(rq->cmd_flags)) + rmr_flag |= RMR_F_FUA; + + return rmr_flag; +} + +static inline u32 brmr_pool_chunk_size(struct brmr_clt_pool *pool) +{ + return pool->chunk_size; +} + +struct brmr_clt_dev *brmr_clt_map_device(const char *pool, u64 size); +int brmr_clt_close_device(struct brmr_clt_dev *dev, const struct attribute *sysfs_self); + +void brmr_clt_put_dev(struct brmr_clt_dev *dev); + +struct brmr_clt_dev *find_and_get_device(const char *name); + +/* brmr-sysfs.c */ + +int brmr_clt_create_sysfs_files(void); +void brmr_clt_destroy_sysfs_files(void); + +void brmr_clt_destroy_dev_sysfs_files(struct brmr_clt_dev *dev, + const struct attribute *sysfs_self); + +/* brmr-reque.c */ + +bool brmr_add_to_requeue(struct brmr_clt_pool *pool, struct brmr_queue *q); +void brmr_requeue_requests(struct brmr_clt_pool *pool); +void brmr_init_cpu_qlists(struct brmr_cpu_qlist __percpu *cpu_queues); + +/* brmr-stats.c */ + +int brmr_clt_init_stats(struct brmr_clt_stats *stats); +void brmr_clt_free_stats(struct brmr_clt_stats *stats); + +int brmr_clt_reset_submitted_req(struct brmr_clt_stats *stats, bool enable); +int brmr_clt_reset_req_sizes(struct brmr_clt_stats *stats, bool enable); +int brmr_clt_reset_sts_resource(struct brmr_clt_stats *stats, bool enable); + +/** + * size: size of the request submitted in bytes + * split: 0 when request from blk is submitted to rmr-clt as 1 + * 1 if it is one part of the split from a blk request + */ +void brmr_update_stats(struct brmr_clt_stats *stats, size_t size, int split, int d); + +/** + * which: at which place is BLK_STS_RESOURCE returned? + */ +void brmr_clt_update_sts_resource(struct brmr_clt_stats *stats, int which); + +ssize_t brmr_clt_stats_sizes_to_str(struct brmr_clt_stats *stats, char *page, size_t len); + +ssize_t brmr_clt_stats_rq_to_str(struct brmr_clt_stats *stats, char *page, size_t len); + +ssize_t brmr_stats_sts_resource_to_str( + struct brmr_clt_stats *stats, char *page, size_t len); + +ssize_t brmr_stats_sts_resource_per_cpu_to_str( + struct brmr_clt_stats *stats, char *page, size_t len); + +#define STAT_STORE_FUNC(type, store, reset) \ +static ssize_t store##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + int ret = -EINVAL; \ + type *dev = container_of(kobj, type, kobj_stats); \ + \ + if (sysfs_streq(buf, "1")) \ + ret = reset(&dev->stats, true); \ + else if (sysfs_streq(buf, "0")) \ + ret = reset(&dev->stats, false); \ + if (ret) \ + return ret; \ + \ + return count; \ +} + +#define STAT_SHOW_FUNC(type, show, print) \ +static ssize_t show##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + type *dev = container_of(kobj, type, kobj_stats); \ + \ + return print(&dev->stats, page, PAGE_SIZE); \ +} + +#define STAT_ATTR(type, stat, print, reset) \ +STAT_STORE_FUNC(type, stat, reset) \ +STAT_SHOW_FUNC(type, stat, print) \ +static struct kobj_attribute stat##_attr = \ + __ATTR(stat, 0644, \ + stat##_show, \ + stat##_store) + +#endif /* BRMR_PRI_H */ diff --git a/drivers/block/brmr/brmr-proto.h b/drivers/block/brmr/brmr-proto.h new file mode 100644 index 000000000000..c5f0f25a5eb7 --- /dev/null +++ b/drivers/block/brmr/brmr-proto.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#define BRMR_PROTO_VER_MAJOR 0 +#define BRMR_PROTO_VER_MINOR 1 + +#define BRMR_CMD_RSP_MAGIC 0xDEADF00D + +struct brmr_blk_dev_params { + /* + * Params holding block device related info + */ + __le32 max_hw_sectors; + __le32 max_write_zeroes_sectors; + __le32 max_discard_sectors; + __le32 discard_granularity; + __le32 discard_alignment; + __le16 physical_block_size; + __le16 logical_block_size; + __le16 max_segments; + __le16 secure_discard; + u8 cache_policy; +}; + +enum brmr_msg_type { + BRMR_MSG_IO, + BRMR_MSG_CMD, +}; + +struct brmr_msg_hdr { + __le16 type; + __le16 __padding; +}; + +enum brmr_msg_cmd_type { + BRMR_CMD_MAP, // 0 + BRMR_CMD_REMAP, + + BRMR_CMD_UNMAP, + BRMR_CMD_GET_PARAMS, + + /* + * Add new command types above this. + */ + BRMR_CMD_RSP, +}; + +struct brmr_msg_map_new_cmd { + struct brmr_blk_dev_params dev_params; + + u32 version; /* version of the header itself */ + u64 mapped_size; /* size in 512 byte blocks of this device */ +}; + +struct brmr_msg_cmd { + struct brmr_msg_hdr hdr; + u8 ver; + u8 cmd_type; + u8 rsvd[2]; + union { + struct brmr_msg_map_new_cmd map_new_cmd; + /* May be other command(s) later */ + }; +}; + +/** + * struct brmr_cmd_get_params_rsp - response message to BRMR_CMD_GET_PARAMS + * @hdr: message header + * @nsectors: number of sectors in the usual 512b unit + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * @max_write_zeroes_sectors: max sectors for WRITE ZEROES in the 512b unit + * @max_discard_sectors: max. sectors that can be discarded at once in 512b + * unit. + * @discard_granularity: size of the internal discard allocation unit in bytes + * @discard_alignment: offset from internal allocation assignment in bytes + * @physical_block_size: physical block size device supports in bytes + * @logical_block_size: logical block size device supports in bytes + * @max_segments: max segments hardware support in one transfer + * @secure_discard: supports secure discard + * @cache_policy: support write-back caching or FUA? + */ +struct brmr_cmd_get_params_rsp { + struct brmr_blk_dev_params dev_params; + + /* + * Params holding brmr device related info + */ + u8 mapped; + __le64 mapped_size; +}; + +struct brmr_msg_cmd_rsp { + struct brmr_msg_hdr hdr; + u64 magic; + u8 ver; + u8 cmd_type; + u8 status; + u8 rsvd[1]; + union { + struct brmr_cmd_get_params_rsp get_params_rsp; + //any other command responces. + }; +}; + +struct brmr_cmd_priv { + void *dev; + u8 cmd_type; + void *rsp_buf; + size_t rsp_buf_len; + int errno; + struct completion complete_done; +}; + +enum brmr_cache_policy { + BRMR_FUA = 1 << 0, + BRMR_WRITEBACK = 1 << 1, +}; diff --git a/drivers/block/brmr/brmr-srv-sysfs.c b/drivers/block/brmr/brmr-srv-sysfs.c new file mode 100644 index 000000000000..7e413eb258bb --- /dev/null +++ b/drivers/block/brmr/brmr-srv-sysfs.c @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "brmr-srv.h" +#include "rmr-srv.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +static struct class *rmr_str_class; +static struct device *rmr_ctl_dev; +static struct device *rmr_strs_dev; + +enum { + BRMR_SRV_STR_OPT_ERR = 0, + BRMR_SRV_STR_OPT_DEVICE = 1 << 0, + BRMR_SRV_STR_OPT_POOL = 1 << 2, + BRMR_SRV_STR_OPT_MAPPED_SIZE = 1 << 3, + BRMR_SRV_STR_OPT_MODE = 1 << 4, +}; + +static const unsigned int rmr_str_opt_mandatory[] = { + BRMR_SRV_STR_OPT_POOL, + BRMR_SRV_STR_OPT_DEVICE, + BRMR_SRV_STR_OPT_MAPPED_SIZE, +}; + +static const match_table_t rmr_str_opt_tokens = { + { BRMR_SRV_STR_OPT_POOL, "pool=%s" }, + { BRMR_SRV_STR_OPT_DEVICE, "device=%s" }, + { BRMR_SRV_STR_OPT_MAPPED_SIZE, "mapped_size=%s" }, + { BRMR_SRV_STR_OPT_MODE, "mode=%s" }, + { BRMR_SRV_STR_OPT_ERR, NULL }, +}; + +struct brmr_srv_str_options { + char *pool; + char *device; + unsigned long mapped_size; +}; + +static void brmr_srv_remove_store(struct brmr_srv_blk_dev *dev, struct kobj_attribute *attr, + bool delete) +{ + mutex_lock(&store_mutex); + + blk_str_destroy_sysfs_files(dev, &attr->attr); + + brmr_srv_blk_close(dev, delete); + + pr_info("put blkdev %s\n", dev->bdev->bd_disk->disk_name); + bdev_fput(dev->bdev_file); + + pr_info("%s store %s, store name %s.\n", (delete ? "Delete" : "Remove"), + dev->name, dev->poolname); + brmr_srv_blk_destroy(dev); + mutex_unlock(&store_mutex); +} + +static int brmr_srv_parse_add_opts(const char *buf, struct brmr_srv_str_options *opt, + unsigned int *replace) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = strstrip(options); + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_str_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case BRMR_SRV_STR_OPT_POOL: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("add_store: pool name is too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(opt->pool, p, NAME_MAX); + kfree(p); + break; + + case BRMR_SRV_STR_OPT_DEVICE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("add_store: device name is too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(opt->device, p, NAME_MAX); + kfree(p); + break; + + case BRMR_SRV_STR_OPT_MAPPED_SIZE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + ret = kstrtoul(p, 0, &opt->mapped_size); + if (ret) { + pr_err("mapped_size isn't an integer: %d\n", ret); + kfree(p); + goto out; + } + + if (opt->mapped_size == 0) { + pr_err("mapped_size cannot be 0\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + + kfree(p); + break; + + case BRMR_SRV_STR_OPT_MODE: + if (!replace) { + pr_err("%s: mode option not supported here\n", __func__); + ret = -EINVAL; + goto out; + } + + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "replace")) { + *replace = true; + } else { + pr_err("%s: Unknown mode '%s'\n", __func__, p); + ret = -EINVAL; + kfree(p); + goto out; + } + kfree(p); + break; + + default: + pr_err("add_store: Unknown parameter or missing value '%s'\n", + p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(rmr_str_opt_mandatory); i++) { + if ((opt_mask & rmr_str_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("add_store: Parameters missing\n"); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + +static ssize_t blk_str_dev_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + return sysfs_emit(page, "%llu\n", dev->dev_size); +} + +static struct kobj_attribute blk_str_dev_size_attr = + __ATTR(dev_size, 0644, blk_str_dev_size_show, NULL); + +static ssize_t blk_str_mapped_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + return sysfs_emit(page, "%llu\n", dev->mapped_size); +} + +static struct kobj_attribute blk_str_mapped_size_attr = + __ATTR(mapped_size, 0644, blk_str_mapped_size_show, NULL); + +static ssize_t blk_str_bdev_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + return sysfs_emit(page, "%s\n", dev->name); +} + +static struct kobj_attribute blk_str_bdev_name_attr = + __ATTR(bdev_name, 0644, blk_str_bdev_name_show, NULL); + +static ssize_t blk_str_remove_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 to remove the store\n"); +} + +static ssize_t blk_str_remove_store_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + dev->name, attr->attr.name, buf); + return -EINVAL; + } + + brmr_srv_remove_store(dev, attr, false); + + return count; +} + +static struct kobj_attribute blk_str_remove_store_attr = + __ATTR(remove_store, 0644, + blk_str_remove_store_show, blk_str_remove_store_store); + +static ssize_t blk_str_delete_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 to delete the store\n"); +} + +static ssize_t blk_str_delete_store_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + dev->name, attr->attr.name, buf); + return -EINVAL; + } + + brmr_srv_remove_store(dev, attr, true); + + return count; +} + +static struct kobj_attribute blk_str_delete_store_attr = + __ATTR(delete_store, 0644, + blk_str_delete_store_show, blk_str_delete_store_store); + +static ssize_t state_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + size_t count = 0; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + if (test_bit(BRMR_SRV_STORE_OPEN, &dev->state)) + count += sysfs_emit_at(page, count, "open\n"); + else + count += sysfs_emit_at(page, count, "closed\n"); + + if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state)) + count += sysfs_emit_at(page, count, "mapped\n"); + else + count += sysfs_emit_at(page, count, "unmapped\n"); + + return count; +} + +static struct kobj_attribute blk_str_state_attr = + __ATTR_RO(state); + +static struct attribute *blk_str_map_attrs[] = { + &blk_str_dev_size_attr.attr, + &blk_str_mapped_size_attr.attr, + &blk_str_bdev_name_attr.attr, + &blk_str_remove_store_attr.attr, + &blk_str_delete_store_attr.attr, + &blk_str_state_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(blk_str_map); + +static struct kobj_type blk_str_device_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = blk_str_map_groups, +}; + +static int blk_str_create_sysfs_files(struct brmr_srv_blk_dev *dev) +{ + int ret; + + ret = kobject_init_and_add(&dev->kobj, &blk_str_device_ktype, + &rmr_strs_dev->kobj, + "%s", dev->poolname); + if (ret) + pr_err("Failed to create sysfs dir for store %s, name %s, err=%d\n", + dev->name, dev->poolname, ret); + + return ret; +} + +void blk_str_destroy_sysfs_files(struct brmr_srv_blk_dev *dev, + const struct attribute *sysfs_self) +{ + if (sysfs_self) + sysfs_remove_file_self(&dev->kobj, sysfs_self); + + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); +} + +/** + * brmr_srv_blk_dev_exit() - Destroy and put the blkdev + * + * @dev: RMR block device structure to be used. + * + * Description: + * This function gives up the blkdev reference, and destroys the rmr block device + */ +static void brmr_srv_blk_dev_exit(struct brmr_srv_blk_dev *dev) +{ + pr_info("%s: put blkdev %s\n", __func__, dev->name); + bdev_fput(dev->bdev_file); + + brmr_srv_blk_destroy(dev); +} + +/** + * brmr_srv_blk_dev_init() - Create and initialize a brmr server store block device + * + * @pool_name: Name to be given to the created rmr block device + * @dev_name: path to the block device + * @mapped_size:mapped size of the block device + * + * Description: + * This function checks whether the rmr pool is available to be registered. + * It then creates the block device, and initializes it. + * + * Return: + * Pointer to the created rmr block device on success + * Error pointer on error + */ +static struct brmr_srv_blk_dev *brmr_srv_blk_dev_init(char *pool_name, char *dev_name, + u64 mapped_size) +{ + struct file *bdev_file; + struct brmr_srv_blk_dev *dev; + + dev = brmr_srv_blk_create(dev_name, pool_name); + if (IS_ERR(dev)) { + pr_err("failed to alloc store for device %s: %pe\n", pool_name, dev); + return dev; + } + + bdev_file = bdev_file_open_by_path(dev_name, DEFAULT_BLK_OPEN_FLAGS, + dev, NULL); + if (IS_ERR(bdev_file)) { + pr_err("%s: bdev_file_open_by_path for device %s failed with err (%pe)\n", + __func__, dev_name, bdev_file); + brmr_srv_blk_destroy(dev); + return ERR_CAST(bdev_file); + } + + dev->bdev_file = bdev_file; + dev->bdev = file_bdev(bdev_file); + dev->dev_size = get_capacity(dev->bdev->bd_disk); + strscpy(dev->name, dev->bdev->bd_disk->disk_name, sizeof(dev->name)); + + if (mapped_size < BLK_STR_MIN_MAPPED_SIZE) { + pr_err("%s: Given mapped size %llu less than minimum default(%lu) for dev %s\n", + __func__, mapped_size, BLK_STR_MIN_MAPPED_SIZE, dev->name); + brmr_srv_blk_dev_exit(dev); + return ERR_PTR(-ENOSPC); + } + + if (mapped_size + BLK_STR_MD_SIZE_SECTORS > dev->dev_size) { + pr_err("can not map %llu, only %llu available %s\n", + mapped_size, dev->dev_size - BLK_STR_MD_SIZE_SECTORS, dev->name); + brmr_srv_blk_dev_exit(dev); + return ERR_PTR(-ENOSPC); + } + + dev->mapped_size = mapped_size; + + pr_info("%s: succeeded\n", __func__); + + return dev; +} + +static ssize_t brmr_srv_create_store_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_str_options opt; + char dev_name[NAME_MAX]; + char pool_name[NAME_MAX]; + struct brmr_srv_blk_dev *dev; + struct brmr_srv_blk_dev_meta *md_page; + int md_state, err; + + opt.pool = pool_name; + opt.device = dev_name; + opt.mapped_size = 0; + + if (brmr_srv_parse_add_opts(buf, &opt, NULL)) + goto out; + + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_err("%s: Failed to allocate page to read md\n", __func__); + goto out; + } + + mutex_lock(&store_mutex); + + dev = brmr_srv_blk_dev_init(pool_name, dev_name, opt.mapped_size); + if (IS_ERR(dev)) { + pr_err("%s: brmr_srv_blk_dev_init failed: %pe\n", __func__, dev); + goto mut_unlock; + } + + md_state = brmr_srv_read_and_check_md(dev, md_page); + if (md_state != -1) { + /* + * read and check md failed. It could be read error or that md exists + */ + pr_err("%s: md read and check failed: %d\n", __func__, md_state); + goto dev_exit; + } + + err = brmr_srv_blk_open(dev, dev_name, true, false); + if (err) { + pr_err("failed to open %s, err %d\n", dev_name, err); + goto dev_exit; + } + + err = blk_str_create_sysfs_files(dev); + if (err) { + pr_err("failed to create sysfs files\n"); + goto dev_close; + } + + mutex_unlock(&store_mutex); + pr_info("Created new blk store for %s, with disk %s\n", pool_name, dev_name); + + kfree(md_page); + return count; + +dev_close: + brmr_srv_blk_close(dev, true); +dev_exit: + brmr_srv_blk_dev_exit(dev); +mut_unlock: + mutex_unlock(&store_mutex); + kfree(md_page); +out: + return -EINVAL; +} + +static ssize_t brmr_srv_create_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"pool= device= mapped_size=\" > %s\n\n", + attr->attr.name); +} + +static struct kobj_attribute brmr_srv_create_store_attr = + __ATTR(create_store, 0644, + brmr_srv_create_store_show, brmr_srv_create_store_store); + +static ssize_t brmr_srv_add_store_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_blk_dev *dev; + char dev_name[NAME_MAX]; + char pool_name[NAME_MAX]; + struct brmr_srv_str_options opt; + struct brmr_srv_blk_dev_meta *md_page; + int md_state, ret; + unsigned int replace = false; + + opt.pool = pool_name; + opt.device = dev_name; + opt.mapped_size = 0; + + if (brmr_srv_parse_add_opts(buf, &opt, &replace)) + goto out; + + /* + * Disable replace mode for now. + * Most of the code for replace mode to work is present, but there are some + * edge cases which needs work, and a info exchange between storage nodes which + * needs to be added. + */ + if (replace) { + pr_err("%s: Replace mode not supported yet\n", __func__); + goto out; + } + + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_err("Failed to allocate page to read md\n"); + goto out; + } + + mutex_lock(&store_mutex); + + dev = brmr_srv_blk_dev_init(pool_name, dev_name, opt.mapped_size); + if (IS_ERR(dev)) { + pr_err("brmr_srv_blk_dev_init failed: %pe\n", dev); + goto mut_unlock; + } + + md_state = brmr_srv_read_and_check_md(dev, md_page); + if (md_state == -1) { + /* + * md doesn't exists. This means the disk is an empty one. + * We have to replace, so check the mode first + */ + if (!replace) { + pr_err("%s: Incorrect mode %d. md doesn't exists\n", __func__, replace); + goto dev_exit; + } + + /* + * we have to do the following, + * + * 1) Check params like mapped size from at least one other storage node + * 2) Do discard + */ + pr_info("%s: No md found. Replacing disk %s for pool %s, size %llu\n", + __func__, dev_name, pool_name, dev->mapped_size); + } else if (md_state == 0) { + /* + * md exists. + * We are restoring an earlier used device. + */ + if (replace) { + pr_err("%s: Incorrect mode %d. md exists\n", __func__, replace); + goto dev_exit; + } + + /* + * Validate the metadata stored with the data provided. + */ + ret = brmr_srv_blk_validate_md(dev, md_page); + if (ret) { + pr_err("Local metadata validation failed\n"); + goto dev_exit; + } + + memcpy(&dev->dev_params, &md_page->dev_params, sizeof(struct rmr_blk_dev_params)); + dev->state = md_page->state; + + pr_info("%s: md found. Re-adding disk %s for pool %s, size %llu\n", + __func__, dev_name, pool_name, dev->mapped_size); + } else { + pr_err("%s: md cannot be read for block device %s, Err = %d\n", + __func__, dev->name, md_state); + goto dev_exit; + } + + if (brmr_srv_blk_open(dev, dev_name, false /* create */, replace)) { + pr_err("failed to open %s\n", dev_name); + goto dev_exit; + } + + ret = blk_str_create_sysfs_files(dev); + if (ret) { + pr_err("failed to create sysfs files\n"); + goto dev_close; + } + + mutex_unlock(&store_mutex); + + kfree(md_page); + return count; + +dev_close: + brmr_srv_blk_close(dev, replace); +dev_exit: + brmr_srv_blk_dev_exit(dev); +mut_unlock: + mutex_unlock(&store_mutex); + kfree(md_page); +out: + return -EINVAL; +} + +static ssize_t brmr_srv_add_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"pool= device= mapped_size=\" > %s\n\n", + attr->attr.name); +} + +static struct kobj_attribute brmr_srv_add_store_attr = + __ATTR(add_store, 0644, + brmr_srv_add_store_show, brmr_srv_add_store_store); + +static struct attribute *default_attrs[] = { + &brmr_srv_create_store_attr.attr, + &brmr_srv_add_store_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +int brmr_srv_create_sysfs_files(void) +{ + int err; + dev_t devt = MKDEV(0, 0); + + rmr_str_class = class_create("brmr-server"); + if (IS_ERR(rmr_str_class)) + return PTR_ERR(rmr_str_class); + + rmr_ctl_dev = device_create(rmr_str_class, NULL, devt, NULL, "ctl"); + if (IS_ERR(rmr_ctl_dev)) { + err = PTR_ERR(rmr_ctl_dev); + goto cls_destroy; + } + + rmr_strs_dev = device_create(rmr_str_class, NULL, devt, NULL, "stores"); + if (IS_ERR(rmr_strs_dev)) { + err = PTR_ERR(rmr_strs_dev); + goto ctl_destroy; + } + + err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto strs_destroy; + + return 0; + +strs_destroy: + device_unregister(rmr_strs_dev); +ctl_destroy: + device_unregister(rmr_ctl_dev); +cls_destroy: + class_destroy(rmr_str_class); + + return err; +} + +void brmr_srv_destroy_sysfs_files(void) +{ + sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group); + device_unregister(rmr_strs_dev); + device_unregister(rmr_ctl_dev); + class_destroy(rmr_str_class); +} diff --git a/drivers/block/brmr/brmr-srv.c b/drivers/block/brmr/brmr-srv.c new file mode 100644 index 000000000000..cf85a54e4511 --- /dev/null +++ b/drivers/block/brmr/brmr-srv.c @@ -0,0 +1,1402 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include + +#include "brmr-srv.h" +#include "rmr-srv.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(BRMR_SERVER_VER_STRING); +MODULE_DESCRIPTION("BRMR Server"); +MODULE_LICENSE("GPL"); + +LIST_HEAD(store_list); +DEFINE_MUTEX(store_mutex); /* mutex to protect store_list */ + +/** + * brmr_srv_blk_validate_md() - Parse metadata for the given rmr block device and validate it + * + * @dev: RMR block device against which the md is to be validated + * @meta: pointer to metadata to be checked + * + * Return: + * 0: On success + * -Error: On failure + */ +int brmr_srv_blk_validate_md(struct brmr_srv_blk_dev *dev, struct brmr_srv_blk_dev_meta *meta) +{ + if (meta->magic != BRMR_BLK_STORE_MAGIC) { + pr_warn("No md found. store %s md magic=%llX does not match %X\n", + dev->poolname, meta->magic, BRMR_BLK_STORE_MAGIC); + return -EINVAL; + } + + // TODO: check version! + + if (dev->dev_size && dev->dev_size != meta->dev_size) { + pr_err("store %s dev_size %llu does not match md value %llu\n", + dev->poolname, dev->dev_size, meta->dev_size); + return -EINVAL; + } + + if (dev->mapped_size != meta->mapped_size) { + pr_err("store %s mapped_size %llu does not match md value %llu\n", + dev->poolname, dev->mapped_size, meta->mapped_size); + return -EINVAL; + } + + if (strncmp(dev->poolname, meta->poolname, NAME_MAX)) { + pr_err("store %s does not match md value %s\n", + dev->poolname, meta->poolname); + return -EINVAL; + } + + pr_debug("store %s md: mapped_size=%llu\n", + dev->poolname, meta->mapped_size); + pr_debug("md parsing is done for store %s\n", dev->poolname); + + return 0; +} + +/** + * brmr_srv_blk_fill_md() - Fill metadata from brmr srv block device + * + * @dev: BRMR server block device from which data is to be taken + * @data: pointer to metadata + * + * Return: + * 0: On success + * -Error: On failure + */ +static int brmr_srv_blk_fill_md(struct brmr_srv_blk_dev *dev, void *data) +{ + struct brmr_srv_blk_dev_meta *meta = data; + + meta->magic = BRMR_BLK_STORE_MAGIC; + meta->version = 0; + meta->dev_size = dev->dev_size; + meta->offset = BLK_STR_MD_SIZE_SECTORS; + meta->ts = jiffies; // or ktime_get_real_seconds(); + meta->mapped_size = dev->mapped_size; + meta->state = dev->state; + + memcpy(&meta->dev_params, &dev->dev_params, sizeof(struct rmr_blk_dev_params)); + + strscpy(meta->poolname, dev->poolname, NAME_MAX); + + pr_debug("md filling pool %s is done for dev %s\n", meta->poolname, dev->name); + + return 0; +} + +static int brmr_srv_blk_md_io_sync(struct block_device *bdev, int rw, void *md_data) +{ + int err = 0; + struct bio *bio; + blk_opf_t bio_flags = REQ_META; + u32 bytes; + + bio = bio_alloc(bdev, 1, bio_flags, GFP_NOIO); + if (!bio) { + pr_err("Failed to allocate metadata bio\n"); + return -ENOMEM; + } + + bytes = bio_add_page(bio, virt_to_page(md_data), PAGE_SIZE, 0); + if (bytes != PAGE_SIZE) { + pr_err("Failed to add page to bio, bytes returned=%u, expected %lu\n", + bytes, PAGE_SIZE); + err = -EINVAL; + goto bio_put; + } + + if (rw == READ) + bio->bi_opf = REQ_OP_READ; + else + bio->bi_opf = REQ_OP_WRITE | REQ_FUA; + + bio->bi_opf |= bio_flags; + bio->bi_iter.bi_sector = 0; + bio_set_dev(bio, bdev); + + pr_debug("submit_bio_wait dev %s, rw %s\n", + bdev->bd_disk->disk_name, rw == WRITE ? "WRITE" : "READ"); + err = submit_bio_wait(bio); + if (err) { + pr_err("Error reading md from %s, err %d\n", + bdev->bd_disk->disk_name, err); + goto bio_put; + } + pr_info("%s: for dev %s md rw %s is completed with code %d\n", + __func__, bdev->bd_disk->disk_name, rw == WRITE ? "WRITE" : "READ", err); + +bio_put: + bio_put(bio); + + return err; +} + +/** + * brmr_srv_blk_read_md() - read md from given block device + * + * @bdev: block device from which to read md + * @md_page: buffer to fill with md + */ +static int brmr_srv_blk_bdev_read_md(struct block_device *bdev, char *md_page) +{ + int err = 0; + + err = brmr_srv_blk_md_io_sync(bdev, READ, md_page); + if (err) { + pr_err("error reading md from %s, err %d\n", bdev->bd_disk->disk_name, err); + return err; + } + + pr_debug("read md from dev %s is done\n", bdev->bd_disk->disk_name); + + return err; +} + +static int brmr_srv_blk_write_md(struct brmr_srv_blk_dev *dev) +{ + int err = 0; + void *md_page; + + pr_debug("flush md to dev %s\n", dev->name); + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_err("Failed to allocate page to read md\n"); + err = -ENOMEM; + goto out; + } + + err = brmr_srv_blk_fill_md(dev, md_page); + if (err) { + pr_err("error filling md for dev %s, err %d\n", dev->name, err); + goto free_md_page; + } + + err = brmr_srv_blk_md_io_sync(dev->bdev, WRITE, md_page); + if (err) { + pr_err("error writing md to %s, err %d\n", dev->name, err); + goto free_md_page; + } + pr_debug("flush md to dev is done %s\n", dev->name); + +free_md_page: + kfree(md_page); +out: + return err; +} + +static void brmr_srv_blk_zero_md(struct brmr_srv_blk_dev *dev) +{ + int err = 0; + void *md_page; + + pr_debug("zero md on dev %s\n", dev->name); + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_warn("Failed to allocate page to read md\n"); + return; + } + + err = brmr_srv_blk_md_io_sync(dev->bdev, WRITE, md_page); + if (err) + pr_warn("error writing zero md to %s, err %d\n", dev->name, err); + + pr_debug("zero md on dev is done %s\n", dev->name); + kfree(md_page); +} + +static void brmr_srv_ref_kill(struct brmr_srv_blk_dev *dev) +{ + percpu_ref_kill(&dev->kref); + wait_for_completion(&dev->comp); +} + +static void brmr_srv_blk_release(struct percpu_ref *kref) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kref, struct brmr_srv_blk_dev, kref); + complete(&dev->comp); +} + +/** + * brmr_srv_blk_close() - Close a brmr srv block device + * + * @dev: BRMR server block device to be closed + * + * Description: + * Close an opened brmr srv store block device. + * This function is the opposite of brmr_srv_blk_open. + * This function is supposed to be the check and stop for inflight IOs. + * + * Locks: + * store_mutex should be held while calling this. + */ +void brmr_srv_blk_close(struct brmr_srv_blk_dev *dev, bool delete) +{ + pr_info("rmr store name: %s; dev %s is closing\n", dev->poolname, dev->name); + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN); + + list_del(&dev->entry); + + pr_info("brmr server store blk dev %s wait for io to complete.\n", dev->name); + brmr_srv_ref_kill(dev); + + /* + * Reinit the ref counter so that RMR can send metadata requests. + */ + reinit_completion(&dev->comp); + percpu_ref_reinit(&dev->kref); + + rmr_srv_unregister(dev->poolname, delete); + dev->pool = NULL; + brmr_srv_ref_kill(dev); + + if (delete) + brmr_srv_blk_zero_md(dev); +} + +static int brmr_srv_blk_do_discard(struct brmr_srv_blk_dev *dev) +{ + struct rmr_pool *pool = dev->pool; + int err; + + pr_info("store id %s has mapped size of %llu, send discarded chunks to rmr pool %s\n", + dev->poolname, dev->mapped_size, dev->pool->poolname); + + err = rmr_srv_discard_id(pool, 0, 0, 0, true); + if (err) + pr_err("store %s failed to discard all data\n", dev->poolname); + + return err; +} + +/** + * brmr_srv_init_cmd() - Initialize message command + * + * @msg: command message where to init + */ +static void brmr_srv_init_cmd(struct brmr_msg_cmd *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->ver = BRMR_PROTO_VER_MAJOR; +} + +/** + * brmr_srv_cmd_conf() - Confirmation function for brmr srv store internal command message + * + * @priv: priv pointer to brmr command private data + * @errno: error number passed from RMR. + * See description of errno in RMR function. + * + * Description: + * Command response for a map new command can fail on multiple levels. + * If RMR fails to send the message to any or one of the nodes, that would reflect on the + * errno. If the command fails on BRMR level, that would reflect on the rsp struct. + * The error number will be used differently by different commands accordingly. + */ +static void brmr_srv_cmd_conf(void *priv, int errno) +{ + struct brmr_cmd_priv *cmd_priv = (struct brmr_cmd_priv *)priv; + + cmd_priv->errno = errno; + + switch (cmd_priv->cmd_type) { + case BRMR_CMD_GET_PARAMS: + if (cmd_priv->errno) + pr_err("%s: BRMR_CMD_GET_PARAMS failed with err=%pe on sending", + __func__, ERR_PTR(errno)); + + break; + + default: + cmd_priv->errno = -EINVAL; + pr_err("%s: Unknown command type %d err=%d\n", __func__, cmd_priv->cmd_type, errno); + } + + complete(&cmd_priv->complete_done); +} + +/** + * brmr_srv_send_msg_cmd() - Sends command message to internal rmr pool through rmr-srv pool + * + * @dev: pointer to brmr device + * @msg: msg struct to be sent + * @rsp_buf: response buffer where the response of the storage side is stored + * @rsp_buf_len: length of the response buffer + * + * Return: + * Negative if failed to sent command + * As handled by each command in brmr_cmd_conf, if succeeded to send command + * + * Context: + * Would block until response is received + */ +static int brmr_srv_send_msg_cmd(struct brmr_srv_blk_dev *dev, struct brmr_msg_cmd *msg, + void *rsp_buf, size_t rsp_buf_len) +{ + struct brmr_cmd_priv cmd_priv; + struct kvec vec; + int ret; + + vec = (struct kvec) { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + + cmd_priv.dev = dev; + cmd_priv.cmd_type = msg->cmd_type; + cmd_priv.rsp_buf = rsp_buf; + cmd_priv.rsp_buf_len = rsp_buf_len; + cmd_priv.errno = 0; + init_completion(&cmd_priv.complete_done); + + ret = rmr_srv_pool_cmd_with_rsp(dev->pool, brmr_srv_cmd_conf, &cmd_priv, &vec, 1, rsp_buf, + rsp_buf_len, sizeof(struct brmr_msg_cmd_rsp)); + + if (!ret) { + wait_for_completion(&cmd_priv.complete_done); + ret = cmd_priv.errno; + } + + return ret; +} + +/** + * brmr_srv_blk_get_params() - Get parameters from other servers + * + * @dev: Backend device for which to be checked + * + * Description: + * Check whether parameters from other servers are consistent with this server through + * internal network. + * + * Return: + * 0 on success of checks + * -Negative error value on failure of checks. + * -EAGAIN if no sync sessions are connected to this server. + */ +static int brmr_srv_blk_get_params(void *device) +{ + struct brmr_srv_blk_dev *dev; + struct brmr_msg_cmd msg; + struct brmr_msg_cmd_rsp *brmr_cmd_rsp; + void *rsp_buf; + size_t rsp_buf_len; + int err = 0, i; + bool checked = false; + + dev = (struct brmr_srv_blk_dev *)device; + brmr_srv_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_GET_PARAMS; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) + return -ENOMEM; + + err = brmr_srv_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (err < 0) { + pr_warn("%s: brmr_send_msg_cmd failed with errno %d\n", __func__, err); + /* + * Sending could fail for various reasons. The server may be isolated and has + * no connected sync sessions to other nodes. Or the connected server has no + * store attached. + */ + goto free_data; + } + + /* + * We do not care if the command failed for few storage nodes, as long as we get a good + * response from one of them. + * + * The mapped size of all storage nodes which are connected should be the same, whether + * the backend device of those nodes is mapped or not. + * + * TODO: If the responses of other storage nodes are different, then use values from + * nodes which are mapped. If there are no mapped devices in the pool, then the check + * will fail when the mapped sizes are different. + */ + brmr_cmd_rsp = (struct brmr_msg_cmd_rsp *)rsp_buf; + for (i = 0; i < RMR_POOL_MAX_SESS; i++, brmr_cmd_rsp++) { + struct brmr_cmd_get_params_rsp *get_params_rsp = &brmr_cmd_rsp->get_params_rsp; + struct brmr_blk_dev_params *rsp_dev_params; + + /* + * If there is no magic, or the command failed, + * we do not use that nodes info to perform the check. + */ + if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC || + brmr_cmd_rsp->status) + continue; + + if (dev->mapped_size != le64_to_cpu(get_params_rsp->mapped_size)) { + pr_err("%s: Mismatch in mapped_size: %llu != %llu\n", __func__, + dev->mapped_size, le64_to_cpu(get_params_rsp->mapped_size)); + err = -EINVAL; + goto free_data; + } + + rsp_dev_params = &get_params_rsp->dev_params; + + dev->dev_params.max_hw_sectors = le32_to_cpu(rsp_dev_params->max_hw_sectors); + dev->dev_params.max_write_zeroes_sectors = + le32_to_cpu(rsp_dev_params->max_write_zeroes_sectors); + dev->dev_params.max_discard_sectors = + le32_to_cpu(rsp_dev_params->max_discard_sectors); + dev->dev_params.discard_granularity = + le32_to_cpu(rsp_dev_params->discard_granularity); + dev->dev_params.discard_alignment = le32_to_cpu(rsp_dev_params->discard_alignment); + dev->dev_params.physical_block_size = + le16_to_cpu(rsp_dev_params->physical_block_size); + dev->dev_params.logical_block_size = + le16_to_cpu(rsp_dev_params->logical_block_size); + dev->dev_params.max_segments = le16_to_cpu(rsp_dev_params->max_segments); + dev->dev_params.secure_discard = le16_to_cpu(rsp_dev_params->secure_discard); + dev->dev_params.cache_policy = rsp_dev_params->cache_policy; + + /* + * At least check passed with one mapped storage node + * + * We still perform the check for other mapped storage nodes just for sanity. + */ + checked = true; + } + + if (checked == false) { + pr_err("%s: Check for mapped_size failed for dev %s.\n", + __func__, dev->poolname); + err = -EINVAL; + } + +free_data: + kfree(rsp_buf); + + return err; +} + +/** + * brmr_srv_blk_add_handle_replace() - Handle check and discard for a store which was replaced + * + * @dev: RMR block device to be closed + * + * Description: + * When an empty disk is added to an already existing brmr server store, it means that the + * empty disk is to replace the disk which was present in the existing brmr srv store. + * Before replacing the disk with the new empty one, there are a number of things to be done. + * This function performs the following task, + * 1) Get some parameters from other storage node through the internal network, and checks + * whether the mapped_size passed for the new empty disk is correct or not. + * 2) If the above check passed, then discard is sent above to rmr-server. + * + * Return: + * 0 on success + * -Error value on error + */ +static int brmr_srv_blk_add_handle_replace(struct brmr_srv_blk_dev *dev) +{ + int err = 0; + + /* + * The check passed. We can now do the discard safely. + */ + err = brmr_srv_blk_do_discard(dev); + if (err) { + pr_err("%s: brmr_srv_blk_do_discard failed for dev %s\n", __func__, dev->poolname); + return err; + } + + /* + * We are done with everything, and we are good. + * We now set the MAPPED state and write metadata again so it is persisted. + * so that IOs can be served. + */ + brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_MAPPED); + err = brmr_srv_blk_write_md(dev); + if (err) { + pr_err("%s: dev %s: write md error %d\n", __func__, dev->name, err); + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED); + return err; + } + + /* + * After the discarded entries are sent to rmr-server, set the map version of + * rmr pool to zero. + */ + rmr_srv_replace_store(dev->pool); + return 0; +} + +/** + * brmr_srv_read_and_check_md() - Read and check metadata if it exists + * + * @dev: BRMR server block device for which the metadata is to be checked + * @md_page: pointer to the buf where to read the metadata + * + * Description: + * Read metadata from the given store device, and check whether metadata exists. + * + * Return: + * 0: read was successful and metadata exists + * -1: read was successful but metadata doesn't exists + * -Errno: read failed + */ +int brmr_srv_read_and_check_md(struct brmr_srv_blk_dev *dev, void *md_page) +{ + struct brmr_srv_blk_dev_meta *meta = md_page; + int err; + + err = brmr_srv_blk_bdev_read_md(dev->bdev, md_page); + if (err) { + pr_err("%s: failed to read md, err=%d\n", __func__, err); + return -EINVAL; + } + + if (meta->magic != BRMR_BLK_STORE_MAGIC) { + pr_info("%s: No MD exists for block device %s, md magic=%llX does not match %X\n", + __func__, dev->name, meta->magic, BRMR_BLK_STORE_MAGIC); + return -1; + } + + pr_info("%s: %s MD exists for block device %s\n", __func__, meta->poolname, dev->name); + + return 0; +} + +/** + * brmr_srv_blk_open() - Open an brmr srv block device + * + * @dev: BRMR server block device structure to be used. + * @path: path to the block device. + * @create: Whether to create a new store or open an existing one. + * @replace: Whether the device is being added to replace an empty disk. + * + * Description: + * Open the block device "path", and populate the brmr srv block device "dev" + * with the details. + * To close the device, call brmr_srv_blk_close() + * + * Return: + * 0 on success + * -Error value on error + * + * Locks: + * store_mutex should be held while calling this. + */ +int brmr_srv_blk_open(struct brmr_srv_blk_dev *dev, const char *path, + bool create, bool replace) +{ + struct rmr_attrs attr; + int err; + + err = rmr_srv_query(NULL, dev->mapped_size, &attr); + if (err) { + pr_err("dev %s: rmr srv query failed %d\n", dev->name, err); + return err; + } + + if ((dev->mapped_size + BLK_STR_MD_SIZE_SECTORS + attr.rmr_md_size) > dev->dev_size) { + pr_err("%s: dev %s: No space for rmr metadata %llu(in sectors)\n", + __func__, dev->name, attr.rmr_md_size); + return -ENOSPC; + } + + /* + * After the device registers to the RMR server pool, there will be metadata requests from + * RMR server transmitted to the device which starts reference counting. The reference + * count of the device must be initialized before any in flight requests are sent to BRMR. + */ + err = percpu_ref_init(&dev->kref, brmr_srv_blk_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (err) { + pr_err("%s: percpu ref init failed.\n", __func__); + return -EINVAL; + } + init_completion(&dev->comp); + + dev->pool = rmr_srv_register(dev->poolname, &pstore_blk_ops, dev, + dev->mapped_size, create ? RMR_SRV_DISK_CREATE : + (replace ? RMR_SRV_DISK_REPLACE : + RMR_SRV_DISK_ADD)); + if (!dev->pool) { + pr_err("Failed registering blk store %s, err\n", dev->poolname); + brmr_srv_ref_kill(dev); + return -EINVAL; + } + + brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_OPEN); + + if (!create) { + err = brmr_srv_blk_get_params(dev); + if (replace) { + /* + * Any failure of getting parameters is not allowed when replacing a store. + * Either it failed to send the command or the parameters are different. + */ + if (err) { + pr_err("%s: replace_store: brmr_srv_blk_get_params failed with err %d\n", + __func__, err); + goto close_dev; + } + } else { + /* + * The store creation will fail if the connected servers to this server + * share different parameter values. If sending the command of getting + * parameters failed due to no sync sessions connected to this server + * where no parameters are received, the store will be created, delaying + * checks when this server is connected to some other servers. + */ + if (err && err != -EAGAIN) { + pr_err("%s: create_store: brmr_srv_blk_get_params failed with err %d\n", + __func__, err); + goto close_dev; + } + } + + /* + * TODO: Would we be creating the maps for replace (empty disk) at the + * same time as we create one for create_disk? + */ + if (replace) { + err = brmr_srv_blk_add_handle_replace(dev); + if (err) { + pr_err("%s: replace_store %s: handling replace failed with err %d", + __func__, dev->poolname, err); + goto close_dev; + } + } + } + + /* we write md in both cases (new or old device) just to check if device is ok + * for writing + */ + err = brmr_srv_blk_write_md(dev); + if (err) { + pr_err("dev %s: write md error %d\n", dev->name, err); + goto close_dev; + } + + list_add(&dev->entry, &store_list); + + pr_info("%s: brmr srv blk str %s, dev %s set state to open\n", __func__, dev->poolname, + dev->name); + + return 0; + +close_dev: + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN); + /* + * TODO: Ideally, the unregister should be called with (create || replace). + * But right now there is no way to RMR to go ahead with the delete, + * even if marked_delete is not set. + */ + rmr_srv_unregister(dev->poolname, create); + dev->pool = NULL; + brmr_srv_ref_kill(dev); + + return err; +} + +/** + * brmr_srv_blk_cleanup() - Cleanup all the opened and active brmr srv block devices + * + * Description: + * This function is called when the module brmr server store is getting removed. + * It closes, destroys and frees all the open and active brmr server block devices. + */ +static void brmr_srv_blk_cleanup(void) +{ + struct brmr_srv_blk_dev *dev, *tmp; + + mutex_lock(&store_mutex); + list_for_each_entry_safe(dev, tmp, &store_list, entry) { + blk_str_destroy_sysfs_files(dev, NULL); + brmr_srv_blk_close(dev, false); + + pr_info("put blkdev %s\n", dev->bdev->bd_disk->disk_name); + bdev_fput(dev->bdev_file); + + brmr_srv_blk_destroy(dev); + } + mutex_unlock(&store_mutex); +} + +/** + * brmr_srv_blk_create() - Create an brmr_srv_blk_dev with the given data + * + * @path: path to the block device. + * @poolname: Name to be given to the created block device + * + * Description: + * To destroy a created brmr server block device, call brmr_srv_blk_destroy() + * + * Return: + * Pointer to the allocated brmr srv block device on success + * Error pointer on error + */ +struct brmr_srv_blk_dev *brmr_srv_blk_create(const char *path, char *poolname) +{ + struct brmr_srv_blk_dev *dev; + int err = 0; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + err = -ENOMEM; + goto err; + } + + strscpy(dev->poolname, poolname, NAME_MAX); + + dev->io_priv_cache = kmem_cache_create("brmr_srv_io_priv_cache", + sizeof(struct brmr_srv_io_priv), 0, 0, NULL); + if (!dev->io_priv_cache) { + pr_err("failed to create cache for device %s\n", poolname); + err = -ENOMEM; + goto free_dev; + } + + pr_debug("brmr srv blk store with name %s created\n", poolname); + + return dev; + +free_dev: + kfree(dev); +err: + return ERR_PTR(err); +} + +/** + * brmr_srv_blk_destroy() - Destroy a given brmr_srv_blk_dev + * + * @dev: brmr server block device to be destroyed + * @sysfs_self: Pointer to self attribute + * + * Description: + * This function is the opposite of brmr_srv_blk_create() + * The pointer to the self attribute is used to denote whether the destroy call + * is a result of a sysfs task for its own device. + */ +void brmr_srv_blk_destroy(struct brmr_srv_blk_dev *dev) +{ + kmem_cache_destroy(dev->io_priv_cache); + kfree(dev); +} + +/** + * brmr_srv_blk_map_dev() - Process a map command from the client side + * + * @dev: brmr server block device to be destroyed + * @map_cmd: Pointer to structure holding map command info + * + * Description: + * We save all the data and param sent in the command in out metadata, + * since these are assured to have been validated across all storage nodes. + * + * For future get params requests, we send back these instead of reading them + * from the underlying block device. + * + * Return: + * 0 on success + * -Error value on error + */ +static int brmr_srv_blk_map_dev(struct brmr_srv_blk_dev *dev, + const struct brmr_msg_map_new_cmd *map_cmd) +{ + const struct brmr_blk_dev_params *cmd_dev_params = &map_cmd->dev_params; + int err; + u64 recvd_mapped_size = map_cmd->mapped_size; + + pr_info("%s: Mapping device %s with mapped_size %llu, recvd size %llu\n", + __func__, dev->name, dev->mapped_size, recvd_mapped_size); + + if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state)) { + pr_err("%s: Received map command for already mapped device %s\n", + __func__, dev->name); + return -EINVAL; + } + + if (recvd_mapped_size > dev->dev_size - BLK_STR_MD_SIZE_SECTORS) { + pr_err("can not map %llu, only %llu available %s\n", + recvd_mapped_size, dev->dev_size - BLK_STR_MD_SIZE_SECTORS, dev->name); + return -ENOSPC; + } + + if (dev->mapped_size && dev->mapped_size != recvd_mapped_size) { + pr_err("dev %s is already mapped with size %llu, does not match %llu", + dev->name, dev->mapped_size, recvd_mapped_size); + return -EINVAL; + } + + dev->mapped_size = recvd_mapped_size; + + dev->dev_params.max_hw_sectors = le32_to_cpu(cmd_dev_params->max_hw_sectors); + dev->dev_params.max_write_zeroes_sectors = + le32_to_cpu(cmd_dev_params->max_write_zeroes_sectors); + dev->dev_params.max_discard_sectors = le32_to_cpu(cmd_dev_params->max_discard_sectors); + dev->dev_params.discard_granularity = le32_to_cpu(cmd_dev_params->discard_granularity); + dev->dev_params.discard_alignment = le32_to_cpu(cmd_dev_params->discard_alignment); + dev->dev_params.physical_block_size = le16_to_cpu(cmd_dev_params->physical_block_size); + dev->dev_params.logical_block_size = le16_to_cpu(cmd_dev_params->logical_block_size); + dev->dev_params.max_segments = le16_to_cpu(cmd_dev_params->max_segments); + dev->dev_params.secure_discard = le16_to_cpu(cmd_dev_params->secure_discard); + dev->dev_params.cache_policy = cmd_dev_params->cache_policy; + + brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_MAPPED); + + err = brmr_srv_blk_write_md(dev); + if (err) { + pr_err("failed to write md for %s, err %d\n", dev->name, err); + dev->mapped_size = 0; + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED); + return -EINVAL; + } + + return 0; +} + +/* Always succeeds. */ +static int brmr_srv_blk_unmap_dev(struct brmr_srv_blk_dev *dev) +{ + pr_info("unmap device: %s\n", dev->name); + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED); + + return 0; +} + +static bool brmr_srv_blk_io_allowed(void *store_priv) +{ + struct brmr_srv_blk_dev *dev = store_priv; + + if (!dev) { + pr_err("no store registered\n"); + return false; + } + + return test_bit(BRMR_SRV_STORE_OPEN, &dev->state) && + test_bit(BRMR_SRV_STORE_MAPPED, &dev->state); +} + +#define bio_disk_name(bio) ((bio)->bi_bdev->bd_disk->disk_name) +#define bio_first_sector(bio) ((bio_end_sector(bio) - bio_sectors(bio))) + +static void brmr_srv_bi_end_io(struct bio *bio) +{ + struct brmr_srv_io_priv *io_priv = bio->bi_private; + struct brmr_srv_blk_dev *dev = io_priv->dev; + int err; + + err = blk_status_to_errno(bio->bi_status); + pr_debug("end io called for dev %s, bio=%p, err=%d\n", dev->poolname, bio, err); + + if (err) { + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN); + pr_err("Dev %s, Bio %p type %s, err=%d bdev_name=%s\n", dev->poolname, + bio, bio_data_dir(bio) == WRITE ? "W" : "R", err, bio_disk_name(bio)); + } + + rmr_srv_req_resp(io_priv->priv, err); + + kmem_cache_free(dev->io_priv_cache, io_priv); + brmr_srv_blk_put_ref(dev); + bio_put(bio); +} + +static int brmr_srv_submit_bi(struct brmr_srv_blk_dev *dev, void *data, u64 offset, u32 length, + unsigned long flags, u16 prio, void *priv) +{ + struct bio *bio; + struct brmr_srv_io_priv *io_priv; + blk_opf_t bio_flags; + int ret = 0; + bool is_md_op = false; + + switch (rmr_op(flags)) { + case RMR_OP_READ: + bio_flags = REQ_OP_READ; + break; + case RMR_OP_WRITE: + case RMR_OP_SYNCREQ: + bio_flags = REQ_OP_WRITE; + break; + case RMR_OP_DISCARD: + bio_flags = REQ_OP_DISCARD; + break; + case RMR_OP_WRITE_ZEROES: + bio_flags = REQ_OP_WRITE_ZEROES; + break; + case RMR_OP_FLUSH: + bio_flags = REQ_OP_WRITE | REQ_PREFLUSH; + break; + case RMR_OP_MD_READ: + bio_flags = REQ_OP_READ; + is_md_op = true; + break; + case RMR_OP_MD_WRITE: + bio_flags = REQ_OP_WRITE; + is_md_op = true; + break; + default: + pr_err("Wrong flags=%lu\n", flags); + return -EINVAL; + } + + /* + * Most md IO are created on rmr-srv and does not get priority value passed on from rmr-clt + */ + if (is_md_op) { + bio_flags |= REQ_META; + if (rmr_op(flags) == RMR_OP_MD_WRITE) + bio_flags |= REQ_FUA; + } + + if (flags & RMR_F_SYNC) + bio_flags |= REQ_SYNC; + + if (flags & RMR_F_FUA) + bio_flags |= REQ_FUA; + + bio = bio_alloc(dev->bdev, 1, bio_flags, GFP_KERNEL); + if (bio_add_page(bio, virt_to_page(data), length, + offset_in_page(data)) != length) { + pr_err("Failed to map data to bio\n"); + ret = -EINVAL; + goto put_bio; + } + + io_priv = kmem_cache_zalloc(dev->io_priv_cache, GFP_KERNEL); + if (!io_priv) { + pr_err("Failed to alloc io_priv for op %lx dev %s\n", flags, dev->poolname); + ret = -ENOMEM; + goto put_bio; + } + + io_priv->dev = dev; + io_priv->priv = priv; + + bio->bi_private = io_priv; + bio->bi_end_io = brmr_srv_bi_end_io; + bio->bi_iter.bi_sector = offset; + bio->bi_iter.bi_size = length; + bio_set_dev(bio, dev->bdev); + + pr_debug("Submit %s bio=%p, disk=%s, flag=[%lx], bio_flag=[%x], op=[%x]" + "first_sect=%llu, sectors=%d\n", + is_md_op ? "md req" : "req", bio, bio_disk_name(bio), + flags, bio_flags, rmr_op(flags), + (u64)bio_first_sector(bio), bio_sectors(bio)); + + if (is_md_op) { + ret = submit_bio_wait(bio); + if (ret) { + pr_err("Error waiting md from %s, err %d\n", + dev->bdev->bd_disk->disk_name, ret); + } + goto end_bio; + } else { + /* + * Most md IO are created on rmr-srv and does not get priority value passed on from + * rmr-clt + */ + bio->bi_ioprio = prio; + submit_bio(bio); + } + + return 0; +end_bio: + rmr_srv_req_resp(io_priv->priv, ret); + kmem_cache_free(dev->io_priv_cache, io_priv); +put_bio: + bio_put(bio); + return ret; +} + +/** + * brmr_srv_process_blk_req() - Processes brmr srv store IO messages + * + * @dev: pointer to rmr block device + * @data: pointer to data + * @data_offset: offset on disk (represented in bytes) + * @length: length of data in bytes + * @flags: IO flags + * @prio: prio from block layer + * @priv: pointer to priv data for rmr + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_process_blk_req(void *device, void *data, u32 data_offset, + u32 length, unsigned long flags, u16 prio, void *priv) +{ + struct brmr_srv_blk_dev *dev = (struct brmr_srv_blk_dev *)device; + u64 offset = 0; /* in sectors */ + int ret = 0; + + if (!brmr_srv_blk_get_ref(dev)) { + pr_err("for dev %s, name %s, failed to get_ref\n", + dev->name, dev->poolname); + return -EIO; + } + + if (!brmr_srv_blk_io_allowed(dev)) { + pr_err("Store name %s, offset %u, length %u, io is not allowed!\n", + dev->poolname, data_offset, length); + ret = -EINVAL; + goto err; + } + + offset = BLK_STR_MD_SIZE_SECTORS; + offset += (data_offset) >> SECTOR_SHIFT; //bytes to sectors; + + pr_debug("Submitted req to %s, flag %lu offset %llu length %u\n", + dev->name, flags, offset, length); + ret = brmr_srv_submit_bi(dev, data, offset, length, flags, prio, priv); + if (ret) { + pr_err("%s: bio submission failed for data IO\n", __func__); + goto err; + } + + return 0; + +err: + brmr_srv_blk_put_ref(dev); + return ret; +} + +/** + * brmr_srv_process_blk_md_req() - Process the requests for rmr metadata + * + * Return: + * 0 on success + * + * Description: + * The rmr metadata will be stored at the end of the device. + */ +static int brmr_srv_process_blk_md_req(void *device, void *data, u32 data_offset, + u32 length, unsigned long flags, void *priv) +{ + struct brmr_srv_blk_dev *dev = device; + int err; + u64 offset = 0; /* in sectors */ + + if (!brmr_srv_blk_get_ref(dev)) { + pr_err("for dev %s, name %s, failed to get_ref\n", + dev->name, dev->poolname); + return -EIO; + } + + /* The mapped_size is in sectors. */ + offset = BLK_STR_MD_SIZE_SECTORS + dev->mapped_size; + offset += (data_offset) >> SECTOR_SHIFT; //bytes to sectors; + pr_debug("Submitted md req to %s, flag %lu offset %llu length %u\n", + dev->name, flags, offset, length); + /* + * It's no need to return err to upper layer here. If the submission of md request fails, + * it will go through the endreq path after the server req finishes processing. + */ + err = brmr_srv_submit_bi(dev, data, offset, length, flags, 0, priv); + if (err) + pr_err("%s: bio submission failed for metadata IO\n", __func__); + brmr_srv_blk_put_ref(dev); + return 0; +} + +/** + * brmr_srv_init_cmd_rsp() - Initialize command response + * + * @msg: command response to initialize + */ +static void brmr_srv_init_cmd_rsp(struct brmr_msg_cmd_rsp *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->magic = BRMR_CMD_RSP_MAGIC; + msg->ver = BRMR_PROTO_VER_MAJOR; + msg->cmd_type = BRMR_CMD_RSP; +} + +/** + * brmr_srv_fill_dev_param_dev() - Fill dev params from the saved params in brmr srv block device + * + * @dev: pointer to brmr server block device + * @rsp: Pointer to command response structure holding params + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_fill_dev_param_dev(struct brmr_srv_blk_dev *dev, + struct brmr_cmd_get_params_rsp *rsp) +{ + struct brmr_srv_blk_dev_meta *md_page; + struct brmr_blk_dev_params *rsp_dev_params = &rsp->dev_params; + int ret; + + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + ret = -ENOMEM; + goto out; + } + + /* + * We have to read the metadata from the device. + */ + ret = brmr_srv_blk_bdev_read_md(dev->bdev, (void *)md_page); + if (ret) { + pr_err("%s: failed to read md, err=%d\n", __func__, ret); + goto out; + } + + if (md_page->magic != BRMR_BLK_STORE_MAGIC) { + pr_warn("%s: No md found. store %s md magic=%llX does not match %X\n", + __func__, dev->poolname, md_page->magic, BRMR_BLK_STORE_MAGIC); + ret = -EINVAL; + goto out; + } + + rsp_dev_params->max_hw_sectors = cpu_to_le32(md_page->dev_params.max_hw_sectors); + rsp_dev_params->max_write_zeroes_sectors = + cpu_to_le32(md_page->dev_params.max_write_zeroes_sectors); + rsp_dev_params->max_discard_sectors = cpu_to_le32(md_page->dev_params.max_discard_sectors); + rsp_dev_params->discard_granularity = cpu_to_le32(md_page->dev_params.discard_granularity); + rsp_dev_params->discard_alignment = cpu_to_le32(md_page->dev_params.discard_alignment); + rsp_dev_params->physical_block_size = cpu_to_le16(md_page->dev_params.physical_block_size); + rsp_dev_params->logical_block_size = cpu_to_le16(md_page->dev_params.logical_block_size); + rsp_dev_params->max_segments = cpu_to_le16(md_page->dev_params.max_segments); + rsp_dev_params->secure_discard = cpu_to_le16(md_page->dev_params.secure_discard); + rsp_dev_params->cache_policy = md_page->dev_params.cache_policy; + +out: + kfree(md_page); + return ret; +} + +/** + * brmr_srv_fill_dev_param_bdev() - Fill dev params from the underlying block device + * + * @dev: pointer to brmr server block device + * @rsp: Pointer to command response structure holding params + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_fill_dev_param_bdev(struct brmr_srv_blk_dev *dev, + struct brmr_cmd_get_params_rsp *rsp) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct brmr_blk_dev_params *rsp_dev_params = &rsp->dev_params; + + if (!q) { + pr_err("%s: no queue for dev %s\n", __func__, dev->name); + return -EINVAL; + } + + rsp_dev_params->logical_block_size = + cpu_to_le16(bdev_logical_block_size(bdev)); + rsp_dev_params->physical_block_size = + cpu_to_le16(bdev_physical_block_size(bdev)); + rsp_dev_params->max_segments = + cpu_to_le16(queue_max_segments(q)); + rsp_dev_params->max_hw_sectors = + cpu_to_le32(queue_max_hw_sectors(q)); + rsp_dev_params->max_write_zeroes_sectors = + cpu_to_le32(bdev_write_zeroes_sectors(bdev)); + rsp_dev_params->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev)); + rsp_dev_params->discard_granularity = + cpu_to_le32(bdev_get_queue(bdev)->limits.discard_granularity); + rsp_dev_params->discard_alignment = + cpu_to_le32(bdev_get_queue(bdev)->limits.discard_alignment); + rsp_dev_params->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev)); + rsp_dev_params->cache_policy = 0; + + if (blk_queue_write_cache(q)) + rsp_dev_params->cache_policy |= BRMR_WRITEBACK; + if (bdev_fua(bdev)) + rsp_dev_params->cache_policy |= BRMR_FUA; + + return 0; +} + +/** + * brmr_srv_fill_get_params_rsp() - Fill dev params into the command response structure + * + * @dev: pointer to brmr server block device + * @brmr_cmd_rsp: Pointer to command response structure + * + * Description: + * For mapped devices, we need to pick up the params from the brmr server block device itself + * These are the same ones which are saved in the metadata of the device. + * + * For unmapped devices, we need to extract this info from the underlying block device + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_fill_get_params_rsp(struct brmr_srv_blk_dev *dev, + struct brmr_msg_cmd_rsp *brmr_cmd_rsp) +{ + struct brmr_cmd_get_params_rsp *rsp; + int ret; + + if (!dev) { + pr_err("%s: no brmr srv blk dev to get params\n", __func__); + return -ENODEV; + } + + if (!dev->bdev) { + pr_err("%s: no bdev opened for dev %s\n", __func__, dev->name); + return -EINVAL; + } + + rsp = &brmr_cmd_rsp->get_params_rsp; + + /* + * For a mapped device, we get the saved params in the device structure (read from md) + * since those are the ones which would have gone through validation, + * when the map happened. + * + * For unmapped device, we get params from the underlying bdev. + */ + if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state)) + ret = brmr_srv_fill_dev_param_dev(dev, rsp); + else + ret = brmr_srv_fill_dev_param_bdev(dev, rsp); + + if (ret) { + pr_err("%s: Fill dev params failed for dev %s\n", __func__, dev->name); + return -EINVAL; + } + + rsp->mapped = test_bit(BRMR_SRV_STORE_MAPPED, &dev->state); + rsp->mapped_size = cpu_to_le64(dev->mapped_size); + pr_info("%s: dev %s, mapped_size %llu\n", __func__, + dev->name, le64_to_cpu(rsp->mapped_size)); + + return 0; +} + +/** + * brmr_srv_blk_cmd() - Processes brmr srv store command messages + * + * @device: brmr server store device + * @usr_buf: user buffer containing the command message struct (ones sent as kvec to rmr) + * @usr_len: length of the usr_buf + * @data: data buffer where the response can be sent back for brmr client to read + * @datalen: length of data buffer + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_blk_cmd(void *device, const void *usr_buf, int usr_len, void *data, + int datalen) +{ + struct brmr_srv_blk_dev *dev = device; + const struct brmr_msg_cmd *msg = (const struct brmr_msg_cmd *)usr_buf; + struct brmr_msg_cmd_rsp *brmr_cmd_rsp = (struct brmr_msg_cmd_rsp *)data; + int ret = 0; + + if (datalen < sizeof(*brmr_cmd_rsp)) { + WARN_ON(1); + return -EINVAL; + } + + if (!brmr_srv_blk_get_ref(dev)) { + pr_err("for dev %s, name %s, failed to get_ref to process command %d\n", + dev->name, dev->poolname, msg->cmd_type); + return -EIO; + } + + brmr_srv_init_cmd_rsp(brmr_cmd_rsp); + + switch (msg->cmd_type) { + case BRMR_CMD_MAP: + pr_info("%s: BRMR_CMD_MAP\n", __func__); + + brmr_cmd_rsp->status = brmr_srv_blk_map_dev(dev, &msg->map_new_cmd); + if (brmr_cmd_rsp->status) { + pr_err("Failed to map new dev to %s, err %d\n", + dev->name, brmr_cmd_rsp->status); + } + break; + case BRMR_CMD_REMAP: + pr_info("%s: BRMR_CMD_REMAP\n", __func__); + break; + case BRMR_CMD_UNMAP: + pr_info("%s: BRMR_CMD_UNMAP\n", __func__); + + brmr_cmd_rsp->status = brmr_srv_blk_unmap_dev(dev); + break; + case BRMR_CMD_GET_PARAMS: + pr_info("%s: BRMR_CMD_GET_PARAMS\n", __func__); + + brmr_cmd_rsp->status = brmr_srv_fill_get_params_rsp(dev, brmr_cmd_rsp); + break; + + default: + pr_err("%s: Unknown command type %d\n", __func__, msg->cmd_type); + } + + brmr_srv_blk_put_ref(dev); + + return ret; +} + +struct rmr_srv_store_ops pstore_blk_ops = { + .submit_req = brmr_srv_process_blk_req, + .submit_md_req = brmr_srv_process_blk_md_req, + .submit_cmd = brmr_srv_blk_cmd, + .io_allowed = brmr_srv_blk_io_allowed, + .get_params = brmr_srv_blk_get_params, +}; + +static int __init brmr_srv_init_module(void) +{ + int err = 0; + + pr_info("Loading module %s, version %s\n", + KBUILD_MODNAME, BRMR_SERVER_VER_STRING); + + err = brmr_srv_create_sysfs_files(); + if (err) { + pr_err("rmr_store_create_sysfs_files(), err: %d\n", err); + goto out; + } + + return 0; +out: + return err; +} + +static void __exit brmr_srv_cleanup_module(void) +{ + brmr_srv_blk_cleanup(); + brmr_srv_destroy_sysfs_files(); + + pr_info("Module %s unloaded\n", KBUILD_MODNAME); +} + +module_init(brmr_srv_init_module); +module_exit(brmr_srv_cleanup_module); diff --git a/drivers/block/brmr/brmr-srv.h b/drivers/block/brmr/brmr-srv.h new file mode 100644 index 000000000000..4180ee600e65 --- /dev/null +++ b/drivers/block/brmr/brmr-srv.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef BRMR_SRV_H +#define BRMR_SRV_H + +#include +#include +#include +#include + +#include "brmr-proto.h" +#include "rmr-req.h" + +#define BRMR_SERVER_VER_MAJOR 0 +#define BRMR_SERVER_VER_MINOR 1 + +#ifndef BRMR_SERVER_VER_STRING +#define BRMR_SERVER_VER_STRING __stringify(BRMR_SERVER_VER_MAJOR) "." \ + __stringify(BRMR_SERVER_VER_MINOR) +#endif + +#define DEFAULT_BLK_OPEN_FLAGS (BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_EXCL) + +#define BRMR_BLK_STORE_MAGIC 0xC0FFEE +#define BLK_STR_MD_SIZE PAGE_SIZE +#define BLK_STR_MD_SIZE_SECTORS (PAGE_SIZE / SECTOR_SIZE) +#define BLK_STR_MIN_MAPPED_SIZE (PAGE_SIZE + BLK_STR_MD_SIZE) + +extern struct list_head store_list; +extern struct mutex store_mutex; + +extern struct rmr_srv_store_ops pstore_blk_ops; +extern struct kobject *rmr_strs_kobj; + +/* brmr server */ + +enum brmr_srv_store_state { + BRMR_SRV_STORE_OPEN, + BRMR_SRV_STORE_MAPPED, + BRMR_SRV_STORE_NEED_SYNC, +}; + +struct brmr_srv_io_priv { + struct brmr_srv_blk_dev *dev; + void *priv; +}; + +struct rmr_blk_dev_params { + u32 max_hw_sectors; + u32 max_write_zeroes_sectors; + u32 max_discard_sectors; + u32 discard_granularity; + u32 discard_alignment; + u16 physical_block_size; + u16 logical_block_size; + u16 max_segments; + u16 secure_discard; + u8 cache_policy; +}; + +struct brmr_srv_blk_dev { + char poolname[NAME_MAX]; + struct block_device *bdev; + struct file *bdev_file; + struct list_head entry; + char name[BDEVNAME_SIZE]; + struct rmr_pool *pool; + u64 mapped_size; /* in sectors */ + u64 dev_size; /* in sectors */ + struct rmr_blk_dev_params dev_params; + struct kmem_cache *io_priv_cache; + struct kobject kobj; + unsigned long state; + struct completion comp; + struct percpu_ref kref; +}; + +struct brmr_srv_blk_dev_meta { + char poolname[NAME_MAX]; + struct rmr_blk_dev_params dev_params; + u64 magic; /* magic token to identify a header */ + u32 version; /* version of the header itself */ + u64 dev_size; + u64 mapped_size; + u64 state; + u64 offset; + u64 ts; +} __packed; + +int brmr_srv_blk_validate_md(struct brmr_srv_blk_dev *dev, struct brmr_srv_blk_dev_meta *meta); +struct brmr_srv_blk_dev *brmr_srv_blk_create(const char *path, char *name); +void brmr_srv_blk_destroy(struct brmr_srv_blk_dev *dev); +int brmr_srv_blk_open(struct brmr_srv_blk_dev *dev, const char *path, bool create, bool replace); +void brmr_srv_blk_close(struct brmr_srv_blk_dev *dev, bool delete); + +int brmr_srv_read_and_check_md(struct brmr_srv_blk_dev *dev, void *md_page); + +static inline void brmr_srv_blk_set_state(struct brmr_srv_blk_dev *dev, + enum brmr_srv_store_state state) +{ + set_bit(state, &dev->state); +} + +static inline void brmr_srv_blk_clear_state(struct brmr_srv_blk_dev *dev, + enum brmr_srv_store_state state) +{ + clear_bit(state, &dev->state); +} + +static inline int brmr_srv_blk_get_ref(struct brmr_srv_blk_dev *dev) +{ + return percpu_ref_tryget(&dev->kref); +} + +static inline void brmr_srv_blk_put_ref(struct brmr_srv_blk_dev *dev) +{ + percpu_ref_put(&dev->kref); +} + + +/* brmr-server-sysfs.c */ + +int brmr_srv_create_sysfs_files(void); +void brmr_srv_destroy_sysfs_files(void); +void blk_str_destroy_sysfs_files(struct brmr_srv_blk_dev *dev, + const struct attribute *sysfs_self); + +#endif /* BRMR_SRV_H */ diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a7e3f29dc037..4b2470b5a592 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -110,5 +110,6 @@ source "drivers/infiniband/ulp/srpt/Kconfig" source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/ulp/rtrs/Kconfig" +source "drivers/infiniband/ulp/rmr/Kconfig" endif # INFINIBAND diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile index 51b0d41699b8..24c8e4b00065 100644 --- a/drivers/infiniband/ulp/Makefile +++ b/drivers/infiniband/ulp/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ obj-$(CONFIG_INFINIBAND_ISER) += iser/ obj-$(CONFIG_INFINIBAND_ISERT) += isert/ obj-$(CONFIG_INFINIBAND_RTRS) += rtrs/ +obj-$(CONFIG_INFINIBAND_RMR) += rmr/ diff --git a/drivers/infiniband/ulp/rmr/Kconfig b/drivers/infiniband/ulp/rmr/Kconfig new file mode 100644 index 000000000000..1d62322a02be --- /dev/null +++ b/drivers/infiniband/ulp/rmr/Kconfig @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config INFINIBAND_RMR + tristate + depends on INFINIBAND_ADDR_TRANS + +config INFINIBAND_RMR_CLIENT + tristate "RMR client module" + depends on INFINIBAND_ADDR_TRANS + depends on INFINIBAND_RTRS_CLIENT + select INFINIBAND_RMR + help + Reliable Multicast over RTRS (RMR) client module. + + RMR is an RDMA ULP that provides active-active block-level + replication on top of the RTRS transport. It guarantees + delivery of an I/O to a group of storage nodes and handles + resynchronization of data between storage nodes without + involving the compute client. This option builds the client + side, intended to be used by an upper-layer initiator such + as BRMR. + + If unsure, say N. + +config INFINIBAND_RMR_SERVER + tristate "RMR server module" + depends on INFINIBAND_ADDR_TRANS + depends on INFINIBAND_RTRS_SERVER + select INFINIBAND_RMR + help + RMR server module processing connection, IO and replication + requests from RMR clients on top of RTRS. It will pass IO + requests to its consumer, e.g. BRMR_server. + + If unsure, say N. diff --git a/drivers/infiniband/ulp/rmr/Makefile b/drivers/infiniband/ulp/rmr/Makefile new file mode 100644 index 000000000000..c173092f4cf2 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs + +CFLAGS_rmr-clt-trace.o = -I$(src) + +rmr-client-y := rmr-pool.o \ + rmr-clt.o \ + rmr-map-mgmt.o \ + rmr-clt-stats.o \ + rmr-clt-sysfs.o \ + rmr-map.o \ + rmr-clt-trace.o + +rmr-server-y := rmr-pool.o \ + rmr-srv.o \ + rmr-srv-md.o \ + rmr-srv-sysfs.o \ + rmr-req.o \ + rmr-map.o + +obj-$(CONFIG_INFINIBAND_RMR_CLIENT) += rmr-client.o +obj-$(CONFIG_INFINIBAND_RMR_SERVER) += rmr-server.o diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-stats.c b/drivers/infiniband/ulp/rmr/rmr-clt-stats.c new file mode 100644 index 000000000000..83a4089defc0 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-stats.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rmr-clt.h" + +int rmr_clt_reset_read_retries(struct rmr_clt_stats *stats, bool enable) +{ + if (unlikely(!enable)) + return -EINVAL; + + atomic_set(&stats->read_retries, 0); + + return 0; +} + +ssize_t rmr_clt_stats_read_retries_to_str( + struct rmr_clt_stats *stats, char *page) +{ + return sysfs_emit(page, "%u\n", + atomic_read(&stats->read_retries)); +} + diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c b/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c new file mode 100644 index 000000000000..7e12c526f0c9 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c @@ -0,0 +1,1496 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include /* for NAME_MAX */ +#include +#include +#include +#include +#include +#include + +#include "rmr-clt.h" + +/* + * Wait a bit before trying to reconnect after a failure + * in order to give server time to finish clean up which + * leads to "false positives" failed reconnect attempts + */ +#define RTRS_RECONNECT_BACKOFF 1000 + +#define RMR_DEFAULT_CHUNK_SIZE 131072 /* 128 KB */ + +static struct class *rmr_dev_class; +static struct device *rmr_ctl_dev; +static struct device *rmr_pool_dev; +static struct device *rmr_sess_dev; + +enum { + RMR_OPT_ERR = 0, + RMR_ADD_OPT_PATH = 1 << 0, + RMR_ADD_OPT_SESSNAME = 1 << 1, + RMR_ADD_OPT_MODE = 1 << 2, + RMR_DEL_OPT_MODE = 1 << 3, +}; + +static unsigned int rmr_opt_add_mandatory[] = { + RMR_ADD_OPT_PATH, + RMR_ADD_OPT_SESSNAME, + RMR_ADD_OPT_MODE, +}; + +/* For sync pools mode is not meaningful; only path and sessname are required. */ +static unsigned int rmr_opt_add_sync_mandatory[] = { + RMR_ADD_OPT_PATH, + RMR_ADD_OPT_SESSNAME, +}; + +static const match_table_t rmr_opt_add_tokens = { + { RMR_ADD_OPT_PATH, "path=%s" }, + { RMR_ADD_OPT_SESSNAME, "sessname=%s" }, + { RMR_ADD_OPT_MODE, "mode=%s" }, + { RMR_OPT_ERR, NULL }, +}; + +enum rmr_opt_join { + RMR_JOIN_OPT_POOLNAME, + RMR_JOIN_OPT_Mandatory_count, + RMR_JOIN_OPT_SYNC, + RMR_JOIN_OPT_CHUNK_SIZE, + RMR_JOIN_OPT_ERR, +}; + +static const char * const rmr_srv_opts_mandatory_names[] = { + [RMR_JOIN_OPT_POOLNAME] = "poolname", +}; + +static const match_table_t rmr_opt_join_tokens = { + { RMR_JOIN_OPT_POOLNAME, "poolname=%s" }, + { RMR_JOIN_OPT_SYNC, "sync=%s" }, + { RMR_JOIN_OPT_CHUNK_SIZE, "chunk_size=%s" }, + { RMR_JOIN_OPT_ERR, NULL }, +}; + +static unsigned int rmr_opt_del_mandatory[] = { + RMR_DEL_OPT_MODE, +}; + +static const match_table_t rmr_opt_del_tokens = { + { RMR_DEL_OPT_MODE, "mode=%s" }, + { RMR_OPT_ERR, NULL }, +}; + +enum { + RMR_RECONNECT_OPT_ERR = 0, + RMR_RECONNECT_OPT_PATH = 1 << 0, +}; + +static unsigned int rmr_opt_reconnect_mandatory[] = { + RMR_RECONNECT_OPT_PATH, +}; + +static const match_table_t rmr_opt_reconnect_tokens = { + { RMR_RECONNECT_OPT_PATH, "path=%s" }, + { RMR_RECONNECT_OPT_ERR, NULL }, +}; + +/* remove new line from string */ +static void strip(char *s) +{ + char *p = s; + + while (*s != '\0') { + if (*s != '\n') + *p++ = *s++; + else + ++s; + } + *p = '\0'; +} + +static int rmr_clt_parse_add_sess_opts(const char *buf, char *sessname, int *create, + struct rtrs_addr *paths, size_t *path_cnt, + size_t max_path_cnt, const char *er_msg, + const match_table_t rmr_opt_tokens, + unsigned int *rmr_opt_mandatory, + size_t num_rmr_opt_mandatory) +{ + char *options, *options_orig, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + int p_cnt = 0; + + options_orig = kstrdup(buf, GFP_KERNEL); + if (!options_orig) + return -ENOMEM; + + options = strstrip(options_orig); + strip(options); + sep_opt = options; + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case RMR_ADD_OPT_SESSNAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("%s: sessname too long\n", er_msg); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(sessname, p, NAME_MAX); + kfree(p); + break; + + case RMR_ADD_OPT_PATH: + p = match_strdup(args); + if (!p || p_cnt >= max_path_cnt) { + ret = -ENOMEM; + goto out; + } + + ret = rtrs_addr_to_sockaddr(p, strlen(p), RTRS_PORT, + &paths[p_cnt]); + if (ret) { + pr_err("Can't parse path %s: %d\n", p, ret); + kfree(p); + goto out; + } + + p_cnt++; + + kfree(p); + break; + + case RMR_ADD_OPT_MODE: + if (!create) { + pr_err("%s: mode option not supported here\n", er_msg); + ret = -EINVAL; + goto out; + } + + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "create")) { + *create = true; + } else if (!strcmp(p, "assemble")) { + *create = false; + } else { + pr_err("%s: Unknown mode '%s' (valid: create, assemble)\n", er_msg, p); + ret = -EINVAL; + kfree(p); + goto out; + } + kfree(p); + break; + + default: + pr_err("%s: Unknown parameter or missing value" + " '%s'\n", er_msg, p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < num_rmr_opt_mandatory; i++) { + if ((opt_mask & rmr_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("%s: Parameters missing\n", er_msg); + ret = -EINVAL; + break; + } + } + +out: + if (path_cnt) + *path_cnt = p_cnt; + kfree(options_orig); + return ret; +} + +static void rmr_clt_destroy_session_sysfs_files(struct rmr_clt_pool_sess *pool_sess, + const struct attribute *sysfs_self) +{ + if (pool_sess->kobj.state_in_sysfs) { + sysfs_remove_link(&pool_sess->kobj, "clt_sess"); + + if (sysfs_self) + sysfs_remove_file_self(&pool_sess->kobj, sysfs_self); + kobject_del(&pool_sess->kobj); + kobject_put(&pool_sess->kobj); + } +} + +static int rmr_clt_parse_del_sess_opts(const char *buf, bool *delete) +{ + char *options, *options_orig, *sep_opt, *p; + substring_t args[MAX_OPT_ARGS]; + int i, token, opt_mask = 0, ret = -EINVAL; + + options_orig = kstrdup(buf, GFP_KERNEL); + if (!options_orig) + return -ENOMEM; + + options = strstrip(options_orig); + strip(options); + sep_opt = options; + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_opt_del_tokens, args); + opt_mask |= token; + + switch (token) { + case RMR_DEL_OPT_MODE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "delete")) { + *delete = true; + } else if (!strcmp(p, "disassemble")) { + *delete = false; + } else { + pr_err("%s: Unknown mode '%s' (valid: delete, disassemble)\n", "del_sess", p); + ret = -EINVAL; + kfree(p); + goto out; + } + kfree(p); + break; + + default: + pr_err("%s: Unknown parameter or missing value '%s'\n", "del_sess", p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(rmr_opt_del_mandatory); i++) { + if ((opt_mask & rmr_opt_del_mandatory[i])) { + ret = 0; + } else { + pr_err("%s: Parameters missing\n", "del_sess"); + ret = -EINVAL; + break; + } + } + +out: + kfree(options_orig); + return ret; +} + +static ssize_t rmr_clt_del_sess_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_del_sess_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess *clt_sess; + int err, i, idx; + bool delete = false; + u8 srv_sess_member_id; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + clt_sess = pool_sess->clt_sess; + srv_sess_member_id = pool_sess->member_id; + pool = pool_sess->pool; + clt_pool = (struct rmr_clt_pool *)pool->priv; + + err = rmr_clt_parse_del_sess_opts(buf, &delete); + if (err) + return err; + + if (pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_REMOVING)) { + /* + * Freeze + */ + rmr_clt_pool_io_freeze(clt_pool); + + /* + * Wait for all completion + */ + rmr_clt_pool_io_wait_complete(clt_pool); + + /* + * Remove the storage node from the pool members list. + */ + xa_erase(&pool->stg_members, srv_sess_member_id); + + /* + * We simply wait for all inflights to get over to make sure + * that they are not affected with the delete session messages + * we are going to send after this. + * Once the inflights are done, we can restart the IOs immediately, + * since the session state has been changed to "removing". + * + * Unfreeze and wake up. + */ + rmr_clt_pool_io_unfreeze(clt_pool); + + send_msg_leave_pool(pool_sess, delete, WAIT); + } + + pr_info("Closing session %s in pool %s\n", + pool_sess->sessname, pool->poolname); + + if (!pool->sync) { + if (delete) { + /* + * Delete map for this session if it exists. + * For disassemble, keep the map so the piggyback loop + * continues to accumulate dirty entries for the member. + */ + rmr_pool_remove_map(pool, srv_sess_member_id); + + /* + * Clear the srv_md entry so the piggyback loop does + * not keep referencing a gone member. + * For disassemble, leave it intact — it is needed to + * identify the member during piggyback until reassembly. + */ + idx = rmr_pool_find_md(&pool->pool_md, srv_sess_member_id, false); + + if (idx >= 0) + memset(&pool->pool_md.srv_md[idx], 0, + sizeof(struct rmr_srv_md)); + /* + * TODO: Push the srv_md change to persistence disk on remaining storages. + */ + } else { + /* + * Disassemble: if this was the last non-sync session, no IOs + * will occur and the dirty maps serve no purpose. Delete them + * all; they will be recreated for all members on the first + * assemble via rmr_clt_process_non_sync_sess. + */ + if (xa_empty(&pool->stg_members)) { + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + u8 mid = pool->pool_md.srv_md[i].member_id; + + if (!mid) + continue; + rmr_pool_remove_map(pool, mid); + } + } + } + + /* + * Send messages to all other sessions, + * Informing them that a particular stor is getting deleted + */ + err = rmr_clt_del_stor_from_pool(pool_sess, delete); + if (err) { + pr_err("pool %s, del_stor failed for sess with member_id %u, err %d\n", + pool->poolname, srv_sess_member_id, err); + return err; + } + } + + /* + * Remove the session from the list. + */ + mutex_lock(&pool->sess_lock); + rmr_clt_del_pool_sess(pool_sess); + mutex_unlock(&pool->sess_lock); + + rmr_clt_destroy_session_sysfs_files(pool_sess, &attr->attr); + + rmr_clt_free_pool_sess(pool_sess); + rmr_clt_sess_put(clt_sess); + + if (list_empty(&pool->sess_list)) + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_JOINED, false); + + return count; +} + +static struct kobj_attribute rmr_clt_del_pool_sess_attr = + __ATTR(del_sess, 0644, rmr_clt_del_sess_show, + rmr_clt_del_sess_store); + +static ssize_t rmr_clt_pool_sess_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_clt_pool_sess *pool_sess; + ssize_t written = 0; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + + written += scnprintf(page, PAGE_SIZE, "%s\n", + rmr_clt_sess_state_str(atomic_read(&pool_sess->state))); + + written += scnprintf(page + written, PAGE_SIZE - written, + "Maintenance mode: %d\n", pool_sess->maintenance_mode); + + return written; +} + +static struct kobj_attribute rmr_clt_pool_sess_state_attr = + __ATTR(state, 0444, rmr_clt_pool_sess_state_show, NULL); + +static ssize_t rmr_clt_sess_member_id_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_clt_pool_sess *pool_sess; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + + return scnprintf(page, PAGE_SIZE, "%u\n", + pool_sess->member_id); +} + +static struct kobj_attribute rmr_clt_pool_sess_member_id_attr = + __ATTR(member_id, 0644, rmr_clt_sess_member_id_show, + NULL); + +static ssize_t rmr_clt_sess_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "echo '1|0' > this_sysfs\n"); +} + +static ssize_t rmr_clt_sess_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool *pool; + int pool_sess_state, err; + bool enable; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + pool = pool_sess->pool; + + if (sysfs_streq(buf, "1")) + enable = true; + else if (sysfs_streq(buf, "0")) + enable = false; + else { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + goto err; + } + + pool_sess_state = atomic_read(&pool_sess->state); + + /* + * Manual disable is interpreted as switching to maintenance mode + * And it is only allowed for sessions NOT in "created" and "removing" state + * And non-sync sessions + */ + if (!enable && ((pool_sess_state == RMR_CLT_POOL_SESS_CREATED) || + (pool_sess_state == RMR_CLT_POOL_SESS_REMOVING) || + (pool_sess->pool->sync))) { + pr_err("Cannot put pool_sess in maintenance mode: state %d, sync %d\n", + pool_sess_state, pool_sess->pool->sync); + goto print_state_err; + } + + if (enable) + err = rmr_clt_enable_sess(pool_sess); + else + err = rmr_clt_set_pool_sess_mm(pool_sess); + if (err) { + pr_err("%s failed with err %d\n", __func__, err); + goto err; + } + + return count; + +print_state_err: + pr_err("Current state: %d\n", atomic_read(&pool_sess->state)); +err: + return -EINVAL; +} + +static struct kobj_attribute rmr_clt_pool_sess_enable_attr = + __ATTR(enable, 0644, rmr_clt_sess_enable_show, + rmr_clt_sess_enable_store); + +static ssize_t rmr_clt_sess_check_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "echo '1' > this_sysfs\n"); +} + +static ssize_t rmr_clt_sess_check_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + int err; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + goto err; + } + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_MAP_CHECK; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s failed with err %d\n", __func__, err); + goto err; + } + return count; + +err: + return -EINVAL; +} + +static struct kobj_attribute rmr_clt_pool_sess_check_map_attr = + __ATTR(check_map, 0644, rmr_clt_sess_check_map_show, + rmr_clt_sess_check_map_store); + +static struct attribute *rmr_clt_pool_sess_attrs[] = { + &rmr_clt_del_pool_sess_attr.attr, + &rmr_clt_pool_sess_state_attr.attr, + &rmr_clt_pool_sess_member_id_attr.attr, + &rmr_clt_pool_sess_enable_attr.attr, + &rmr_clt_pool_sess_check_map_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_clt_pool_sess); + +static struct kobj_type rmr_clt_pool_sess_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_clt_pool_sess_groups, +}; + +static int rmr_clt_create_session_sysfs_files(struct rmr_clt_pool_sess *pool_sess) +{ + int ret; + + ret = kobject_init_and_add(&pool_sess->kobj, &rmr_clt_pool_sess_ktype, + &pool_sess->pool->sessions_kobj, + "%s", pool_sess->sessname); + if (ret) + pr_err("Failed to create sysfs dir for session '%s': %d\n", + pool_sess->sessname, ret); + + return ret; +} + +static ssize_t rmr_clt_pool_add_sess_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo \"" + "sessname=" + " path=<[srcaddr,]dstaddr>" + " [path=<[srcaddr,]dstaddr>]\" > %s\n\n" + "addr ::= [ ip: | ip: | gid: ]\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_add_sess_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rtrs_addr paths[3]; + struct sockaddr_storage saddr[ARRAY_SIZE(paths)]; + struct sockaddr_storage daddr[ARRAY_SIZE(paths)]; + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_sess *clt_sess; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool_md *clt_md; + char *sessname; + size_t path_cnt; + int ret, index, create = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + clt_pool = (struct rmr_clt_pool *)pool->priv; + + sessname = kzalloc(NAME_MAX, GFP_KERNEL); + if (unlikely(!sessname)) + return -ENOMEM; + + for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) { + paths[path_cnt].src = &saddr[path_cnt]; + paths[path_cnt].dst = &daddr[path_cnt]; + } + + ret = rmr_clt_parse_add_sess_opts(buf, sessname, + pool->sync ? NULL : &create, + paths, &path_cnt, ARRAY_SIZE(paths), + "add_sess", rmr_opt_add_tokens, + pool->sync ? rmr_opt_add_sync_mandatory + : rmr_opt_add_mandatory, + pool->sync ? ARRAY_SIZE(rmr_opt_add_sync_mandatory) + : ARRAY_SIZE(rmr_opt_add_mandatory)); + if (ret) + goto free_name; + + pr_info("%s: Creating rmr client session %s in pool %s\n", __func__, sessname, + pool->poolname); + + clt_sess = find_and_get_or_create_clt_sess(sessname, paths, path_cnt); + if (IS_ERR(clt_sess)) { + pr_err("failed to find and get or create clt sess %s\n", sessname); + ret = PTR_ERR(clt_sess); + goto free_name; + } + + pool_sess = rmr_clt_add_pool_sess(pool, clt_sess, create); + if (IS_ERR(pool_sess)) { + pr_err("failed to add pool sess %s to the pool %s\n", + sessname, pool->poolname); + ret = PTR_ERR(pool_sess); + goto put_clt_sess; + } + ret = rmr_clt_create_session_sysfs_files(pool_sess); + if (ret) { + pr_err("Creating sysfs files for %s in %s failed: %d\n", + pool_sess->sessname, pool->poolname, ret); + goto destroy_sess; + } + + ret = sysfs_create_link(&pool_sess->kobj, &clt_sess->kobj, "clt_sess"); + if (ret) { + pr_err("Creating symlink for %s failed, err: %d\n", + pool_sess->sessname, ret); + rmr_clt_destroy_session_sysfs_files(pool_sess, NULL); + goto destroy_sess; + } + // ret = sysfs_create_link(&sess->kobj, sess->sess_kobj, + // RTRS_LINK_NAME); + // if (ret) { + // pr_err("Creating rtrs symlink for %s in %s failed: %d\n", + // sess->sessname, pool->poolname, ret); + // rmr_clt_destroy_session_sysfs_files(sess, NULL); + // goto destroy_sess; + // } + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_JOINED, true); + + clt_md = &pool->pool_md; + index = rmr_pool_find_md(clt_md, pool_sess->member_id, true); + if (index < 0) { + pr_err("No space for member %u in the clt_md\n", pool_sess->member_id); + goto destroy_sess; + } + clt_md->srv_md[index].member_id = pool_sess->member_id; + clt_md->srv_md[index].mapped_size = pool->mapped_size; + + kfree(sessname); + return count; + +destroy_sess: + rmr_clt_destroy_pool_sess(pool_sess, create); +put_clt_sess: + rmr_clt_sess_put(clt_sess); +free_name: + kfree(sessname); + return ret; +} + +static struct kobj_attribute rmr_clt_pool_add_sess_attr = + __ATTR(add_sess, 0644, rmr_clt_pool_add_sess_show, + rmr_clt_pool_add_sess_store); + +static ssize_t rmr_clt_pool_leave_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_leave_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + int ret; + + pool = container_of(kobj, struct rmr_pool, kobj); + clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + if (refcount_read(&clt_pool->refcount) > 1) { + pr_err("%s: Pool %s is in use.\n", __func__, pool->poolname); + return -EINVAL; + } + + pr_info("clt: Deleting pool '%s'\n", pool->poolname); + + ret = rmr_clt_remove_pool_from_sysfs(pool, &attr->attr); + if (unlikely(ret)) + return ret; + + return count; +} + +static struct kobj_attribute rmr_clt_pool_leave_pool_attr = + __ATTR(leave_pool, 0644, rmr_clt_pool_leave_pool_show, + rmr_clt_pool_leave_pool_store); + +static ssize_t rmr_clt_pool_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (pool->chunk_size == UINT_MAX) + return scnprintf(page, PAGE_SIZE, "undefined\n"); + + return scnprintf(page, PAGE_SIZE, "%u\n", pool->chunk_size); +} + +static struct kobj_attribute rmr_clt_pool_chunk_size_attr = + __ATTR(chunk_size, 0644, rmr_clt_pool_chunk_size_show, NULL); + +static ssize_t rmr_clt_pool_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool = container_of(kobj, struct rmr_pool, kobj); + struct rmr_dirty_id_map *map; + int i, lock_idx; + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + rmr_map_dump_bitmap(map); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return 0; +} + +static ssize_t rmr_clt_pool_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + int err; + rmr_id_t id = { 0, 0 }; + int srv_id; + + pool = container_of(kobj, struct rmr_pool, kobj); + if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) { + pr_err("cannot parse id.a %s\n", buf); + return -EINVAL; + } + pr_debug("add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id); + + /* + * If given chunk number exceeds total chunks for us, ignore! + */ + if (id.b > pool->no_of_chunks) + return count; + + err = rmr_clt_map_add_id(pool, srv_id, id); + if (err == -ENOMEM) { + pr_err("failed insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + } else { + pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + } + + return count; +} + +static struct kobj_attribute rmr_clt_pool_map_attr = + __ATTR(map, 0644, rmr_clt_pool_map_show, + rmr_clt_pool_map_store); + +static ssize_t rmr_clt_pool_map_ver_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + ssize_t written; + + pool = container_of(kobj, struct rmr_pool, kobj); + + written = scnprintf(page, PAGE_SIZE, "Map ver: %llu\n", pool->map_ver); + + return written; +} + +static struct kobj_attribute rmr_clt_pool_map_ver_attr = + __ATTR(map_version, 0444, rmr_clt_pool_map_ver_show, NULL); + +static ssize_t rmr_clt_pool_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + int ret; + + pool = container_of(kobj, struct rmr_pool, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + return -EINVAL; + } + + ret = rmr_clt_pool_try_enable(pool); + if (ret) { + pr_err("%s: pool %s rmr_clt_pool_try_enable failed with err %d\n", + attr->attr.name, pool->poolname, ret); + return ret; + } + + return count; +} + +static struct kobj_attribute rmr_clt_pool_enable_attr = + __ATTR(pool_enable, 0644, rmr_clt_pool_enable_show, + rmr_clt_pool_enable_store); + +static ssize_t rmr_clt_pool_test_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_test_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + int err; + + pool = container_of(kobj, struct rmr_pool, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + pr_info("pool %s start test map...\n", pool->poolname); + err = rmr_clt_test_map(pool, pool); + if (err) { + pr_err("pool %s, test map failed, err %d\n", + pool->poolname, err); + return err; + } + pr_info("pool %s test map done.", pool->poolname); + + return count; +} + +static struct kobj_attribute rmr_clt_pool_test_map_attr = + __ATTR(test_map, 0644, rmr_clt_pool_test_map_show, + rmr_clt_pool_test_map_store); + +static struct attribute *rmr_clt_pool_attrs[] = { + &rmr_clt_pool_add_sess_attr.attr, + &rmr_clt_pool_leave_pool_attr.attr, + &rmr_clt_pool_chunk_size_attr.attr, + &rmr_clt_pool_map_attr.attr, + &rmr_clt_pool_map_ver_attr.attr, + &rmr_clt_pool_enable_attr.attr, + &rmr_clt_pool_test_map_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_clt_pool); + +static struct kobj_type rmr_clt_pool_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_clt_pool_groups, +}; + +static struct kobj_type ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +static ssize_t rmr_clt_join_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo \"" + "poolname= sync=y|Y|0|1 [chunk_size=]\" " + "> %s\n", + attr->attr.name); +} + +static int rmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *stats_kobj); + +static int rmr_clt_create_pool_sysfs_files(struct rmr_pool *pool) +{ + int ret; + struct rmr_clt_pool *clt_pool; + + ret = kobject_init_and_add(&pool->kobj, &rmr_clt_pool_ktype, + &rmr_pool_dev->kobj, "%s", pool->poolname); + if (ret) { + pr_err("Failed to create sysfs dir for pool '%s': %d\n", + pool->poolname, ret); + return ret; + } + + ret = kobject_init_and_add(&pool->sessions_kobj, &ktype, &pool->kobj, + "sessions"); + if (unlikely(ret)) { + pr_err("Failed to create sessions dir for pool '%s': %d\n", + pool->poolname, ret); + goto put_pool_kobj; + } + clt_pool = (struct rmr_clt_pool *)pool->priv; + ret = rmr_clt_create_stats_files(&pool->kobj, &clt_pool->stats_kobj); + if (unlikely(ret)) { + pr_err("Failed to create sysfs stats files " + "for pool '%s': %d\n", + pool->poolname, ret); + goto put_sessions_kobj; + } + + return 0; + +put_sessions_kobj: + kobject_del(&pool->sessions_kobj); + kobject_put(&pool->sessions_kobj); +put_pool_kobj: + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + + return ret; +} + +void rmr_clt_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self) +{ + struct rmr_clt_pool *clt_pool; + + if (pool->kobj.state_in_sysfs) { + clt_pool = (struct rmr_clt_pool *)pool->priv; + kobject_del(&clt_pool->stats_kobj); + kobject_put(&clt_pool->stats_kobj); + + kobject_del(&pool->sessions_kobj); + kobject_put(&pool->sessions_kobj); + if (sysfs_self) + sysfs_remove_file_self(&pool->kobj, sysfs_self); + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + } +} + +static ssize_t rmr_clt_sess_reconnect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "echo 'path=ip:' > this_sysfs\n"); +} + +static ssize_t rmr_clt_sess_reconnect_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_clt_sess *clt_sess; + struct rmr_clt_pool_sess *pool_sess; + struct rtrs_addr paths[3]; + struct sockaddr_storage saddr[ARRAY_SIZE(paths)]; + struct sockaddr_storage daddr[ARRAY_SIZE(paths)]; + size_t path_cnt; + int err; + + + clt_sess = container_of(kobj, struct rmr_clt_sess, kobj); + + pr_info("%s: Starting manual reconnect for clt_sess %s\n", __func__, clt_sess->sessname); + + /* + * The IP of the server has changed. + * Close the old rtrs connection, parse the path IP, + * and reconnect the session + */ + for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) { + paths[path_cnt].src = &saddr[path_cnt]; + paths[path_cnt].dst = &daddr[path_cnt]; + } + + err = rmr_clt_parse_add_sess_opts(buf, NULL, NULL, paths, &path_cnt, ARRAY_SIZE(paths), + "reconnect_sess", rmr_opt_reconnect_tokens, + rmr_opt_reconnect_mandatory, + ARRAY_SIZE(rmr_opt_reconnect_mandatory)); + if (err) { + pr_err("%s: failed to parse options, err=%d\n", __func__, err); + return err; + } + + if (!IS_ERR_OR_NULL(clt_sess->rtrs)) { + pr_info("close rtrs clt for session %s\n", clt_sess->sessname); + + clt_sess->state = RMR_CLT_SESS_DISCONNECTED; + + /* + * Wait for the state to be seen by rmr client + * + * The ones which are already in the rcu read section (see rmr_get_sess_iu) + * would complete its get_permit for rtrs. + * After that, rtrs_clt_close would wait for all the inflight permits to be + * returned. + */ + mutex_lock(&clt_sess->lock); + list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) + synchronize_srcu(&pool_sess->pool->sess_list_srcu); + mutex_unlock(&clt_sess->lock); + + rtrs_clt_close(clt_sess->rtrs); + clt_sess->rtrs = NULL; + + msleep(RTRS_RECONNECT_BACKOFF); + } + + err = rmr_clt_reconnect_sess(clt_sess, paths, path_cnt); + if (err) { + pr_err("rmr_clt_reconnect_sess Failed\n"); + return err; + } + + pr_info("%s: Manual reconnect for clt_sess %s succeeded\n", __func__, clt_sess->sessname); + return count; +} + +static struct kobj_attribute rmr_clt_sess_reconnect_attr = + __ATTR(reconnect, 0644, rmr_clt_sess_reconnect_show, + rmr_clt_sess_reconnect_store); + +static const char *rmr_clt_sess_state_names[] = { + [0] = "invalid state", + [RMR_CLT_SESS_DISCONNECTED] = "disconnected", + [RMR_CLT_SESS_CONNECTED] = "connected" +}; + +static ssize_t rmr_clt_sess_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_clt_sess *clt_sess; + + clt_sess = container_of(kobj, struct rmr_clt_sess, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", + rmr_clt_sess_state_names[clt_sess->state]); +} + +static struct kobj_attribute rmr_clt_sess_state_attr = + __ATTR(state, 0444, rmr_clt_sess_state_show, NULL); + +static struct attribute *rmr_clt_sess_attrs[] = { + &rmr_clt_sess_reconnect_attr.attr, + &rmr_clt_sess_state_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_clt_sess); + +static struct kobj_type rmr_clt_sess_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_clt_sess_groups, +}; + +int rmr_clt_create_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess) +{ + int ret; + + ret = kobject_init_and_add(&clt_sess->kobj, &rmr_clt_sess_ktype, + &rmr_sess_dev->kobj, "%s", clt_sess->sessname); + if (ret) { + pr_err("Failed to create sysfs dir for sess '%s': %d\n", + clt_sess->sessname, ret); + return ret; + } + + return 0; +} + +void rmr_clt_destroy_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess) +{ + if (clt_sess->kobj.state_in_sysfs) { + kobject_del(&clt_sess->kobj); + kobject_put(&clt_sess->kobj); + } +} + +static int rmr_clt_parse_join_opts(const char *buf, char *poolname, + bool *sync, u32 *chunk_size) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + options = strstrip(options); + strip(options); + sep_opt = options; + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_opt_join_tokens, args); + opt_mask |= (1 << token); + + switch (token) { + case RMR_JOIN_OPT_POOLNAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("join_pool: poolname too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(poolname, p, NAME_MAX); + kfree(p); + break; + + case RMR_JOIN_OPT_SYNC: + p = match_strdup(args); + + ret = kstrtobool(p, sync); + if (ret) { + pr_err("sync isn't a boolean: %d\n", ret); + kfree(p); + goto out; + } + + kfree(p); + break; + + case RMR_JOIN_OPT_CHUNK_SIZE: + /* + * Min supported chunk_size is PAGE_SIZE. + * The value must be power-of-2 and multiples + * of SECTOR_SIZE. + */ + p = match_strdup(args); + + ret = kstrtou32(p, 0, chunk_size); + if (ret) { + pr_err("chunk_size isn't an integer: %d\n", ret); + kfree(p); + goto out; + } else if (*chunk_size < PAGE_SIZE) { + pr_err("Min supported chunk_size is %lu\n", PAGE_SIZE); + ret = -EINVAL; + kfree(p); + goto out; + } else if (!is_power_of_2(*chunk_size)) { + pr_err("chunk_size must be power of 2\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + + kfree(p); + break; + default: + pr_err("join_pool: Unknown parameter or missing value" + " '%s'\n", p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < RMR_JOIN_OPT_Mandatory_count; i++) { + if ((opt_mask & (1 << rmr_opt_join_tokens[i].token))) { + ret = 0; + } else { + pr_err("join_pool: Mandatory parameter missing: %s\n", + rmr_srv_opts_mandatory_names[i]); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + +static struct rmr_clt_pool *rmr_create_clt_pool(char *poolname, bool sync) +{ + struct rmr_clt_pool *clt_pool; + int ret; + + clt_pool = kzalloc(sizeof(struct rmr_clt_pool), GFP_KERNEL); + if (unlikely(!clt_pool)) + return ERR_PTR(-ENOMEM); + + refcount_set(&clt_pool->refcount, 1); + + init_waitqueue_head(&clt_pool->map_update_wq); + atomic_set(&clt_pool->io_freeze, 0); + mutex_init(&clt_pool->io_freeze_lock); + mutex_init(&clt_pool->clt_pool_lock); + + clt_pool->recover_wq = alloc_workqueue("%s_recover_wq", 0, 0, poolname); + if (!clt_pool->recover_wq) { + ret = -ENOMEM; + goto free_clt_pool; + } + + if (!sync) { + INIT_DELAYED_WORK(&clt_pool->recover_dwork, recover_work); + queue_delayed_work(clt_pool->recover_wq, &clt_pool->recover_dwork, + msecs_to_jiffies(RMR_RECOVER_INTERVAL_MS)); + } + + return clt_pool; + +free_clt_pool: + kfree(clt_pool); + return ERR_PTR(ret); +} + +static ssize_t rmr_clt_join_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_pool_md *clt_md; + char *poolname; + u32 chunk_size = RMR_DEFAULT_CHUNK_SIZE; + bool sync = false; + int err; + + poolname = kzalloc(NAME_MAX, GFP_KERNEL); + if (unlikely(!poolname)) + return -ENOMEM; + + err = rmr_clt_parse_join_opts(buf, poolname, &sync, &chunk_size); + if (unlikely(err)) + goto out; + + strip(poolname); + + pr_info("%s: Creating client pool with poolname %s, sync %d\n", + __func__, poolname, sync); + + clt_pool = rmr_create_clt_pool(poolname, sync); + if (IS_ERR(clt_pool)) { + pr_err("%s: Clt pool creationg failed\n", __func__); + err = PTR_ERR(clt_pool); + goto out; + } + + pool = rmr_create_pool(poolname, clt_pool); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto put_clt_pool; + } + + pool->is_clt = true; + pool->sync = sync; + clt_pool->pool = pool; + + pr_debug("pool %p, clt_pool %p\n", pool, pool->priv); + + err = rmr_clt_create_pool_sysfs_files(pool); + if (err) + goto put_clt_pool; + + if (!sync) { + clt_md = &clt_pool->pool->pool_md; + strscpy(clt_md->poolname, poolname, NAME_MAX); + clt_md->group_id = pool->group_id; + clt_md->map_ver = 1; + } + + kfree(poolname); + + return count; + +put_clt_pool: + if (!sync) + cancel_delayed_work_sync(&clt_pool->recover_dwork); + + rmr_put_clt_pool(clt_pool); +out: + kfree(poolname); + return err; +} + +static struct kobj_attribute rmr_clt_join_pool_attr = + __ATTR(join_pool, 0644, + rmr_clt_join_pool_show, rmr_clt_join_pool_store); + +static struct attribute *default_attrs[] = { + &rmr_clt_join_pool_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +void rmr_clt_destroy_sysfs_files(void) +{ + sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group); + + device_unregister(rmr_sess_dev); + device_unregister(rmr_pool_dev); + device_unregister(rmr_ctl_dev); + + class_destroy(rmr_dev_class); +} + +int rmr_clt_create_sysfs_files(void) +{ + int err; + dev_t devt = MKDEV(0, 0); + + rmr_dev_class = class_create("rmr-client"); + if (IS_ERR(rmr_dev_class)) + return PTR_ERR(rmr_dev_class); + + rmr_ctl_dev = device_create(rmr_dev_class, NULL, devt, NULL, "ctl"); + if (IS_ERR(rmr_ctl_dev)) { + err = PTR_ERR(rmr_ctl_dev); + goto cls_destroy; + } + + rmr_pool_dev = device_create(rmr_dev_class, NULL, devt, NULL, "pools"); + if (IS_ERR(rmr_pool_dev)) { + err = PTR_ERR(rmr_pool_dev); + goto ctl_destroy; + } + + rmr_sess_dev = device_create(rmr_dev_class, NULL, devt, NULL, "sessions"); + if (IS_ERR(rmr_sess_dev)) { + err = PTR_ERR(rmr_sess_dev); + goto pool_destroy; + } + + err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto sess_destroy; + + return 0; + +sess_destroy: + device_unregister(rmr_sess_dev); +pool_destroy: + device_unregister(rmr_pool_dev); +ctl_destroy: + device_unregister(rmr_ctl_dev); +cls_destroy: + class_destroy(rmr_dev_class); + + return err; +} + +STAT_ATTR(struct rmr_clt_stats, read_retries, + rmr_clt_stats_read_retries_to_str, rmr_clt_reset_read_retries); + +static struct attribute *rmr_clt_stats_attrs[] = { + &read_retries_attr.attr, + NULL, +}; + +static struct attribute_group rmr_clt_stats_attr_group = { + .attrs = rmr_clt_stats_attrs, +}; + +static int rmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *stats_kobj) +{ + int ret; + + ret = kobject_init_and_add(stats_kobj, &ktype, kobj, "stats"); + if (ret) { + pr_err("Failed to init and add stats kobject, err: %d\n", + ret); + return ret; + } + + ret = sysfs_create_group(stats_kobj, &rmr_clt_stats_attr_group); + if (ret) { + pr_err("failed to create stats sysfs group, err: %d\n", + ret); + goto put_stats_obj; + } + + return 0; + +put_stats_obj: + kobject_del(stats_kobj); + kobject_put(stats_kobj); + + return ret; +} diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-trace.c b/drivers/infiniband/ulp/rmr/rmr-clt-trace.c new file mode 100644 index 000000000000..2e6d9adee7c8 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-trace.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ +#include "rmr-clt.h" + +#define CREATE_TRACE_POINTS +#include "rmr-clt-trace.h" + diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-trace.h b/drivers/infiniband/ulp/rmr/rmr-clt-trace.h new file mode 100644 index 000000000000..1d9a511dc763 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-trace.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rmr_clt + +#if !defined(_TRACE_RMR_CLT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RMR_CLT_H + +#include + +struct rmr_clt_pool_sess; + +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_CREATED); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_NORMAL); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_FAILED); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_RECONNECTING); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_REMOVING); + +#define show_pool_sess_state(x) \ + __print_symbolic(x, \ + { RMR_CLT_POOL_SESS_CREATED, "CREATED" }, \ + { RMR_CLT_POOL_SESS_NORMAL, "NORMAL" }, \ + { RMR_CLT_POOL_SESS_FAILED, "FAILED" }, \ + { RMR_CLT_POOL_SESS_RECONNECTING, "RECONNECTING" }, \ + { RMR_CLT_POOL_SESS_REMOVING, "REMOVING" }) + +TRACE_EVENT(pool_sess_change_state, + TP_PROTO(struct rmr_clt_pool_sess *pool_sess, + int newstate, + int oldstate, + int changed), + + TP_ARGS(pool_sess, newstate, oldstate, changed), + + TP_STRUCT__entry( + __string(sessname, pool_sess->sessname) + __field(int, newstate) + __field(int, oldstate) + __field(int, changed) + ), + + TP_fast_assign( + __assign_str(sessname); + __entry->newstate = newstate; + __entry->oldstate = oldstate; + __entry->changed = changed; + ), + + TP_printk("RMR-CLT: sessname=%s newstate='%s' oldstate='%s' state-changed='%d'", + __get_str(sessname), + show_pool_sess_state(__entry->newstate), + show_pool_sess_state(__entry->oldstate), + __entry->changed + ) +); + +DECLARE_EVENT_CLASS(rtrs_clt_request_class, + TP_PROTO(int dir, struct rmr_clt_sess_iu *sess_iu), + + TP_ARGS(dir, sess_iu), + + TP_STRUCT__entry( + __field(int, dir) + __array(char, sessname, NAME_MAX) + __field(void *, rtrs) + __field(void *, clt_sess) + ), + + TP_fast_assign( + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + __entry->dir = dir; + memcpy(__entry->sessname, pool_sess->sessname, NAME_MAX); + __entry->rtrs = clt_sess->rtrs; + __entry->clt_sess = clt_sess; + ), + + TP_printk("rtrs clt request: sessname=%s dir=%s rtrs=%p clt_sess=%p", + __entry->sessname, + __print_symbolic(__entry->dir, + { READ, "READ" }, + { WRITE, "WRITE" }), + __entry->rtrs, + __entry->clt_sess + ) +); + +#define DEFINE_RTRS_CLT_EVENT(name) \ +DEFINE_EVENT(rtrs_clt_request_class, name, \ + TP_PROTO(int dir, struct rmr_clt_sess_iu *sess_iu), \ + TP_ARGS(dir, sess_iu)) + +DEFINE_RTRS_CLT_EVENT(send_usr_msg); +DEFINE_RTRS_CLT_EVENT(retry_failed_read); +DEFINE_RTRS_CLT_EVENT(rmr_clt_request); +DEFINE_RTRS_CLT_EVENT(rmr_clt_cmd_with_rsp); +DEFINE_RTRS_CLT_EVENT(send_map_update); + +#endif /* _TRACE_RMR_CLT_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rmr-clt-trace +#include + diff --git a/drivers/infiniband/ulp/rmr/rmr-clt.c b/drivers/infiniband/ulp/rmr/rmr-clt.c new file mode 100644 index 000000000000..33e4b6d84b0b --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt.c @@ -0,0 +1,3866 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include + +#include "rmr-clt.h" +#include "rmr-clt-trace.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_DESCRIPTION("RMR Client"); +MODULE_VERSION(RMR_VER_STRING); +MODULE_LICENSE("GPL"); + +#define RMR_CLT_SEND_MSG_TIMEOUT_MS 30000 + +//static int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool wait); +static void retry_failed_read(struct work_struct *work); +static DEFINE_MUTEX(g_sess_lock); +static LIST_HEAD(g_sess_list); + +static bool rmr_get_clt_pool(struct rmr_clt_pool *clt_pool) +{ + pr_debug("pool %s, before inc refcount %d\n", + clt_pool->pool->poolname, refcount_read(&clt_pool->refcount)); + return refcount_inc_not_zero(&clt_pool->refcount); +} + +static struct rmr_clt_pool *rmr_find_and_get_clt_pool(const char *poolname) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + + mutex_lock(&pool_mutex); + pool = rmr_find_pool(poolname); + if (!pool) { + clt_pool = ERR_PTR(-ENOENT); + goto out; + } + + clt_pool = (struct rmr_clt_pool *)pool->priv; + if (!rmr_get_clt_pool(clt_pool)) + clt_pool = ERR_PTR(-EINVAL); + +out: + mutex_unlock(&pool_mutex); + return clt_pool; +} + +void rmr_put_clt_pool(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + might_sleep(); + + pr_debug("clt pool %s, before dec refcnt %d\n", + (pool ? pool->poolname : "(empty)"), refcount_read(&clt_pool->refcount)); + if (refcount_dec_and_test(&clt_pool->refcount)) { + + destroy_workqueue(clt_pool->recover_wq); + mutex_destroy(&clt_pool->io_freeze_lock); + mutex_destroy(&clt_pool->clt_pool_lock); + + if (pool) { + pr_info("clt: destroy pool %s\n", pool->poolname); + free_pool(pool); + } + + kfree(clt_pool); + } +} + +static inline int rmr_clt_sess_get(struct rmr_clt_sess *sess) +{ + return kref_get_unless_zero(&sess->kref); +} + +static void rmr_clt_sess_release(struct kref *kref) +{ + struct rmr_clt_sess *clt_sess; + + clt_sess = container_of(kref, struct rmr_clt_sess, kref); + + mutex_lock(&g_sess_lock); + + rmr_clt_destroy_clt_sess_sysfs_files(clt_sess); + + pr_info("close rtrs for session %s\n", clt_sess->sessname); + rtrs_clt_close(clt_sess->rtrs); + list_del(&clt_sess->g_list); + kfree(clt_sess); + + mutex_unlock(&g_sess_lock); +} + +void rmr_clt_sess_put(struct rmr_clt_sess *sess) +{ + kref_put(&sess->kref, rmr_clt_sess_release); +} + +static const char *rmr_get_clt_pool_state_name(enum rmr_clt_pool_state state) +{ + switch (state) { + case RMR_CLT_POOL_STATE_JOINED: return "RMR_CLT_POOL_STATE_JOINED"; + case RMR_CLT_POOL_STATE_IN_USE: return "RMR_CLT_POOL_STATE_IN_USE"; + + default: return "Unknown state"; + } +} + +static void rmr_clt_dump_state(struct rmr_clt_pool *rmr_clt_pool) +{ + char current_state[1024] = {0}; + int i, n = 0, len = sizeof(current_state); + + for (i = 0; i < RMR_CLT_POOL_STATE_MAX; i++) { + enum rmr_clt_pool_state state = (enum rmr_clt_pool_state)i; + + if (test_bit(state, &rmr_clt_pool->state)) + n += scnprintf(current_state + n, len - n, "%s, ", + rmr_get_clt_pool_state_name(state)); + } + + pr_info("%s: RMR client pool current state: %s\n", __func__, current_state); +} + +/** + * rmr_clt_change_pool_state() - Change clt pool state + * + * @clt_pool: Client pool whose state is to be changed + * @new_state: New state to set + * @set: Informs whether to set/unset the given new+state + */ +void rmr_clt_change_pool_state(struct rmr_clt_pool *rmr_clt_pool, + enum rmr_clt_pool_state new_state, bool set) +{ + if (set) { + set_bit(new_state, &rmr_clt_pool->state); + pr_info("%s: state %s set\n", + __func__, rmr_get_clt_pool_state_name(new_state)); + } else { + clear_bit(new_state, &rmr_clt_pool->state); + pr_info("%s: state %s cleared\n", + __func__, rmr_get_clt_pool_state_name(new_state)); + } + + rmr_clt_dump_state(rmr_clt_pool); +} + +/** + * send_map_get_version() - Send a map get version command + * + * @pool_sess: pool session where to send the message + * + * Description: + * Ask the storage node to send back its map_version. + * + * Return: + * 0 on success + * Negative error in case of failure + */ + +/** + * rmr_clt_md_update() - Update the client (non-sync) pool metadata + */ +static void rmr_clt_md_update(struct rmr_pool *pool) +{ + struct rmr_pool_md *clt_md = &pool->pool_md; + + if (pool->sync) + return; + + clt_md->map_ver = pool->map_ver; +} + +#if 0 +static int send_map_set_version(struct rmr_clt_pool_sess *pool_sess, u64 ver) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_SET_VER; + msg.set_map_ver_cmd.map_ver = ver; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + } + return err; +} + +/** + * rmr_clt_coordinate_discard() - Coordinate the discard_entries flag + * + * @pool: the client pool + * @member_id: member id of the source node + * + * Description: + * This function sends discard request to all normal pool sessions of the pool. + * It is to solve the case where network is partitioned between the server nodes + * and only the client connects those partitions. Any request that failed on a session + * would fail this call. + * + * TODO: To address the network partitions (including the client), wait for consistency + * protocols. + * + * Return: + * 0 on success + * Negative error in case of failure + * + * Pre-requisite: rcu read lock should be held by caller + */ +static int rmr_clt_coordinate_discard(struct rmr_pool *pool, u8 cmd_type, u8 member_id) +{ + struct rmr_clt_pool_sess *pool_sess; + int err = 0; + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + /* + * If the pool session state is not normal, the dirty maps of the that pool is + * likely corrupted. Don't bother to send the discards. + */ + if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_NORMAL) + continue; + + pr_info("%s: send discards to (pool_sess %s: %d) with member_id %u\n", + __func__, pool_sess->sessname, pool_sess->member_id, member_id); + + /* Send discard request to the pool session. */ + err = send_discard(pool_sess, cmd_type, member_id); + if (err) { + pr_err("%s: Failed discard request on sess %s for member_id %u\n", + __func__, pool_sess->sessname, member_id); + return err; + } + } + + return err; +} + +static int rmr_clt_handle_discard(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_dirty_id_map *map; + int idx, ret, err = 0; + u64 map_ver; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + /* Find out if there is pending discard requests on the server side */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + ret = send_map_get_version(pool_sess, &map_ver); + if (ret) + continue; + + /* + * When disk replacement appears at the storage node, pserver will set the all + * map entries of that server to dirty. + */ + if (RMR_STORE_IS_REPLACE(map_ver)) { + map = rmr_pool_find_map(pool, pool_sess->member_id); + if (!map) { + pr_err("The clt pool %s cannot find map for member_id %u\n", + pool->poolname, pool_sess->member_id); + err = -EINVAL; + goto out; + } + + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + + /* Check any normal pool session failed to receive discards */ + err = rmr_clt_coordinate_discard(pool, RMR_CMD_SEND_DISCARD, + pool_sess->member_id); + if (err) { + pr_err("%s: Failed to coordinate discard state for member_id %u\n", + __func__, pool_sess->member_id); + goto out; + } + + /* update the map version */ + err = send_map_set_version(pool_sess, RMR_STORE_UNSET_REPLACE(map_ver)); + if (err) { + pr_err("%s: Failed to reset map version for %s\n", + __func__, pool_sess->sessname); + goto out; + } + + /* Everyone knows about the discarded entries now. */ + err = rmr_clt_coordinate_discard(pool, RMR_CMD_DISCARD_CLEAR_FLAG, + pool_sess->member_id); + if (err) { + pr_err("%s: Failed to clear discard flag for S%u\n", + __func__, pool_sess->member_id); + goto out; + } + } + } + +out: + srcu_read_unlock(&pool->sess_list_srcu, idx); + return err; +} +#endif + +static int rmr_clt_start_send_md(struct rmr_pool *pool); + +/** + * recover_work() - A work thread, which performs a number of tasks at regular intervals + * + * @work: The work struct holding the data + * + * Description: + * Every client pool has its own work thread. It performs the following 3 tasks. + * 1) Pool sessions in NORMAL state, and having dirty map entries associated with it, + * are checked, and if the entries are cleared from the particular storage node, then + * they are deleted from the pserver also. + * 2) If the pool session state is FAILED, but the network state (clt session) is connected, + * then a store check message is send to the pool session. The storage node wil confirm + * with the backend, if IOs can be send or not. + * 3) Send the client pool metadata to the servers. + */ +void recover_work(struct work_struct *work) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool_md *clt_md; + int index, lock_idx = 0; + + clt_pool = container_of(to_delayed_work(work), struct rmr_clt_pool, recover_dwork); + pool = clt_pool->pool; + + pr_debug("check map for pool %s started...\n", pool->poolname); + + lock_idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + pr_debug("pool %s sess %s sess->member_id %d sess->state %d\n", + pool->poolname, pool_sess->sessname, + pool_sess->member_id, atomic_read(&pool_sess->state)); + + clt_md = &pool->pool_md; + index = rmr_pool_find_md(clt_md, pool_sess->member_id, false); + if (index < 0) { + pr_debug("%s failed to find pool_sess %u\n", + __func__, pool_sess->member_id); + continue; + } + if (pool_sess->maintenance_mode) + goto pool_sess_state_check; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) { + struct rmr_dirty_id_map *map; + + map = rmr_pool_find_map(pool, pool_sess->member_id); + if (!map) { + pr_debug("pool %s no map found for member_id %u\n", + pool->poolname, pool_sess->member_id); + continue; + } + if (!rmr_map_empty(map)) { + pr_debug("pool %s sess %s map is not empty, check stg map...\n", + pool->poolname, pool_sess->sessname); + send_map_check(pool_sess); + } + } +pool_sess_state_check: + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED && + clt_sess->state == RMR_CLT_SESS_CONNECTED) { + pr_debug("pool %s sess %s try pool sess recover\n", + pool->poolname, pool_sess->sessname); + send_store_check(pool_sess); + } + } + srcu_read_unlock(&pool->sess_list_srcu, lock_idx); + + rmr_clt_md_update(pool); + /* If the send fails, wait for the next update. */ + rmr_clt_start_send_md(pool); + + pr_debug("check map for pool %s done. schedule next one.\n", pool->poolname); + + queue_delayed_work(clt_pool->recover_wq, &clt_pool->recover_dwork, + msecs_to_jiffies(RMR_RECOVER_INTERVAL_MS)); +} + +static int init_clt_pool(struct rmr_clt_pool *clt_pool) +{ + int err; + + clt_pool->pcpu_sess = alloc_percpu(typeof(*clt_pool->pcpu_sess)); + if (unlikely(!clt_pool->pcpu_sess)) { + err = -ENOMEM; + goto out_err; + } + + return 0; + +out_err: + return err; +} + +static void destroy_clt_pool(struct rmr_pool *pool) +{ + int i; + struct rmr_clt_pool *clt_pool; + struct rmr_dirty_id_map *map; + struct rmr_dirty_id_map *maplist = NULL; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + if (clt_pool) { + free_percpu(clt_pool->pcpu_sess); + clt_pool->pcpu_sess = NULL; + } + + mutex_lock(&pool->maps_lock); + for (i = 0; i < pool->maps_cnt; i++) { + map = rcu_dereference_protected(pool->maps[i], + lockdep_is_held(&pool->maps_lock)); + if (WARN_ON(!map)) + continue; + rcu_assign_pointer(pool->maps[i], NULL); + map->next = maplist; + maplist = map; + } + pool->maps_cnt = 0; + + if (maplist) + synchronize_srcu(&pool->map_srcu); + + mutex_unlock(&pool->maps_lock); + + rmr_maplist_destroy(maplist); +} + +static void rmr_put_sess_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu); + +static struct rmr_iu * +rmr_alloc_iu(void) +{ + struct rmr_iu *iu; + + iu = kzalloc(sizeof(*iu), GFP_KERNEL); + if (!iu) + return NULL; + INIT_LIST_HEAD(&iu->sess_list); + iu->num_sessions = 0; + refcount_set(&iu->ref, 1); + return iu; +} + +void rmr_get_iu(struct rmr_iu *iu) +{ + refcount_inc(&iu->ref); +} + +void rmr_put_iu(struct rmr_iu *iu) +{ + struct rmr_clt_sess_iu *sess_iu, *tmp; + + if (refcount_dec_and_test(&iu->ref)) { + list_for_each_entry_safe(sess_iu, tmp, + &iu->sess_list, entry) { + if (!list_empty(&sess_iu->entry)) + list_del_init(&sess_iu->entry); + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); + } + kfree(iu); + } +} + +void rmr_clt_free_pool_sess(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_clt_pool *clt_pool; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + clt_pool = (struct rmr_clt_pool *)pool_sess->pool->priv; + + if (!list_empty(&pool_sess->clt_sess_entry)) { + mutex_lock(&clt_sess->lock); + list_del(&pool_sess->clt_sess_entry); + mutex_unlock(&clt_sess->lock); + } + + pr_info("before free pool_sess %s, clt_sess refcount=%d\n", + pool_sess->sessname, kref_read(&clt_sess->kref)); + + kfree(pool_sess); +} + +void rmr_clt_put_pool(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + rmr_put_clt_pool(clt_pool); +} +EXPORT_SYMBOL(rmr_clt_put_pool); + +/** + * rmr_clt_open() - Open a client for use + * + * @priv: private data for the user + * @link_ev: holds the link event callback + * @poolname: name of the pool to open + * + * Description: + * Open an RMR pool for the user to use. The rmr pool must have at least one session. + * A single pool can be opened and used by only a single user. + * + * Return: + * Returns pointer to the rmr pool opened. + */ +struct rmr_pool *rmr_clt_open(void *priv, rmr_clt_ev_fn *link_ev, const char *poolname) +{ + struct rmr_clt_pool *clt_pool; + int err; + + clt_pool = rmr_find_and_get_clt_pool(poolname); + if (IS_ERR(clt_pool)) { + pr_err("RMR client pool '%s' is not found\n", poolname); + err = PTR_ERR(clt_pool); + goto err_out; + } + + if (!mutex_trylock(&clt_pool->clt_pool_lock)) { + pr_err("RMR client pool '%s' is busy, recovery in progress\n", poolname); + err = -EBUSY; + goto put_err; + } + if (test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) { + pr_err("RMR client pool '%s' is already in use\n", poolname); + err = -ENOENT; + goto put_err; + } + + if (!test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) { + pr_err("RMR client pool '%s' has no sessions open\n", poolname); + err = -ENOENT; + goto put_err; + } + + clt_pool->link_ev = link_ev; + clt_pool->priv = priv; + + err = init_clt_pool(clt_pool); + if (unlikely(err)) { + pr_err("RMR client pool '%s' failed to initialize: %d\n", poolname, err); + goto put_err; + } + + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_IN_USE, true); + + mutex_unlock(&clt_pool->clt_pool_lock); + return clt_pool->pool; + +put_err: + mutex_unlock(&clt_pool->clt_pool_lock); + rmr_put_clt_pool(clt_pool); +err_out: + return ERR_PTR(err); +} +EXPORT_SYMBOL(rmr_clt_open); + +void rmr_clt_close(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + mutex_lock(&clt_pool->clt_pool_lock); + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_IN_USE, false); + + pr_info("%s: RMR client close called for pool %s\n", __func__, pool->poolname); + + /* + * Freeze I/O. + * Degrade ref count to the usual model with a single shared + * atomic_t counter + */ + rmr_clt_pool_io_freeze(clt_pool); + pr_info("pool %s wait for inflight io to complete\n", clt_pool->pool->poolname); + + /* Wait for all completion */ + rmr_clt_pool_io_wait_complete(clt_pool); + + pr_info("pool %s inflight io completed\n", clt_pool->pool->poolname); + + clt_pool->link_ev = NULL; + clt_pool->priv = NULL; + + /* Unfreeze and Resurrect */ + rmr_clt_pool_io_unfreeze(clt_pool); + + mutex_unlock(&clt_pool->clt_pool_lock); + + rmr_put_clt_pool(clt_pool); +} +EXPORT_SYMBOL(rmr_clt_close); + +void *rmr_clt_get_priv(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + if (clt_pool) + return clt_pool->priv; + + return NULL; +} +EXPORT_SYMBOL(rmr_clt_get_priv); + +static struct rmr_clt_sess *alloc_clt_sess(const char *sessname) +{ + struct rmr_clt_sess *sess; + + sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!sess)) { + pr_err("Failed to create session %s," + " allocating session struct failed\n", + sessname); + return ERR_PTR(-ENOMEM); + } + strscpy(sess->sessname, sessname, sizeof(sess->sessname)); + mutex_init(&sess->lock); + INIT_LIST_HEAD(&sess->pool_sess_list); + kref_init(&sess->kref); + sess->state = RMR_CLT_SESS_DISCONNECTED; + + return sess; +} + +static struct rmr_clt_pool_sess *alloc_pool_sess(struct rmr_pool *pool, + struct rmr_clt_sess *clt_sess) +{ + struct rmr_clt_pool_sess *pool_sess; + + pool_sess = kzalloc_node(sizeof(*pool_sess), GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!pool_sess)) { + pr_err("Failed to allocate session for pool %s\n", pool->poolname); + return ERR_PTR(-ENOMEM); + } + + strscpy(pool_sess->sessname, clt_sess->sessname, NAME_MAX); + INIT_LIST_HEAD(&pool_sess->entry); + INIT_LIST_HEAD(&pool_sess->clt_sess_entry); + pool_sess->pool = pool; + pool_sess->clt_sess = clt_sess; + pool_sess->maintenance_mode = false; + atomic_set(&pool_sess->state, RMR_CLT_POOL_SESS_CREATED); + + return pool_sess; +} + +/* + * Checks if the session already exists (search by session name) + * Returns TRUE if session found, FALSE otherwise. + */ +static bool __find_sess_by_name(struct rmr_pool *pool, const char *sessname) +{ + struct rmr_clt_pool_sess *pool_sess; + int idx; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (!strcmp(sessname, pool_sess->sessname)) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + return true; + } + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return false; +} + +/** + * __find_sess_by_member_id() - Find and return pool_sess with a given member_id + * + * @pool: RMR pool to search pool_sess in + * @member_id: member ID to search + * + * Return: + * Pointer to rmr_clt_pool_sess on success + * NULL if no pool session exists with the given member_id + * + * Context: + * The caller should hold srcu_read_lock + */ +static struct rmr_clt_pool_sess *__find_sess_by_member_id(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_clt_pool_sess *pool_sess = NULL, *tmp_pool_sess; + + list_for_each_entry_srcu(tmp_pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (member_id == tmp_pool_sess->member_id) { + pool_sess = tmp_pool_sess; + break; + } + } + + return pool_sess; +} + +/** + * pool_sess_change_state() - Change pool session state + * + * @pool_sess: Pool session whose state is to be changed + * @newstate: New state which is to be set + * + * Description: + * Pool session states decide a number of crucial things. + * Where the IOs can be sent, which node has an outdated map, etc. + * As such, transitioning of states are important and is tightly controlled through + * this function. All state transitions should happen through this function. + * + * Return: + * True in case the state was changed + * False in case the state was not changed + */ +bool pool_sess_change_state(struct rmr_clt_pool_sess *pool_sess, + enum rmr_clt_pool_sess_state newstate) +{ + bool changed = false; + int oldstate = atomic_read(&pool_sess->state); + + if (WARN_ON(oldstate == RMR_CLT_POOL_SESS_REMOVING)) + goto out; + + switch (newstate) { + case RMR_CLT_POOL_SESS_NORMAL: + if (pool_sess->maintenance_mode) + break; + /* + * Non-sync sessions must pass through RECONNECTING before + * reaching NORMAL so that a map update can take place first. + * Sync sessions skip RECONNECTING entirely and go FAILED→NORMAL + * directly. + */ + if (!rmr_clt_sess_is_sync(pool_sess)) { + if (WARN_ON(oldstate == RMR_CLT_POOL_SESS_FAILED)) + break; + if (oldstate == RMR_CLT_POOL_SESS_CREATED || + oldstate == RMR_CLT_POOL_SESS_RECONNECTING) + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + } else { + if (oldstate == RMR_CLT_POOL_SESS_CREATED || + oldstate == RMR_CLT_POOL_SESS_FAILED || + oldstate == RMR_CLT_POOL_SESS_RECONNECTING) + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + } + break; + case RMR_CLT_POOL_SESS_RECONNECTING: + /* + * Sync sessions never need a map update and must not enter + * RECONNECTING. + */ + if (WARN_ON(rmr_clt_sess_is_sync(pool_sess) && + !pool_sess->maintenance_mode)) + break; + if (oldstate == RMR_CLT_POOL_SESS_FAILED || + oldstate == RMR_CLT_POOL_SESS_CREATED || + (oldstate == RMR_CLT_POOL_SESS_NORMAL && pool_sess->maintenance_mode)) + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + break; + case RMR_CLT_POOL_SESS_FAILED: + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + /* + * TODO + * We should really be updating map version with the state, + * Or before it. + */ + if (changed && oldstate != RMR_CLT_POOL_SESS_FAILED) + pool_sess->pool->map_ver++; + break; + case RMR_CLT_POOL_SESS_REMOVING: + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + break; + default: + pr_err("%s: Unknown state %d\n", __func__, newstate); + break; + } + + if (changed && !rmr_clt_sess_is_sync(pool_sess)) { + if (newstate == RMR_CLT_POOL_SESS_NORMAL) { + /* + * Entering NORMAL: this session is no longer the last + * authoritative holder of the dirty map. + */ + pool_sess->was_last_authoritative = false; + atomic_inc(&pool_sess->pool->normal_count); + } else if (oldstate == RMR_CLT_POOL_SESS_NORMAL) { + /* + * Leaving NORMAL via FAILED or maintenance-mode + * RECONNECTING: decrement the count of NORMAL sessions. + * If this was the last one, mark it as authoritative so + * that recovery can enable it directly (without a map + * update) when it comes back — its dirty map was the last + * complete one the pool had. + * + * REMOVING is not marked authoritative: a deliberate + * removal (delete or disassemble) is not an uncontrolled + * failure. On reassembly the leg goes through the full + * map update path and does not need the direct-enable + * shortcut. + */ + if (newstate == RMR_CLT_POOL_SESS_FAILED || + (newstate == RMR_CLT_POOL_SESS_RECONNECTING && + pool_sess->maintenance_mode)) { + if (atomic_dec_and_test(&pool_sess->pool->normal_count)) + pool_sess->was_last_authoritative = true; + } else { + /* REMOVING */ + atomic_dec(&pool_sess->pool->normal_count); + } + } + } + +out: + + trace_pool_sess_change_state(pool_sess, newstate, oldstate, changed); + + return changed; +} + +void rmr_clt_pool_io_freeze(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + mutex_lock(&clt_pool->io_freeze_lock); + if (atomic_inc_return(&clt_pool->io_freeze) == 1) + percpu_ref_kill(&pool->ids_inflight_ref); + mutex_unlock(&clt_pool->io_freeze_lock); +} + +void rmr_clt_pool_io_unfreeze(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + mutex_lock(&clt_pool->io_freeze_lock); + if (atomic_dec_return(&clt_pool->io_freeze) == 0) { + reinit_completion(&pool->complete_done); + percpu_ref_reinit(&pool->ids_inflight_ref); + + wake_up_all(&clt_pool->map_update_wq); + } + mutex_unlock(&clt_pool->io_freeze_lock); +} + +void rmr_clt_pool_io_wait_complete(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + wait_for_completion(&pool->complete_done); +} + +//am: what kind of locking is rquired for that ? +static void set_pool_sess_states_to_failed(struct rmr_clt_sess *clt_sess) +{ + struct rmr_clt_pool_sess *pool_sess; + + mutex_lock(&clt_sess->lock); + + list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) { + if (pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED)) + pr_info("set sess %s to failed due to link_ev\n", pool_sess->sessname); + } + mutex_unlock(&clt_sess->lock); +} + +static void rmr_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev) +{ + struct rmr_clt_sess *clt_sess = priv; + + switch (ev) { + case RTRS_CLT_LINK_EV_DISCONNECTED: + pr_info("Rtrs link ev disconnected: session %s\n", + clt_sess->sessname); + clt_sess->state = RMR_CLT_SESS_DISCONNECTED; + set_pool_sess_states_to_failed(clt_sess); + break; + case RTRS_CLT_LINK_EV_RECONNECTED: + pr_info("Rtrs link ev reconnected: session %s\n", + clt_sess->sessname); + clt_sess->state = RMR_CLT_SESS_CONNECTED; + resend_join_pool(clt_sess); + break; + default: + pr_err("Unknown rtrs link event received (%d), " + "session: %s\n", + ev, clt_sess->sessname); + } +} + +/* + * Gets an iu for I/O operations. + * + * Context: + * The call to this function should be protected with an srcu_read_lock. + */ +static struct rmr_clt_sess_iu *rmr_get_sess_iu(struct rmr_clt_pool_sess *pool_sess, + enum rtrs_clt_con_type con_type, + enum wait_type wait) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct rmr_clt_sess_iu *sess_iu; + struct rtrs_permit *permit; + + WARN_ON(!srcu_read_lock_held(&pool->sess_list_srcu)); + + if (clt_sess->state == RMR_CLT_SESS_DISCONNECTED) { + pr_info("The rmr client session %s state is disconnected\n", clt_sess->sessname); + return NULL; + } + + sess_iu = kzalloc(sizeof(*sess_iu), GFP_KERNEL); + if (!sess_iu) + return NULL; + + permit = rtrs_clt_get_permit(clt_sess->rtrs, con_type, wait); + if (unlikely(!permit)) { + kfree(sess_iu); + return NULL; + } + + INIT_LIST_HEAD(&sess_iu->entry); + sess_iu->permit = permit; + sess_iu->pool_sess = pool_sess; + + return sess_iu; +} + +/* + * Gets the iu for user messages. + * It will be reference counted initialized with refcount + */ +static inline struct rmr_clt_sess_iu *rmr_msg_get_iu(struct rmr_clt_pool_sess *pool_sess, + enum rtrs_clt_con_type con_type, + enum wait_type wait, int refcount) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_sess_iu *sess_iu; + int idx; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + sess_iu = rmr_get_sess_iu(pool_sess, con_type, wait); + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (unlikely(!sess_iu)) + return NULL; + + init_waitqueue_head(&sess_iu->comp.wait); + sess_iu->comp.errno = INT_MAX; + atomic_set(&sess_iu->refcount, refcount); + + return sess_iu; +} + +/* + * reference counted put, refcount has to be initialized. + */ +void rmr_msg_put_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu) +{ + if (atomic_dec_and_test(&sess_iu->refcount)) { + rtrs_clt_put_permit(pool_sess->clt_sess->rtrs, sess_iu->permit); + kfree(sess_iu); + } +} + +/* + * put the sess_iu without reference counting. + * I/O does not need reference counting. + */ +static void rmr_put_sess_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu) +{ + rtrs_clt_put_permit(pool_sess->clt_sess->rtrs, sess_iu->permit); + kfree(sess_iu); +} + +void wake_up_iu_comp(struct rmr_clt_sess_iu *sess_iu) +{ + sess_iu->comp.errno = sess_iu->errno; + wake_up(&sess_iu->comp.wait); +} + +void msg_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + + sess_iu->errno = errno; + /* just schedule the work because kfree must not be done here */ + schedule_work(&sess_iu->work); +} + +static int send_usr_msg(struct rtrs_clt_sess *rtrs, int dir, + struct rmr_clt_sess_iu *sess_iu, + struct kvec *vec, size_t nr, size_t len, + struct scatterlist *sg, unsigned int sg_len, + void (*conf)(struct work_struct *work), + int *errno, enum rmr_wait_type wait) +{ + int err; + struct rtrs_clt_req_ops req_ops; + + INIT_WORK(&sess_iu->work, conf); + req_ops = (struct rtrs_clt_req_ops){ + .priv = sess_iu, + .conf_fn = msg_conf, + }; + + trace_send_usr_msg(dir, sess_iu); + + err = rtrs_clt_request(dir, &req_ops, rtrs, sess_iu->permit, + vec, nr, len, sg, sg_len); + if (!err && wait) { + wait_event_timeout(sess_iu->comp.wait, + sess_iu->comp.errno != INT_MAX, + msecs_to_jiffies(RMR_CLT_SEND_MSG_TIMEOUT_MS)); + *errno = sess_iu->comp.errno; + if (*errno == INT_MAX) + *errno = -ETIMEDOUT; + } else { + *errno = 0; + } + return err; +} + +static int send_msg_rejoin_pool(struct rmr_clt_pool_sess *pool_sess, bool wait) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + int ret; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_REJOIN_POOL; + + msg.join_pool_cmd.rejoin = true; + msg.join_pool_cmd.chunk_size = pool->chunk_size; + msg.join_pool_cmd.queue_depth = clt_sess->queue_depth; + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait); + if (ret) + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + + return ret; +} + +static int send_msg_join_pool(struct rmr_clt_pool_sess *pool_sess, bool create, + bool dirty, bool wait) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool_member_info *mem_info; + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_pool_sess *t_pool_sess; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct rmr_dirty_id_map *map; + int ret, i = 0, idx; + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_JOIN_POOL; + + msg.join_pool_cmd.queue_depth = clt_sess->queue_depth; + msg.join_pool_cmd.chunk_size = pool->chunk_size; + msg.join_pool_cmd.rejoin = false; + + if (!msg.sync) { + msg.join_pool_cmd.create = create; + msg.join_pool_cmd.dirty = dirty; + mem_info = &(msg.join_pool_cmd.mem_info); + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(t_pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (t_pool_sess->member_id == pool_sess->member_id) + continue; + + map = rmr_pool_find_map(pool, t_pool_sess->member_id); + if (!map) { + pr_err("%s: Map with member_id %u does not exist\n", + __func__, t_pool_sess->member_id); + srcu_read_unlock(&pool->sess_list_srcu, idx); + return -ENOENT; + } + + mem_info->p_mem_info[i].member_id = t_pool_sess->member_id; + /* Only relevant for create */ + if (create) + mem_info->p_mem_info[i].c_dirty = !rmr_map_empty(map); + i++; + if (WARN_ON(i >= RMR_POOL_MAX_SESS)) + break; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + mem_info->no_of_stor = i; + } + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait); + if (ret) + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + + return ret; +} + +int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool delete, bool wait) +{ + struct rmr_msg_pool_cmd msg = {}; + int ret; + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_LEAVE_POOL; + + msg.leave_pool_cmd.member_id = pool_sess->member_id; + msg.leave_pool_cmd.delete = delete; + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait); + if (ret) + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + + return ret; +} + +bool rmr_clt_sess_is_sync(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + bool ret = false; + + if (!pool) { + WARN(1, "for sess %s pool is not assigned\n", + pool_sess->clt_sess->sessname); + return false; + } + + if (pool->sync) { + pr_debug("sess %s pool %s is sync (internal) clt sess\n", + pool_sess->clt_sess->sessname, pool->poolname); + ret = true; + } else { + pr_debug("sess %s pool %s is not sync clt sess\n", + pool_sess->clt_sess->sessname, pool->poolname); + ret = false; + } + return ret; +} + +/** + * rmr_clt_send_pool_info() - Notify all other pool members of a membership change + * + * @pool_sess: The pool session of the member whose state is changing. + * @op: Operation: %RMR_POOL_INFO_OP_ADD or %RMR_POOL_INFO_OP_REMOVE. + * @mode: For ADD: %RMR_POOL_INFO_MODE_CREATE or %RMR_POOL_INFO_MODE_ASSEMBLE. + * For REMOVE: %RMR_POOL_INFO_MODE_DELETE or %RMR_POOL_INFO_MODE_DISASSEMBLE. + * @dirty: When op is ADD and mode is CREATE, indicates that @pool_sess + * has outstanding dirty data that the receiving node must track. + * + * Sends a POOL_INFO command to every other non-FAILED, non-REMOVING + * member in the pool so they can update their view of pool membership. + * + * Return: + * 0 on success, negative error code on failure. + * + * Context: + * This function blocks while sending the command. + */ +static int rmr_clt_send_pool_info(struct rmr_clt_pool_sess *pool_sess, + enum rmr_pool_info_op op, enum rmr_pool_info_mode mode, + bool dirty) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_pool_sess *t_pool_sess; + struct rmr_msg_pool_cmd msg = {}; + int idx, ret = 0; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_POOL_INFO; + + msg.pool_info_cmd.member_id = pool_sess->member_id; + msg.pool_info_cmd.operation = op; + msg.pool_info_cmd.mode = mode; + + if (op == RMR_POOL_INFO_OP_ADD && mode == RMR_POOL_INFO_MODE_CREATE && dirty) + msg.pool_info_cmd.dirty = dirty; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(t_pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + enum rmr_clt_pool_sess_state state; + + /* + * No need to send the info message to the member who just joined. + */ + if (t_pool_sess->member_id == pool_sess->member_id) + continue; + + state = atomic_read(&t_pool_sess->state); + /* + * TODO: For FAILED session we have to store the missed + * msgs and send them later when the session recovers. + */ + if (state == RMR_CLT_POOL_SESS_FAILED || + state == RMR_CLT_POOL_SESS_REMOVING) + continue; + + ret = rmr_clt_pool_send_cmd(t_pool_sess, &msg, WAIT); + if (ret) { + pr_err("%s failed with err %d\n", rmr_get_cmd_name(msg.cmd_type), ret); + break; + } + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return ret; +} + +void resend_join_pool(struct rmr_clt_sess *clt_sess) +{ + struct rmr_clt_pool_sess *pool_sess; + + mutex_lock(&clt_sess->lock); + + list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) { + int err; + + err = send_msg_rejoin_pool(pool_sess, WAIT); + if (err) { + pr_err("send_msg_rejoin_pool failed for sess %s error %d\n", + pool_sess->sessname, err); + } + } + mutex_unlock(&clt_sess->lock); + + return; +} + +int send_msg_enable_pool(struct rmr_clt_pool_sess *pool_sess, bool enable) +{ + struct rmr_msg_pool_cmd msg = {}; + int ret; + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_ENABLE_POOL; + + msg.enable_pool_cmd.enable = enable; + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (ret) { + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + goto err; + } + +err: + return ret; +} + +static const char *rmr_clt_pool_sess_state_names[] = { + [0] = "invalid state", + [RMR_CLT_POOL_SESS_CREATED] = "created", + [RMR_CLT_POOL_SESS_NORMAL] = "normal", + [RMR_CLT_POOL_SESS_FAILED] = "failed", + [RMR_CLT_POOL_SESS_RECONNECTING] = "reconnecting", + [RMR_CLT_POOL_SESS_REMOVING] = "removing" +}; + +const char *rmr_clt_sess_state_str(enum rmr_clt_pool_sess_state state) +{ + return rmr_clt_pool_sess_state_names[state]; +} + +int rmr_clt_reconnect_sess(struct rmr_clt_sess *clt_sess, + const struct rtrs_addr *paths, + size_t path_cnt) +{ + struct rtrs_attrs attrs; + struct rtrs_clt_ops rtrs_ops; + int err = 0; + + rtrs_ops = (struct rtrs_clt_ops){ + .priv = clt_sess, + .link_ev = rmr_clt_link_ev, + }; + + clt_sess->rtrs = rtrs_clt_open(&rtrs_ops, clt_sess->sessname, + paths, path_cnt, RTRS_PORT, + 0, /* Do not use pdu of rtrs */ + RECONNECT_DELAY, + MAX_RECONNECTS, 0); + if (IS_ERR(clt_sess->rtrs)) { + err = PTR_ERR(clt_sess->rtrs); + pr_err("rtrs_clt_open error %d\n", err); + goto err; + } + + err = rtrs_clt_query(clt_sess->rtrs, &attrs); + if (unlikely(err)) { + pr_err("rtrs_clt_query error %d\n", err); + goto close_sess; + } + clt_sess->max_io_size = attrs.max_io_size; + clt_sess->queue_depth = attrs.queue_depth; + clt_sess->max_segments = attrs.max_segments; + + clt_sess->state = RMR_CLT_SESS_CONNECTED; + + resend_join_pool(clt_sess); + + return err; + +close_sess: + rtrs_clt_close(clt_sess->rtrs); +err: + return err; +} + +//TODO: we do not use rsp in this function, do we need it as an argument? +static int rmr_clt_handle_rejoin_rsp(struct rmr_clt_pool_sess *pool_sess, struct rmr_msg_pool_cmd_rsp *rsp) +{ + int err = 0; + + if (rmr_clt_sess_is_sync(pool_sess)) { + /* + * The client on sync side does not need map update + * hence goes to "normal" state directly. + * NB: FAILED => NORMAL + */ + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + } else { + /* + * The client on non-sync side needs map update, + * + * A map update is to be triggered, which updates the map, + * and then sets state to "normal" + */ + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + + /* + * Send the info about the pool to all the storages. + * Contains IDs of storages connected to this pool. + */ + err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_ADD, + RMR_POOL_INFO_MODE_ASSEMBLE, false); + if (err) { + pr_err("Rejoin: rmr_clt_send_pool_info failed for session %s", + pool_sess->sessname); + return -EINVAL; + } + + err = rmr_clt_pool_try_enable(pool_sess->pool); + if (err) + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool_sess->pool->poolname, + pool_sess->sessname, err); + } + + return err; +} + +static void rmr_clt_handle_join_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_pool_md *clt_md; + u64 mapped_size; + + clt_md = &pool->pool_md; + + pool_sess->ver = min_t(u8, rsp->ver, RMR_PROTO_VER_MAJOR); + pool_sess->member_id = rsp->member_id; + xa_store(&pool->stg_members, pool_sess->member_id, pool_sess, GFP_KERNEL); + + pool->chunk_size = rsp->join_pool_cmd_rsp.chunk_size; + pool->chunk_size_shift = ilog2(pool->chunk_size); + clt_md->chunk_size = pool->chunk_size; + + mapped_size = rsp->join_pool_cmd_rsp.mapped_size; + if (mapped_size) { + pool->mapped_size = mapped_size; + pool->pool_md.mapped_size = mapped_size; + rmr_pool_update_no_of_chunk(pool); + pr_info("clt join_pool: mapped size %llu\n", pool->mapped_size); + } +} + +static int cmd_process_rsp(struct rmr_clt_pool_sess *pool_sess, struct rmr_msg_pool_cmd_rsp *rsp) +{ + int err = 0; + + pr_debug("rsp, cmd_type %d, member_id %d, err %d\n", + rsp->cmd_type, rsp->member_id, rsp->err); + + if (rsp->err) + return rsp->err; + + switch (rsp->cmd_type) { + case RMR_CMD_MAP_CHECK: + return rmr_clt_handle_map_check_rsp(pool_sess, rsp); + case RMR_CMD_STORE_CHECK: + return rmr_clt_handle_store_check_rsp(pool_sess, rsp); + case RMR_CMD_MAP_READY: + case RMR_CMD_MAP_SEND: + case RMR_CMD_MAP_BUF_DONE: + case RMR_CMD_MAP_DONE: + case RMR_CMD_MAP_DISABLE: + case RMR_CMD_LEAVE_POOL: + case RMR_CMD_LAST_IO_TO_MAP: + case RMR_CMD_MD_SEND: + case RMR_CMD_MAP_SET_VER: + case RMR_CMD_SEND_DISCARD: + case RMR_CMD_DISCARD_CLEAR_FLAG: + case RMR_CMD_POOL_INFO: + pr_debug("%s: No rsp handling for %s\n", __func__, rmr_get_cmd_name(rsp->cmd_type)); + break; + case RMR_CMD_REJOIN_POOL: + return rmr_clt_handle_rejoin_rsp(pool_sess, rsp); + case RMR_CMD_JOIN_POOL: + rmr_clt_handle_join_rsp(pool_sess, rsp); + break; + case RMR_CMD_ENABLE_POOL: + pool_sess->ver = min_t(u8, rsp->ver, RMR_PROTO_VER_MAJOR); + break; + default: + pr_warn("%s: switch default type: %d\n", __func__, rsp->cmd_type); + + err = -EINVAL; + } + + return err; +} + +static void msg_pool_cmd_conf(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + struct rmr_msg_pool_cmd_rsp *rsp = sess_iu->buf; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + + pr_debug("pool cmd for %s session %s member_id %d conf with errno %d\n", + pool_sess->pool->poolname, pool_sess->sessname, + pool_sess->member_id, sess_iu->errno); + + if (!sess_iu->errno) { + /* + * We need to check if there was an error while processing the cmd + * on the server side. If there was, then we fail the command. + */ + sess_iu->errno = cmd_process_rsp(pool_sess, rsp); + } + + kfree(rsp); + wake_up_iu_comp(sess_iu); + rmr_msg_put_iu(pool_sess, sess_iu); +} + +void rmr_clt_init_cmd(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.group_id = cpu_to_le32(pool->group_id); + msg->hdr.type = cpu_to_le16(RMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->ver = RMR_PROTO_VER_MAJOR; + msg->sync = pool->sync; + + strncpy(msg->pool_name, pool->poolname, sizeof(msg->pool_name)); +} +EXPORT_SYMBOL(rmr_clt_init_cmd); + +int rmr_clt_pool_send_cmd(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd *msg, bool wait) +{ + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct rmr_msg_pool_cmd_rsp *rsp; + struct rmr_clt_sess_iu *sess_iu; + struct kvec vec = { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + int err, errno; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (unlikely(!rsp)) + return -ENOMEM; + + sess_iu = rmr_msg_get_iu(pool_sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT, 2); + if (unlikely(!sess_iu)) { + kfree(rsp); + return -ENOMEM; + } + + sess_iu->buf = rsp; + sg_init_one(&sess_iu->sg, rsp, sizeof(*rsp)); + + err = send_usr_msg(clt_sess->rtrs, READ, sess_iu, + &vec, 1, sizeof(*rsp), &sess_iu->sg, 1, + msg_pool_cmd_conf, &errno, wait); + if (unlikely(err)) { + rmr_msg_put_iu(pool_sess, sess_iu); + kfree(rsp); + } else { + err = errno; + } + + rmr_msg_put_iu(pool_sess, sess_iu); + + return err; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess * +rmr_clt_get_first_normal_session(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess; + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) + return pool_sess; + } + + return NULL; +} + +/** + * rmr_clt_pool_send_all - Send a command to all sessions in the pool + * + * @pool: The client pool which sends the command message + * @msg: The command message of pool + * + * Description: + * When sending messages to all pool sessions, it will continue to send + * regardless of the failure of the previous communication. + * + * Return: + * 0 if at least one successful request + * less than 0 if all requests failed + */ +int rmr_clt_pool_send_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg) +{ + struct rmr_clt_pool_sess *pool_sess; + int idx, err = 0; + u8 member_id = 0; + int ret = 0; + + if (msg->cmd_type == RMR_CMD_SEND_DISCARD) + member_id = msg->send_discard_cmd.member_id; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + /* The node has had discards. */ + if (pool_sess->member_id == member_id) + continue; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) + continue; + + pr_info("pool %s send cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + + /* The err code reflects the response from this pool_sess. */ + err = rmr_clt_pool_send_cmd(pool_sess, msg, WAIT); + if (err) { + pr_err("pool %s sending cmd to sess %s failed, err=%d\n", + pool->poolname, pool_sess->sessname, err); + continue; + } + + pr_info("pool %s done sending cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + ret++; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (ret) + return 0; + + return -ENETUNREACH; +} +EXPORT_SYMBOL(rmr_clt_pool_send_all); + +/** + * rmr_clt_send_cmd_with_data_all - Send a command with data to all sessions in the pool + * + * Return: + * 0 on success of all sends + * less than 0 if all sends failed + * positive number of failed sends + */ +int rmr_clt_send_cmd_with_data_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen) +{ + struct rmr_clt_pool_sess *pool_sess; + int idx, err = 0; + bool ret = false; + int errno = 0; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) { + errno++; + continue; + } + + pr_debug("pool %s send cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + err = rmr_clt_send_cmd_with_data(pool, pool_sess, msg, buf, buflen); + if (err) { + errno++; + pr_debug("pool %s sending cmd to sess %s failed, err=%d\n", + pool->poolname, pool_sess->sessname, err); + continue; + } + + pr_debug("pool %s done sending cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + ret = true; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (ret) + return errno; + + return -EINVAL; +} +EXPORT_SYMBOL(rmr_clt_send_cmd_with_data_all); + +/** + * rmr_clt_start_last_io_update() - Do the last IO update + * + * @pool: The pool + * + * Description: + * Last IO update is needed in case a pserver went down while connected to a pool. + * A pserver going down while performing IOs could mean that some IOs could have been + * executed in some nodes but not all. This function takes the last 'queue_depth' number of + * IOs on each storage node and makes sure they are synced in between all the nodes. + * Before performing the last IO conversion, it also makes sure that all the storage nodes + * have the lastest map. + * + * Return: + * 0 on success + * Error value on failure + * + * Context: + * srcu_read_lock should be held while calling this function. + */ +int rmr_clt_start_last_io_update(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess_chosen, *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + u64 map_ver, highest_map_ver = 0; + int j, err, idx, ret = 0; + int discard_ids[RMR_POOL_MAX_SESS]; + u8 id, nr_discards = 0; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + for (j = 0; j < RMR_POOL_MAX_SESS; j++) { + struct rmr_clt_pool_sess *ps; + u8 mid = pool->pool_md.srv_md[j].member_id; + + if (!mid) + continue; + + ps = xa_load(&pool->stg_members, mid); + if (!ps) { + pr_err("%s: member_id %u not yet assembled\n", + __func__, mid); + err = -EINVAL; + goto out; + } + if (atomic_read(&ps->state) != RMR_CLT_POOL_SESS_RECONNECTING) { + pr_err("%s: member_id %u not in reconnecting state\n", + __func__, mid); + err = -EINVAL; + goto out; + } + } + + /* + * Before pserver died, it could be that one or more storage nodes were down. + * This would mean there is a possibility that those storage nodes will not have + * the latest map. But that can create problems. + * We need to make sure that every storage node has the latest map. + * Hence, find out which node has the latest map first, + */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + err = send_map_get_version(pool_sess, &map_ver); + if (err) { + pr_err("%s: Failed to read map version for sess %s\n", + __func__, pool_sess->sessname); + err = -EINVAL; + goto out; + } + + if (RMR_STORE_IS_REPLACE(map_ver)) { + map_ver = RMR_STORE_GET_VER(map_ver); + discard_ids[nr_discards] = pool_sess->member_id; + nr_discards++; + } + + if (map_ver > highest_map_ver) { + highest_map_ver = map_ver; + pool_sess_chosen = pool_sess; + } + } + + for (j = 0; j < nr_discards; j++) { + id = discard_ids[j]; + pr_info("%s: Send discard req %d to S%d\n", + __func__, id, pool_sess_chosen->member_id); + err = send_discard(pool_sess_chosen, RMR_CMD_SEND_DISCARD, id); + if (err) { + pr_err("%s: Failed to send discard request to %s\n", + __func__, pool_sess_chosen->sessname); + goto out; + } + } + + /* + * We have the storage node with the latest map, + * make sure the latest map is sent to all other storage nodes. + */ + err = rmr_clt_spread_map(pool, pool_sess_chosen, false, false); + if (err) { + pr_err("%s: Failed to spread the latest map\n", __func__); + goto out; + } + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + for (j = 0; j < nr_discards; j++) { + id = discard_ids[j]; + pr_info("%s: Send discard clear req %d to S%d\n", + __func__, id, pool_sess->member_id); + err = send_discard(pool_sess, RMR_CMD_DISCARD_CLEAR_FLAG, id); + if (err) { + pr_err("%s: Failed to clear discard state on %s\n", + __func__, pool_sess->sessname); + } else { + ret++; + } + } + } + + if (nr_discards && !ret) { + pr_err("%s: Failed to clear discard state on any storage node\n", __func__); + err = -EINVAL; + goto out; + } + + /* + * Now that we are done with the dispersing of the latest map, + * we can start last IO update. + */ + rmr_clt_init_cmd(pool, &msg); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + msg.cmd_type = RMR_CMD_LAST_IO_TO_MAP; + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto out; + } + + err = rmr_clt_spread_map(pool, pool_sess, true, false); + if (err) { + pr_err("%s: Failed to spread last_io converted map\n", __func__); + goto out; + } + } + + err = rmr_clt_read_map(pool); + if (err) { + pr_err("%s: rmr_clt_read_map failed with err %d\n", __func__, err); + goto out; + } + +out: + srcu_read_unlock(&pool->sess_list_srcu, idx); + return err; +} + +/** + * rmr_clt_enable_sess() - Enable the rmr clt pool sessions + * + * @pool_sess: The rmr clt pool session to enable + * + * Description: + * This function takes care of enable request, for pool sessions + * not in maintenance mode and in mm. + * + * Return: + * 0 on success + * Error value on failure + */ +int rmr_clt_enable_sess(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + int pool_sess_state, err = 0; + + pr_info("%s: For session %s of pool %s\n", + __func__, pool_sess->sessname, pool->poolname); + + if (!pool_sess->maintenance_mode) { + /* + * Simple enable, not related to maintenance. + * Manual enable is only allowed for sessions in "created" state + */ + pool_sess_state = atomic_read(&pool_sess->state); + if (pool_sess_state != RMR_CLT_POOL_SESS_CREATED) { + pr_err("Cannot manually enable session: state %d\n", pool_sess_state); + err = -EINVAL; + goto out; + } + + err = send_msg_enable_pool(pool_sess, 1); + if (err) { + pr_err("Failed to send enable to pool %s. Err %d\n", + pool->poolname, err); + goto out; + } + + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + } else { + /* + * Enable when in maintenance mode. + */ + err = rmr_clt_unset_pool_sess_mm(pool_sess); + } + +out: + return err; +} + +/** + * rmr_clt_create_sess() - allocate and initialize rmr client session, rmr_clt_pool sess can use it + * to submit io to the rtrs connection + * + * @sessname: Name to be given to the new session being created. + * @paths: RTRS paths created for the session. + * @path_cnt: Number of paths. + * + * Return: + * Pointer to rmr_clt_sess on success + * ERR_PTR on failure + * + * Description: + * Create a new session to storage node with address "rtrs_addr". + * After this function is done, rmr_clt_pool_sess caan use this sess to submit io + * + * Context: + * This function blocks while creating the session + */ +static struct rmr_clt_sess *rmr_clt_create_sess(const char *sessname, + const struct rtrs_addr *paths, + size_t path_cnt) +{ + struct rmr_clt_sess *clt_sess; + struct rtrs_attrs attrs; + struct rtrs_clt_ops rtrs_ops; + int err; + + clt_sess = alloc_clt_sess(sessname); + if (IS_ERR(clt_sess)) { + pr_err("Session '%s' can not be allocated in pool\n", sessname); + return clt_sess; // TODO: isit err_cast here? + } + + rtrs_ops = (struct rtrs_clt_ops){ + .priv = clt_sess, + .link_ev = rmr_clt_link_ev, + }; + /* + * Nothing was found, establish rtrs connection and proceed further. + */ + clt_sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname, + paths, path_cnt, RTRS_PORT, + 0, /* Do not use pdu of rtrs */ + RECONNECT_DELAY, + MAX_RECONNECTS, 0); + if (IS_ERR(clt_sess->rtrs)) { + err = PTR_ERR(clt_sess->rtrs); + pr_err("rtrs_clt_open error %d\n", err); + goto free_clt_sess; + } + err = rtrs_clt_query(clt_sess->rtrs, &attrs); + if (unlikely(err)) { + pr_err("rtrs_clt_query error %d\n", err); + goto close_sess; + } + clt_sess->max_io_size = attrs.max_io_size; + clt_sess->queue_depth = attrs.queue_depth; + clt_sess->max_segments = attrs.max_segments; + //sess->sess_kobj = &sess->rtrs->dev.dev.kobj; + + err = rmr_clt_create_clt_sess_sysfs_files(clt_sess); + if (err) { + pr_err("failed to crete sysfs files for sess %s, err=%d\n", + clt_sess->sessname, err); + goto close_sess; + } + clt_sess->state = RMR_CLT_SESS_CONNECTED; + + mutex_lock(&g_sess_lock); + list_add(&clt_sess->g_list, &g_sess_list); + mutex_unlock(&g_sess_lock); + + return clt_sess; + +close_sess: + rtrs_clt_close(clt_sess->rtrs); + +free_clt_sess: + kfree(clt_sess); + + return ERR_PTR(err); +} + +/** + * rmr_clt_pool_try_enable() - Trigger pool session recovery if conditions are met + * + * @pool: The pool to check + * + * Scans pool sessions and fires the appropriate recovery action: + * + * Case 1: ≥1 NORMAL session exists → spread its map (with enable=true) to all + * non-NORMAL sessions, then set them to NORMAL on the client side + * Case 2: Exactly one was_last_authoritative RECONNECTING session exists → + * enable it directly (data is complete, no map needed), then spread + * its map to remaining sessions + * Cases 3/4: All pool_md members present and RECONNECTING → last_io_update + * + * Return: 0 on success or when conditions are not yet met, negative error on failure. + */ +int rmr_clt_pool_try_enable(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess, *normal_sess, *auth_sess; + bool any_member = false; + int idx, j, err = 0; + + pr_info("%s: Started for pool %s\n", __func__, pool->poolname); + + /* + * clt_pool_lock is held across all RPC round-trips below (MAP_READY, + * MAP_SEND, MAP_DONE, last_io_update exchanges). This serialises + * concurrent try_enable calls and prevents rmr_clt_open/close from + * racing with recovery. The RPC send path (rmr_clt_pool_send_cmd) + * uses per-session permits and does not acquire clt_pool_lock, so + * there is no deadlock. rmr_clt_open and rmr_clt_close use + * mutex_trylock and mutex_lock respectively to handle this. + */ + mutex_lock(&clt_pool->clt_pool_lock); + + normal_sess = NULL; + auth_sess = NULL; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + int state = atomic_read(&pool_sess->state); + + if (state == RMR_CLT_POOL_SESS_NORMAL) { + if (!normal_sess) + normal_sess = pool_sess; + } else if (state == RMR_CLT_POOL_SESS_RECONNECTING && + pool_sess->was_last_authoritative && + !pool_sess->maintenance_mode && + !auth_sess) { + auth_sess = pool_sess; + } + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + /* + * Invariant: at most one was_last_authoritative session can exist + * (guaranteed by atomic_dec_and_test in pool_sess_change_state), and + * it cannot coexist with a NORMAL session (if a NORMAL session exists, + * the pool never fully went to FAILED, so no session gets the flag). + */ + if (WARN_ON(auth_sess && normal_sess)) { + err = -EINVAL; + goto out; + } + + /* Case 2: was_last_authoritative session — enable it directly, then spread */ + if (auth_sess) { + err = send_msg_enable_pool(auth_sess, 1); + if (err) { + pr_err("%s: pool %s failed to enable auth sess %s: %d\n", + __func__, pool->poolname, auth_sess->sessname, err); + goto out; + } + pool_sess_change_state(auth_sess, RMR_CLT_POOL_SESS_NORMAL); + normal_sess = auth_sess; + } + + /* Case 1: ≥1 NORMAL session → spread map to all non-NORMAL sessions */ + if (normal_sess) { + idx = srcu_read_lock(&pool->sess_list_srcu); + err = rmr_clt_spread_map(pool, normal_sess, true, true); + if (err) + pr_err("%s: pool %s spread map from %s failed: %d\n", + __func__, pool->poolname, normal_sess->sessname, err); + else + goto out_normal; + + srcu_read_unlock(&pool->sess_list_srcu, idx); + goto out; + } + + /* Cases 3/4: all pool_md members present and RECONNECTING */ + for (j = 0; j < RMR_POOL_MAX_SESS; j++) { + struct rmr_clt_pool_sess *ps; + u8 mid = pool->pool_md.srv_md[j].member_id; + + if (!mid) + continue; + + any_member = true; + ps = xa_load(&pool->stg_members, mid); + if (!ps || atomic_read(&ps->state) != RMR_CLT_POOL_SESS_RECONNECTING || + ps->maintenance_mode) { + pr_info("%s: pool %s member_id %u not yet in reconnecting/mm, waiting\n", + __func__, pool->poolname, mid); + goto out; + } + } + + if (!any_member) { + pr_info("%s: pool %s has no members in pool_md, nothing to do\n", + __func__, pool->poolname); + goto out; + } + + pr_info("%s: pool %s all members reconnecting, starting last_io_update\n", + __func__, pool->poolname); + + err = rmr_clt_start_last_io_update(pool); + if (err) { + pr_err("%s: pool %s last_io_update failed: %d\n", + __func__, pool->poolname, err); + goto out; + } + + idx = srcu_read_lock(&pool->sess_list_srcu); +out_normal: + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_RECONNECTING || + pool_sess->maintenance_mode) + continue; + + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + +out: + mutex_unlock(&clt_pool->clt_pool_lock); + return err; +} + +/** + * rmr_clt_read_pool_md() - Read the full pool_md from a storage server's disk + * + * @pool_sess: The pool session to read from. + * + * Sends RMR_CMD_MD_SEND with read_full_md=1 to the given session and imports + * the returned srv_md[] entries into pool->pool_md, skipping already-known + * members. Used during add_sess mode=assemble so the client learns all pool + * member IDs from the server's on-disk metadata, not only the one being + * assembled. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_clt_read_pool_md(struct rmr_clt_pool_sess *pool_sess, bool first) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool_md *remote_md; + int i, err; + + remote_md = kzalloc(sizeof(*remote_md), GFP_KERNEL); + if (!remote_md) + return -ENOMEM; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MD_SEND; + msg.md_send_cmd.src_mapped_size = pool->mapped_size; + msg.md_send_cmd.sender_id = pool_sess->member_id; + msg.md_send_cmd.read_full_md = 1; + + err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, + remote_md, sizeof(*remote_md)); + if (err) { + pr_err("%s: failed to read pool_md from sess %s: %d\n", + __func__, pool_sess->sessname, err); + goto out; + } + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + u8 mid = remote_md->srv_md[i].member_id; + int idx; + + if (!mid) + continue; + + idx = rmr_pool_find_md(&pool->pool_md, mid, first); + if (idx < 0) + continue; + + if (!pool->pool_md.srv_md[idx].member_id) { + /* New entry — import blindly */ + memcpy(&pool->pool_md.srv_md[idx], &remote_md->srv_md[i], + sizeof(struct rmr_srv_md)); + } else { + /* Already known — verify stable fields are consistent */ + if (pool->pool_md.srv_md[idx].mapped_size != + remote_md->srv_md[i].mapped_size) + pr_warn("%s: member_id %u mapped_size mismatch: " + "expected %llu, got %llu from sess %s\n", + __func__, mid, + pool->pool_md.srv_md[idx].mapped_size, + remote_md->srv_md[i].mapped_size, + pool_sess->sessname); + } + } + +out: + kfree(remote_md); + return err; +} + +/** + * rmr_clt_process_non_sync_sess() - Set up map and notify peers for a new non-sync session + * + * @pool_sess: The newly added pool session. + * @create: True if this is a fresh pool creation; false for an assemble of an + * existing pool. + * @dirty: True if there are already other sessions in the pool; the new member's + * map will be marked fully dirty to trigger a resync. + * + * Creates the dirty map for @pool_sess and informs all existing pool members + * about the new storage node joining. On failure the map is removed. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_clt_process_non_sync_sess(struct rmr_clt_pool_sess *pool_sess, bool create, + bool dirty) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_dirty_id_map *map; + enum rmr_pool_info_mode mode; + u8 created_mids[RMR_POOL_MAX_SESS]; + int created_cnt = 0; + int i, err = 0; + + /* + * The mapped size of the pool is set after a backend device is mapped to the + * client. If a new client pool session is extended to this pool, the map for that + * new server node needs to be created for the client pool as well. + */ + if (!pool->mapped_size) { + pr_err("%s: pool %s mapped_size is 0\n", + __func__, pool->poolname); + err = -EINVAL; + goto out; + } + + pr_info("Through add_sess, pool %s mapped_size %llu\n", + pool->poolname, pool->mapped_size); + + rmr_pool_update_no_of_chunk(pool); + + if (create) { + if (rmr_pool_find_map(pool, pool_sess->member_id)) { + pr_err("%s: pool %s map for member_id %u already exists\n", + __func__, pool->poolname, pool_sess->member_id); + err = -EEXIST; + goto out; + } + + map = rmr_map_create(pool, pool_sess->member_id); + if (IS_ERR(map)) { + err = PTR_ERR(map); + pr_err("%s: pool %s failed to create map for member_id %u\n", + __func__, pool->poolname, pool_sess->member_id); + goto out; + } + + /* + * During pool creation, all storage nodes must start with identical + * data. The first node added is taken as the clean reference; any + * subsequent node joining must be fully synced from it. + * Mark the entire map dirty to trigger that initial resync. + */ + if (dirty) + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + + mode = RMR_POOL_INFO_MODE_CREATE; + } else { + /* + * For assemble, read pool_md first so we know all member IDs, + * then create maps for every member in the pool. + */ + mode = RMR_POOL_INFO_MODE_ASSEMBLE; + + err = rmr_clt_read_pool_md(pool_sess, !dirty); + if (err) { + pr_err("%s: failed to read pool_md from sess %s: %d\n", + __func__, pool_sess->sessname, err); + goto out; + } + + if (!dirty) { + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + u8 mid = pool->pool_md.srv_md[i].member_id; + + if (!mid) + continue; + + map = rmr_map_create(pool, mid); + if (IS_ERR(map)) { + err = PTR_ERR(map); + pr_err("%s: pool %s failed to create map for member_id %u\n", + __func__, pool->poolname, mid); + goto del_maps; + } + created_mids[created_cnt++] = mid; + } + } + } + + /* + * We need to send the info about this node joining to other storage nodes. + */ + err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_ADD, mode, dirty); + if (err) { + pr_err("rmr_clt_send_pool_info failed for session %s\n", + pool_sess->sessname); + if (create) + rmr_pool_remove_map(pool, pool_sess->member_id); + else + goto del_maps; + goto out; + } + + if (!create) { + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + err = rmr_clt_pool_try_enable(pool); + if (err) + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool->poolname, pool_sess->sessname, err); + } + + return err; + +del_maps: + for (i = 0; i < created_cnt; i++) + rmr_pool_remove_map(pool, created_mids[i]); +out: + return err; +} + +/** + * rmr_clt_add_pool_sess() - Add a client session to an RMR pool + * + * @pool: The pool to join. + * @clt_sess: The client transport session to associate. + * @create: True if this is a fresh pool creation; false for an assemble of an + * existing pool. + * + * Sends a join_pool command to the server, allocates a pool session, creates + * the dirty map for this storage node (for non-sync pools), and notifies the + * other pool members via a pool_info message. + * + * Return: + * Pointer to the new pool session on success, ERR_PTR on failure. + */ +struct rmr_clt_pool_sess *rmr_clt_add_pool_sess(struct rmr_pool *pool, + struct rmr_clt_sess *clt_sess, bool create) +{ + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool_md *clt_md; + int err, idx; + bool dirty = false; + + mutex_lock(&pool->sess_lock); + + if (__find_sess_by_name(pool, clt_sess->sessname)) { + pr_err("Session '%s' already exists in pool %s\n", + clt_sess->sessname, pool->poolname); + err = -EEXIST; + goto err_out; + } + + pool_sess = alloc_pool_sess(pool, clt_sess); + if (IS_ERR(pool_sess)) { + pr_err("pool session '%s' can not be allocated in pool %s\n", + clt_sess->sessname, pool->poolname); + err = PTR_ERR(pool_sess); + goto err_out; + } + + clt_pool = (struct rmr_clt_pool *)pool->priv; + + /* TODO handle case where tags are alreaydy initialized */ + clt_pool->queue_depth = clt_sess->queue_depth; + clt_md = &clt_pool->pool->pool_md; + clt_md->queue_depth = clt_sess->queue_depth; + + if (!pool->sync) + dirty = !list_empty(&pool->sess_list); + + err = send_msg_join_pool(pool_sess, create, dirty, WAIT); + if (unlikely(err)) { + pr_err("send_msg_join_pool error %d\n", err); + goto free_sess; + } + + /* + * Now that we have the member_id of the new storage node, + * check if it is unique. + */ + idx = srcu_read_lock(&pool->sess_list_srcu); + if (__find_sess_by_member_id(pool, pool_sess->member_id)) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + pr_err("%s: Session with member_id %u already exists\n", + __func__, pool_sess->member_id); + err = -EEXIST; + goto err_leave_pool; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + list_add_tail_rcu(&pool_sess->entry, &pool->sess_list); + + if (!pool->sync) { + err = rmr_clt_process_non_sync_sess(pool_sess, create, dirty); + if (err) { + pr_err("%s: rmr_clt_process_non_sync_sess failed for sess %s with err %d\n", + __func__, clt_sess->sessname, err); + goto rem_from_list; + } + } else + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + + mutex_unlock(&pool->sess_lock); + + mutex_lock(&clt_sess->lock); + list_add_tail(&pool_sess->clt_sess_entry, &clt_sess->pool_sess_list); + mutex_unlock(&clt_sess->lock); + + return pool_sess; + +rem_from_list: + rmr_clt_del_pool_sess(pool_sess); +err_leave_pool: + send_msg_leave_pool(pool_sess, create, WAIT); +free_sess: + rmr_clt_free_pool_sess(pool_sess); +err_out: + mutex_unlock(&pool->sess_lock); + return ERR_PTR(err); +} + +//reauire g_sess_lock acquired +static struct rmr_clt_sess *__find_and_get_clt_sess(const char *sessname) +{ + struct rmr_clt_sess *sess, *sn; + +again: + list_for_each_entry_safe (sess, sn, &g_sess_list, g_list) { + if (strcmp(sessname, sess->sessname)) + continue; + + if (rmr_clt_sess_get(sess)) + return sess; + + pr_info("failed to get ref for sess %s\n", sessname); + goto again; //don't like it + } + + return NULL; +} + +struct rmr_clt_sess *find_and_get_or_create_clt_sess(char *sessname, + struct rtrs_addr *paths, + size_t path_cnt) +{ + struct rmr_clt_sess *sess; + + mutex_lock(&g_sess_lock); + sess = __find_and_get_clt_sess(sessname); + mutex_unlock(&g_sess_lock); + + if (!sess) { + pr_info("%s: Cannot find rmr_clt_sess with name %s\n", __func__, sessname); + sess = rmr_clt_create_sess(sessname, paths, path_cnt); + if (IS_ERR(sess)) { + return sess; + } + pr_info("%s: rmr_clt_sess %s created\n", __func__, sessname); + } + + return sess; +} + +/** + * rmr_clt_del_pool_sess() - Remove a session from the pool session list. + * @pool_sess: Pool session to remove. + * + * Removes @pool_sess from the pool's session list, waits for any in-progress + * SRCU readers to finish, and clears any per-CPU cached references to it. + * + * Context: Caller must hold pool->sess_lock. + */ +void rmr_clt_del_pool_sess(struct rmr_clt_pool_sess *pool_sess) +{ + int cpu; + bool dosync = false; + struct rmr_clt_pool_sess __rcu **ppcpu_sess; + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + list_del_rcu(&pool_sess->entry); + synchronize_srcu(&pool->sess_list_srcu); + + for_each_possible_cpu(cpu) { + preempt_disable(); + ppcpu_sess = per_cpu_ptr(clt_pool->pcpu_sess, cpu); + if (pool_sess == rcu_access_pointer(*ppcpu_sess)) { + rcu_assign_pointer(*ppcpu_sess, NULL); + dosync = true; + } + preempt_enable(); + } + + if (dosync) + synchronize_srcu(&pool->sess_list_srcu); +} + +/** + * rmr_clt_destroy_pool_sess() - Send leave_pool and free a pool session + * + * @pool_sess: Pool session to destroy. + * @delete: True for a permanent pool deletion; false for a temporary + * disassembly. This flag is forwarded in the leave_pool message + * so the server can act accordingly. + */ +void rmr_clt_destroy_pool_sess(struct rmr_clt_pool_sess *pool_sess, bool delete) +{ + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + send_msg_leave_pool(pool_sess, delete, WAIT); + rmr_clt_free_pool_sess(pool_sess); + rmr_clt_sess_put(clt_sess); +} + +static void rmr_clt_destroy_pool(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess, *tmp; + + destroy_clt_pool(pool); + + list_for_each_entry_safe (pool_sess, tmp, &pool->sess_list, entry) { + mutex_lock(&pool->sess_lock); + list_del_rcu(&pool_sess->entry); + mutex_unlock(&pool->sess_lock); + + rmr_clt_destroy_pool_sess(pool_sess, false /* never delete */); + } + + rmr_put_clt_pool(clt_pool); +} + +int rmr_clt_remove_pool_from_sysfs(struct rmr_pool *pool, + const struct attribute *sysfs_self) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (!pool->sync) + cancel_delayed_work_sync(&clt_pool->recover_dwork); + + rmr_clt_destroy_pool_sysfs_files(pool, sysfs_self); + rmr_clt_destroy_pool(pool); + return 0; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess * +rmr_clt_next_sess(struct rmr_pool *pool, struct rmr_clt_pool_sess *prev) +{ + struct rmr_clt_pool_sess *next; + + next = list_next_or_null_rcu(&pool->sess_list, + &prev->entry, + struct rmr_clt_pool_sess, + entry); + if (next) + return next; + + return list_first_or_null_rcu(&pool->sess_list, + struct rmr_clt_pool_sess, + entry); +} + +static inline bool rmr_clt_pool_sess_in_iu(struct rmr_iu *iu, + struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(iu->sess_list), entry) { + + if (sess_iu->pool_sess == pool_sess) + return true; + } + + return false; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess *rmr_clt_round_robin_sess(struct rmr_pool *pool, + struct rmr_iu *iu) +{ + struct rmr_clt_pool_sess *old, *next, *pool_sess; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess __rcu **ppcpu_sess; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + ppcpu_sess = this_cpu_ptr(clt_pool->pcpu_sess); + + if (iu) { + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (rmr_clt_pool_sess_in_iu(iu, pool_sess)) + continue; + + rcu_assign_pointer(*ppcpu_sess, pool_sess); + return pool_sess; + } + + return NULL; + } + + old = rcu_dereference(*ppcpu_sess); + if (!old) { + next = rmr_clt_get_first_normal_session(pool); + if (!next) + return NULL; + rcu_assign_pointer(*ppcpu_sess, next); + return next; + } + + for (next = rmr_clt_next_sess(pool, old); + next && next != old; + next = rmr_clt_next_sess(pool, next)) { + /* + * It could happen that the state of pool_sess hasn't been able to + * represent the recent rtrs-clt sess state. + */ + if (next->clt_sess->state == RMR_CLT_SESS_DISCONNECTED) + continue; + + if (atomic_read(&next->state) == RMR_CLT_POOL_SESS_NORMAL) { + rcu_assign_pointer(*ppcpu_sess, next); + return next; + } + } + + /* + * There may be just one session with normal state i.e. old. + * In this case per-cpu sess pointer does not need update. + */ + return rmr_clt_get_first_normal_session(pool); +} + +int rmr_clt_query(struct rmr_pool *pool, struct rmr_attrs *attr) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + int idx; + + if (unlikely(!clt_pool)) + return -EINVAL; + + attr->chunk_size = pool->chunk_size; + attr->sync = pool->sync; + + attr->queue_depth = U32_MAX; + attr->max_io_size = U32_MAX; + attr->max_segments = U32_MAX; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + if (list_empty(&pool->sess_list)) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + return -ENOENT; + } + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + attr->queue_depth = min_t(int, clt_sess->queue_depth, attr->queue_depth); + attr->max_io_size = min_t(u32, clt_sess->max_io_size, attr->max_io_size); + attr->max_segments = min_t(u32, clt_sess->max_segments, attr->max_segments); + } + attr->pool_kobj = &(pool->kobj); + + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return 0; +} +EXPORT_SYMBOL(rmr_clt_query); + +struct rmr_iu *rmr_clt_get_iu(struct rmr_pool *pool, enum rmr_io_flags flag, + enum rmr_wait_type wait) +{ + int err = 0, idx; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_iu *iu; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + bool reset = false; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) { + pr_err("%s: Pool %s not in use state\n", __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + /* + * We get the inflight ref first. + * If we see that an IO freeze is in progress, we put the ref, and wait for it to unfreeze + * + * The while loop protects us from parallel freeze, like + * A leg deletion, and right after that a call to rmr_clt_close. + * + * We are guranteed to not go on an infinite loop, since rmr_clt_close can be called only + * once, And, there are limited legs to delete + */ + percpu_ref_get(&pool->ids_inflight_ref); + while (atomic_read(&clt_pool->io_freeze) > 0) { + percpu_ref_put(&pool->ids_inflight_ref); + /* + * Coincidentally, the rcu lock might be held when the wait event occurs, + * violating the constraint that no sleeping during general rcu critical section. + * Temporarily release the rcu lock, and re-acquire it after waking up. + * + * TODO: This approach is simple but may need to be revisited. + */ + if (rcu_read_lock_held()) { + rcu_read_unlock(); + reset = true; + } + + wait_event(clt_pool->map_update_wq, !atomic_read(&clt_pool->io_freeze)); + + if (reset) + rcu_read_lock(); + + /* + * Once IO is unfrozen, we check if the state of the pool has changed. + * It could be that rmr_clt_close was called, and hence state is not IN_USE. + * Or, it could be that the last leg was deleted, and we are not in JOINED state + * + * In both the case, we cannot service IOs, hence fail. + */ + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state) || + !test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) { + pr_err("%s: Failed to get inflight IO ref.\n", __func__); + pr_err("%s: Pool %s is not joined or used\n", + __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + percpu_ref_get(&pool->ids_inflight_ref); + } + + iu = rmr_alloc_iu(); + if (unlikely(!iu)) { + percpu_ref_put(&pool->ids_inflight_ref); + return NULL; + } + + idx = srcu_read_lock(&pool->sess_list_srcu); + if (rmr_op(flag) == RMR_OP_READ) { + /* + * Round robin use of one of the sessions in normal state for READ. + * + * This call is always from rmr_clt_request, so for READ, + * this is the first pool_sess we are trying + */ + pool_sess = rmr_clt_round_robin_sess(pool, NULL); + if (unlikely(!pool_sess)) { + err = -ENODEV; + goto put_iu; + } + + sess_iu = rmr_get_sess_iu(pool_sess, RTRS_IO_CON, (enum wait_type) wait); + if (unlikely(!sess_iu)) + goto put_iu; + + sess_iu->rmr_iu = iu; + iu->num_sessions = 1; + list_add_tail(&(sess_iu->entry), (&iu->sess_list)); + } else { + /* + * For WRITE operations we need to submit to all sessions. + */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + /* Sessions must be in normal state for I/O */ + if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_NORMAL) + continue; + + sess_iu = rmr_get_sess_iu(pool_sess, + RTRS_IO_CON, (enum wait_type) wait); + if (unlikely(!sess_iu)) + goto put_sessions; + + sess_iu->rmr_iu = iu; + /* + * The mem_id of sess_iu tracks the next free slot in the permit bitmap + * of an RTRS-clt session, which is used to store write IO chunk info by + * RMR-server. + */ + sess_iu->mem_id = sess_iu->permit->mem_id; + iu->num_sessions++; + list_add_tail(&(sess_iu->entry), (&iu->sess_list)); + } + } + + refcount_set(&iu->refcount, iu->num_sessions); + iu->errno = 0; + + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return iu; + +put_sessions: + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(iu->sess_list), entry) { + if (!list_empty(&sess_iu->entry)) + list_del_init(&sess_iu->entry); + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); + } +put_iu: + srcu_read_unlock(&pool->sess_list_srcu, idx); + rmr_put_iu(iu); + percpu_ref_put(&pool->ids_inflight_ref); + + if (err) + return ERR_PTR(err); + + return NULL; +} +EXPORT_SYMBOL(rmr_clt_get_iu); + +void rmr_clt_put_iu(struct rmr_pool *pool, struct rmr_iu *iu) +{ + rmr_put_iu(iu); + percpu_ref_put(&pool->ids_inflight_ref); +} +EXPORT_SYMBOL(rmr_clt_put_iu); + +/** + * Returns 1 if the errno represents a condition in the + * storage server that prevents the operation to be executed. + * The oposite is an error with respect to the storage server + * where the operation can be re-tried on a different one. + * + * Example is attemp to read a block that does not exists + * versus server has been crashed. + * + * Note that in doubt we have to trigger the re-try. + */ +/* +static inline int rmr_is_op_error(int errno) +{ + switch (-errno) { + case ENOENT: + case EINVAL: + case EEXIST: + case ENODEV: + return 1; + default: + return 0; + } +} +*/ + +static void msg_read_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + struct rmr_iu *iu = sess_iu->rmr_iu; + rmr_conf_fn *clt_conf = iu->conf; + + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + + if (errno) { + if (!iu->errno) + /* only first error is reported */ + iu->errno = errno; + + pr_err_ratelimited("%s got errno: %d for session %d. Schedule retry.\n", + __func__, errno, pool_sess->member_id); + if (!pool_sess->pool->sync) + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED); + + INIT_WORK(&iu->work, retry_failed_read); + schedule_work(&iu->work); + } else { + (*clt_conf)(iu->priv, errno); + } +} + +static void retry_failed_read(struct work_struct *work) +{ + struct rmr_iu *iu = container_of(work, struct rmr_iu, work); + struct rmr_pool *pool = iu->pool; + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + rmr_conf_fn *clt_conf = iu->conf; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu; + struct rtrs_clt_req_ops req_ops; + struct kvec vec; + int err, idx; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + pool_sess = rmr_clt_round_robin_sess(pool, iu); + if (!pool_sess) + goto give_up; + + sess_iu = rmr_get_sess_iu(pool_sess, RTRS_IO_CON, RTRS_PERMIT_WAIT); + if (unlikely(!sess_iu)) + goto give_up; + + pr_debug("%s: Pool %s to session %d, chunk [%llu, %llu]\n", + __func__, pool->poolname, pool_sess->member_id, + le64_to_cpu(iu->msg.id_a), le64_to_cpu(iu->msg.id_b)); + + sess_iu->rmr_iu = iu; + iu->msg.member_id = pool_sess->member_id; + atomic_inc(&clt_pool->stats.read_retries); + + list_add_tail(&(sess_iu->entry), (&iu->sess_list)); + + vec = (struct kvec) { + .iov_base = &iu->msg, + .iov_len = sizeof(iu->msg) + }; + + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_read_conf, + }; + + trace_retry_failed_read(READ, sess_iu); + + err = rtrs_clt_request(RMR_OP_READ, &req_ops, pool_sess->clt_sess->rtrs, sess_iu->permit, + &vec, 1, le32_to_cpu(iu->msg.length), iu->sg, iu->sg_cnt); + + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (err) + /* beware! recursion!! */ + msg_read_conf(sess_iu, err); + + return; +give_up: + srcu_read_unlock(&pool->sess_list_srcu, idx); + /* recursion termination! */ + (*clt_conf)(iu->priv, iu->errno); +} + +/* +static int rmr_clt_map_remove_id(struct rmr_pool *pool, int srv_id, rmr_id_t id) +{ + struct rmr_dirty_id_map *map; + + pr_debug("pool %s, remove id (%llu, %llu) for stg_id %d\n", + pool->poolname, id.a, id.b, srv_id); + + map = rmr_pool_find_map(pool, srv_id); + if (!map) { + pr_err("pool %s no map found for pool_id %u\n", + pool->poolname, srv_id); + return -EINVAL; + //TODO: handle this , probably initialize map, or just throw err? + } + + if (!rmr_map_empty(map)) { + void *val; + + val = rmr_map_find(map, id); + if (!val) { + pr_debug("pool %s value for id (%llu, %llu) is not in the dirty map\n", + pool->poolname, id.a, id.b); + return 0; + } + rmr_map_erase(map, id); + pr_debug("pool %s, id (%llu, %llu) is removed from map for stg_id %d\n", + pool->poolname, id.a, id.b, srv_id); + } + + return 0; +} +*/ + +static void msg_io_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + struct rmr_iu *iu = sess_iu->rmr_iu; + rmr_conf_fn *clt_conf = iu->conf; + void *clt_priv = iu->priv; + + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + WARN_ON(pool_sess->pool->sync); + + if (errno) { + pr_err("%s: For sess %s, id (%llu, %llu), got errno: %d\n", + __func__, pool_sess->sessname, iu->msg.id_a, iu->msg.id_b, errno); + sess_iu->errno = errno; + if (!iu->errno) + /* only first error is reported */ + iu->errno = errno; + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED); + pr_debug("iu->errno %d, errno %d, before dec refcnt %d\n", + iu->errno, errno, refcount_read(&iu->refcount)); + } else { + atomic_inc(&iu->succeeded); + // TODO: is it ok to clear it here? + // rmr_clt_map_remove_id(session->pool, session->pool_id, iu->id); + } + + pr_debug("called for id (%llu, %llu), errno %d, sessname %s\n", + iu->msg.id_a, iu->msg.id_b, errno, pool_sess->sessname); + + if (refcount_dec_and_test(&iu->refcount)) { + if (atomic_read(&iu->succeeded) == 0) { + /* + * None of the IOs succeeded. + * Map add is not needed; Just fail the IO. + */ + pr_err("Write IO failed. Passing it up. errno %d\n", iu->errno); + (*clt_conf)(clt_priv, iu->errno); + } else if (iu->errno) { + /* + * Some IOs failed. Send map update (add). + * The clt conf will be called when map update is done. + * + * We are using the same iu to send map update + * So reset the refcount. + */ + refcount_set(&iu->refcount, iu->num_sessions); + + /* + * we are in interrupt here, so sched map update + */ + pr_debug("%s: some IOs failed for %s. Starts map_add\n", __func__, + pool_sess->sessname); + INIT_WORK(&iu->work, sched_map_add); + schedule_work(&iu->work); + } else { + /* + * All good. + */ + errno = 0; + (*clt_conf)(clt_priv, errno); + } + } +} + +static inline void rmr_clt_put_cu(struct rmr_clt_cmd_unit *cmd_unit) +{ + percpu_ref_put(&cmd_unit->clt_pool->pool->ids_inflight_ref); + kfree(cmd_unit); +} + +/** + * msg_cmd_conf() - Confirmation function called for command user commands sent + * + * priv: Pointer to private data passed to rtrs. sess_iu in this case. + * errno: error status passed by rtrs + */ +static void msg_cmd_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + struct rmr_clt_cmd_unit *cmd_unit = sess_iu->rmr_cmd_unit; + rmr_conf_fn *clt_conf = cmd_unit->conf; + void *clt_priv = cmd_unit->priv; + int total_failed; + + pr_debug("%s: sessname:%s, errno=%d\n", __func__, sess_iu->pool_sess->sessname, errno); + if (!errno) + atomic_inc(&cmd_unit->succeeded); + + if (refcount_dec_and_test(&cmd_unit->refcount)) { + if (atomic_read(&cmd_unit->succeeded) == 0) { + /* + * None of the IOs succeeded. + */ + pr_err("CMD failed with err %pe. Passing it up.\n", ERR_PTR(errno)); + (*clt_conf)(clt_priv, errno); + } else { + total_failed = cmd_unit->failed_state + + (cmd_unit->num_sessions - atomic_read(&cmd_unit->succeeded)); + /* + * Pass the number of failures up to the user. + */ + (*clt_conf)(clt_priv, total_failed); + } + + rmr_clt_put_cu(cmd_unit); + } + + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); +} + +/* The amount of data that belongs to an I/O and the amount of data that + * should be read or written to the disk (bi_size) can differ. + * + * E.g. When WRITE_SAME is used, only a small amount of data is + * transferred that is then written repeatedly over a lot of sectors. + * + * Get the size of data to be transferred via RTRS by summing up the size + * of the scather-gather list entries. + */ +static size_t rmr_clt_get_sg_size(struct scatterlist *sglist, u32 len) +{ + struct scatterlist *sg; + size_t tsize = 0; + int i; + + for_each_sg(sglist, sg, len, i) + tsize += sg->length; + return tsize; +} + +/** + * rmr_clt_request() - Request data transfer to/from storage node via given pool + * + * @pool: The Pool + * @iu: Iu allocated by pevious rmr_clt_get_iu call. + * @offset: offset inside the object to read/write: + * @length: length of data starting from offset + * @flag: READ/WRITE/REMOVE + * @prio: priority of IO + * @priv: User provided data, passed back with corresponding + * @(conf) confirmation. + * @conf: callback function to be called as confirmation + * @sg: Pages to be sent/received to/from server. + * @sg_cnt: Number of elements in the @sg + * + * Description: + * Data transfer through the given pool, using the underlying RTRS <-> RDMA + * While sending write IOs, if there are FAILED or RECONNECTING pool sessions, that IO + * would be added as dirty for such sessions. + * This is used by both pserver client, and the rmr server on the storage node to perform + * sync reads. + * + * Return: + * 0 on success. This means IO was sent. Final confirmation would be sent via conf function + * Error value on failure + */ +int rmr_clt_request(struct rmr_pool *pool, struct rmr_iu *iu, + size_t offset, size_t length, enum rmr_io_flags flag, unsigned short prio, + void *priv, rmr_conf_fn *conf, struct scatterlist *sg, unsigned int sg_cnt) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + struct rtrs_clt_req_ops req_ops; + rmr_id_t id; + struct kvec vec; + size_t sg_len; + int dir, err, idx; + u32 rmr_flag; + + rmr_get_iu(iu); + rmr_flag = rmr_op(flag); + dir = (rmr_flag == RMR_OP_READ) ? READ : WRITE; + + sg_len = rmr_clt_get_sg_size(sg, sg_cnt); + if (!(flag & RMR_OP_DISCARD || flag & RMR_OP_WRITE_ZEROES)) + WARN_ON(length != sg_len); + + iu->msg.hdr.group_id = cpu_to_le32(pool->group_id); + iu->msg.hdr.type = cpu_to_le16(RMR_MSG_IO); + iu->msg.hdr.__padding = 0; + + iu->msg.offset = cpu_to_le32(offset); + iu->msg.length = cpu_to_le32(length); + iu->msg.flags = cpu_to_le32(flag); + iu->msg.prio = cpu_to_le16(prio); + + iu->msg.sync = pool->sync; + + iu->priv = priv; + iu->conf = conf; + iu->pool = pool; + + if (rmr_flag != RMR_OP_FLUSH && sg_len) { + rmr_map_calc_chunk(pool, offset, length, &id); + /* + * We are not ready to process IO requests which are across chunk boundary. + * The main area which needs work is triggering sync IO (see rmr-req.c) which + * holding the IO which touches multiple chunks. And then making sure other IOs + * which overlap these chunks are held properly, and restarted once the corresponding + * chunk is synced. + */ + BUG_ON(id.a > 1); + iu->msg.id_a = cpu_to_le64(id.a); + iu->msg.id_b = cpu_to_le64(id.b); + } + + if (rmr_flag == RMR_OP_READ) { + iu->sg = sg; + iu->sg_cnt = sg_cnt; + } else if (!pool->sync && rmr_flag == RMR_OP_WRITE) { + /* + * We take this path only for request from client side + * Never from rmr_req_remote_read. + */ + int failed_cnt = 0; + int i; + + atomic_set(&iu->succeeded, 0); + idx = srcu_read_lock(&pool->sess_list_srcu); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + struct rmr_clt_pool_sess *ps; + enum rmr_clt_pool_sess_state state; + u8 mid = pool->pool_md.srv_md[i].member_id; + + if (!mid) + continue; + + ps = xa_load(&pool->stg_members, mid); + if (ps) { + state = atomic_read(&ps->state); + if (state != RMR_CLT_POOL_SESS_FAILED && + state != RMR_CLT_POOL_SESS_RECONNECTING) + continue; + } + /* ps == NULL (disassembled) or FAILED/RECONNECTING */ + if (WARN_ON(failed_cnt >= RMR_POOL_MAX_SESS)) + break; + iu->msg.map_ver = cpu_to_le64(pool->map_ver); + iu->msg.failed_id[failed_cnt] = mid; + failed_cnt++; + rmr_clt_map_add_id(pool, mid, id); + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + iu->msg.failed_cnt = failed_cnt; + } else if (pool->sync) { + pr_err("rmr_clt_request: Sync sessions do not process writes\n"); + return -EPERM; + } + + vec = (struct kvec) { + .iov_base = &iu->msg, + .iov_len = sizeof(iu->msg) + }; + + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(iu->sess_list), entry) { + struct rmr_clt_sess *clt_sess; + + pool_sess = sess_iu->pool_sess; + clt_sess = pool_sess->clt_sess; + iu->msg.member_id = pool_sess->member_id; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING || + pool_sess->maintenance_mode) { + /* + * The storage for this session is getting removed from + * the pool, or is in maintenance mode. + * Simply complete this IO with error + */ + err = -EAGAIN; + goto complete_io; + } + + pr_debug("Sending %x request to pool %s session %s " + "chunk (%llu, %llu) offset %lu length %lu)\n", + rmr_flag, + pool->poolname, pool_sess->sessname, + id.a, id.b, offset, length); + + if (rmr_flag == RMR_OP_READ) { + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_read_conf, + }; + } else { + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_io_conf, + }; + + /* + * Update mem_id before transmitting each write IO to the corresponding + * server. + */ + iu->msg.mem_id = cpu_to_le32(sess_iu->mem_id); + } + + trace_rmr_clt_request(dir, sess_iu); + + err = rtrs_clt_request(dir, &req_ops, clt_sess->rtrs, + sess_iu->permit, &vec, 1, sg_len, + sg, sg_cnt); + +complete_io: + if (err) { + if (rmr_flag == RMR_OP_READ) + msg_read_conf(sess_iu, err); + else + msg_io_conf(sess_iu, err); + } + } + rmr_put_iu(iu); + + return 0; +} +EXPORT_SYMBOL(rmr_clt_request); + +/** + * rmr_clt_get_cu() - Allocate and return a command unit. + * + * @pool: rmr pool for which the command unit is to be allocated + * + * Description: + * Allocates and returns a command unit for the rmr pool. The command unit contains a list of + * session units, for each session which is not in the "REMOVING" state. + * + * Return: + * Pointer to the command unit + */ +static struct rmr_clt_cmd_unit *rmr_clt_get_cu(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_cmd_unit *cmd_unit; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + int idx; + + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) { + pr_err("%s: Pool %s not in use\n", __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + /* + * We get the inflight ref first. + * If we see that an IO freeze is in progress, we put the ref, and wait for it to unfreeze + * + * The while loop protects us from parallel freeze, like + * A leg deletion, and right after that a call to rmr_clt_close. + * + * We are guranteed to not go on an infinite loop, since rmr_clt_close can be called only + * once, And, there are limited legs to delete + */ + percpu_ref_get(&pool->ids_inflight_ref); + while (atomic_read(&clt_pool->io_freeze) > 0) { + percpu_ref_put(&pool->ids_inflight_ref); + wait_event(clt_pool->map_update_wq, !atomic_read(&clt_pool->io_freeze)); + + /* + * Once IO is unfrozen, we check if the state of the pool has changed. + * It could be that rmr_clt_close was called, and hence state is not IN_USE. + * Or, it could be that the last leg was deleted, and we are not in JOINED state + * + * In both the case, we cannot service IOs, hence fail. + */ + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state) || + !test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) { + pr_err("%s: Failed to get inflight IO ref.\n", __func__); + pr_err("%s: Pool %s is not joined or used\n", __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + percpu_ref_get(&pool->ids_inflight_ref); + } + + cmd_unit = kzalloc(sizeof(*cmd_unit), GFP_KERNEL); + if (!cmd_unit) { + percpu_ref_put(&pool->ids_inflight_ref); + return NULL; + } + + INIT_LIST_HEAD(&cmd_unit->sess_list); + cmd_unit->pool = pool; + cmd_unit->clt_pool = clt_pool; + atomic_set(&cmd_unit->succeeded, 0); + + idx = srcu_read_lock(&pool->sess_list_srcu); + /* + * Acquire the permits for all sessions. + * Continue only if we manage to get permits for all "normal" sessions?? + */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING) + continue; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) { + cmd_unit->failed_state++; + continue; + } + + sess_iu = rmr_get_sess_iu(pool_sess, RTRS_ADMIN_CON, RTRS_PERMIT_NOWAIT); + if (unlikely(!sess_iu)) + goto put_sessions; + + sess_iu->rmr_cmd_unit = cmd_unit; + + cmd_unit->num_sessions++; + list_add_tail(&(sess_iu->entry), (&cmd_unit->sess_list)); + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + refcount_set(&cmd_unit->refcount, cmd_unit->num_sessions); + + return cmd_unit; + +put_sessions: + srcu_read_unlock(&pool->sess_list_srcu, idx); + + /* Free sess_ius */ + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(cmd_unit->sess_list), entry) { + if (!list_empty(&sess_iu->entry)) + list_del_init(&sess_iu->entry); + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); + } + + rmr_clt_put_cu(cmd_unit); + + return NULL; +} + +/** + * rmr_clt_cmd_err_conf() - Calls confirmation function for commands + * + * @work: schedules work + * + * Description: + * In case of error in the user command path, we cannot call the confirmation function + * directly, since it might end up calling confirmation function of the user itself. + * Hence a work is scheduled to call the confirmation function in case the code for sending + * user commands itself fails. + */ +static void rmr_clt_cmd_err_conf(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + + msg_cmd_conf(sess_iu, sess_iu->errno); +} + +/** + * rmr_clt_cmd_with_rsp() - Sends a user command to all sessions of an rmr pool + * + * @pool: rmr pool to which the command is for + * @conf: confirmation function to be called after completion + * @priv: pointer to priv data, to be returned to user while calling conf function + * @usr_vec: kvec containing user data (mostly command messages?) + * @nr: number of kvecs + * @buf: buf where the response from the user server is to be directed + * The buf must be physically contiguous in memory (kmalloc()'d). + * @buf_len: length of the buffer + * @size: size of the buf to be sent to a single session + * + * Description: + * This function provides an interface for the user to send commands to the server side. + * The command is sent as a read, so that the response from the user srv side can be received + * The buffer sent by the user is meant to receive the response from the user server side. + * The size of the buffer is set during rmr_clt_open. + * + * Return: + * 0 on success + * negative errno in case of error + * + * Context: + * Inflight commands will block map update, until the inflights are completed. + */ +int rmr_clt_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, size_t size) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + struct rmr_clt_cmd_unit *cmd_unit; + struct rmr_msg_pool_cmd msg = {}; + struct rtrs_clt_req_ops req_ops; + struct kvec *vec; + int i, j, err = 0; + + /* + * TODO: kvmalloc() memory is yet to be supported for SG I/O. + */ + if (is_vmalloc_addr(buf)) + return -EINVAL; + + if (buf_len != (RMR_POOL_MAX_SESS * size)) + return -EINVAL; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_USER; + + /* + * RMR msg struct + user vecs + */ + vec = kzalloc((1 + nr) * sizeof(*vec), GFP_KERNEL); + if (!vec) + return -ENOMEM; + + /* + * RMR msg struct first, + * followed by the user kvecs + */ + vec[0].iov_base = &msg; + vec[0].iov_len = sizeof(msg); + for (i = 1, j = 0; j < nr; i++, j++) { + vec[i].iov_base = usr_vec[j].iov_base; + vec[i].iov_len = usr_vec[j].iov_len; + + msg.user_cmd.usr_len += usr_vec[j].iov_len; + } + + cmd_unit = rmr_clt_get_cu(pool); + if (!cmd_unit) { + err = -ENOMEM; + goto out; + } + + cmd_unit->conf = conf; + cmd_unit->priv = priv; + + i = 0; + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(cmd_unit->sess_list), entry) { + pool_sess = sess_iu->pool_sess; + + req_ops = (struct rtrs_clt_req_ops){ + .priv = sess_iu, + .conf_fn = msg_cmd_conf, + }; + + /* + * The user expects each node to be able to send back data of this "size" as + * response. + * So divide the user buffer into chunks of "size", and send them to each leg. + */ + sg_init_one(&sess_iu->sg, buf + (i * size), size); + + trace_rmr_clt_cmd_with_rsp(READ, sess_iu); + + err = rtrs_clt_request(READ, &req_ops, pool_sess->clt_sess->rtrs, sess_iu->permit, + vec, (1 + nr), size, &sess_iu->sg, 1); + if (err) { + /* + * We want to deal with this error just like we deal with the error + * received from the conf function returned from rtrs. + * This would help us to inform the user the correct number of commands + * which failed on the rmr level (rtrs is also rmr level for user). + */ + pr_warn("rtrs_clt_request Failed with err %d\n", err); + sess_iu->errno = err; + INIT_WORK(&sess_iu->work, rmr_clt_cmd_err_conf); + schedule_work(&sess_iu->work); + err = 0; + } + + i++; + } + + /* + * No session to send command + */ + if (i == 0) { + rmr_clt_put_cu(cmd_unit); + err = -EINVAL; + } + +out: + kfree(vec); + + return err; +} +EXPORT_SYMBOL(rmr_clt_cmd_with_rsp); + +/** + * rmr_clt_send_cmd_with_data() - send command containing data buffer as a payload or response + * + * @pool: rmr pool to send command + * @pool_sess: client pool session used to send + * @msg: initialized command message describing the command + * @buf: pointer to the data buffer for data transfers + * @buflen: size of the buffer in bytes + * + * Description: + * Performs sending the command described by msg with a payload or response + * in the buf. + * + * Return: + * 0 on success, error code otherwise. + * + * Context: + * This function blocks while sending the buffer. + * + * Locks: + * should be called under srcu_read_lock since it uses pool_sess + */ +int rmr_clt_send_cmd_with_data(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen) +{ + struct rmr_clt_sess_iu *sess_iu; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct kvec vec = { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + int errno = 0, err = 0; + int dir; + + switch (msg->cmd_type) { + case RMR_CMD_MAP_CHECK: + case RMR_CMD_READ_MAP_BUF: + case RMR_CMD_MAP_GET_VER: + case RMR_CMD_MD_SEND: + case RMR_CMD_MAP_SET_VER: + dir = READ; + break; + case RMR_CMD_MAP_TEST: + case RMR_CMD_SEND_MAP_BUF: + case RMR_CMD_SEND_MD_BUF: + dir = WRITE; + break; + default: + pr_err("%s: pool %s cmd type %u is not supported\n", + __func__, pool->poolname, msg->cmd_type); + return -EINVAL; + } + + // TODO: why io_con not admin? + if (clt_sess->state == RMR_CLT_SESS_DISCONNECTED) { + pr_debug("The rmr client session %s state is disconnected\n", clt_sess->sessname); + err = -EINVAL; + goto err; + } + + sess_iu = rmr_msg_get_iu(pool_sess, RTRS_IO_CON, RTRS_PERMIT_WAIT, 2); + if (unlikely(!sess_iu)) { + err = -ENOMEM; + goto err; + } + + sess_iu->buf = buf; + sg_init_one(&sess_iu->sg, buf, buflen); + + err = send_usr_msg(clt_sess->rtrs, dir, sess_iu, + &vec, 1, buflen, &sess_iu->sg, 1, + msg_pool_cmd_map_content_conf, &errno, WAIT); + if (unlikely(err)) { + rmr_msg_put_iu(pool_sess, sess_iu); + } else { + err = errno; + } + + rmr_msg_put_iu(pool_sess, sess_iu); + +err: + return err; +} + +/** + * rmr_clt_pool_member_synced() - check if the pool member has no data to sync + * + * @pool: rmr pool in which we perform the check + * @member_id: id of the pool member tto check + * + * Description: + * Send the check map command to the pool member with the specified id. + * Pool member returns whether he has unsynced chunks or not. + * + * Return: + * error code if failed to send, 0 if pool member is not synced completely, + * 1 if pool member is synced (has no dirty chunks in his map). + * + * Context: + * This function blocks while sending the command. + * + * Locks: + * no + */ +int rmr_clt_pool_member_synced(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd_rsp rsp = {}; + struct rmr_msg_pool_cmd msg = {}; + int ret = 0, idx; + enum rmr_clt_pool_sess_state state; + + pr_debug("start looking for session with member_id=%u\n", member_id); + idx = srcu_read_lock(&pool->sess_list_srcu); + + pool_sess = __find_sess_by_member_id(pool, member_id); + if (!pool_sess) { + pr_err("in pool %s failed to find sess with a member_id=%u\n", + pool->poolname, member_id); + ret = -ENOENT; + goto out; + } + + pr_debug("found session %s with member_id=%u\n", + pool_sess->sessname, member_id); + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_FAILED || + state == RMR_CLT_POOL_SESS_REMOVING) { + pr_debug("pool %s session %s is in %s state, cannot send cmd %s\n", + pool->poolname, pool_sess->sessname, + rmr_clt_sess_state_str(state), rmr_get_cmd_name(msg.cmd_type)); + ret = -EINVAL; + goto out; + } + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_CHECK; + + pr_debug("send cmd %u to %s\n", msg.cmd_type, pool_sess->sessname); + ret = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, &rsp, sizeof(rsp)); + if (ret) { + pr_err("%s: For pool %s failed to %s, err %d\n", + __func__, pool->poolname, rmr_get_cmd_name(msg.cmd_type), ret); + goto out; + } + + if (rsp.value) + ret = 1; // other side reported map is clear + + pr_debug("send cmd %u to %s is done\n", msg.cmd_type, pool_sess->sessname); +out: + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return ret; +} +EXPORT_SYMBOL(rmr_clt_pool_member_synced); + +/** + * rmr_pool_md_to_buf - Fill the buffer with the metadata + * + * @pool: rmr pool contains the metadata. It must be a non-sync pool, + * either client or server pool. + * @buf: buffer to fill with the metadata. + * + */ +static void rmr_clt_md_to_buf(struct rmr_pool *pool, u8 *buf) +{ + struct rmr_pool_md *pool_md; + struct rmr_srv_md *srv_md; + + if (pool->is_clt) { + pool_md = (struct rmr_pool_md *)buf; + /* copy the entire client pool md */ + memcpy(pool_md, &pool->pool_md, sizeof(struct rmr_pool_md)); + return; + } + + srv_md = (struct rmr_srv_md *)(&buf[RMR_CLT_MD_SIZE]); + memcpy(srv_md, &pool->pool_md.srv_md[0], RMR_SRV_MD_SIZE); +} + +/** + * rmr_clt_pool_send_md_all() - Send metadata of rmr pool + * + * Description: + * Send metadata of the src pool to all sessions of the client pool. + * 1) If the client pool is sync pool, it sends the entire server pool + * metadata back after the leader reads the metadata of its connected + * nodes. + * 2) If it is non-sync, send the client pool metadata to storage node + * backups. + */ +int rmr_clt_pool_send_md_all(struct rmr_pool *src_pool, struct rmr_pool *clt_pool) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + void *buf; + u32 buflen; + int err = 0, idx; + + if (!clt_pool) { + pr_err("Cannot send metadata when clt_pool is NULL\n"); + return -EINVAL; + } + + if (src_pool->sync) { + pr_err("Cannot send metadata when src_pool is sync\n"); + return -EINVAL; + } + + buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + buflen = RMR_MD_SIZE; + if (!buf) + return -ENOMEM; + + rmr_clt_md_to_buf(src_pool, buf); + + /* + * It will continue to send the md to the next session even if the previous send failed. + */ + idx = srcu_read_lock(&clt_pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &clt_pool->sess_list, entry, + (srcu_read_lock_held(&clt_pool->sess_list_srcu))) { + pr_debug("Start sending md for pool %s; to session %s with member_id %d\n", + src_pool->poolname, pool_sess->sessname, pool_sess->member_id); + + rmr_clt_init_cmd(clt_pool, &msg); + msg.cmd_type = RMR_CMD_SEND_MD_BUF; + msg.send_md_buf_cmd = (struct rmr_msg_send_md_buf_cmd) { + .sync = clt_pool->sync, + /* the receiver of buffer is the leader */ + .receiver_id = pool_sess->member_id, + /* change flags in cmd message */ + .flags = RMR_OP_MD_WRITE, + }; + + err = rmr_clt_send_cmd_with_data(clt_pool, pool_sess, &msg, buf, buflen); + if (err) { + pr_debug("Cannot send the clt/srv_md of entire pool to the pool sess %s\n", + pool_sess->sessname); + continue; + } + } + + pr_debug("send_md done\n"); + + kfree(buf); + + srcu_read_unlock(&clt_pool->sess_list_srcu, idx); + return err; +} +EXPORT_SYMBOL(rmr_clt_pool_send_md_all); + +static int rmr_clt_start_send_md(struct rmr_pool *pool) +{ + return rmr_clt_pool_send_md_all(pool, pool); +} + +/** + * rmr_clt_del_stor_from_pool() - Notify pool members that a storage node is leaving + * + * @pool_sess: The pool session of the departing storage node. + * @delete: True for a permanent deletion (%RMR_POOL_INFO_MODE_DELETE); + * false for a temporary disassembly (%RMR_POOL_INFO_MODE_DISASSEMBLE). + * + * Sends a POOL_INFO REMOVE message to all other active pool members so they + * can update their dirty maps and membership state accordingly. + * + * Return: + * 0 on success, negative error code on failure. + */ +int rmr_clt_del_stor_from_pool(struct rmr_clt_pool_sess *pool_sess, bool delete) +{ + enum rmr_pool_info_mode mode; + int err; + + if (delete) + mode = RMR_POOL_INFO_MODE_DELETE; + else + mode = RMR_POOL_INFO_MODE_DISASSEMBLE; + + err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_REMOVE, mode, false); + if (err) { + pr_err("rmr_clt_send_pool_info failed for session\n"); + return err; + } + + return 0; +} + +static int __init rmr_client_init(void) +{ + int err; + + pr_info("Loading module %s, version %s, proto %s\n", KBUILD_MODNAME, + RMR_VER_STRING, RMR_PROTO_VER_STRING); + + err = rmr_clt_create_sysfs_files(); + if (err) { + pr_err("Failed to load module," + " creating sysfs device files failed, err: %d\n", + err); + goto out; + } + + return 0; + +out: + return err; +} + +static void __exit rmr_client_exit(void) +{ + struct rmr_pool *pool, *tmp; + + pr_info("Unloading module\n"); + + list_for_each_entry_safe(pool, tmp, &pool_list, entry) + (void) rmr_clt_remove_pool_from_sysfs(pool, NULL); + + rmr_clt_destroy_sysfs_files(); + pr_info("Module unloaded\n"); +} + +module_init(rmr_client_init); +module_exit(rmr_client_exit); diff --git a/drivers/infiniband/ulp/rmr/rmr-clt.h b/drivers/infiniband/ulp/rmr/rmr-clt.h new file mode 100644 index 000000000000..c50651efe4a3 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_CLT_H +#define RMR_CLT_H + +#include +#include "rmr-pool.h" + +#define RECONNECT_DELAY 30 +#define MAX_RECONNECTS -1 +#define RTRS_LINK_NAME "rtrs" + +#define RMR_MAP_CLEAN_DELAY_MS 5000 +#define RMR_RECOVER_INTERVAL_MS 3000 + +enum rmr_clt_sess_state { + RMR_CLT_SESS_DISCONNECTED = 1, + RMR_CLT_SESS_CONNECTED, +}; + +struct rmr_clt_sess { + char sessname[NAME_MAX]; + struct kobject kobj; + struct mutex lock; + struct rtrs_clt_sess *rtrs; + bool rtrs_ready; + /* server this session is connected to */ + int queue_depth; + u32 max_io_size; + u32 max_segments; + struct list_head pool_sess_list; + struct list_head g_list; + struct kref kref; + enum rmr_clt_sess_state state; +}; + +/* + * NB: If you change here, make sure the changes are in sync with + * pool_sess state machine routine i.e. pool_sess_change_state(). + */ +enum rmr_clt_pool_sess_state { + RMR_CLT_POOL_SESS_CREATED = 1, // No IO, No dirty map addition, Yes cmd msgs + RMR_CLT_POOL_SESS_NORMAL, // Yes IO, No dirty map addition, Yes cmd msgs + RMR_CLT_POOL_SESS_FAILED, // No IO, Yes dirty map addition, No cmd msgs + RMR_CLT_POOL_SESS_RECONNECTING, // No IO, Yes, dirty map addition, Yes cmd msgs + // But not with an updated map + + RMR_CLT_POOL_SESS_REMOVING // No IO, No dirty map addition, Yes cmd msgs + // Getting removed from pool +}; + +struct rmr_clt_pool_sess { + char sessname[NAME_MAX]; + struct rmr_pool *pool; + struct kobject kobj; + u8 member_id; /* refers to the pool id on the */ + struct kobject sess_kobj; + struct list_head entry; /* for pool->sess_list */ + struct list_head clt_sess_entry; /* for clt_sess->pool_sess_list */ + struct rmr_clt_sess *clt_sess; + atomic_t state; /* rmr_clt_pool_sess_state */ + u8 ver; /* protocol version */ + u8 pool_id; /* refers to the pool id on the */ + bool maintenance_mode; /* If the pool is in maintenance mode or not */ + bool was_last_authoritative; /* last NORMAL sess before it went FAILED; + * carries complete dirty maps for all members */ +}; + +struct rmr_clt_stats { + struct kobject kobj_stats; + atomic_t read_retries; +}; + +/* + * State descriptions: + * RMR_CLT_POOL_STATE_JOINED: An rmr_clt_pool which has one or more legs (rmr_clt_pool_sess) + * added to it. This means the pool has joined into pools from + * storage nodes + * + * RMR_CLT_POOL_STATE_IN_USE: An rmr_clt_pool which is in use by an upper layer client. This + * is usually done by calling rmr_clt_open + * + * Note: When adding a new state, + * remember to add an entry in the function rmr_get_clt_pool_state_name() + */ +enum rmr_clt_pool_state { + RMR_CLT_POOL_STATE_JOINED = 0, + RMR_CLT_POOL_STATE_IN_USE, + // RMR_CLT_POOL_STATE_DEGRADED, uncomment and use + // RMR_CLT_POOL_STATE_DIRTY, + RMR_CLT_POOL_STATE_MAX, +}; + +struct rmr_clt_pool { + struct rmr_pool *pool; + refcount_t refcount; + unsigned long state; + struct mutex clt_pool_lock; + + size_t queue_depth; + + struct rmr_clt_stats stats; + struct kobject stats_kobj; + + void *priv; /* provided by user */ + rmr_clt_ev_fn *link_ev; /* deliver events to user */ + + atomic_t io_freeze; + wait_queue_head_t map_update_wq; + struct mutex io_freeze_lock; + + struct workqueue_struct *recover_wq; + struct delayed_work recover_dwork; + + /* use sessions round robbin to read */ + struct rmr_clt_pool_sess __rcu *__percpu *pcpu_sess; +}; + +struct rmr_iu_comp { + wait_queue_head_t wait; + int errno; +}; + +/** + * rmr_iu - reserves resources needed to do an I/O op on pool + */ +struct rmr_iu { + struct rmr_pool *pool; + unsigned int mem_id; + struct list_head sess_list; /* list of per-session tags */ + u8 num_sessions; + refcount_t ref; /* lifetime refcount */ + struct rmr_msg_io msg; + int errno; + atomic_t succeeded; + refcount_t refcount; + rmr_conf_fn *conf; + void *priv; + /* for retry of failed reads */ + struct work_struct work; + struct scatterlist *sg; + unsigned int sg_cnt; +}; + +struct rmr_clt_sess_iu { + void *buf; /* for session messages */ + struct rtrs_permit *permit; + struct rmr_clt_pool_sess *pool_sess; + int errno; + union { + /* for session messages only */ + struct scatterlist sg; + /* for tag->sess_list of io messages*/ + struct list_head entry; + }; + + /* for session messages only */ + struct work_struct work; + + /* for io requests */ + struct rmr_iu *rmr_iu; + unsigned int mem_id; + + /* for command messages */ + struct rmr_clt_cmd_unit *rmr_cmd_unit; + + /* for session messages only */ + struct rmr_iu_comp comp; + atomic_t refcount; +}; + +struct rmr_clt_iu_comp { + wait_queue_head_t wait; + int errno; +}; + +struct rmr_clt_cmd_unit { + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + + struct list_head sess_list; + int num_sessions; + + int failed_state; + int errno; + atomic_t succeeded; + refcount_t refcount; + + rmr_conf_fn *conf; + void *priv; +}; + +/* rmr-clt.c */ +struct rmr_pool *rmr_clt_create_pool(const char *name); +void rmr_put_clt_pool(struct rmr_clt_pool *clt_pool); + +void rmr_clt_change_pool_state(struct rmr_clt_pool *rmr_clt_pool, + enum rmr_clt_pool_state new_state, bool set); +int rmr_clt_remove_pool_from_sysfs(struct rmr_pool *pool, + const struct attribute *sysfs_self); +struct rmr_clt_sess *find_and_get_or_create_clt_sess(char *sessname, + struct rtrs_addr *paths, + size_t path_cnt); +struct rmr_clt_pool_sess *rmr_clt_add_pool_sess(struct rmr_pool *pool, + struct rmr_clt_sess *clt_sess, bool create); +void rmr_clt_sess_put(struct rmr_clt_sess *sess); +void rmr_clt_del_pool_sess(struct rmr_clt_pool_sess *sess); +void rmr_clt_destroy_pool_sess(struct rmr_clt_pool_sess *sess, bool delete); + +const char *rmr_clt_sess_state_str(enum rmr_clt_pool_sess_state state); +void resend_join_pool(struct rmr_clt_sess *sess); +int rmr_clt_reconnect_sess(struct rmr_clt_sess *sess, + const struct rtrs_addr *paths, + size_t path_cnt); +int rmr_clt_start_last_io_update(struct rmr_pool *pool); +int rmr_clt_set_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess); +int rmr_clt_enable_sess(struct rmr_clt_pool_sess *sess); + +int rmr_clt_send_map_update(struct rmr_pool *pool, struct rmr_iu *iu); + +int rmr_clt_pool_send_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg); +int rmr_clt_send_cmd_with_data(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen); +int rmr_clt_map_add_id(struct rmr_pool *pool, int stg_id, rmr_id_t id); +void rmr_clt_init_cmd(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg); +int rmr_clt_pool_send_cmd(struct rmr_clt_pool_sess *sess, struct rmr_msg_pool_cmd *msg, bool wait); +int rmr_clt_del_stor_from_pool(struct rmr_clt_pool_sess *pool_sess, bool delete); +bool rmr_clt_sess_is_sync(struct rmr_clt_pool_sess *sess); +int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool delete, bool wait); +void rmr_clt_free_pool_sess(struct rmr_clt_pool_sess *pool_sess); +int rmr_clt_send_map(struct rmr_pool *map_src_pool, struct rmr_pool *clt_pool, + const struct rmr_msg_map_send_cmd *map_send_cmd, rmr_map_filter filter); +int rmr_clt_test_map(struct rmr_pool *src_pool, struct rmr_pool *dst_pool); +int rmr_clt_send_cmd_with_data_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen); +int rmr_clt_pool_send_md_all(struct rmr_pool *src_pool, struct rmr_pool *clt_pool); +int rmr_clt_pool_send_cmd_all(struct rmr_pool *pool, enum rmr_msg_cmd_type cmd_type); +void recover_work(struct work_struct *work); + +int rmr_clt_pool_member_synced(struct rmr_pool *pool, u8 member_id); + +bool pool_sess_change_state(struct rmr_clt_pool_sess *pool_sess, + enum rmr_clt_pool_sess_state newstate); + +void rmr_clt_pool_io_freeze(struct rmr_clt_pool *clt_pool); +void rmr_clt_pool_io_unfreeze(struct rmr_clt_pool *clt_pool); +void rmr_clt_pool_io_wait_complete(struct rmr_clt_pool *clt_pool); +int rmr_clt_pool_try_enable(struct rmr_pool *pool); +int send_msg_enable_pool(struct rmr_clt_pool_sess *pool_sess, bool enable); + +void rmr_get_iu(struct rmr_iu *iu); +void rmr_put_iu(struct rmr_iu *iu); +void rmr_msg_put_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu); +void wake_up_iu_comp(struct rmr_clt_sess_iu *sess_iu); +void msg_conf(void *priv, int errno); + +/* rmr-map-mgmt.c */ +void send_map_check(struct rmr_clt_pool_sess *pool_sess); +void send_store_check(struct rmr_clt_pool_sess *pool_sess); +int send_map_get_version(struct rmr_clt_pool_sess *pool_sess, u64 *ver); +int send_discard(struct rmr_clt_pool_sess *pool_sess, u8 cmd_type, u8 member_id); +int rmr_clt_handle_map_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp); +int rmr_clt_handle_store_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp); +int rmr_clt_read_map(struct rmr_pool *pool); +int rmr_clt_spread_map(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess_chosen, + bool enable, bool skip_normal); +int rmr_clt_unset_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess); +void sched_map_add(struct work_struct *work); +void msg_pool_cmd_map_content_conf(struct work_struct *work); + +/* rmr-clt-sysfs.c */ +int rmr_clt_create_sysfs_files(void); +void rmr_clt_destroy_sysfs_files(void); +void rmr_clt_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self); +int rmr_clt_create_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess); +void rmr_clt_destroy_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess); + +int rmr_clt_reset_read_retries(struct rmr_clt_stats *stats, bool enable); +ssize_t rmr_clt_stats_read_retries_to_str(struct rmr_clt_stats *stats, char *page); + +#endif /* RMR_CLT_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c b/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c new file mode 100644 index 000000000000..cade5dbf2e20 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c @@ -0,0 +1,933 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) — client MAP-exchange management + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include + +#include "rmr-clt.h" +#include "rmr-clt-trace.h" + +void send_map_check(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_CHECK; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + return; + } +} + +void send_store_check(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_STORE_CHECK; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); //am : why wait ? + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + pr_err("sess %s failed to send store check with err %d\n", + pool_sess->sessname, err); + } +} + +int send_map_get_version(struct rmr_clt_pool_sess *pool_sess, u64 *ver) +{ + struct rmr_msg_pool_cmd_rsp rsp = {}; + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_GET_VER; + + err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, &rsp, sizeof(rsp)); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + return -EINVAL; + } + + *ver = rsp.value; + + return 0; +} + +int send_discard(struct rmr_clt_pool_sess *pool_sess, u8 cmd_type, u8 member_id) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = cmd_type; + msg.send_discard_cmd.member_id = member_id; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + } + + return err; +} + +int rmr_clt_handle_map_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_dirty_id_map *map; + + pr_debug("pool %s sess %s member_id %u, rsp->value=%llu\n", + pool->poolname, pool_sess->sessname, rsp->member_id, rsp->value); + if (!rsp->value) // map is not empty on stg + return 0; + + pr_debug("pool %s server with id %u has empty dirty map, lets clean it.\n", + pool->poolname, rsp->member_id); + map = rmr_pool_find_map(pool, rsp->member_id); + if (!map) { + pr_err("%s: pool %s no map found for member_id %u\n", + __func__, pool->poolname, rsp->member_id); + return -EINVAL; + //TODO: handle this, how? + } + + if (!rmr_map_empty(map)) { + pr_debug("pool %s dirty map for member_id %d is not empty, map->ts %lu (now %lu)\n", + pool->poolname, rsp->member_id, map->ts, jiffies); + if (time_after(jiffies, map->ts + msecs_to_jiffies(RMR_MAP_CLEAN_DELAY_MS))) { + pr_info("%s: pool %s clear dirty map for member_id %d\n", + __func__, pool->poolname, rsp->member_id); + rmr_map_unset_dirty_all(map); + map->ts = jiffies; + } + } + + pr_debug("pool %s map with member_id %u cleaned\n", + pool->poolname, map->member_id); + return 0; +} + +int rmr_clt_handle_store_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp) +{ + struct rmr_pool *pool = pool_sess->pool; + int err = 0; + + pr_debug("pool %s sess %s member_id %u, rsp->value=%llu\n", + pool->poolname, pool_sess->sessname, rsp->member_id, rsp->value); + if (!rsp->value) { + pr_debug("pool %s sess %s (state=%d) reported that store is not available, changing state\n", + pool->poolname, pool_sess->sessname, atomic_read(&pool_sess->state)); + return 0; + } + pr_info("pool %s sess %s (state=%d) reported that store is available, changing state\n", + pool->poolname, pool_sess->sessname, atomic_read(&pool_sess->state)); + + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + + if (!pool_sess->maintenance_mode) { + err = rmr_clt_pool_try_enable(pool); + if (err) { + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool->poolname, pool_sess->sessname, err); + return err; + } + } + + return 0; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess *rmr_clt_get_first_reconnecting_session(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess; + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_RECONNECTING) + return pool_sess; + } + + return NULL; +} + +/** + * rmr_clt_pool_map_xfer() - transfer dirty maps between rmr client and server + * + * @pool: the rmr pool used for map transfers + * @pool_sess: client pool session that is used for map transfer + * @cmd_type: pool command type generated for this transfer, for now only + * RMR_CMD_READ_MAP_BUF, RMR_CMD_SEND_MAP_BUF, RMR_CMD_MAP_TEST are used + * @buf: pointer to the data buffer for data transfers + * @buflen: size of the buffer in bytes + * @map_idx: index of the map in dirty map array from which we start to send or receive + * the data + * @offset: key in the map from which we start to send/receive the data about the maps + * + * Description: + * Performs transfer of the information about the dirty maps starting from the map with + * position map_idx in the array of dirty maps and from the start_key at that map. + * cmd types are handled as follows: + * RMR_CMD_READ_MAP_BUF - read the information about the maps from the pool and fill buf + * RMR_CMD_SEND_MAP_BUF - send buf with filled data to the pull + * RMR_CMD_MAP_TEST - send the buf with data to the pool to perform map comparison + * + * Return: + * 0 on success, error code otherwise. + * + * Context: + * This function blocks while sending the buffer. + * + * Locks: + * should be called under srcu_read_lock since it uses pool_sess + */ +static int rmr_clt_pool_map_xfer(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess, + int cmd_type, void *buf, unsigned int buflen, + u8 map_idx, u64 slp_idx) +{ + struct rmr_msg_pool_cmd msg = {}; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = cmd_type; + + msg.map_buf_cmd.map_idx = map_idx; + msg.map_buf_cmd.slp_idx = slp_idx; + + err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, buf, buflen); + if (err) { + pr_debug("pool %s failed to send map xfer cmd %u, err %d\n", + pool->poolname, cmd_type, err); + return err; + } + + return 0; +} + +int rmr_clt_read_map(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess = NULL; + struct rmr_map_buf_hdr *map_buf_hdr; + u8 map_idx = 0; + u64 slp_idx = 0; + int err = 0, buflen, idx; + void *buf; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + pool_sess = rmr_clt_get_first_reconnecting_session(pool); + if (pool_sess == NULL) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + pr_err("%s: No created session found\n", __func__); + return -EINVAL; + } + + buflen = RTRS_IO_LIMIT; + buf = kzalloc(buflen, GFP_KERNEL); + if (!buf) { + pr_err("%s: Error allocating buffer\n", __func__); + err = -ENOMEM; + goto ret; + } + + while (true) { + err = rmr_clt_pool_map_xfer(pool, pool_sess, RMR_CMD_READ_MAP_BUF, + buf, buflen, map_idx, slp_idx); + if (err) { + pr_debug("rmr_clt_pool_map_xfer failed for pool %s, err %d\n", + pool->poolname, err); + goto ret_free; + } + + map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + if (map_buf_hdr->member_id == 0) + break; + + err = rmr_pool_save_map(pool, buf, buflen, false); + if (err) { + pr_err("rmr_pool_save_map failed\n"); + goto ret_free; + } + + map_idx = map_buf_hdr->map_idx; + slp_idx = map_buf_hdr->slp_idx; + } + +ret_free: + kfree(buf); + +ret: + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return err; +} + +/** + * rmr_clt_spread_map() - Spread the map contained in storage node connected by pool_sess_chosen + * + * @pool: The pool + * @pool_sess_chosen: pool session from where the map is to be updated from + * @enable: Whether the last MAP_DONE command should have the enable param set or not + * @skip_normal: If true, freeze IOs before spreading and silently skip any NORMAL + * sessions encountered in the loop (used in Case 1 recovery where + * pool_sess_chosen is itself a NORMAL session that is still serving IOs). + * If false, encountering a NORMAL session is treated as an error. + * + * Description: + * This function spreads the map contained in the storage node connected by given pool + * session. The param enable denotes whether the map update should result in the storage + * going to NORMAL state or not. This is controlled by the enable param in the last MAP_DONE + * message. + * + * Return: + * 0 on success + * Error value on failure + * + * Context: + * srcu_read_lock should be held while calling this function. + */ +int rmr_clt_spread_map(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess_chosen, + bool enable, bool skip_normal) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + int state, err = 0; + + rmr_clt_init_cmd(pool, &msg); + + /* + * If we expect NORMAL session, then we should expect IOs running. + * Which is why we should freeze IOs before doing map_update. + */ + if (skip_normal) { + /* Freeze IOs */ + rmr_clt_pool_io_freeze(clt_pool); + + /* Wait for all completion */ + rmr_clt_pool_io_wait_complete(clt_pool); + } + + /* + * TODO: Use rmr_clt_handle_discard to check whether the pool + * session has pending discard request to be sent. + * + * Enable this when we fix replace. + * + err = rmr_clt_handle_discard(pool); + if (err) { + pr_err("%s: discard handling failed\n", __func__); + goto err; + } + */ + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (pool_sess == pool_sess_chosen) + continue; + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_NORMAL) { + if (skip_normal) + continue; + pr_err("%s: pool %s unexpected NORMAL session %s during spread\n", + __func__, pool->poolname, pool_sess->sessname); + err = -EINVAL; + goto err_out; + } + + if (state != RMR_CLT_POOL_SESS_RECONNECTING || + pool_sess->maintenance_mode) + continue; + + msg.cmd_type = RMR_CMD_MAP_READY; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto err_dis; + } + + msg.cmd_type = RMR_CMD_MAP_SEND; + msg.map_send_cmd.receiver_member_id = pool_sess->member_id; + err = rmr_clt_pool_send_cmd(pool_sess_chosen, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto err_dis; + } + + msg.cmd_type = RMR_CMD_MAP_DONE; + msg.map_done_cmd.enable = enable; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto err_dis; + } + } + + /* Unfreeze IOs and wake up */ + if (skip_normal) + rmr_clt_pool_io_unfreeze(clt_pool); + + return 0; + +err_dis: + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (pool_sess == pool_sess_chosen) + continue; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) { + if (skip_normal) + continue; + pr_err("%s: pool %s unexpected NORMAL session %s during spread\n", + __func__, pool->poolname, pool_sess->sessname); + } + + msg.cmd_type = RMR_CMD_MAP_DISABLE; + rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + } + +err_out: + /* Unfreeze IOs and wake up */ + if (skip_normal) + rmr_clt_pool_io_unfreeze(clt_pool); + + return err; +} + +/** + * rmr_clt_set_pool_sess_mm() - Set the rmr clt pool session to maintenance mode + * + * @pool_sess: The rmr clt pool session to set in maintenance mode + * + * Description: + * This function does the necessary work required, like setting the pool session to + * maintenance mode and updating the state. + * It then also communicates this state change to the corresponding storage node. + * + * Return: + * 0 on success + * Error value on failure + */ +int rmr_clt_set_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + int err; + + pr_info("%s: Putting sess %s of pool %s in maintenance mode\n", + __func__, pool_sess->sessname, pool->poolname); + + if (pool_sess->maintenance_mode) + goto send_message; + + /* + * If the pool_sess is to be put in maintenance mode, + * update relevant states and params, Then send message to storage node. + * + * We do not need any kind of locking for this, because of the way IO units (IU) are + * allocated & sent. The mm mode update & the state change can happen at multiple places. + * + * 1) If the state changes before the pool_sess is picked up into the IU, then we are safe + * 2) If the state changes after the pool_sess is picked up into the IU, but before, + * rmr_clt_request, it will be failed in rmr_clt_request. + * 3) If the state changes after rmr_clt_request, the IO would be sent to the storage node + * for that pool_sess. Then we have 2 cases, + * a) The message for maintenance_mode is received by the storage node before the IO, + * then the storage node will fail the IO. Failure would then be handled by the client. + * b) The message for maintenance_mode is received by the storage node after the IO, + * then the storage node will process the IO, and return success to client. In this case + * also we are fine, since the IO got processes successfully. + */ + pool->map_ver++; + pool_sess->maintenance_mode = true; + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + +send_message: + err = send_msg_enable_pool(pool_sess, 0); + if (err) { + pr_err("%s: send_msg_enable_pool failed for pool %s. Err %d\n", + __func__, pool->poolname, err); + } + + return err; +} + +/** + * rmr_clt_unset_pool_sess_mm() - Clear the rmr clt pool sessions maintenance mode + * + * @pool_sess: The rmr clt pool session to clear maintenance mode of + * + * Description: + * This function clears the maintenance mode of the given rmr clt pool session. + * It also does the map_update which essentially brings the pool_session and its + * corresponding storage node to NORMAL state. + * + * Return: + * 0 on success + * Error value on failure + */ +int rmr_clt_unset_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + int err; + + pr_info("%s: Putting to sess %s of pool %s out of maintenance mode\n", + __func__, pool_sess->sessname, pool->poolname); + + /* + * Cannot be in NORMAL and CREATED states while in maintenance mode. + */ + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL); + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + + /* + * If this pool_sess is getting removed, we fail unset maintenance mode + */ + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING) + return -EINVAL; + + /* + * First unset mm of storage node + */ + err = send_msg_enable_pool(pool_sess, 1); + if (err) { + pr_err("Failed to send enable to pool %s. Err %d\n", + pool->poolname, err); + return -EINVAL; + } + + /* Now do this */ + pool_sess->maintenance_mode = false; + + /* + * For FAILED states, further action would happen when it goes to RECONNECTING state + */ + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) + return 0; + + /* + * Since we are in RECONNECTING state, we do map update here. + */ + err = rmr_clt_pool_try_enable(pool); + if (err) { + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool->poolname, pool_sess->sessname, err); + return err; + } + + return 0; +} + +void msg_pool_cmd_map_content_conf(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + + pr_debug("%s: session %s conf with errno %d\n", + __func__, pool_sess->sessname, sess_iu->errno); + + wake_up_iu_comp(sess_iu); + rmr_msg_put_iu(pool_sess, sess_iu); +} + +static void send_map_update_done(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + struct rmr_iu *iu = sess_iu->rmr_iu; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + int errno = sess_iu->errno; + + pr_debug("%s: Session %s, err %d, iu %p\n", + __func__, pool_sess->sessname, errno, iu); + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + + /* + * We leave "iu->errno" set from the IO failure. + * Even though one map_add succeeds, we clear `iu->errno` + * and the main IO succeeds. And all other map_adds + * simply trigger session state change to FAILURE. + */ + if (!errno) { + iu->errno = 0; + } else { + pr_err_ratelimited("%s: for sess %s got errno: %d\n", + __func__, pool_sess->sessname, errno); + + if (iu->errno) + /* only the last error is reported */ + iu->errno = errno; + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED); + } + + pr_debug("%s: Before dec and test iu %p refcnt=%d\n", + __func__, iu, refcount_read(&iu->refcount)); + + if (refcount_dec_and_test(&iu->refcount)) { + rmr_conf_fn *conf = iu->conf; + + pr_debug("all maps updated, call conf %p withh errno %d\n", + conf, errno); + (*conf)(iu->priv, iu->errno); + } +} + +/** + * rmr_clt_send_map_update() - Send map update to all connected storage nodes + * + * @pool: The client pool of whose sessions the update is to be sent + * @iu: The IO unit containing the information for the update + * + * Description: + * Send map update, using the underlying RTRS <-> RDMA + * Currently we use the same rmr_iu as IO, since it saves us time. + * When an IO fails, and a MAP_ADD is to be sent, the code reuses the + * same rmr_iu used for IO. This way we do not spend time acquiring + * and initializing another rmr_iu. + * + * A map update currently can either be a MAP_ADD or a MAP_CLEAR. + * The caller must make sure the basic and required information for both + * the above commands is updated in the rmr_iu. + * Basic being the pool group_id, msg hdr type, etc. + * Required being the following, + * MAP_ADD requires the rmr_id_t chunk numbers, failed_id array and failed_cnt + * MAP_CLEAR requires the rmr_id_t and the member_id + * + * Return: + * 0 on success. This means the map_update was sent successfully. + * The subsequent status (err or not) goes to iu->conf call, + * so the caller should check that too. + * + * Error value on failure. When this function returns error, + * be aware that the iu->conf will not be called. + */ +int rmr_clt_send_map_update(struct rmr_pool *pool, struct rmr_iu *iu) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + struct rtrs_clt_req_ops req_ops; + struct kvec vec; + int err; + + pr_debug("%s: rmr_id (%llu, %llu), msg %d, refcnt=%d\n", __func__, + iu->msg.id_a, iu->msg.id_b, iu->msg.hdr.type, refcount_read(&iu->refcount)); + + if (!pool) { + pr_err("Cannot send map update. pool is NULL\n"); + return -EINVAL; + } + + rmr_get_iu(iu); + + vec = (struct kvec){ + .iov_base = &iu->msg, + .iov_len = sizeof(iu->msg) + }; + + list_for_each_entry_safe(sess_iu, tmp_sess_iu, &(iu->sess_list), entry) { + struct rmr_clt_sess *clt_sess; + enum rmr_clt_pool_sess_state state; + + pool_sess = sess_iu->pool_sess; + clt_sess = pool_sess->clt_sess; + + INIT_WORK(&sess_iu->work, send_map_update_done); + + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_conf, + }; + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_FAILED || + state == RMR_CLT_POOL_SESS_REMOVING) { + /* + * Sessions in failed state is probably the reason why we sending + * map add in the first place. + * We can skip those sessions, since map update will take care of this. + */ + pr_debug("%s: skipped sess %s\n", __func__, sess_iu->pool_sess->sessname); + sess_iu->errno = -EINVAL; + schedule_work(&sess_iu->work); + continue; + } + + pr_debug("Sending request flags %u to pool %s session %s " + "chunk [%llu, %llu] offset %u length %u)\n", + iu->msg.flags, pool->poolname, pool_sess->sessname, + iu->msg.id_a, iu->msg.id_b, + iu->msg.offset, iu->msg.length); + + trace_send_map_update(WRITE, sess_iu); + + err = rtrs_clt_request(WRITE, &req_ops, clt_sess->rtrs, + sess_iu->permit, &vec, 1, 0, NULL, 0); + + /* we can ignore errno since we called rmr_clt_send_map_update with NO_WAIT */ + if (err) { + sess_iu->errno = err; + + pr_err("%s: Failed with err %d, schedule work\n", + __func__, err); + schedule_work(&sess_iu->work); + } + } + rmr_put_iu(iu); + + /* + * We are handling err through iu->conf + */ + return 0; +} +EXPORT_SYMBOL(rmr_clt_send_map_update); + +int rmr_clt_map_add_id(struct rmr_pool *pool, int stg_id, rmr_id_t id) +{ + struct rmr_dirty_id_map *map; + + map = rmr_pool_find_map(pool, stg_id); + if (!map) { + pr_err("in pool %s cannot find map for member_id %u\n", + pool->poolname, stg_id); + return -EINVAL; + } + + map->ts = jiffies; + rmr_map_set_dirty(map, id, 0); + + pr_debug("pool %s id (%llu, %llu) inserted to the dirty map\n", + pool->poolname, id.a, id.b); + + return 0; +} + +void sched_map_add(struct work_struct *work) +{ + struct rmr_iu *iu = container_of(work, struct rmr_iu, work); + struct rmr_pool *pool = iu->pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu; + rmr_conf_fn *clt_conf = iu->conf; + void *clt_priv = iu->priv; + int failed_cnt = 0, err = 0; + rmr_id_t id; + + pr_debug("scheduled work process for rmr iu %p send map add id (%llu, %llu), poolname %s\n", + iu, iu->msg.id_a, iu->msg.id_b, pool->poolname); + + /* + * For MAP_ADD, we need failed_id, failed_cnt, and rmr_id_t for chunk number. + * + * We reuse the iu which was used for this IO. + * It already has the chunk number, the clt_conf function to be called, + * and other important things. + */ + iu->msg.hdr.type = cpu_to_le16(RMR_MSG_MAP_ADD); + + id.a = le64_to_cpu(iu->msg.id_a); + id.b = le64_to_cpu(iu->msg.id_b); + list_for_each_entry(sess_iu, &(iu->sess_list), entry) { + pool_sess = sess_iu->pool_sess; + + if (sess_iu->errno) { + iu->msg.map_ver = cpu_to_le64(pool->map_ver); + iu->msg.failed_id[failed_cnt] = pool_sess->member_id; + failed_cnt++; + + rmr_clt_map_add_id(pool, pool_sess->member_id, id); + } + } + iu->msg.failed_cnt = failed_cnt; + + err = rmr_clt_send_map_update(pool, iu); + if (err) { + pr_err("error sending map add for id (%llu, %llu), err=%d\n", + iu->msg.id_a, iu->msg.id_b, err); + (*clt_conf)(clt_priv, err); + } +} + +/** + * rmr_clt_send_map() - Send dirty map entries + * + * @map_src_pool: Pool whose map is to be sent + * @clt_pool: Client pool through which the dest session is selected + * @map_send_cmd: Command structure containing the member_id of the target session + * where the map is to be sent. If NULL then send to all of the session + * + * Return: + * 0 on success, err code otherwise. + * + * Description: + * Sends all the dirty entries from the map in "map_src_pool" to the session with + * member_id equal to member_id mentioned in the map_send_cmd. + * The session where to send the map is picked from the clt_pool. If + * map_send_cmd is NULL then send cmd to all of the sessions in clt_pool. + * + * Context: + * This function blocks while sending the map. + */ +int rmr_clt_send_map(struct rmr_pool *map_src_pool, struct rmr_pool *clt_pool, + const struct rmr_msg_map_send_cmd *map_send_cmd, rmr_map_filter filter) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + bool sess_found = false; + void *bitmap_buf; + int err = 0, idx; + + if (!clt_pool) { + pr_err("Cannot send map, when clt_pool is NULL\n"); + return -EINVAL; + } + + bitmap_buf = kzalloc(RTRS_IO_LIMIT, GFP_KERNEL); + if (!bitmap_buf) { + pr_err("%s: pool %s error allocating buffer to send map\n", + __func__, map_src_pool->poolname); + return -ENOMEM; + } + + idx = srcu_read_lock(&clt_pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &clt_pool->sess_list, entry, + (srcu_read_lock_held(&clt_pool->sess_list_srcu))) { + int bytes = 0; + u8 map_idx = 0; + u64 slp_idx = 0; + + /* if we have a command then skip all the sessions that are not in command */ + if (map_send_cmd && pool_sess->member_id != map_send_cmd->receiver_member_id) + continue; + + sess_found = true; + pr_info("Start sending dirty map for pool %s; to session %s with member_id %d\n", + map_src_pool->poolname, pool_sess->sessname, pool_sess->member_id); + + while ((bytes = rmr_pool_maps_to_buf(map_src_pool, &map_idx, &slp_idx, + bitmap_buf, RTRS_IO_LIMIT, filter)) > 0) { + pr_debug("mapped %d bytes to bitmap_buf\n", bytes); + + err = rmr_clt_pool_map_xfer(clt_pool, pool_sess, RMR_CMD_SEND_MAP_BUF, + bitmap_buf, bytes, 0, 0); + if (err) { + pr_err("%s: Failed to send bitmap_buf, from %s to %s err %d\n", + __func__, map_src_pool->poolname, clt_pool->poolname, err); + goto err_free; + } + } + + rmr_clt_init_cmd(map_src_pool, &msg); + msg.cmd_type = RMR_CMD_MAP_BUF_DONE; + msg.map_buf_done_cmd.map_version = map_src_pool->map_ver; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For pool %s, %s failed\n", + __func__, map_src_pool->poolname, rmr_get_cmd_name(msg.cmd_type)); + goto err_free; + } + } + + if (map_send_cmd && !sess_found) { + pr_err("pool %s failed to find sess with member_id %u to send map\n", + clt_pool->poolname, map_send_cmd->receiver_member_id); + err = -EINVAL; + goto err_free; + } + + pr_info("%s: Sending map done\n", __func__); + +err_free: + kfree(bitmap_buf); + srcu_read_unlock(&clt_pool->sess_list_srcu, idx); + + return err; +} +EXPORT_SYMBOL(rmr_clt_send_map); + +int rmr_clt_test_map(struct rmr_pool *src_pool, struct rmr_pool *dst_pool) +{ + struct rmr_clt_pool_sess *pool_sess; + void *bitmap_buf; + int err, idx; + + pr_info("test maps from src_pool=%s to dst_pool=%s...\n", + src_pool->poolname, dst_pool->poolname); + + bitmap_buf = kzalloc(RTRS_IO_LIMIT, GFP_KERNEL); + if (!bitmap_buf) { + pr_err("%s: Error allocating buffer\n", __func__); + err = -ENOMEM; + goto err; + } + + idx = srcu_read_lock(&dst_pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &dst_pool->sess_list, entry, + (srcu_read_lock_held(&dst_pool->sess_list_srcu))) { + enum rmr_clt_pool_sess_state state; + int bytes = 0; + u8 map_idx = 0; + u64 slp_idx = 0; + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_CREATED || + state == RMR_CLT_POOL_SESS_FAILED) { + pr_warn("sess %s is in created/failed state, skip map test.\n", + pool_sess->sessname); + continue; + } + pr_info("perform map test for sess %s\n", pool_sess->sessname); + while ((bytes = rmr_pool_maps_to_buf(src_pool, &map_idx, &slp_idx, + bitmap_buf, RTRS_IO_LIMIT, + MAP_NO_FILTER)) > 0) { + pr_debug("mapped %d bytes to bitmap_buf\n", bytes); + + err = rmr_clt_pool_map_xfer(dst_pool, pool_sess, RMR_CMD_MAP_TEST, + bitmap_buf, bytes, 0, 0); + if (err) { + pr_err("%s: For sess %s failed test map, src_pool %s dst_pool %s err %d\n", + __func__, pool_sess->sessname, src_pool->poolname, + dst_pool->poolname, err); + srcu_read_unlock(&dst_pool->sess_list_srcu, idx); + goto err_free; + } + } + pr_info("sess %s map test done\n", pool_sess->sessname); + } + srcu_read_unlock(&dst_pool->sess_list_srcu, idx); + +err_free: + kfree(bitmap_buf); +err: + pr_info("test maps from src_pool=%s to dst_pool=%s done, err %d\n", + src_pool->poolname, dst_pool->poolname, err); + + return err; +} +EXPORT_SYMBOL(rmr_clt_test_map); diff --git a/drivers/infiniband/ulp/rmr/rmr-map.c b/drivers/infiniband/ulp/rmr/rmr-map.c new file mode 100644 index 000000000000..f4b7dd7c3b50 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-map.c @@ -0,0 +1,904 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#include + +#include "rmr-map.h" +#include "rmr-pool.h" + +void rmr_map_update_page_params(struct rmr_dirty_id_map *map) +{ + unsigned long remaining_chunks; + + map->no_of_flp = (map->no_of_chunks >> CHUNKS_PER_FLP_LOG2); + + /* + * If the number of chunks are not completely filling an FLP (CHUNKS_PER_FLP), + * then the remaining would be tracked by the next FLP. Thus the next FLP would + * have unused SLP pointers. We will calculate the number of SLP slots which will + * be used in the last FLP. + */ + remaining_chunks = map->no_of_chunks & (CHUNKS_PER_FLP - 1); + if (!remaining_chunks) { + /* + * If there are no remaining chunks, then the last FLP is completely full. + */ + map->no_of_slp_in_last_flp = NO_OF_SLP_PER_FLP; + map->no_of_chunk_in_last_slp = NO_OF_CHUNKS_PER_PAGE; + } else { + /* + * If there are remaining chunks, then we add another FLP for it. + * This FLP will not be full, hence we calculate the number of SLP slots + * that will be used. + */ + map->no_of_flp += 1; + map->no_of_slp_in_last_flp = (remaining_chunks >> CHUNKS_PER_SLP_LOG2); + + /* + * Same as above. It could be that the number of chunks do not fit neatly + * in the last SLP (CHUNKS_PER_SLP), and the remaining ones end up in the + * SLP with remaining chunk slots. + */ + remaining_chunks &= (CHUNKS_PER_SLP - 1); + if (!remaining_chunks) { + /* + * If there are no remaining chunks, then the last SLP is completely full. + */ + map->no_of_chunk_in_last_slp = CHUNKS_PER_SLP; + } else { + /* + * If there are remaining chunks, then we add another SLP. + */ + map->no_of_slp_in_last_flp += 1; + map->no_of_chunk_in_last_slp = remaining_chunks; + } + } + + map->total_slp = ((map->no_of_flp - 1) * NO_OF_SLP_PER_FLP) + map->no_of_slp_in_last_flp; +} + +static void rmr_map_update_map_params(struct rmr_pool *pool, struct rmr_dirty_id_map *map) +{ + map->no_of_chunks = pool->no_of_chunks; + + rmr_map_update_page_params(map); + + pr_info("%s: Chunks info %u, %u, %u, %llu\n", + __func__, pool->chunk_size, ilog2(pool->chunk_size), + pool->chunk_size_shift, map->no_of_chunks); + pr_info("%s: FLPs %llu, SLPs in last FLP %llu, Total SLPs %llu, chunks in last SLP %llu\n", + __func__, map->no_of_flp, map->no_of_slp_in_last_flp, map->total_slp, + map->no_of_chunk_in_last_slp); + pr_info("%s: Dirty map size %lldB\n", __func__, (map->total_slp * PAGE_SIZE)); +} + +static int rmr_map_allocate_pages(struct rmr_pool *pool, struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + u64 no_of_slps; + int i, j; + + for (i = 0; i < map->no_of_flp;) { + map->dirty_bitmap[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!map->dirty_bitmap[i]) + goto err_alloc; + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + /* + * Move the increment to here, so that later in err_alloc: if we have to free, + * the index i, is pointing in the correct position. + */ + i++; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + *flp_ptr = get_zeroed_page(GFP_KERNEL); + if (!*flp_ptr) + goto err_alloc; + } + } + + // TODO remove this + map->bitmap_filter = kcalloc(pool->no_of_chunks, sizeof(*map->bitmap_filter), GFP_KERNEL); + if (!map->bitmap_filter) + goto err_alloc; + + return 0; + +err_alloc: + for (--i; i >= 0; i--) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + for (--j; j >= 0; j--) + free_page((unsigned long)*(flp_ptr + j)); + + j = NO_OF_SLP_PER_FLP; + free_page((unsigned long)map->dirty_bitmap[i]); + } + + return -ENOMEM; +} + +struct rmr_dirty_id_map *rmr_map_create(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_dirty_id_map *map = NULL; + int ret; + + pr_info("%s: Creating map for member_id %u, in pool %s. Existing map_cnt %u\n", + __func__, member_id, pool->poolname, pool->maps_cnt); + + if (!pool->no_of_chunks) { + pr_err("%s: dirty map size cannot be zero\n", __func__); + return ERR_PTR(-EINVAL); + } + + mutex_lock(&pool->maps_lock); + + /* + * Don't create if already exists + */ + map = rmr_pool_find_map(pool, member_id); + if (map != NULL) { + pr_err("Map with member_id %u already exists\n", member_id); + ret = -EEXIST; + goto err_unlock; + } + + if (pool->maps_cnt >= RMR_POOL_MAX_SESS) { + pr_err("pool %s can not create new map, max number of sessions %d achieved\n", + pool->poolname, RMR_POOL_MAX_SESS); + ret = -EINVAL; + goto err_unlock; + } + + /* + * Allocate memory and init the structure + */ + map = (struct rmr_dirty_id_map *)get_zeroed_page(GFP_KERNEL); + if (!map) { + pr_err("cannot allocate map for member_id %u\n", member_id); + ret = -ENOMEM; + goto err_unlock; + } + rmr_map_update_map_params(pool, map); + + ret = rmr_map_allocate_pages(pool, map); + if (ret) { + pr_err("cannot allocate memory for member_id %u\n", member_id); + goto err_map; + } + + xa_init_flags(&map->rmr_id_map, XA_FLAGS_ALLOC); + map->member_id = member_id; + map->ts = jiffies; + + rmr_pool_maps_append(pool, map); + + mutex_unlock(&pool->maps_lock); + + return map; + +err_map: + free_page((unsigned long)map); +err_unlock: + mutex_unlock(&pool->maps_lock); + return ERR_PTR(ret); +} + +void rmr_map_destroy(struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + int i, j; + u64 no_of_slps; + + WARN_ON(!xa_empty(&map->rmr_id_map)); + map->ts = jiffies; + + pr_info("%s: member_id %u\n", __func__, map->member_id); + kfree(map->bitmap_filter); + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++) + free_page((unsigned long)*(flp_ptr + j)); + + free_page((unsigned long)map->dirty_bitmap[i]); + } + + free_page((unsigned long)map); +} + +/** + * rmr_map_calc_chunk - Calculate chunk number from offset and length of IO + * + * @pool: The pool + * @offset: Offset of the IO + * @length: Length of the IO + * @id: rmr_id_t where to populate the chunk details + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * + * For example: + * if id.a is 1, only id.b is dirty. + * if id.a is 2, id.b and (id.b+1) is dirty + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +void rmr_map_calc_chunk(struct rmr_pool *pool, size_t offset, size_t length, rmr_id_t *id) +{ + u64 off_len = offset + length; + + id->b = GET_CHUNK_NUMBER(offset, pool->chunk_size_shift); + id->a = GET_FOLLOWING_CHUNKS(off_len, pool->chunk_size_shift, id->b); +} + +/** + * rmr_get_chunk_md_from_id - Get the chunk metadata byte from rmr_id_t + * + * @map: The map to work on + * @id: rmr_id_t to use to get the chunk metadata byte + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline u8 *rmr_get_chunk_md_from_id(struct rmr_dirty_id_map *map, rmr_id_t id) +{ + unsigned long idb_slp, idb_slp_index, idb_chunk; + el_flp *flp_ptr; + u8 *slp, *chunk_md; + + /* + * First get the pointer to first level page (FLP). + * To get that, we need to find which first level page the chunk belongs, and it can + * be found by dividing the chunk number by the maximum number of chunks 1 FLP can track. + * + * After that we need to adjust the id.b to go one level down. This is because we just + * moved to the desired FLP, and hence that portion of id.b can be dropped. + * For this we do the modulo with CHUNKS_PER_FLP. + */ + flp_ptr = (el_flp *)(map->dirty_bitmap[id.b >> CHUNKS_PER_FLP_LOG2]); + idb_slp = id.b & (CHUNKS_PER_FLP - 1); + + /* + * Now we need to move to the second level page (SLP). + * The addresses to SLPs are stored in the FLP as a list of addresses. Hence we calculate + * the desired slp index which has the address to the SLP our chunk md resides in. + * + * We then adjust our flp_ptr according to the index. + * Note that flp_ptr is of type el_flp (flp element), which is unsigned long, since + * addresses are of that data type. This lets us move to the slp index easily. + */ + idb_slp_index = idb_slp >> CHUNKS_PER_SLP_LOG2; + flp_ptr += idb_slp_index; + + /* + * The location pointed by flp_ptr is storing the address to the SLP we want to move to. + * So we dereference it first, and then cast it to relevant pointer (to the chunk metadata + * data type, which is u8). + * + * The last step it to move to the correct chunk metadata in the SLP. + * + * Each SLP can store metadata for CHUNKS_PER_SLP chunks. So we adjust the idb_slp + * accordingly. And then move our slp pointer to the correct chunk metadata byte. + */ + slp = (u8 *)(*flp_ptr); + idb_chunk = idb_slp & (CHUNKS_PER_SLP - 1); + chunk_md = slp + idb_chunk; + + return chunk_md; +} + +static bool rmr_chunk_md_check_dirty(u8 *chunk_md) +{ + return (*chunk_md) & (0x1 << CHUNK_DIRTY_BIT); +} + +static void rmr_chunk_md_set_dirty(u8 *chunk_md) +{ + *chunk_md |= (0x1 << CHUNK_DIRTY_BIT); +} + +static void rmr_chunk_md_unset_dirty(u8 *chunk_md) +{ + *chunk_md &= ~(0x1 << CHUNK_DIRTY_BIT); +} + +/** + * rmr_map_set_dirty - Set bits from rmr_id_t + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * @filter: Filter to add to entry + * + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline void rmr_map_set_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter) +{ + u8 *chunk_md; + u64 i; + + map->ts = jiffies; + + chunk_md = rmr_get_chunk_md_from_id(map, id); + for (i = 0; i < id.a; i++) { + rmr_chunk_md_set_dirty(chunk_md); + chunk_md++; + } +} + +inline void rmr_map_set_dirty_all(struct rmr_dirty_id_map *map, u8 filter) +{ + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks; + bool is_last_flp; + u8 *slp; + int i, j, k; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + is_last_flp = (i == (map->no_of_flp - 1)); + + if (is_last_flp) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (u8 *)(*flp_ptr); + + if (is_last_flp && j == (no_of_slps - 1)) + no_of_chunks = map->no_of_chunk_in_last_slp; + else + no_of_chunks = NO_OF_CHUNKS_PER_PAGE; + + for (k = 0; k < no_of_chunks; k++, slp++) + rmr_chunk_md_set_dirty(slp); + } + } +} + +/** + * rmr_map_unset_dirty - Clear bits from rmr_id_t, and free entry if any + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * @filter: Filter to add to entry + * + * Description: + * This version can be used by both client and server. + * If entry is found, the function frees it. + * Clears the bit using info from the given rmr_id_t + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline struct rmr_map_entry *rmr_map_unset_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, + u8 filter) +{ + struct rmr_map_entry *entry; + u8 *chunk_md; + u64 i; + + map->ts = jiffies; + + chunk_md = rmr_get_chunk_md_from_id(map, id); + BUG_ON(!chunk_md); + for (i = 0; i < id.a; i++) { + rmr_chunk_md_unset_dirty(chunk_md); + chunk_md++; + } + + entry = xa_erase(&map->rmr_id_map, rmr_id_to_key(id)); + if (!entry) { + pr_debug("in the member_id %d there is no entry for id [%llu, %llu]\n", + map->member_id, id.a, id.b); + } + + return entry; +} + +/* + * rmr_map_check_dirty - Check if the following bits are set or not + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline bool rmr_map_check_dirty(struct rmr_dirty_id_map *map, rmr_id_t id) +{ + u8 *chunk_md; + + chunk_md = rmr_get_chunk_md_from_id(map, id); + return rmr_chunk_md_check_dirty(chunk_md); +} + +/** + * rmr_map_get_dirty_entry - Check and return entry if the following bits are set + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * + * Description: + * Check if a chunk is dirty or not. + * If the particular chunk is dirty, then create an entry for it and return back. + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline struct rmr_map_entry *rmr_map_get_dirty_entry(struct rmr_dirty_id_map *map, rmr_id_t id) +{ + struct rmr_map_entry *entry; + int err; + + if (rmr_map_check_dirty(map, id)) { + entry = xa_load(&map->rmr_id_map, rmr_id_to_key(id)); + if (entry) { + pr_debug("%s: For id [%llu, %llu], entry exists member_id %u\n", + __func__, id.a, id.b, map->member_id); + return entry; + } + + entry = kmem_cache_zalloc(rmr_map_entry_cachep, GFP_KERNEL); + if (!entry) { + pr_err("%s: Cannot allocate entry for member_id %d, id [[%llu, %llu]]\n", + __func__, map->member_id, id.a, id.b); + return ERR_PTR(-ENOMEM); + } + + atomic_set(&entry->sync_cnt, -1); + init_llist_head(&entry->wait_list); + + err = xa_insert(&map->rmr_id_map, rmr_id_to_key(id), entry, GFP_KERNEL); + if (err == 0) + return entry; + + kmem_cache_free(rmr_map_entry_cachep, entry); + + if (err == -EBUSY) + return xa_load(&map->rmr_id_map, rmr_id_to_key(id)); + else + return ERR_PTR(-ENOMEM); + } + + return NULL; +} + +/** + * rmr_map_clear_filter_all - Clear filter for entire bitmap + * + * @map: Map to work on + * @filter: Filter to be cleared + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline void rmr_map_clear_filter_all(struct rmr_dirty_id_map *map, u8 filter) +{ + u64 i; + + for (i = 0; i < map->no_of_chunks; i++) + map->bitmap_filter[i] &= ~filter; +} + +/** + * rmr_map_unset_dirty_all - Clear all chunk bits (the entire map) + * + * @map: Map to work on + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline void rmr_map_unset_dirty_all(struct rmr_dirty_id_map *map) +{ + rmr_id_t id; + u64 i; + + /* + * TODO: memcpy zeroes or something faster + */ + + id.a = 1; + for (i = 0; i < map->no_of_chunks; i++) { + id.b = i; + + if (!rmr_map_check_dirty(map, id)) + continue; + + rmr_map_unset_dirty(map, id, MAP_NO_FILTER); + } + + rmr_map_clear_filter_all(map, MAP_ENTRY_UNSYNCED); +} + +/** + * rmr_map_empty - Check if there are any chunks dirty + * + * @map: Map to work on + * + * Return: + * True: If map is empty + * False: Otherwise + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline bool rmr_map_empty(struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks; + bool is_last_flp; + u8 *slp; + int i, j, k; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + is_last_flp = (i == (map->no_of_flp - 1)); + + if (is_last_flp) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (u8 *)(*flp_ptr); + + if (is_last_flp && j == (no_of_slps - 1)) + no_of_chunks = map->no_of_chunk_in_last_slp; + else + no_of_chunks = NO_OF_CHUNKS_PER_PAGE; + + for (k = 0; k < no_of_chunks; k++, slp++) { + if (rmr_chunk_md_check_dirty(slp)) + return false; + } + } + } + + return true; +} + +inline void rmr_map_bitwise_or_buf(void *dst_buf, void *src_buf, u32 buf_size) +{ + u8 *src_byte, *dst_byte; + + src_byte = src_buf; + dst_byte = dst_buf; + + while (buf_size--) + *(dst_byte + buf_size) |= *(src_byte + buf_size); +} + +inline int rmr_map_create_entries(struct rmr_dirty_id_map *map) +{ + struct rmr_map_entry *entry; + rmr_id_t id; + int err; + u64 i; + + id.a = 1; + for (i = 0; i < map->no_of_chunks; i++) { + id.b = i; + + if (!rmr_map_check_dirty(map, id)) + continue; + + if (xa_load(&map->rmr_id_map, rmr_id_to_key(id))) + continue; + + entry = kmem_cache_zalloc(rmr_map_entry_cachep, GFP_KERNEL); + if (!entry) { + pr_err("%s: Cannot allocate entry for member_id %d, chunk %llu\n", + __func__, map->member_id, i); + return -ENOMEM; + } + + atomic_set(&entry->sync_cnt, -1); + init_llist_head(&entry->wait_list); + + pr_debug("%s: Adding entry %p for chunk %llu\n", + __func__, entry, i); + + err = xa_insert(&map->rmr_id_map, rmr_id_to_key(id), entry, GFP_KERNEL); + if (err) { + pr_err("%s: Cannot insert entry for member_id %d, chunk %llu\n", + __func__, map->member_id, i); + return err; + } + } + + return 0; +} + +/** + * rmr_map_slps_to_buf - Copy SLPs to given buf + * + * @map: Map to work on + * @slp_idx: SLP number to start copying from + * @no_of_slp: Number of SLPs to copy + * @buf: Buffer to copy SLPs to + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +void rmr_map_slps_to_buf(struct rmr_dirty_id_map *map, u64 slp_idx, u64 no_of_slp, u8 *buf) +{ + el_flp *flp_ptr; + u64 slp_no, flp_no, i = 0; + void *slp; + + flp_no = slp_idx >> NO_OF_SLP_PER_FLP_LOG2; + slp_no = slp_idx & (NO_OF_SLP_PER_FLP - 1); + + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + while (i < no_of_slp) { + slp = (void *)(*(flp_ptr + slp_no)); + + memcpy(buf, slp, PAGE_SIZE); + buf += PAGE_SIZE; + + slp_no++; + if (slp_no >= NO_OF_SLP_PER_FLP) { + flp_no += 1; + slp_no = 0; + + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + } + + i++; + } + + return; +} + +/** + * rmr_map_buf_to_slps - Copy data from buf to SLPs + * + * @map: Map to work on + * @buf: Buffer from which to copy data + * @buf_size: Buffer size + * @slp_idx: SLP number to start copying to + * @test: Whether to compare data or copy + * + * Return: + * Number of SLPs to which data was copied. + * 0 in case of failure. + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +u64 rmr_map_buf_to_slps(struct rmr_dirty_id_map *map, u8 *buf, u32 buf_size, u64 slp_idx, + bool test) +{ + el_flp *flp_ptr; + u64 slp_no, flp_no, i = 0; + u64 no_of_slp; + void *slp; + + /* + * The buf_size should be a factor of PAGE_SIZE + */ + if (buf_size % PAGE_SIZE) { + pr_info("%s: Failed %u\n", __func__, buf_size); + return 0; + } + + no_of_slp = buf_size >> PAGE_SHIFT; + + flp_no = slp_idx >> NO_OF_SLP_PER_FLP_LOG2; + slp_no = slp_idx & (NO_OF_SLP_PER_FLP - 1); + + pr_info("%s: no_of_slp=%llu, flp_no=%llu, slp_no=%llu, slp_idx=%llu\n", + __func__, no_of_slp, flp_no, slp_no, slp_idx); + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + while (i < no_of_slp) { + slp = (void *)(*(flp_ptr + slp_no)); + + if (test && memcmp(slp, buf, PAGE_SIZE)) { + pr_info("%s: Compare failed\n", __func__); + return 0; + } else if (!test) { + memcpy(slp, buf, PAGE_SIZE); + } + buf += PAGE_SIZE; + + slp_no++; + if (slp_no >= NO_OF_SLP_PER_FLP) { + flp_no += 1; + slp_no = 0; + + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + } + + i++; + } + + return no_of_slp; +} + +void rmr_map_hexdump_bitmap_buf(u8 member_id, void *buf, u32 buf_size) +{ + u8 *buf_byte; + u32 size = 0; + + buf_byte = buf; + + pr_info("%s: Starting bitmap dump for member %u in hex, size %u\n", + __func__, member_id, buf_size); + pr_info("---------------------------------------------------------\n"); + while (size < buf_size) { + pr_cont("%02X", *(buf_byte + size)); + size++; + } + + pr_info("\n"); +} + +void rmr_map_dump_bitmap(struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks; + bool is_last_flp; + u8 *slp; + int i, j; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + is_last_flp = (i == (map->no_of_flp - 1)); + + if (is_last_flp) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (u8 *)(*flp_ptr); + + if (is_last_flp && j == (no_of_slps - 1)) + no_of_chunks = map->no_of_chunk_in_last_slp; + else + no_of_chunks = NO_OF_CHUNKS_PER_PAGE; + + /* Each chunk is represented by a byte */ + rmr_map_hexdump_bitmap_buf(map->member_id, slp, no_of_chunks); + } + } +} + +/** + * rmr_map_summary_format - Format a per-member dirty-chunk summary into buf + * + * @pool: Pool whose maps to summarise + * @buf: Output buffer (must be at least @buf_size bytes) + * @buf_size: Size of @buf in bytes + * + * Description: + * Output format (one line per member that has a map): + * member : [ ...] / dirty + * At most 50 dirty chunk indices are listed per member; if there + * are more, a "..." marker appears before the closing bracket. + * + * Context: caller must hold srcu pool->map_srcu. + * + * Return: number of bytes written (excluding trailing NUL). + */ +int rmr_map_summary_format(struct rmr_pool *pool, char *buf, size_t buf_size) +{ + struct rmr_dirty_id_map *map; + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks_in_slp; + u64 chunk_idx, dirty_count; + bool is_last_flp; + u8 *slp; + int printed_ids; + int pos = 0; + int i, fi, si; + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + pos += scnprintf(buf + pos, buf_size - pos, + "member %u: [", map->member_id); + + dirty_count = 0; + chunk_idx = 0; + printed_ids = 0; + for (fi = 0; fi < map->no_of_flp; fi++) { + flp_ptr = (el_flp *)map->dirty_bitmap[fi]; + is_last_flp = (fi == (map->no_of_flp - 1)); + no_of_slps = is_last_flp ? + map->no_of_slp_in_last_flp : NO_OF_SLP_PER_FLP; + + for (si = 0; si < no_of_slps; si++, flp_ptr++) { + u64 ci; + + slp = (u8 *)(*flp_ptr); + no_of_chunks_in_slp = + (is_last_flp && si == (no_of_slps - 1)) ? + map->no_of_chunk_in_last_slp : + NO_OF_CHUNKS_PER_PAGE; + + for (ci = 0; ci < no_of_chunks_in_slp; + ci++, chunk_idx++) { + if (!(slp[ci] & (1 << CHUNK_DIRTY_BIT))) + continue; + dirty_count++; + /* Cap listed IDs to fit all members in PAGE_SIZE */ + if (printed_ids < 50) { + pos += scnprintf(buf + pos, + buf_size - pos, + "%llu ", chunk_idx); + printed_ids++; + } + } + } + } + + /* Overwrite trailing space before ']' */ + if (pos > 0 && buf[pos - 1] == ' ') + pos--; + if (printed_ids < dirty_count) + pos += scnprintf(buf + pos, buf_size - pos, + "...] %llu/%llu dirty\n", + dirty_count, map->no_of_chunks); + else + pos += scnprintf(buf + pos, buf_size - pos, + "] %llu/%llu dirty\n", + dirty_count, map->no_of_chunks); + } + + return pos; +} + +void rmr_map_bidump_bitmap_buf(void *buf, u8 member_id, u32 buf_long) +{ + char box[65]; + u64 *buf_byte; + u64 the_byte; + int i, j; + u32 count = 0; + + buf_byte = buf; + + pr_info("%s: bitmap for member %d dump in binary, the size in longs %u\n", + __func__, member_id, buf_long); + while (count < buf_long) { + the_byte = *(buf_byte + count); + for (i = 63, j = 0; i >= 0; i--, j++) + box[j] = (the_byte & (1ULL << i)) ? '1' : '0'; + box[j] = '\0'; + pr_cont("[%s]", box); + count++; + } + + pr_info("\n"); + pr_info("---------------------------------------------------------\n"); +} diff --git a/drivers/infiniband/ulp/rmr/rmr-map.h b/drivers/infiniband/ulp/rmr/rmr-map.h new file mode 100644 index 000000000000..76ef6506421f --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-map.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_MAP_H +#define RMR_MAP_H + +#include +#include + +#include "rmr.h" + +/** + * The dirty map buffer is used to track dirty chunks through bits. + * The position of the bit denotes the chunk number it tracks. + * + * Bitmap structure + * ---------------- + * The dirty bitmap is stored in a 2 level tree-like structure. + * The main unit of storage are memory pages; They act as nodes of this structure. + * The first level pages (FLP) stores the address of the second level pages. + * There can be a total of 256 first level pages. + * The second level pages (SLP, also the leaf nodes/pages) stores the bitmap. + * + * The first level pages have to store the address of the second level pages. + * An address being 8B (default/max) long, the addresses of a maximum of 512 pages can + * be stored in a first level page. This then decides the maximum leaf pages a pool can + * have, which, for our example, is [(# pages of FLP) * (PAGE_SIZE / address_size)], + * (256*512)=131072. + * With the above info, the available space for bitmap is 131072*4KB(PAGE_SIZE)=512MB. + * + * A chunk is the smallest unit of data which is tracked for being dirty. A chunk is + * called dirty/unsynced, even if a single byte in it is dirty/unsynced. + * To track a chunk, a single byte (1B) is used. The least significant bit is used to signify + * if the chunk is dirty (set) or not. Other bits can be used for other purposes (for example, + * filters). The maximum number of chunks RMR can manage are then, (512MB)/1B=536870912. + * This number is fixed, as one can see from the calculations, and hence the maximum size of + * metadata RMR can allocate and use is fixed. + + * The user configurable part is the chunk size. Its range is 128KB-1MB, and it has to be a + * power of 2. + * The chunk size decides the maximum mapped size for an RMR pool. + * For example, for chunk size 1MB, and taking the maximum number of chunks RMR can allocate + * and handle (536870912, see above), the maximum mapped size would be (536870912*1MB)=512TB. + * The table showing the relation between chunk size and maximum mapped size is as follows, + * Chunk size Maximum mapped size + * 128KB 64TB + * 256KB 128TB + * 512KB 256TB + * 1MB 512TB + * + * Calculating chunk number + * ------------------------ + * Some key points + * 1) The Linux kernel has a fixed size for sector, which is 512 (or 9 bitshift) + * 2) The mapped_size provided and stores in the rmr_pool structure is in sectors. + * 3) The chunk_size provided and stored in the rmr_pool structure is in bytes. + * 4) The code calculates and stores chunk_size_shift in the rmr_pool structure to do fast + * calculation. + * 5) The IO offset give to RMR (through function rmr_clt_request) is in bytes. + * + * -- + * With the above points, lets have a sample scenario with mapped_size 1GB and chunk_size 128KB + * The numbers would then be, + * + * no_of_chunks = (mapped_size / chunk_size) + * no_of_chunks = 8192 + * + * chunk_size = 131072 + * chunk_size_shift = 17 + * + * dirty_map buffer size (in BYTES) = (no_of_chunks / bits in a byte) + * dirty_map buffer size (in BYTES) = 1024 + * + * -- + * Lets do a sample calculation of chunk_no from offset and length of an IO + * + * For offset 30801920 and length 4096 + * + * chunk_no = (offset >> chunk_size_shift) + * chunk_no = 235 + * + */ + +#define RMR_KEY_SHIFT 32 + +// Each chunk requires 1B of metadata +#define PER_CHUNK_MD 1 +#define PER_CHUNK_MD_LOG2 ilog2(PER_CHUNK_MD) + +#define GET_CHUNK_NUMBER(offset, shift) (offset >> shift) +#define GET_FOLLOWING_CHUNKS(offset_len, shift, start) (((offset_len - 1) >> shift) - start + 1) + +#define CHUNK_TO_OFFSET(chunk_no, shift) (chunk_no << shift) + +// The element type stored in FLP +typedef unsigned long el_flp; + +enum { + CHUNK_DIRTY_BIT = 0, + CHUNK_FILTER_BIT, +}; + +enum { + MAX_NO_OF_FLP = 256, + NO_OF_SLP_PER_FLP = (PAGE_SIZE >> ilog2(sizeof(void *))), + NO_OF_SLP_PER_FLP_LOG2 = ilog2(NO_OF_SLP_PER_FLP), + MAX_NO_OF_SLP = (MAX_NO_OF_FLP * NO_OF_SLP_PER_FLP), + + NO_OF_CHUNKS_PER_PAGE = (PAGE_SIZE >> PER_CHUNK_MD_LOG2), + // Chunks data is stored only in SLP + MAX_NO_OF_CHUNKS = (MAX_NO_OF_SLP * NO_OF_CHUNKS_PER_PAGE), + + CHUNKS_PER_SLP = (PAGE_SIZE >> PER_CHUNK_MD_LOG2), + CHUNKS_PER_SLP_LOG2 = ilog2(CHUNKS_PER_SLP), + CHUNKS_PER_FLP = (CHUNKS_PER_SLP * NO_OF_SLP_PER_FLP), + CHUNKS_PER_FLP_LOG2 = ilog2(CHUNKS_PER_FLP), +}; + +typedef enum { + MAP_NO_FILTER = 0, + MAP_ENTRY_UNSYNCED +} rmr_map_filter; + +enum rmr_map_state { + RMR_MAP_STATE_NO_CHECK = 0, + RMR_MAP_STATE_CHECKING, + // do we have some other useful states ? +}; + +struct rmr_dirty_id_map { + u8 member_id; + struct xarray rmr_id_map; + unsigned long ts; + atomic_t check_state; + + /* + * The usage of this is restricted to form a linked lised + * during mass deletion. Since this is in an RCU list (maps + * in rmr_pool), we cannot use this or change any data until + * the RCU period completes. So we use this next variable + * during mass deletion so we can have a list and don't have + * to wait and restart the search on every individual deletion + * of a map. Refer destroy_clt_pool(). + */ + struct rmr_dirty_id_map *next; + + u64 no_of_chunks; + u64 no_of_flp; + u64 no_of_slp_in_last_flp; + u64 no_of_chunk_in_last_slp; + u64 total_slp; + u8 *bitmap_filter; + void *dirty_bitmap[MAX_NO_OF_FLP]; +}; + +struct rmr_map_entry { + atomic_t sync_cnt; + struct llist_head wait_list; +}; + +/* + * The header of the bitmap buffer. + */ +struct rmr_map_cbuf_hdr { + u64 version; + u8 member_id; + + u64 no_of_chunks; + u64 no_of_flp; + u64 no_of_slp_in_last_flp; + u64 no_of_chunk_in_last_slp; + u64 total_slp; +} __packed; + +static inline unsigned long rmr_id_to_key(rmr_id_t id) +{ + unsigned long res; + + // highest bits for id.a, the rest are for id.b; + res = ((id.a << RMR_KEY_SHIFT) | id.b); + return res; +} + +static inline u64 key_to_a(unsigned long key) +{ + return key >> RMR_KEY_SHIFT; +} + +static inline u64 key_to_b(unsigned long key) +{ + return key & ((1ULL << RMR_KEY_SHIFT) - 1); +} + +void rmr_map_update_page_params(struct rmr_dirty_id_map *map); +struct rmr_dirty_id_map *rmr_map_create(struct rmr_pool *pool, u8 member_id); +void rmr_map_destroy(struct rmr_dirty_id_map *map); +void rmr_map_calc_chunk(struct rmr_pool *pool, size_t offset, size_t length, rmr_id_t *id); +void rmr_map_set_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter); +void rmr_map_set_dirty_all(struct rmr_dirty_id_map *map, u8 filter); +struct rmr_map_entry *rmr_map_unset_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter); +bool rmr_map_check_dirty(struct rmr_dirty_id_map *map, rmr_id_t id); +struct rmr_map_entry *rmr_map_get_dirty_entry(struct rmr_dirty_id_map *map, rmr_id_t id); +void rmr_map_clear_filter_all(struct rmr_dirty_id_map *map, u8 filter); +void rmr_map_unset_dirty_all(struct rmr_dirty_id_map *map); +bool rmr_map_empty(struct rmr_dirty_id_map *map); + +void rmr_map_bitwise_or_buf(void *dst_buf, void *src_buf, u32 buf_size); +int rmr_map_create_entries(struct rmr_dirty_id_map *map); + +void rmr_map_hexdump_bitmap_buf(u8 member_id, void *buf, u32 buf_size); +void rmr_map_slps_to_buf(struct rmr_dirty_id_map *map, u64 slp_idx, u64 no_of_slp, u8 *buf); +u64 rmr_map_buf_to_slps(struct rmr_dirty_id_map *map, u8 *buf, u32 buf_size, u64 slp_idx, + bool test); +void rmr_map_dump_bitmap(struct rmr_dirty_id_map *map); +int rmr_map_summary_format(struct rmr_pool *pool, char *buf, size_t buf_size); +void rmr_map_bidump_bitmap_buf(void *buf, u8 member_id, u32 buf_size); + +static inline void map_entry_get_sync(struct rmr_map_entry *entry) +{ + atomic_inc(&entry->sync_cnt); + pr_debug("after get ref for entry %p, sync cnt %d\n", + entry, atomic_read(&entry->sync_cnt)); +} + +static inline int map_entry_put_sync(struct rmr_map_entry *entry) +{ + pr_debug("before dec_and_test for entry %p, sync cnt %d\n", + entry, atomic_read(&entry->sync_cnt)); + return atomic_dec_and_test(&entry->sync_cnt); +} + +static inline void rmr_maplist_destroy(struct rmr_dirty_id_map *maplist) +{ + struct rmr_dirty_id_map *mp; + + while (maplist != NULL) { + mp = maplist; + maplist = maplist->next; + rmr_map_destroy(mp); + } +} +#endif /* RMR_MAP_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-pool.c b/drivers/infiniband/ulp/rmr/rmr-pool.c new file mode 100644 index 000000000000..5e5632d9d701 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-pool.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#include +#include +#include + +#include "rmr-pool.h" + +LIST_HEAD(pool_list); +DEFINE_MUTEX(pool_mutex); /* mutex to protect pool_list */ +struct kmem_cache *rmr_map_entry_cachep; + +const char *rmr_get_cmd_name(enum rmr_msg_cmd_type cmd) +{ + switch (cmd) { + case RMR_CMD_MAP_READY: return "RMR_CMD_MAP_READY"; + case RMR_CMD_MAP_SEND: return "RMR_CMD_MAP_SEND"; + case RMR_CMD_SEND_MAP_BUF: return "RMR_CMD_SEND_MAP_BUF"; + case RMR_CMD_MAP_BUF_DONE: return "RMR_CMD_MAP_BUF_DONE"; + case RMR_CMD_MAP_DONE: return "RMR_CMD_MAP_DONE"; + case RMR_CMD_MAP_DISABLE: return "RMR_CMD_MAP_DISABLE"; + case RMR_CMD_READ_MAP_BUF: return "RMR_CMD_READ_MAP_BUF"; + case RMR_CMD_MAP_CHECK: return "RMR_CMD_MAP_CHECK"; + case RMR_CMD_LAST_IO_TO_MAP: return "RMR_CMD_LAST_IO_TO_MAP"; + case RMR_CMD_STORE_CHECK: return "RMR_CMD_STORE_CHECK"; + case RMR_CMD_MAP_TEST: return "RMR_CMD_MAP_TEST"; + case RMR_CMD_SEND_MD_BUF: return "RMR_CMD_SEND_MD_BUF"; + case RMR_CMD_MD_SEND: return "RMR_CMD_MD_SEND"; + + case RMR_CMD_MAP_GET_VER: return "RMR_CMD_MAP_GET_VER"; + case RMR_CMD_MAP_SET_VER: return "RMR_CMD_MAP_SET_VER"; + case RMR_CMD_DISCARD_CLEAR_FLAG: return "RMR_CMD_DISCARD_CLEAR_FLAG"; + case RMR_CMD_SEND_DISCARD: return "RMR_CMD_SEND_DISCARD"; + + case RMR_MAP_CMD_MAX: return "RMR_MAP_CMD_MAX"; + + case RMR_CMD_POOL_INFO: return "RMR_CMD_POOL_INFO"; + case RMR_CMD_JOIN_POOL: return "RMR_CMD_JOIN_POOL"; + + case RMR_CMD_REJOIN_POOL: return "RMR_CMD_REJOIN_POOL"; + + case RMR_CMD_LEAVE_POOL: return "RMR_CMD_LEAVE_POOL"; + case RMR_CMD_ENABLE_POOL: return "RMR_CMD_ENABLE_POOL"; + + case RMR_CMD_USER: return "RMR_CMD_USER"; + + case RMR_POOL_CMD_MAX: return "RMR_POOL_CMD_MAX"; + + default: return "Unknown command"; + } +} + +void free_pool(struct rmr_pool *pool) +{ + WARN_ON(!list_empty(&pool->sess_list)); + + cleanup_srcu_struct(&pool->sess_list_srcu); + cleanup_srcu_struct(&pool->map_srcu); + + if (!list_empty(&pool->entry)) { + mutex_lock(&pool_mutex); + list_del(&pool->entry); + mutex_unlock(&pool_mutex); + } + + percpu_ref_exit(&pool->ids_inflight_ref); + kfree(pool); +} + +/** + * rmr_find_pool_by_group_id - Find a pool with group_id in global pool list + * + * @group_id: Group_id of the pool being searched + * + * Locks: + * Caller should hold global pool_mutex + */ +struct rmr_pool *rmr_find_pool_by_group_id(u32 group_id) +{ + struct rmr_pool *pool; + + list_for_each_entry(pool, &pool_list, entry) + if (pool->group_id == group_id) + return pool; + + return NULL; +} + +/** + * rmr_find_pool - Find a pool named poolname in the global pool list + * + * @poolname: Name of the pool to be searched + * + * Locks: + * Caller must hold global pool_mutex + */ +struct rmr_pool *rmr_find_pool(const char *poolname) +{ + struct rmr_pool *pool; + + lockdep_assert_held(&pool_mutex); + + list_for_each_entry(pool, &pool_list, entry) { + if (!strcmp(poolname, pool->poolname)) + return pool; + } + + return NULL; +} + +static void rmr_pool_inflight_ref_release(struct percpu_ref *ref) +{ + struct rmr_pool *pool = container_of(ref, struct rmr_pool, ids_inflight_ref); + + complete_all(&pool->complete_done); +} + +void rmr_pool_confirm_inflight_ref(struct percpu_ref *ref) +{ + struct rmr_pool *pool = container_of(ref, struct rmr_pool, ids_inflight_ref); + + complete_all(&pool->confirm_done); +} + +static struct rmr_pool *alloc_pool(const char *poolname, u32 group_id) +{ + struct rmr_pool *pool; + int ret; + + pr_debug("%s: allocate pool %s with group_id %u\n", + __func__, poolname, group_id); + + if (strlen(poolname) > NAME_MAX) { + pr_err("%s: Failed to create '%s': name too long\n", __func__, poolname); + return ERR_PTR(-EINVAL); + } + + pool = kzalloc(sizeof(struct rmr_pool), GFP_KERNEL); + if (unlikely(!pool)) + return ERR_PTR(-ENOMEM); + + ret = init_srcu_struct(&pool->sess_list_srcu); + if (ret) { + pr_err("%s: Sess list srcu init failed, err: %d\n", __func__, ret); + pool = ERR_PTR(ret); + goto free_pool; + } + + ret = init_srcu_struct(&pool->map_srcu); + if (ret) { + pr_err("%s: Map srcu init failed, err: %d\n", __func__, ret); + pool = ERR_PTR(ret); + goto cleanup_sess_srcu; + } + + ret = percpu_ref_init(&pool->ids_inflight_ref, + rmr_pool_inflight_ref_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (ret) { + pr_err("%s: Percpu reference init failed for pool %s\n", __func__, poolname); + pool = ERR_PTR(ret); + goto cleanup_map_srcu; + } + + pool->group_id = group_id; + pool->map_ver = 1; + pool->mapped_size = 0; + xa_init_flags(&pool->stg_members, XA_FLAGS_ALLOC); + init_completion(&pool->complete_done); + init_completion(&pool->confirm_done); + mutex_init(&pool->sess_lock); + mutex_init(&pool->maps_lock); + INIT_LIST_HEAD(&pool->entry); + INIT_LIST_HEAD(&pool->sess_list); + + init_completion(&pool->discard_done); + atomic_set(&pool->discard_waiting, 0); + atomic_set(&pool->normal_count, 0); + + strscpy(pool->poolname, poolname, sizeof(pool->poolname)); + + return pool; + +cleanup_map_srcu: + cleanup_srcu_struct(&pool->map_srcu); +cleanup_sess_srcu: + cleanup_srcu_struct(&pool->sess_list_srcu); +free_pool: + kfree(pool); + return pool; +} + +struct rmr_pool *rmr_create_pool(const char *poolname, void *priv) +{ + u32 group_id; + struct rmr_pool *pool; + + mutex_lock(&pool_mutex); + + pool = rmr_find_pool(poolname); + if (unlikely(pool)) { + pr_err("Pool '%s' already exists\n", poolname); + pool = ERR_PTR(-EEXIST); + goto out; + } + + /* Calculate the poolname hash */ + group_id = rmr_pool_hash(poolname); + + /* Double ensure there is no hash-clash */ + pool = rmr_find_pool_by_group_id(group_id); + if (unlikely(pool)) { + pr_err("Pool '%s' already exists\n", poolname); + pool = ERR_PTR(-EEXIST); + goto out; + } + + pool = alloc_pool(poolname, group_id); + if (IS_ERR(pool)) { + pr_err("Pool allocation failed for pool %s\n", poolname); + goto out; + } + + list_add(&pool->entry, &pool_list); + pool->priv = priv; + pool->pool_md.magic = RMR_POOL_MD_MAGIC; + +out: + mutex_unlock(&pool_mutex); + return pool; +} + +/** + * rmr_pool_maps_to_buf - Copy dirty_bitmap buffer of pool to buf + * + * @pool: The pool whose map is to be copied + * @map_idx: The map index in the pool's map array + * @offset: The offset to read from in the maps dirty_bitmap buffer + * @buf: Pointer to buf where to copy the dirty_bitmap buffer + * @buflen: Length of the buf available to copy to + * @filter: TODO + * + * Description: + * This function is one half of the (map <-> buf) pair. It is used to save map into a buf. + * The other half is rmr_pool_save_map, which is used to save a buf into the map. + * This function is used while both sending a map and reading a map. + * The process for both of them is largely same. + * + * The relevant params like member_id, offset for the dirty_bitmap buffer + * are stored in the rmr_map_buf_hdr, which is kept at the starting of buf. + * + * The caller has to take care of sending the correct map index and offset to copy from. + * For this, the function provides some help in the form of updating the map_idx and + * offset values (for map send), and storing it those in map_buf_hdr (for map read). + * + * Return value: + * 0 If there is no more data to send + * Total size copied to buf + */ +int rmr_pool_maps_to_buf(struct rmr_pool *pool, u8 *map_idx, u64 *slp_idx, + void *buf, size_t buflen, rmr_map_filter filter) +{ + struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + struct rmr_dirty_id_map *map = NULL; + int lock_idx; + u64 no_of_slp; + + /* Adjust buf and buflen */ + buf += sizeof(struct rmr_map_buf_hdr); + buflen -= sizeof(struct rmr_map_buf_hdr); + + lock_idx = srcu_read_lock(&pool->map_srcu); + for ( ; ; *map_idx += 1) { + + if (*map_idx >= pool->maps_cnt) { + srcu_read_unlock(&pool->map_srcu, lock_idx); + return 0; + } + + map = rcu_dereference(pool->maps[*map_idx]); + if (map) + break; + } + + map_buf_hdr->version = RMR_MAP_FORMAT_VER; + + /* This is for the destination, to inform where to store */ + map_buf_hdr->member_id = map->member_id; + map_buf_hdr->dst_slp_idx = (*slp_idx); + + /* + * SLPs are pages. Duh! + */ + no_of_slp = buflen >> PAGE_SHIFT; + no_of_slp = min(no_of_slp, (map->total_slp - *slp_idx)); + rmr_map_slps_to_buf(map, *slp_idx, no_of_slp, buf); + map_buf_hdr->buf_size = no_of_slp * PAGE_SIZE; + + if ((*slp_idx + no_of_slp) >= map->total_slp) { + /* + * All done for this map. + * Now move on to the next one, and reset the index. + */ + *map_idx += 1; + *slp_idx = 0; + } else { + /* + * Copy the number of SLPs we can, and increment the index. + */ + *slp_idx += no_of_slp; + } + + pr_info("%s: buf_size %u, buflen w/o hdr %lu\n", + __func__, map_buf_hdr->buf_size, buflen); + + /* This is for MAP_READ, to inform where to ask from next */ + map_buf_hdr->map_idx = *map_idx; + map_buf_hdr->slp_idx = *slp_idx; + + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return (map_buf_hdr->buf_size + sizeof(struct rmr_map_buf_hdr)); +} + +/** + * rmr_pool_save_map - Copy given buf to dirty_bitmap buffer of pool + * + * @pool: The pool whose map is the dest for the copy + * @buf: Pointer to buf from where to copy + * @buflen: Length of the buf available to copy + * @test_only: Only test if the buf given matches with dirty_bitmap buf of pool + * @map_clean: TODO + * + * Description: + * This function is the other half of the (map <-> buf) pair. + * It saves buf into the map of pool. The relevant params are read from the + * rmr_map_buf_hdr which lies in the start of the given buf. + * + * Return value: + * 0 on success + * -errno on error + */ +int rmr_pool_save_map(struct rmr_pool *pool, void *buf, size_t buflen, + bool test_only) +{ + struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + struct rmr_dirty_id_map *map = NULL; + int err = 0, lock_idx; + u32 buf_size; + u64 slp_idx; + + if (map_buf_hdr->version != RMR_MAP_FORMAT_VER) { + pr_err("Wrong map format. Expected %d but received %llu\n", + RMR_MAP_FORMAT_VER, map_buf_hdr->version); + return -EINVAL; + } + + /* Adjust buf and buflen */ + buf += sizeof(struct rmr_map_buf_hdr); + buflen -= sizeof(struct rmr_map_buf_hdr); + + lock_idx = srcu_read_lock(&pool->map_srcu); + map = rmr_pool_find_map(pool, map_buf_hdr->member_id); + if (!map) { + pr_err("%s: No map found for member_id %llu\n", + __func__, map_buf_hdr->member_id); + err = -ENOENT; + goto out; + } + + slp_idx = map_buf_hdr->dst_slp_idx; + buf_size = map_buf_hdr->buf_size; + + pr_info("%s: For pool %s, received map for %llu, slp_idx %llu, buf_size %u, buflen %lu\n", + __func__, pool->poolname, map_buf_hdr->member_id, slp_idx, buf_size, buflen); + + /* Sanity */ + WARN_ON(buf_size > buflen); + WARN_ON(buf_size % PAGE_SIZE); + + pr_info("%s: buf_size %u, buflen w/o hdr %lu\n", __func__, map_buf_hdr->buf_size, buflen); + + /* + * The buf_size would be a factor of PAGE_SIZE, + * and thats how we know no_of_slp(s) to save. + */ + if (!rmr_map_buf_to_slps(map, buf, buf_size, slp_idx, test_only)) { + pr_err("%s: rmr_map_buf_to_slps failed\n", __func__); + goto out; + } + +out: + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return err; +} diff --git a/drivers/infiniband/ulp/rmr/rmr-pool.h b/drivers/infiniband/ulp/rmr/rmr-pool.h new file mode 100644 index 000000000000..3cb7d3ae84b9 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-pool.h @@ -0,0 +1,400 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_POOL_H +#define RMR_POOL_H + +#include /* for NAME_MAX */ +#include +#include +#include /* for jhash() */ +#include /* for round_up */ +#include "rmr.h" +#include "rmr-map.h" + +#define RMR_POOL_MD_MAGIC 0xDEADBEEF +#define XA_TRUE ((void *)1UL) +#define XA_FALSE ((void *)2UL) + +extern struct kmem_cache *rmr_map_entry_cachep; +/* + * enum srv_sync_thread_state + */ +enum srv_sync_thread_state { + SYNC_THREAD_REQ_STOP, /* 0 */ + SYNC_THREAD_STOPPED, + SYNC_THREAD_RUNNING, + SYNC_THREAD_WAIT, +}; + +enum srv_map_update_state { + MAP_UPDATE_STATE_DISABLED, + MAP_UPDATE_STATE_READY, + MAP_UPDATE_STATE_DONE, +}; + +/* The srv pool specific structure */ +struct rmr_srv_md { + u64 map_ver; + u64 mapped_size; /* server store size in sectors */ + u8 member_id; + u8 srv_pool_state; /* server pool state */ + u8 store_state; /* state of io_store */ + u8 map_update_state; + bool discard_entries; +}; + +/* Shared by each pool */ +struct rmr_pool_md { + char poolname[NAME_MAX]; + u64 magic; + u32 group_id; + u32 chunk_size; /* rmr client */ + u64 mapped_size; /* client view of store size */ + u32 queue_depth; + u64 map_ver; + struct rmr_srv_md srv_md[RMR_POOL_MAX_SESS]; +} __packed; + +struct rmr_pool { + char poolname[NAME_MAX]; + u32 group_id; /* jhash() on poolname */ + struct kobject kobj; + struct kobject sessions_kobj; + struct list_head entry; /* for global pool_list */ + + struct list_head sess_list; /* list of sessions */ + struct mutex sess_lock; /* protect list of sessions */ + struct srcu_struct sess_list_srcu; + + void *priv; + u64 mapped_size; + u32 chunk_size; + u8 chunk_size_shift; + u64 no_of_chunks; + + struct percpu_ref ids_inflight_ref; + struct completion complete_done; + struct completion confirm_done; + + struct completion discard_done; /* for sync client pool */ + /* Set when waiting for response of discard request */ + atomic_t discard_waiting; + + u8 maps_cnt; + struct mutex maps_lock; + struct rmr_dirty_id_map __rcu + *maps[RMR_POOL_MAX_SESS]; + /* All member ids of the storage nodes */ + struct xarray stg_members; + u64 map_ver; + atomic_t normal_count; /* number of pool sessions currently in NORMAL state */ + struct srcu_struct map_srcu; + + struct rmr_pool_md pool_md; + + bool is_clt; + bool sync; +}; + +/** + * rmr_pool_find_md - find the index of the srv_md with the provided key in the pool_md + * + * @pool_md: the pool_md to search + * @key: the member_id of the server pool to search for + * @empty_slot: the empty slot is required by caller or not + * + * Description: + * Find the index of the srv_md with the matched key. If there is no such a key and the empty + * slot is not required, return -1. + * + * Return: + * >= 0, the index of the key in the pool_md. Return the index of an empty slot when the key + * is not found and the empty_slot flag is true + * -1 if the key is not found and empty_slot is false, or the pool_md doesn't exist + */ +static inline int rmr_pool_find_md(struct rmr_pool_md *pool_md, u8 key, bool empty_slot) +{ + int i; + int empty_i = -1; + + if (!pool_md) + return -1; + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + if (!pool_md->srv_md[i].member_id) + empty_i = i; + + if (pool_md->srv_md[i].member_id == key) + return i; + } + + if (empty_slot) + return empty_i; + return -1; +} + +/** + * rmr_pool_md_check_discard - check the discard_entries flag of the srv_md + * + * @pool: the pool to check pool_md + * @member_id: the member_id of the srv_md to check + * + * Description: + * Check if the pool has received the discards from the server pool with the provided + * member_id. + * + * Return: + * 1 (true) if the pool has received the discards, + * 0 (false) if the pool has not received the discards, + * <0 if the pool has no info of the server pool + */ +static inline int rmr_pool_md_check_discard(struct rmr_pool *pool, u8 member_id) +{ + int md_i = rmr_pool_find_md(&pool->pool_md, member_id, false); + + if (md_i < 0) { + pr_err("Failed to find md for member_id %u\n", member_id); + return -EINVAL; + } + + /* If the flag is set, this pool has received the discards. */ + return pool->pool_md.srv_md[md_i].discard_entries; +} + +#define RMR_MAP_FORMAT_VER 1 +/* + * Get the first most significant bit of map_ver. If it is one, then the store of that storage node + * is being replaced. + */ +#define RMR_STORE_IS_REPLACE(map_ver) (map_ver >> 63 & 1ULL) +#define RMR_STORE_GET_VER(map_ver) (map_ver & ~(1ULL << 63)) +#define RMR_STORE_SET_REPLACE(map_ver) (map_ver |= 1ULL << 63) +#define RMR_STORE_UNSET_REPLACE(map_ver) (map_ver &= ~(1ULL << 63)) +#define RTRS_IO_LIMIT 102400 +//#define RTRS_IO_LIMIT 40 //for tests only + +/* + * TODO: + * We currently do not have mapped_size while creating dirty maps, + * which means we cannot calculate no_of_chunks, hence cannot allocate bitmap + * So, as a workaround, we allocate max size bitmap, + * and to reduce that allocation, we cap max mapped_size. + * + * 1GB max mapped size for now. + * (Size mentioned in number of sectors, just like nr_sects) + */ +#define RMR_MAX_MAPPED_SIZE 2097152 + +/* The header structure of rmr pool metadata will not over this limit. */ +#define RMR_MD_SIZE PAGE_SIZE +#define RMR_MD_SIZE_SECTORS (PAGE_SIZE / SECTOR_SIZE) +#define RMR_MAP_BUF_HDR_SIZE PAGE_SIZE +#define RMR_SRV_MD_SIZE (sizeof(struct rmr_srv_md) * RMR_POOL_MAX_SESS) +#define RMR_CLT_MD_SIZE (sizeof(struct rmr_pool_md) - RMR_SRV_MD_SIZE) +#define RMR_SECTOR_SIZE 512 +#define RMR_INT_ROUND_UP(x, y) (((x) + (y) - 1) / (y)) +#define RMR_ROUND_UP(x) round_up(x, RMR_SECTOR_SIZE) + +#define RMR_SRV_MAX_QDEPTH 512 + +/* last_io region starts right after the pool_md header page */ +#define RMR_LAST_IO_OFFSET RMR_MD_SIZE + +static inline u64 rmr_last_io_len(u32 queue_depth) +{ + return RMR_ROUND_UP((u64)queue_depth * sizeof(rmr_id_t)); +} + +static inline u64 rmr_bitmap_offset(u32 queue_depth) +{ + return RMR_LAST_IO_OFFSET + rmr_last_io_len(queue_depth); +} + +static inline u64 rmr_per_map_bitmap_size(u64 no_of_chunks) +{ + return DIV_ROUND_UP(no_of_chunks, CHUNKS_PER_SLP) * PAGE_SIZE; +} + +static inline u64 rmr_bitmap_len(u64 no_of_chunks) +{ + return RMR_POOL_MAX_SESS * rmr_per_map_bitmap_size(no_of_chunks); +} + +struct rmr_map_buf_hdr { + u64 version; + u64 member_id; + + /* + * dst_slp_idx: SLP index in the local dirty map buffer, + * from where to write the recved dirty map buffer + */ + u64 dst_slp_idx; + u32 buf_size; + + /* + * slp_idx: Only used for MAP_READ, + * to let client know where to ask from in the next iteration + */ + u64 map_idx; + u64 slp_idx; +} __packed; + +extern struct list_head pool_list; +extern struct mutex pool_mutex; + +const char *rmr_get_cmd_name(enum rmr_msg_cmd_type cmd); + +struct rmr_pool *rmr_create_pool(const char *poolname, void *priv); +void free_pool(struct rmr_pool *pool); + +struct rmr_pool *rmr_find_pool_by_group_id(u32 group_id); +struct rmr_pool *rmr_find_pool(const char *poolname); +int rmr_pool_maps_to_buf(struct rmr_pool *pool, u8 *map_idx, u64 *slp_idx, + void *buf, size_t buflen, rmr_map_filter filter); +int rmr_pool_save_map(struct rmr_pool *pool, void *buf, size_t buflen, + bool test_only); + +static inline void rmr_pool_update_no_of_chunk(struct rmr_pool *pool) +{ + u64 calc_no_of_chunks = 0, old_no_of_chunks = pool->no_of_chunks; + + /* + * In include/linux/types.h + * + * "Linux always considers sectors to be 512 (SECTOR_SHIFT==9) bytes long independently + * of the devices real block size." + * + * mapped_size is saved in sectors. + */ + if (pool->mapped_size) { + calc_no_of_chunks = (pool->mapped_size >> (pool->chunk_size_shift - 9)); + + if (pool->chunk_size && + (pool->mapped_size << 9) % pool->chunk_size) + calc_no_of_chunks += 1; + } + + if (calc_no_of_chunks != pool->no_of_chunks) { + pool->no_of_chunks = calc_no_of_chunks; + pr_info("%s: For %s, no_of_chunks old (%llu), updated %llu\n", + __func__, pool->poolname, old_no_of_chunks, pool->no_of_chunks); + } +} + +/* + * rmr_pool_maps_append - Append a map to the dense maps array + * @pool: pool + * @map: map to add + * + * Context: Caller must hold maps_lock. + */ +static inline void rmr_pool_maps_append(struct rmr_pool *pool, + struct rmr_dirty_id_map *map) +{ + rcu_assign_pointer(pool->maps[pool->maps_cnt], map); + pool->maps_cnt++; +} + +/* + * rmr_pool_maps_swap_remove - Remove map at index @i using swap-with-last + * @pool: pool + * @i: index of the map in the map array to remove + * @map: the map being removed + * + * Description: + * Maintains the dense invariant: pool->maps[0:maps_cnt] has no NULL gaps. + * + * Context: Caller must hold maps_lock. + */ +static inline void rmr_pool_maps_swap_remove(struct rmr_pool *pool, u8 i, + struct rmr_dirty_id_map *map) +{ + u8 last = pool->maps_cnt - 1; + + if (i != last) + rcu_assign_pointer(pool->maps[i], rcu_dereference_protected(pool->maps[last], + lockdep_is_held(&pool->maps_lock))); + + rcu_assign_pointer(pool->maps[last], NULL); + pool->maps_cnt--; +} + +static inline struct rmr_dirty_id_map *rmr_pool_find_map(struct rmr_pool *pool, u8 member_id) +{ + int i; + struct rmr_dirty_id_map *map; + struct rmr_dirty_id_map *res = NULL; + + rcu_read_lock(); + for (i = 0; i < pool->maps_cnt; i++) { + map = rcu_dereference(pool->maps[i]); + + if (WARN_ON(!map) || map->member_id != member_id) + continue; + + res = map; + break; + } + rcu_read_unlock(); + + return res; +} + +static inline int rmr_pool_remove_map(struct rmr_pool *pool, u8 member_id) +{ + int i; + struct rmr_dirty_id_map *mp; + struct rmr_dirty_id_map *map = NULL; + + pr_info("%s: pool %s is removing map for member_id %d\n", + __func__, pool->poolname, member_id); + + mutex_lock(&pool->maps_lock); + for (i = 0; i < pool->maps_cnt; i++) { + mp = rcu_dereference_protected(pool->maps[i], + lockdep_is_held(&pool->maps_lock)); + if (WARN_ON(!mp)) + continue; + if (mp->member_id == member_id) { + map = mp; + break; + } + } + + if (!map) { + mutex_unlock(&pool->maps_lock); + pr_err("%s: pool %s cannot find map for member_id %d\n", + __func__, pool->poolname, member_id); + return -EINVAL; + } + + /* Dirty map entries are also removed since the map no longer exists. */ + rmr_map_unset_dirty_all(map); + + rmr_pool_maps_swap_remove(pool, i, map); + synchronize_srcu(&pool->map_srcu); + + mutex_unlock(&pool->maps_lock); + + /* Free up the memory */ + rmr_map_destroy(map); + + return 0; +} + + +bool rmr_pool_change_state(struct rmr_pool *pool, enum rmr_pool_state new_state); + +void rmr_pool_confirm_inflight_ref(struct percpu_ref *ref); + +static inline u32 rmr_pool_hash(const char *poolname) +{ + return jhash(poolname, strlen(poolname), 0); +} + +#endif /* RMR_POOL_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-proto.h b/drivers/infiniband/ulp/rmr/rmr-proto.h new file mode 100644 index 000000000000..02c20ed76bef --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-proto.h @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_PROTO_H +#define RMR_PROTO_H + +#define RMR_PROTO_VER_MAJOR 0 +#define RMR_PROTO_VER_MINOR 1 + +#define RMR_PROTO_VER_STRING __stringify(RMR_PROTO_VER_MAJOR) "." \ + __stringify(RMR_PROTO_VER_MINOR) + +#ifndef RMR_VER_STRING +#define RMR_VER_STRING __stringify(RMR_PROTO_VER_MAJOR) "." \ + __stringify(RMR_PROTO_VER_MINOR) +#endif + +/* TODO: should be configurable */ +#define RTRS_PORT 1234 + +#define RMR_POOL_MAX_SESS 4 + +/** + * enum rmr_msg_types - RMR message types + * @RMR_MSG_JOIN_POOL: Join pool message from client to server + * @RMR_MSG_JOIN_POOL_RSP: Join pool messge response from server to client + * @RMR_MSG_LEAVE_POOL: Leave pool message from client to server + * @RMR_MSG_IO: IO(read/write) request on an object + */ +enum rmr_msg_type { + RMR_MSG_CMD, + RMR_MSG_CMD_RSP, + RMR_MSG_IO, + RMR_MSG_MD, + RMR_MSG_MAP_CLEAR, + RMR_MSG_MAP_ADD, +}; + +/** + * struct rmr_msg_hdr - header of RMR messages + * @type: Message type, valid values see: enum rmr_msg_types + */ +struct rmr_msg_hdr { + __le32 group_id; /* poolname jhash() */ + __le16 type; + __le16 __padding; +}; + +/** + * struct rmr_msg_io - message for object I/O read/write + * @hdr: message header + * @id_a: first 64bit of the object id + * @id_b: second 64bit of the object id + * @offset: offset from where to read/write + * @flags: bitmask, valid values are defined in enum rmr_io_flags + * @length: number of bytes for I/O read/write + * @pool_id: pool id to which the object belongs + */ +struct rmr_msg_io { + struct rmr_msg_hdr hdr; + __le64 id_a; + __le64 id_b; + + __le32 offset; + __le32 length; + __le32 flags; + __le16 prio; + + __le32 mem_id; + __le64 map_ver; + u8 failed_id[RMR_POOL_MAX_SESS]; + u8 failed_cnt; + + u8 member_id; + u8 sync; + u8 __padding[19]; //padding is not correct now i think +}; + +struct rmr_pool_member_info { + u8 no_of_stor; + + struct per_mem_info { + u8 member_id; + u8 c_dirty; + } p_mem_info[RMR_POOL_MAX_SESS]; +}; + +/** + * enum rmr_msg_cmd_types - RMR command types + * @RMR_CMD_MAP_READY: Get ready to receive map + * @RMR_CMD_MAP_SEND: Send map to certain node + * @RMR_CMD_MAP_DONE: Confirm map receipt + * + * When adding a command, + * make sure to add it to the function rmr_get_cmd_name. + */ +enum rmr_msg_cmd_type { + RMR_CMD_MAP_READY, // 0 + RMR_CMD_MAP_SEND, + RMR_CMD_SEND_MAP_BUF, + RMR_CMD_MAP_BUF_DONE, + RMR_CMD_MAP_DONE, + RMR_CMD_MAP_DISABLE, + RMR_CMD_READ_MAP_BUF, + RMR_CMD_MAP_CHECK, + RMR_CMD_LAST_IO_TO_MAP, + RMR_CMD_STORE_CHECK, + RMR_CMD_MAP_TEST, + /* sends the metadata of non-sync rmr-client to server */ + RMR_CMD_SEND_MD_BUF, + /*sends the message of discards to the node */ + RMR_CMD_SEND_DISCARD, + /* sends the message of md_update to the node; the node sends its srv_md back. */ + RMR_CMD_MD_SEND, + + RMR_CMD_MAP_GET_VER, // 14 + RMR_CMD_MAP_SET_VER, + RMR_CMD_DISCARD_CLEAR_FLAG, + + /* + * Add map related commands above this + */ + RMR_MAP_CMD_MAX, + + RMR_CMD_POOL_INFO, // 18 + RMR_CMD_JOIN_POOL, + + RMR_CMD_REJOIN_POOL, + + RMR_CMD_LEAVE_POOL, + RMR_CMD_ENABLE_POOL, // 22 + + RMR_CMD_USER, + + /* + * Add pool related commands above this + */ + RMR_POOL_CMD_MAX, +}; + +struct rmr_msg_map_send_cmd { + u8 receiver_member_id; +}; + +struct rmr_msg_map_buf_cmd { + u64 version; + u8 map_idx; + u64 slp_idx; +}; + +struct rmr_msg_map_buf_done_cmd { + u64 map_version; +}; + +struct rmr_msg_map_done_cmd { + u8 enable; +}; + +struct rmr_msg_send_md_buf_cmd { + u8 sync; /* if the pool is sync or not */ + u8 sender_id; + u8 receiver_id; + u64 flags; +}; + +struct rmr_msg_send_discard_cmd { + u8 member_id; /* the storage node that discards all data */ +}; + +struct rmr_msg_md_send_cmd { + u64 src_mapped_size; /* the pool mapped size on the sending side */ + u8 sender_id; + u8 leader_id; + u8 read_full_md; /* 1 = return full pool_md; 0 = own entry only */ +}; + +struct rmr_msg_pool_info_cmd { + u8 member_id; + u8 operation; /* add/remove */ + u8 mode; /* For add -> create/assemble. For remove -> delete/disassemble */ + u8 dirty; /* Valid only when operation=ADD and mode=CREATE */ +}; + +enum rmr_pool_info_op { + RMR_POOL_INFO_OP_ADD = 0, + RMR_POOL_INFO_OP_REMOVE, +}; + +enum rmr_pool_info_mode { + RMR_POOL_INFO_MODE_CREATE = 0, + RMR_POOL_INFO_MODE_ASSEMBLE, + RMR_POOL_INFO_MODE_DELETE, + RMR_POOL_INFO_MODE_DISASSEMBLE, +}; + +struct rmr_msg_set_map_ver_cmd { + u8 map_ver; /* the map version to set */ +}; + +struct rmr_msg_join_pool_cmd { + u64 queue_depth; + u32 chunk_size; + struct rmr_pool_member_info mem_info; + u8 dirty; + u8 create; + u8 rejoin; +}; + +struct rmr_msg_leave_pool_cmd { + u8 member_id; + u8 delete; +}; + +struct rmr_msg_enable_pool_cmd { + u32 enable; +}; + +struct rmr_msg_user_cmd { + size_t usr_len; +}; + +struct rmr_msg_join_pool_cmd_rsp { + u64 mapped_size; + u32 chunk_size; +}; + +struct rmr_msg_pool_cmd { + struct rmr_msg_hdr hdr; + u8 ver; + u8 cmd_type; + u8 sync; + u8 rsvd[1]; + s8 pool_name[NAME_MAX]; + union { + struct rmr_msg_map_send_cmd map_send_cmd; + struct rmr_msg_map_buf_cmd map_buf_cmd; + struct rmr_msg_map_buf_done_cmd map_buf_done_cmd; + struct rmr_msg_map_done_cmd map_done_cmd; + + struct rmr_msg_send_md_buf_cmd send_md_buf_cmd; + struct rmr_msg_send_discard_cmd send_discard_cmd; + struct rmr_msg_md_send_cmd md_send_cmd; + + struct rmr_msg_pool_info_cmd pool_info_cmd; + + struct rmr_msg_set_map_ver_cmd set_map_ver_cmd; + + struct rmr_msg_join_pool_cmd join_pool_cmd; + + struct rmr_msg_leave_pool_cmd leave_pool_cmd; + struct rmr_msg_enable_pool_cmd enable_pool_cmd; + + struct rmr_msg_user_cmd user_cmd; + }; +}; + +struct rmr_msg_pool_cmd_rsp { + struct rmr_msg_hdr hdr; + enum rmr_msg_cmd_type cmd_type; + u8 err; + u8 ver; + u8 member_id; + union { + struct rmr_msg_join_pool_cmd_rsp join_pool_cmd_rsp; + u64 value; + }; +}; + +#endif /* RMR_PROTO_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-req.c b/drivers/infiniband/ulp/rmr/rmr-req.c new file mode 100644 index 000000000000..d748579c489c --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-req.c @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#include + +#include "rmr-req.h" +#include "rmr-srv.h" +#include "rmr-clt.h" + +extern struct kmem_cache *rmr_req_cachep; +extern struct kmem_cache *rmr_map_entry_cachep; +extern struct rmr_store_ops *pstore_ops; + +static void rmr_req_complete(struct rmr_srv_req *req); +static void rmr_req_store_done(struct rmr_srv_req *req); +static void rmr_req_sync_failed(struct rmr_srv_req *req); +static void rmr_req_send_map_clear(struct rmr_srv_req *req); +static void rmr_req_sync_complete(struct rmr_srv_req *req); +static void rmr_req_store(struct rmr_srv_req *req); + +/** + * rmr_srv_req_resp - Response from the lower level module + * + * @req: Request to be processed + * @err: Error value + * + * Description: + * This function is the return point from the below module + * where IO is submitted. + * + * Context: + * In this function the request should always be in state RMR_REQ_STATE_STORE + */ +void rmr_srv_req_resp(struct rmr_srv_req *req, int err) +{ + /* + * Use the error sent from lower layer + */ + req->err = err; + + /* + * For Normal (non-sync) requests we handle both non-error and error cases from one + * place. Since its simple. + */ + if (rmr_op(req->flags) != RMR_OP_SYNCREQ) { + rmr_req_complete(req); + return; + } + + /* + * Sync requests are complicated, since it needs extra post-processing + * once IO is done for us. + * + * 1) In case of no failure, we need to send map clear to other nodes, + * since they think we are still dirty for this chunk. + * + * 2) We need to check for waiting IO in entry->wait_list, and kick them. + */ + if (!req->err) + rmr_req_store_done(req); + else + rmr_req_sync_failed(req); +} +EXPORT_SYMBOL(rmr_srv_req_resp); + +/** + * rmr_srv_req_create - Create an rmr server request + * + * @msg: IO message containing information + * @srv_pool: Server pool creating this request + * @rtrs_op: rtrs IO context + * @data: pointer to data buf + * @datalen: len of data buf + * @endreq: Function to be called at the end of rmr request processing + * + * Description: + * RMR server request are base structures which holds the IO while they are being processed. + * They go through a state machine, while a number of checks are done. IOs which are + * destined for a chunk that is dirty, are paused while that chunk is synced. + * + * Return: + * Pointer to the create rmr server request on success + * Error pointer on failure + */ +struct rmr_srv_req *rmr_srv_req_create(const struct rmr_msg_io *msg, struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, void *data, u32 datalen, + void (*endreq)(struct rmr_srv_req *, int)) +{ + struct rmr_srv_req *req; + struct rmr_srv_io_store *store = srv_pool->io_store; + int i; + + if (!store || !atomic_read(&srv_pool->store_state)) { + pr_err("%s: store not set, or srv_pool not in correct state %s\n", + __func__, srv_pool->pool->poolname); + return ERR_PTR(-ENODEV); + } + + req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL); + if (!req) { + pr_err("cannot allocate memory for rmr_req.\n"); + return ERR_PTR(-ENOMEM); + } + + req->id.a = le64_to_cpu(msg->id_a); + req->id.b = le64_to_cpu(msg->id_b); + + req->offset = le32_to_cpu(msg->offset); + req->length = le32_to_cpu(msg->length); + req->flags = le32_to_cpu(msg->flags); + req->prio = le16_to_cpu(msg->prio); + + req->mem_id = le32_to_cpu(msg->mem_id); + for (i = 0; i < msg->failed_cnt; i++) + req->failed_srv_id[i] = msg->failed_id[i]; + + req->failed_cnt = msg->failed_cnt; + req->map_ver = le64_to_cpu(msg->map_ver); + req->sync = msg->sync; + + req->data = data; + req->datalen = datalen; + req->rtrs_op = rtrs_op; + req->srv_pool = srv_pool; + req->store = store; + req->endreq = endreq; + + pr_debug("req %p, chunk_size %u\n", req, req->srv_pool->pool->chunk_size); + + return req; +} + +struct rmr_srv_req *rmr_srv_md_req_create(struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, void *data, + u32 offset, u32 len, unsigned long flags, + void (*endreq)(struct rmr_srv_req *, int)) +{ + struct rmr_srv_req *req; + struct rmr_srv_io_store *store = srv_pool->io_store; + + if (!store) { + pr_err("No store_id registered for srv pool %s\n", srv_pool->pool->poolname); + return ERR_PTR(-ENODEV); + } + + req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL); + if (!req) { + pr_err("cannot allocate memory for rmr_req.\n"); + return ERR_PTR(-ENOMEM); + } + req->offset = offset; + req->length = len; + req->flags = flags; + req->sync = false; /* A md req is always non-sync */ + + req->data = data; + req->rtrs_op = rtrs_op; + req->srv_pool = srv_pool; + req->store = store; + req->endreq = endreq; + + pr_debug("md req %p, len %u\n", req, len); + + return req; +} + +void rmr_req_submit(struct rmr_srv_req *req); +static void rmr_req_sched(struct work_struct *work) +{ + struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work); + + pr_debug("scheduled work process for req %p\n", req); + if (req->err) + rmr_req_complete(req); + else + rmr_req_submit(req); +} + +void rmr_process_wait_list(struct rmr_map_entry *entry, int err) +{ + struct llist_node *first, *next; + struct rmr_srv_req *req; + + pr_debug("processing wait list for entry %p, sync_cnt=%d\n", + entry, atomic_read(&entry->sync_cnt)); + + WARN_ON(atomic_read(&entry->sync_cnt) > 0); + + while (!llist_empty(&entry->wait_list)) { + first = llist_del_all(&entry->wait_list); + while (first) { + next = first->next; + req = llist_entry(first, struct rmr_srv_req, node); + + pr_debug("process waiting req %p id (%llu, %llu) flags %u\n", + req, req->id.a, req->id.b, req->flags); + if (err) { + pr_err("fail waiting req %p id (%llu, %llu) flags %u err %d\n", + req, req->id.a, req->id.b, req->flags, err); + req->err = -EIO; + } + + pr_debug("schedule processing req %p with err %d\n", req, req->err); + INIT_WORK(&req->work, rmr_req_sched); + schedule_work(&req->work); + + first = next; + } + } +} + +void rmr_req_submit(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_map_entry *entry; + struct rmr_dirty_id_map *map; + + if (rmr_op(req->flags) == RMR_OP_FLUSH && !req->length) { + rmr_req_store(req); + return; + } + + pr_debug("check map for req %p flag %u request id [%llu, %llu] offset %u length %u\n", + req, req->flags, + req->id.a, req->id.b, req->offset, req->length); + + map = rmr_pool_find_map(srv_pool->pool, srv_pool->member_id); + if (!map) { + pr_err("no map found for pool_id %u\n", srv_pool->member_id); + req->err = -EINVAL; + goto err; + } + + rcu_read_lock(); + entry = rmr_map_get_dirty_entry(map, req->id); + if (!entry) { + /* + * The chunk containing data for this req is NOT dirty for us + */ + pr_debug("check map for req %p flags %u request id [%llu, %llu], no entry in the map\n", + req, req->flags, req->id.a, req->id.b); + rcu_read_unlock(); + rmr_req_store(req); + return; + } else { + /* + * The chunk for this data is dirty for us. + * + * we have 2 cases. + * + * 1) Its coming from a sync rmr-clt (Its an internal read). + * Then, fail the IO, since we do not want to end up in a deadlock, + * or go through multiple hops for a single read. The sender can try some other + * node itself. + */ + if (req->sync) { + WARN_ON(rmr_op(req->flags) != RMR_OP_READ); + rcu_read_unlock(); + req->err = -EIO; + goto err; + } + + /* + * 2) If its coming from a non-sync rmr-clt, + * simply go ahead with syncing the data first. + */ + llist_add(&req->node, &entry->wait_list); + pr_debug("%s: req %p flags %u id (%llu %llu) added to wait list. sync_cnt %d\n", + __func__, req, req->flags, req->id.a, req->id.b, + atomic_read(&entry->sync_cnt)); + + rcu_read_unlock(); + /* + * If we are the first who grabs the entry then start sync. + * + * Otherwise, the one syncing the data would pick us up from the entry->wait_list + * and kick us. So simply exit for now. + */ + if (atomic_cmpxchg(&entry->sync_cnt, -1, 0) == -1) { + int err; + + req->priv = entry; + err = rmr_srv_sync_chunk_id(srv_pool, entry, req->id, false); + if (err) { + atomic_set(&entry->sync_cnt, -1); + rmr_process_wait_list(entry, err); + } + } + } + + return; + +err: + rmr_req_complete(req); +} + +static void rmr_req_store(struct rmr_srv_req *req) +{ + int err; + + pr_debug("submit to store req %p flags %u request id [%llu, %llu] offset %u length %u\n", + req, req->flags, + req->id.a, req->id.b, req->offset, req->length); + + err = req->store->ops->submit_req(req->store->priv, req->data, req->offset, + req->length, req->flags, req->prio, req); + if (err) { + pr_err("%s: error submitting req %p, err %d\n", __func__, req, err); + req->err = err; + if (rmr_op(req->flags) == RMR_OP_SYNCREQ) + rmr_req_sync_failed(req); + else + rmr_req_complete(req); + } +} + +static void rmr_md_req_store(struct rmr_srv_req *req) +{ + int err; + + err = req->store->ops->submit_md_req(req->store->priv, req->data, req->offset, req->length, + req->flags, req); + if (err) { + req->endreq(req, err); + pr_err("release md req %p, flags %u\n", req, req->flags); + kmem_cache_free(rmr_req_cachep, req); + } +} + +/* md req submission path*/ +void rmr_md_req_submit(struct rmr_srv_req *req) +{ + rmr_md_req_store(req); +} + +static void rmr_req_sched_store(struct work_struct *work) +{ + struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work); + + pr_debug("scheduled store for req %p\n", req); + rmr_req_store(req); +} + +static void rmr_req_remote_io_done(void *priv, int err) +{ + struct rmr_srv_req *req = priv; + + pr_debug("called for req %p, err code %d\n", req, err); + + rmr_clt_put_iu(req->srv_pool->clt, req->iu); + + if (err) { + req->err = err; + rmr_req_sync_failed(req); + return; + } + + pr_debug("schedule store for req %p with err %d\n", req, req->err); + INIT_WORK(&req->work, rmr_req_sched_store); + schedule_work(&req->work); +} + +static void rmr_req_remote_read(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_pool *clt = srv_pool->clt; + unsigned long flags; + int err; + + pr_debug("redirecting req id (%llu, %llu)\n", + req->id.a, req->id.b); + if (!clt) { + pr_err("No srv pool assigned for redirect for %s\n", srv_pool->pool->poolname); + err = -EINVAL; + goto err; + } + + if (rmr_op(req->flags) == RMR_OP_SYNCREQ) + flags = RMR_OP_READ; + else + flags = req->flags; + + req->iu = rmr_clt_get_iu(clt, flags, WAIT); + if (IS_ERR_OR_NULL(req->iu)) { + pr_err("Failed to get rmr_iu for req id (%llu, %llu)\n", + req->id.a, req->id.b); + err = -EINVAL; + goto err; + } + + sg_init_one(&req->sg, req->data, req->datalen); + + pr_debug("After sg_init_one nents=%d\n", sg_nents(&req->sg)); + + /* look at the flags here! */ + err = rmr_clt_request(clt, req->iu, req->offset, req->length, flags, + req->prio, req, rmr_req_remote_io_done, + &req->sg, sg_nents(&req->sg)); + if (err) { + pr_err("rmr_clt_request error %d\n", err); + rmr_clt_put_iu(clt, req->iu); + err = -EREMOTEIO; + goto err; + } + + pr_debug("remote read submitted\n"); + return; + +err: + req->err = err; + rmr_req_sync_failed(req); +} + +static void rmr_sync_req_sched(struct work_struct *work) +{ + struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work); + + pr_debug("scheduled work process for req %p\n", req); + if (req->err) + rmr_req_sync_complete(req); + else + rmr_req_send_map_clear(req); +} + +static void rmr_req_complete(struct rmr_srv_req *req) +{ + pr_debug("send completeion for req %p flags %u request id (%llu, %llu) offset %u length %u err %d\n", + req, req->flags, + req->id.a, req->id.b, req->offset, req->length, req->err); + + /* endreq() records the Last IO buffer accordingly. */ + req->endreq(req, req->err); + + pr_debug("release req %p, flags %u\n", req, req->flags); + + kmem_cache_free(rmr_req_cachep, req); +} + +static struct rmr_srv_req *rmr_req_create_sync_req(struct rmr_srv_pool *srv_pool, rmr_id_t id, + u32 offset, u32 len, bool from_sync, + struct rmr_srv_req *parent) +{ + struct rmr_srv_req *req; + struct rmr_srv_io_store *store = srv_pool->io_store; + + if (!store) { + pr_err("No store_id registered for srv pool %s\n", srv_pool->pool->poolname); + return ERR_PTR(-ENODEV); + } + + req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL); + if (!req) { + pr_err("cannot allocate memory for rmr_req.\n"); + return ERR_PTR(-ENOMEM); + } + req->id.a = id.a; + req->id.b = id.b; + req->flags = RMR_OP_SYNCREQ; + req->length = len; + req->offset = offset; + req->srv_pool = srv_pool; + req->store = store; + req->from_sync = from_sync; + + if (parent) { + req->data = parent->data + offset; + } else { + req->data = kmalloc(req->length, GFP_KERNEL); + if (!req->data) { + pr_err("cannot allocate memory for sync req id [%llu, %llu]\n", + req->id.a, req->id.b); + kmem_cache_free(rmr_req_cachep, req); + return ERR_PTR(-ENOMEM); + } + } + req->datalen = len; + req->parent = parent; + + pr_debug("sync req %p created, flags %u request id (%llu, %llu) offset %u length %u parent %p\n", + req, req->flags, req->id.a, req->id.b, req->offset, req->length, parent); + + return req; +} + +//should be called only if corresponding map entry has 0 sync cnt +int rmr_srv_sync_chunk_id(struct rmr_srv_pool *srv_pool, struct rmr_map_entry *entry, + rmr_id_t id, bool from_sync) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_dirty_id_map *map; + struct rmr_srv_req *parent_req; + u32 max_io_size, total_len, offset; + + if (!srv_pool->clt) { + pr_err("For pool %s no sync pool assigned.\n", pool->poolname); + return -EINVAL; + } + max_io_size = srv_pool->max_sync_io_size; + + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("no map found for pool_id %u\n", srv_pool->member_id); + //TODO: handle this , probably initialize map, or just throw err? + return -EINVAL; + } + + offset = CHUNK_TO_OFFSET(id.b, pool->chunk_size_shift); + total_len = pool->chunk_size; + + pr_debug("pool %s sync id (%llu, %llu), total_len %u, max_io_size %u\n", + pool->poolname, id.a, id.b, total_len, max_io_size); + + /* + * The parent_req starts with total_len, then get decremented in loop below. + * The child reqs are filled one by one from end to second. + * + * Maybe refactor this to a simple loop? + */ + parent_req = rmr_req_create_sync_req(srv_pool, id, offset, total_len, from_sync, NULL); + if (IS_ERR_OR_NULL(parent_req)) { + pr_err("pool %s failed to create main sync req to sync id (%llu, %llu)\n", + pool->poolname, id.a, id.b); + return -ENOMEM; + } + parent_req->priv = entry; + + if (from_sync) { + if (rmr_srv_get_sync_permit(srv_pool)) { + pr_err("rmr_srv_sync_chunk_id failed to acquire permit for parent\n"); + kfree(parent_req->data); + kmem_cache_free(rmr_req_cachep, parent_req); + + return -EINVAL; + } + } + + // inc ref cnt for parent_req + map_entry_get_sync(entry); + while (parent_req->length > max_io_size) { + struct rmr_srv_req *req; + u32 child_offset = offset + (parent_req->length - max_io_size); + + // submit req + req = rmr_req_create_sync_req(srv_pool, id, (parent_req->length - max_io_size), + max_io_size, from_sync, parent_req); + if (IS_ERR_OR_NULL(req)) { + pr_err("%s: Pool %s, id (%llu, %llu), offset %u, len %u, err %ld\n", + __func__, pool->poolname, id.a, id.b, + (parent_req->length - max_io_size), max_io_size, PTR_ERR(req)); + parent_req->err = PTR_ERR(req); + + rmr_req_sync_failed(parent_req); + return -EINVAL; + } + + /* + * The offset sent to rmr_req_create_sync_req for this req is in context of the + * chunk. But the real offset for this req in the disk is this. + */ + req->offset = child_offset; + + if (from_sync) { + if (rmr_srv_get_sync_permit(srv_pool)) { + pr_err("rmr_srv_sync_chunk_id failed to acquire permit for child\n"); + kmem_cache_free(rmr_req_cachep, req); + + parent_req->err = -EBUSY; + rmr_req_sync_failed(parent_req); + return -EINVAL; + } + } + + // inc ref cnt for the child req just created + map_entry_get_sync(entry); + req->priv = entry; + rmr_req_remote_read(req); + + parent_req->length -= max_io_size; + parent_req->datalen -= max_io_size; + } + + //submit parent req + rmr_req_remote_read(parent_req); + + return 0; +} + +static void __release_parent_req(struct rcu_head *head) +{ + struct rmr_srv_req *req = container_of(head, struct rmr_srv_req, rcu); + struct rmr_map_entry *entry = req->priv; + + pr_debug("is called for req=%p id=(%llu,%llu) err=%d, entry=%p\n", + req, req->id.a, req->id.b, req->err, entry); + + kfree(req->data); + + //may be now we can stop saving entry in req->priv, but always rmr_map_find it + if (!req->err) { + pr_debug("req %p, completed all sync req, lets clean map\n", req); + rmr_process_wait_list(entry, 0); + } else { + pr_debug("req %p completed with err %d, process wait list\n", + req, req->err); + + /* sync of this entry failed, we reset the sync_cnt so that the other req + * or sync thread could try again in the future. Without resetting, no one + * could get the ref and start sync again. + */ + atomic_set(&entry->sync_cnt, -1); + rmr_process_wait_list(entry, req->err); + } + + pr_debug("free entry %p for req %p\n", entry, req); + kmem_cache_free(rmr_map_entry_cachep, entry); + + if (req->from_sync) + rmr_srv_put_sync_permit(req->srv_pool); + + kmem_cache_free(rmr_req_cachep, req); +} + +static void rmr_req_sync_complete(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_dirty_id_map *map; + int lock_idx; + + pr_debug("sync_req %p completed for id (%llu, %llu), offset %u, len %u, err %d, from sync %d\n", + req, req->id.a, req->id.b, req->offset, req->length, + req->err, req->from_sync); + + if (req->err) + rmr_srv_sync_req_failed(req->srv_pool); + + pr_debug("release sync req %p, flags %u\n", req, req->flags); + + /* + * Only parent sync req own the allocated data. + */ + if (!req->parent) { + if (!req->err) { + map = rmr_pool_find_map(srv_pool->pool, + srv_pool->member_id); + if (map) { + lock_idx = srcu_read_lock(&srv_pool->pool->map_srcu); + rmr_map_unset_dirty(map, req->id, + MAP_NO_FILTER); + srcu_read_unlock(&srv_pool->pool->map_srcu, lock_idx); + } else { + pr_err("no map found for pool_id %u\n", srv_pool->member_id); + req->err = -EINVAL; + } + } + + pr_debug("req %p, completed all sync req, lets clean map\n", + req); + call_rcu(&req->rcu, __release_parent_req); + } else { + /* + * Child req has nothing to do but put permit and free + */ + if (req->from_sync) + rmr_srv_put_sync_permit(req->srv_pool); + + kmem_cache_free(rmr_req_cachep, req); + } +} + +static void rmr_req_sync_failed(struct rmr_srv_req *req) +{ + rmr_srv_sync_req_failed(req->srv_pool); + + pr_err("pool %s sync req %p failed for id (%llu, %llu), offset %u, len %u, err %d\n", + req->srv_pool->pool->poolname, req, req->id.a, req->id.b, + req->offset, req->length, req->err); + + rmr_req_store_done(req); +} + +// this is actually very like rmr_req_remote_io_done but without rmr_clt_put_iu +// do we want to have one function for both cases? +static void rmr_req_map_clear_done(void *priv, int err) +{ + struct rmr_srv_req *req = priv; + + rmr_clt_put_iu(req->srv_pool->clt, req->iu); + + pr_debug("called for req %p, err code %d\n", req, err); + if (err) + pr_err("pool %s, sync req with id (%llu, %llu) failed to send map clear\n", + req->srv_pool->pool->poolname, req->id.a, req->id.b); + + rmr_req_sync_complete(req); +} + +static void rmr_req_store_done(struct rmr_srv_req *req) +{ + struct rmr_map_entry *entry = req->priv; + struct rmr_srv_req *parent_req = NULL; + + pr_debug("called for req %p id (%llu, %llu ) offset %u len %u with parent req %p\n", + req, req->id.a, req->id.b, req->offset, req->length, req->parent); + + if (req->parent) + parent_req = req->parent; + else + parent_req = req; + + if (req->err) + parent_req->err = req->err; + + if (map_entry_put_sync(entry)) { + pr_debug("%s: for entry %p id (%llu, %llu) all sync req done.\n", __func__, + entry, req->id.a, req->id.b); + + /* We have to schedule the work of parent req from here since we are in the + * interrupt context of either parent req or child req + */ + pr_debug("%s: process parent_req %p\n", __func__, parent_req); + INIT_WORK(&parent_req->work, rmr_sync_req_sched); + schedule_work(&parent_req->work); + } + + if (req != parent_req) { + pr_debug("completing req %p with err %d\n", req, req->err); + rmr_req_sync_complete(req); + } +} + +static void rmr_req_send_map_clear(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_pool *pool = srv_pool->clt; + struct rmr_iu *iu; + int err; + + if (!pool) { + pr_err("Cannot send map clear. No pool client assigend for srv pool %s\n", + req->srv_pool->pool->poolname); + req->err = -EINVAL; + goto err; + } + + /* + * We try to clear map, but if we fail to, we simply ignore the error. + * Such zombie entries will be clear by rmr_srv_check_map_clear. + */ + iu = rmr_clt_get_iu(pool, RMR_OP_WRITE, WAIT); + if (IS_ERR_OR_NULL(iu)) { + pr_err("Failed to get rmr_iu for req id (%llu, %llu)\n", + req->id.a, req->id.b); + goto err; + } + + pr_debug("send map clear req id (%llu, %llu), member_id %u\n", + req->id.a, req->id.b, srv_pool->member_id); + + /* + * For MAP_CLEAR, we only need rmr_id_t for chunk number, + * and our member_id to say to clear the above chunk number for ths storage node. + * + * We also update the minimum members needed for map update. + */ + iu->msg.hdr.group_id = cpu_to_le32(pool->group_id); + iu->msg.hdr.type = cpu_to_le16(RMR_MSG_MAP_CLEAR); + iu->msg.hdr.__padding = 0; + + iu->msg.id_a = cpu_to_le64(req->id.a); + iu->msg.id_b = cpu_to_le64(req->id.b); + iu->msg.member_id = srv_pool->member_id; + + iu->msg.flags = cpu_to_le32(RMR_OP_WRITE); + + iu->conf = rmr_req_map_clear_done; + iu->priv = req; + + req->iu = iu; + + err = rmr_clt_send_map_update(pool, req->iu); + if (err) { + pr_err("%s error %d\n", __func__, err); + rmr_clt_put_iu(pool, req->iu); + goto err; + } + + pr_debug("send map clear submitted\n"); + return; + +err: + rmr_req_sync_complete(req); +} diff --git a/drivers/infiniband/ulp/rmr/rmr-req.h b/drivers/infiniband/ulp/rmr/rmr-req.h new file mode 100644 index 000000000000..8f15b36fe480 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-req.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_REQ_H +#define RMR_REQ_H + +#include "rmr-pool.h" + +struct rmr_srv_req { + struct rmr_srv_pool *srv_pool; + rmr_id_t id; + + u32 offset; + u32 length; + u32 flags; + u16 prio; + + u32 mem_id; + struct rtrs_srv_op *rtrs_op; + struct rmr_srv_io_store *store; + void *data; + u32 datalen; //TODO: what is the difference between lenghth? + void (*endreq)(struct rmr_srv_req *, int err); + struct work_struct work; + int err; + u8 failed_cnt; + u8 failed_srv_id[RMR_POOL_MAX_SESS]; + u64 map_ver; + void *priv; + struct llist_node node; + bool from_sync; + struct scatterlist sg; + struct rmr_iu *iu; + struct rmr_srv_req *parent; + bool sync; + struct rcu_head rcu; +}; + +struct rmr_srv_req *rmr_srv_req_create(const struct rmr_msg_io *msg, + struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, + void *data, u32 datalen, + void (*endreq)(struct rmr_srv_req *, int)); +struct rmr_srv_req *rmr_srv_md_req_create(struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, void *data, + u32 offset, u32 len, unsigned long flags, + void (*endreq)(struct rmr_srv_req *, int)); +void rmr_req_submit(struct rmr_srv_req *req); +void rmr_md_req_submit(struct rmr_srv_req *req); +void rmr_srv_req_resp(struct rmr_srv_req *req, int err); +void rmr_srv_md_req_resp(struct rmr_srv_req *req, int err); +int rmr_srv_sync_chunk_id(struct rmr_srv_pool *srv_pool, struct rmr_map_entry *entry, + rmr_id_t id, bool from_sync); + +void rmr_process_wait_list(struct rmr_map_entry *entry, int err); + +struct rmr_map_entry_info { + rmr_id_t id; + u8 srv_id; +}; +#endif /* RMR_REQ_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-srv-md.c b/drivers/infiniband/ulp/rmr/rmr-srv-md.c new file mode 100644 index 000000000000..9dab71a810b8 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv-md.c @@ -0,0 +1,764 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) — server metadata subsystem + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include + +#include "rmr-srv.h" +#include "rmr-req.h" +#include "rmr-clt.h" + +/** + * process_md_io() - Process medata IO message + * + * @pool: the pool where requests go through + * @rtrs_op: rtrs IO context + * @offset: offset in bytes relative to rmr metadata. + * @len: length of the buffer in bytes + * @flags: indicates metadata IO options + * @buf: pointer to metadata buffer + * + * Return: + * 0 on success + * + * Description: + * All metadata IOs go through this function to submit requests to block device. The offset it + * passes on is relative to bytes shifting on rmr medata which is composed of a header + * structure for pool metadata, bitmap and last_io array. + */ +int process_md_io(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len, + unsigned long flags, void *buf) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_srv_req *req; + int err = 0; + + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!percpu_ref_tryget_live(&pool->ids_inflight_ref)) { + err = -EIO; + goto no_put; + } + + req = rmr_srv_md_req_create(srv_pool, rtrs_op, buf, offset, len, flags, rmr_srv_endreq); + if (IS_ERR(req)) { + pr_err("Failed to create rmr_req %pe\n", req); + err = PTR_ERR(req); + goto put_pool; + } + + rmr_md_req_submit(req); + return 0; + +put_pool: + percpu_ref_put(&pool->ids_inflight_ref); +no_put: + return err; +} + +int rmr_srv_read_md(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len, + struct rmr_pool_md *pool_md_page) +{ + /* pool_md is pre-allocated */ + return process_md_io(pool, rtrs_op, offset, len, RMR_OP_MD_READ, pool_md_page); +} + +static int rmr_srv_load_last_io(struct rmr_srv_pool *srv_pool) +{ + void *buf; + u64 offset, len; + struct rmr_pool *pool = srv_pool->pool; + struct rmr_pool_md *pool_md = &pool->pool_md; + int err = 0; + + if (!pool_md->queue_depth) { + pr_err("%s: pool %s has zero queue_depth\n", + __func__, pool->poolname); + return -EINVAL; + } + offset = RMR_LAST_IO_OFFSET; + len = rmr_last_io_len(pool_md->queue_depth); + + if (!srv_pool->last_io_idx) { + srv_pool->last_io_idx = kcalloc(pool_md->queue_depth, + sizeof(*srv_pool->last_io_idx), GFP_KERNEL); + if (!srv_pool->last_io_idx) + return -ENOMEM; + } + + buf = kzalloc(len, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + return err; + } + + err = rmr_srv_read_md(pool, NULL, offset, len, buf); + if (err) { + pr_err("%s: failed to read last_io buffer of len %lld at offset %lld\n", + __func__, len, offset); + goto free_buf; + } + memcpy(srv_pool->last_io_idx, (rmr_id_t *)buf, len); + +free_buf: + kfree(buf); + return err; +} + +/** + * rmr_srv_md_maps_sync - Sync dirty maps to persistent storage + * + * Description: + * Writes maps in two passes to the map-related regions of the on-disk layout: + * + * Pass 1 — hdr_region (single PAGE_SIZE write at RMR_MD_SIZE + last_io_len): + * Fills one rmr_map_cbuf_hdr slot per map_idx in [0:maps_cnt]. + * The buffer is kzalloc'd, so slots beyond maps_cnt are zero. + * The entire PAGE_SIZE region is issued as a single I/O. + * + * Pass 2 — maps_region (slp pages at computed offsets after hdr_region): + * Each map's data offset = map_region_offset + map_idx * per_map_size. + * pool->maps[0:maps_cnt] is always dense (no NULL gaps). + */ +void rmr_srv_md_maps_sync(struct rmr_pool *pool) +{ + struct rmr_map_cbuf_hdr *map_cbuf_hdr; + struct rmr_dirty_id_map *map = NULL; + u32 hdr_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth); + u32 map_region_offset = hdr_region_offset + RMR_MAP_BUF_HDR_SIZE; + u64 per_map_size = 0; + int err, lock_idx; + void *buf; + u8 map_idx; + + buf = kzalloc(RMR_MAP_BUF_HDR_SIZE, GFP_KERNEL); + if (!buf) + return; + + lock_idx = srcu_read_lock(&pool->map_srcu); + + /* Fill the header region: one slot per active map */ + for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) { + map = rcu_dereference(pool->maps[map_idx]); + if (WARN_ON(!map)) + goto unlock; + + map_cbuf_hdr = buf + map_idx * sizeof(struct rmr_map_cbuf_hdr); + map_cbuf_hdr->version = RMR_MAP_FORMAT_VER; + map_cbuf_hdr->member_id = map->member_id; + map_cbuf_hdr->no_of_chunks = map->no_of_chunks; + map_cbuf_hdr->no_of_flp = map->no_of_flp; + map_cbuf_hdr->no_of_slp_in_last_flp = map->no_of_slp_in_last_flp; + map_cbuf_hdr->no_of_chunk_in_last_slp = map->no_of_chunk_in_last_slp; + map_cbuf_hdr->total_slp = map->total_slp; + per_map_size = map->total_slp * PAGE_SIZE; + } + + /* Write the entire header region as a single PAGE_SIZE I/O */ + err = process_md_io(pool, NULL, hdr_region_offset, + PAGE_SIZE, RMR_OP_MD_WRITE, buf); + if (err) { + pr_warn("%s: failed to write header region at 0x%x: %d\n", + __func__, hdr_region_offset, err); + goto unlock; + } + + if (WARN_ON(!per_map_size)) + goto unlock; + + /* Write each map's slp pages */ + for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) { + u32 map_data_offset; + el_flp *flp_ptr; + u64 no_of_slps; + void *slp; + int i, j; + + map = rcu_dereference(pool->maps[map_idx]); + if (WARN_ON(!map)) + break; + + map_data_offset = map_region_offset + map_idx * per_map_size; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (void *)(*flp_ptr); + + err = process_md_io(pool, NULL, map_data_offset, + PAGE_SIZE, RMR_OP_MD_WRITE, slp); + if (err) + pr_warn("%s: failed to write map slp at 0x%x: %d\n", + __func__, map_data_offset, err); + map_data_offset += PAGE_SIZE; + } + } + } + +unlock: + srcu_read_unlock(&pool->map_srcu, lock_idx); + kfree(buf); +} + +/** + * rmr_srv_refresh_md_maps - Restore maps from map buffers on disk + * + * Description: + * Reads back the maps written by rmr_srv_md_maps_sync(). Reads the hdr_region + * in a single I/O to obtain the per-map headers, then loads each present + * map's slp pages from maps_region: + * data offset = map_region_offset + map_idx * per_map_size + * Header slots 0..N-1 are active; remaining are zero (member_id == 0). + */ +static int rmr_srv_refresh_md_maps(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_map_cbuf_hdr *map_cbuf_hdr; + struct rmr_dirty_id_map *map = NULL; + u32 hdr_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth); + u32 map_region_offset = hdr_region_offset + RMR_MAP_BUF_HDR_SIZE; + int err = 0, lock_idx; + void *buf; + u8 map_idx, valid_nr = 0; + bool unpack; + + buf = kzalloc(RMR_MAP_BUF_HDR_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Read the entire header region in a single PAGE_SIZE I/O */ + err = rmr_srv_read_md(pool, NULL, hdr_region_offset, RMR_MAP_BUF_HDR_SIZE, buf); + if (err) { + pr_err("%s: failed to read header region at offset %u\n", + __func__, hdr_region_offset); + kfree(buf); + return err; + } + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (map_idx = 0; map_idx < RMR_POOL_MAX_SESS; map_idx++) { + u64 per_map_size; + u32 map_data_offset; + el_flp *flp_ptr; + u64 no_of_slps; + void *slp; + int i, j; + + map_cbuf_hdr = buf + map_idx * sizeof(struct rmr_map_cbuf_hdr); + pr_debug("%s: %llu %u %llu %llu %llu %llu %llu\n", __func__, + map_cbuf_hdr->version, + map_cbuf_hdr->member_id, + map_cbuf_hdr->no_of_chunks, + map_cbuf_hdr->no_of_flp, + map_cbuf_hdr->no_of_slp_in_last_flp, + map_cbuf_hdr->no_of_chunk_in_last_slp, + map_cbuf_hdr->total_slp); + + /* Empty slot: no more active maps beyond this point */ + if (!map_cbuf_hdr->member_id) + break; + valid_nr++; + + per_map_size = map_cbuf_hdr->total_slp * PAGE_SIZE; + map_data_offset = map_region_offset + map_idx * per_map_size; + + unpack = false; + /* + * The dirty map should be updated only when the one on disk is more updated. + * Such cases are as follows. + * 1) The dirty map does not exist in the pool. The map will be simply restored to + * the last version we have. + * 2) The dirty map of the pool is just created. If it has been updated, the one on + * disk is outdated. + */ + map = rmr_pool_find_map(pool, map_cbuf_hdr->member_id); + if (!map) { + map = rmr_map_create(pool, map_cbuf_hdr->member_id); + if (IS_ERR(map)) { + err = PTR_ERR(map); + pr_err("%s: pool %s, member_id %d failed to create map\n", + __func__, pool->poolname, map_cbuf_hdr->member_id); + goto unlock; + } + unpack = true; + } else if (rmr_map_empty(map)) { + unpack = true; + } + + if (map->no_of_chunks != map_cbuf_hdr->no_of_chunks || + map->no_of_flp != map_cbuf_hdr->no_of_flp || + map->no_of_slp_in_last_flp != map_cbuf_hdr->no_of_slp_in_last_flp || + map->no_of_chunk_in_last_slp != map_cbuf_hdr->no_of_chunk_in_last_slp || + map->total_slp != map_cbuf_hdr->total_slp) { + pr_err("%s: Sanity check failed\n", __func__); + goto unlock; + } + + xa_store(&pool->stg_members, map_cbuf_hdr->member_id, XA_TRUE, GFP_KERNEL); + + if (!unpack) + continue; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (void *)(*flp_ptr); + + err = rmr_srv_read_md(pool, NULL, map_data_offset, + PAGE_SIZE, slp); + if (err) { + pr_err("%s: failed to read bitmap at offset %u\n", + __func__, map_data_offset); + goto unlock; + } + map_data_offset += PAGE_SIZE; + } + } + } + +unlock: + if (!valid_nr) + pr_err("%s: no valid map found in metadata\n", __func__); + + /* + * TODO: We need better error handling logic here. + * Lets suppose after successfully reading few pages for a map, we fail to read next page. + * We then error out and fail the register, but leave the partially updated map in the pool. + * Later when another register is called, and we come here to read the maps, we will + * see a non-empty map, and skip reading the map from disk. + */ + srcu_read_unlock(&pool->map_srcu, lock_idx); + kfree(buf); + return err; +} + +/** + * rmr_srv_md_update() - update the metadata of the server pool + * + * Description: + * Read current in-memory pool states that changes to the srv_md of this pool. + */ +static int rmr_srv_md_update(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool; + struct rmr_srv_md *my_srv_md; + int md_i; + + pool = srv_pool->pool; + md_i = rmr_pool_find_md(&pool->pool_md, srv_pool->member_id, true); + if (md_i < 0) { + pr_warn("No space for new member %d.\n", srv_pool->member_id); + return -EINVAL; + } + my_srv_md = &pool->pool_md.srv_md[md_i]; + my_srv_md->member_id = srv_pool->member_id; + my_srv_md->store_state = atomic_read(&srv_pool->store_state); + my_srv_md->map_ver = srv_pool->pool->map_ver; + my_srv_md->srv_pool_state = atomic_read(&srv_pool->state); + pr_debug("Set srv_md[%d] it with the member_id %d.\n", md_i, srv_pool->member_id); + return 0; +} + +/** + * rmr_srv_flush_pool_md() - Write pool_md region to disk immediately + * + * @srv_pool: Server pool whose pool_md is to be flushed + * + * Description: + * Persist pool_md without waiting for the delayed work. + */ +void rmr_srv_flush_pool_md(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + void *buf; + int err; + + if (!atomic_read(&srv_pool->store_state) || !pool->mapped_size) + return; + + err = rmr_srv_md_update(srv_pool); + if (err) { + pr_warn("%s: failed to update pool_md before flush: 0x%x\n", __func__, err); + return; + } + + buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + if (!buf) + return; + + memcpy(buf, &pool->pool_md, sizeof(struct rmr_pool_md)); + err = process_md_io(pool, NULL, 0, RMR_MD_SIZE, RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: failed to flush pool_md: 0x%x at offset 0 len %lu\n", + __func__, err, RMR_MD_SIZE); + kfree(buf); +} + +/** + * rmr_srv_flush_last_io() - Write last_io region to disk + * + * @srv_pool: Server pool whose last_io is to be flushed + */ +static void rmr_srv_flush_last_io(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + u64 last_io_len = rmr_last_io_len(pool->pool_md.queue_depth); + void *buf; + int err; + + if (!last_io_len || !srv_pool->last_io) + return; + + buf = kzalloc(last_io_len, GFP_KERNEL); + if (!buf) + return; + + memcpy(srv_pool->last_io_idx, srv_pool->last_io, last_io_len); + memcpy(buf, srv_pool->last_io_idx, last_io_len); + + err = process_md_io(pool, NULL, RMR_MD_SIZE, last_io_len, + RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: failed to flush last_io: 0x%x at offset %lu len %llu\n", + __func__, err, RMR_MD_SIZE, last_io_len); + kfree(buf); +} + +/** + * rmr_srv_md_load_buf() - Load the server metadata from buffer to the server pool. + * + * Description: + * This function loads the server-side metadata from buffer to the pool. The buffer must be + * in the format of rmr pool metadata structure, which may contain updated srv_md of + * multiple servers. + */ +static int rmr_srv_md_load_buf(struct rmr_pool *pool, void *buf) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_md *srv_md_buf; + u8 member_id = 0; + int err = 0, index, i; + bool ret = false; + + buf += (RMR_CLT_MD_SIZE - sizeof(struct rmr_srv_md)); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + buf += sizeof(struct rmr_srv_md); + srv_md_buf = (struct rmr_srv_md *)buf; + member_id = srv_md_buf->member_id; + /* skip updating the srv_md of this server pool */ + if (!member_id || member_id == srv_pool->member_id) + continue; + + index = rmr_pool_find_md(&pool->pool_md, member_id, true); + if (index < 0) { + pr_debug("%s: No space in the pool_md for new member %d\n", + __func__, member_id); + err = -EINVAL; + continue; + } + + pr_debug("Load srv_md[%d] with member_id %d\n", index, member_id); + memcpy(&pool->pool_md.srv_md[index], srv_md_buf, sizeof(struct rmr_srv_md)); + ret = true; + } + + if (!ret) { + pr_debug("No server metadata found in the buffer\n"); + err = -EINVAL; + } + + return err; +} + +/** + * rmr_srv_md_process_buf() - Load the metadata from buffer to the server pool. + * + * Description: + * This node loads the metadata from buffer to the server pool. + */ +int rmr_srv_md_process_buf(struct rmr_pool *pool, void *buf, bool sync) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_pool_md *buf_pool_md, *dest_md = &pool->pool_md; + int err = 0; + + srv_pool = (struct rmr_srv_pool *)pool->priv; + buf_pool_md = (struct rmr_pool_md *)buf; + if (!sync) { + /* Copy only the client-side header. */ + memcpy(dest_md, buf_pool_md, RMR_CLT_MD_SIZE); + } else { + err = rmr_srv_md_load_buf(pool, buf); + if (err) + pr_err("Failed to load md buf to pool %s\n", pool->poolname); + } + + return err; +} + +int rmr_srv_send_md_update(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_pool *sync_pool = srv_pool->clt; + struct rmr_msg_pool_cmd msg = {}; + int err = 0, buflen; + void *buf; + + /* Only normal-state server pools should send metadata updates. */ + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) + return -EINVAL; + + /* For a stg node A, is A->B alive? */ + if (!sync_pool) { + pr_debug("pool %s has no sync pool assigned. Cannot send md update commands.\n", + pool->poolname); + return -ENXIO; + } + + buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + buflen = RMR_MD_SIZE; + + rmr_clt_init_cmd(sync_pool, &msg); + msg.cmd_type = RMR_CMD_MD_SEND; + /* This node sends messages to start md_update. */ + msg.md_send_cmd.leader_id = srv_pool->member_id; + msg.md_send_cmd.src_mapped_size = pool->mapped_size; + + err = rmr_clt_send_cmd_with_data_all(sync_pool, &msg, buf, buflen); + if (err < 0) { + pr_debug("pool %s sends all sess RMR_CMD_MD_SEND failed\n", pool->poolname); + goto free_buf; + } + + /* + * keep the original slice of buffer if the corresponding send req failed. + * + * TODO: + * We need to use the err received from rmr_clt_send_cmd_with_data_all in this function, + * and match the sessions we are skipping. + * + * In general, the sessions_skipped == (RMR_POOL_MAX_SESS - (number_of_legs - 1 - err). + * If the above number does not match, then we abandon the buffers, and try again. + */ + err = rmr_srv_md_load_buf(pool, buf); + if (err) { + pr_debug("Failed to load md buf to pool %s\n", pool->poolname); + goto free_buf; + } + +free_buf: + kfree(buf); + return err; +} + +/** + * rmr_srv_refresh_md() - Refresh the metadata of the rmr pool. + * + * @srv_pool: Server pool whose metadata to be find + * + * Description: + * Read the metadata of the rmr pool from the backing store. + * + * Return: + * True when reading the metadata succeeds in two cases. The first case is a successful read + * but no metadata found. The second case is it found metadata which contains the srv_md. + * False otherwise. + */ +int rmr_srv_refresh_md(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool_md *pool_md_page; + struct rmr_pool *pool = srv_pool->pool; + int index, ret; + u64 md_ver; + + pool_md_page = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + if (!pool_md_page) + return -ENOMEM; + + if (rmr_srv_read_md(pool, NULL, 0, RMR_MD_SIZE, pool_md_page)) { + pr_err("%s: failed reading md of rmr\n", __func__); + goto free_md; + } + + pr_info("%s: Read md of pool %s from store with magic 0x%llx\n", + __func__, pool_md_page->poolname, pool_md_page->magic); + + if (pool_md_page->magic != RMR_POOL_MD_MAGIC) { + pr_info("%s: No valid md found on the store for pool %s\n", + __func__, pool->poolname); + ret = -EINVAL; + goto free_md; + } + + /* + * TODO: Should we sanity check other params also? + */ + if (pool_md_page->chunk_size != pool->chunk_size) { + pr_err("%s: chunk size mismatched. pool chunk size %u, md chunk size %u\n", + __func__, pool->chunk_size, pool_md_page->chunk_size); + goto free_md; + } + + /* Import the metadata to the states of the pool. */ + index = rmr_pool_find_md(pool_md_page, srv_pool->member_id, false); + if (index < 0) { + pr_info("%s: No md found for member_id %d\n", __func__, srv_pool->member_id); + ret = index; + goto free_md; + } + + if (pool_md_page->srv_md[index].mapped_size != pool->mapped_size) { + pr_err("%s: Mapped size mismatched. The srv pool %llu, md %llu\n", + __func__, pool->mapped_size, pool_md_page->mapped_size); + ret = -EINVAL; + goto free_md; + } + + md_ver = pool_md_page->srv_md[index].map_ver; + if (md_ver < pool->map_ver) + pr_err("The current map ver is %lld but the map ver on md is %lld.\n", + pool->map_ver, md_ver); + else + pool->map_ver = md_ver; + + pool->pool_md = *pool_md_page; + + ret = rmr_srv_load_last_io(srv_pool); + if (ret) { + pr_err("%s: failed to load last_io array to memory with err 0x%x\n", + __func__, ret); + goto zero_md; + } + + pr_info("%s: no_of_chunks %lld\n", __func__, pool->no_of_chunks); + ret = rmr_srv_refresh_md_maps(srv_pool); + if (ret) { + pr_err("%s: failed to load dirty bitmap to memory with err %pe\n", + __func__, ERR_PTR(ret)); + goto free_last_io; + } + goto free_md; + +free_last_io: + kfree(srv_pool->last_io_idx); + srv_pool->last_io_idx = NULL; +zero_md: + memset(&pool->pool_md, 0, sizeof(pool->pool_md)); +free_md: + kfree(pool_md_page); + return ret; +} + +/** + * rmr_srv_mark_maps_dirty() - Set MD_DIRTY_MAPS and schedule delayed sync + * + * @srv_pool: Server pool with changed maps + */ +void rmr_srv_mark_maps_dirty(struct rmr_srv_pool *srv_pool) +{ + set_bit(MD_DIRTY_MAPS, &srv_pool->md_dirty); + mod_delayed_work(srv_pool->md_sync_wq, &srv_pool->md_sync_dwork, + msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS)); +} + +/** + * rmr_srv_md_sync - sync dirty metadata regions of pool + * + * Description: + * Dirty-driven consumer: only flushes regions whose dirty bit is set. + * Producers set bits and schedule this work via mod_delayed_work(). + * Does NOT re-queue itself — the next dirty event will schedule it. + */ +void rmr_srv_md_sync(struct work_struct *work) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_pool *pool; + bool ret, did_work = false; + + srv_pool = container_of(to_delayed_work(work), struct rmr_srv_pool, md_sync_dwork); + if (!srv_pool->pool) + return; + + /* + * It could happen that access the pool while the pool is not there. Use reference counting + * for server pool to avoid the issue. + */ + ret = rmr_get_srv_pool(srv_pool); + if (!ret) { + pr_err("%s: pool is not there\n", __func__); + return; + } + + pool = srv_pool->pool; + + /* + * Update srv_md snapshot and notify peers whenever any region is dirty. + */ + if (!rmr_srv_md_update(srv_pool) && rmr_srv_send_md_update(pool)) + pr_debug("failed to send md update\n"); + + /* + * The io store is ready after the store is registered and the pool metadata is + * updated, if any. + */ + if (!atomic_read(&srv_pool->store_state) || !pool->mapped_size) + goto put_pool; + + /* + * On-disk layout of rmr pool metadata: + * + * 0 RMR_MD_SIZE +last_io_len +PAGE_SIZE + * +-----------+-------------+---------------+--------------------+ + * | pool_md | last_io | hdr_region | maps_region ... | + * +-----------+-------------+---------------+--------------------+ + * <-RMR_MD_SIZE><-last_io_len><--PAGE_SIZE--><-per_map slp pages-> + * + * pool->maps[0:maps_cnt] is always dense (no NULL gaps). + * + * This I/O covers pool_md + last_io. hdr_region and maps_region are + * written separately by rmr_srv_md_maps_sync(). + */ + if (test_and_clear_bit(MD_DIRTY_POOL, &srv_pool->md_dirty)) { + rmr_srv_flush_pool_md(srv_pool); + did_work = true; + } + + if (test_and_clear_bit(MD_DIRTY_LAST_IO, &srv_pool->md_dirty)) { + rmr_srv_flush_last_io(srv_pool); + did_work = true; + } + + if (test_and_clear_bit(MD_DIRTY_MAPS, &srv_pool->md_dirty)) { + rmr_srv_md_maps_sync(pool); + did_work = true; + } + + if (did_work) + pr_debug("%s: flushed dirty regions for server pool %u of %s\n", + __func__, srv_pool->member_id, pool->poolname); + +put_pool: + rmr_put_srv_pool(srv_pool); + /* Do NOT re-queue. Producers schedule us via mod_delayed_work. */ +} diff --git a/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c b/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c new file mode 100644 index 000000000000..2aa1e07235b8 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c @@ -0,0 +1,1047 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include + +#include "rmr-srv.h" +#include "rmr-map.h" +#include "rmr-clt.h" + +#define MAX_POOL_ID 255 + +static struct class *rmr_dev_class; +static struct device *rmr_ctl_dev; +static struct device *rmr_pool_dev; + +static struct kobj_type rmr_srv_sess_ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +int rmr_srv_sysfs_add_sess(struct rmr_pool *pool, + struct rmr_srv_pool_sess *pool_sess) +{ + int ret; + + ret = kobject_init_and_add(&pool_sess->kobj, &rmr_srv_sess_ktype, + &pool->sessions_kobj, "%s", + pool_sess->sessname); + if (ret) + pr_err("Failed to add session %s into sysfs\n", + pool_sess->sessname); + + return ret; +} + +void rmr_srv_sysfs_del_sess(struct rmr_srv_pool_sess *pool_sess) +{ + kobject_del(&pool_sess->kobj); + kobject_put(&pool_sess->kobj); +} + +static ssize_t rmr_srv_member_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + return sprintf(page, "%d\n", srv_pool->member_id); +} + +static struct kobj_attribute rmr_srv_member_id_attr = + __ATTR(member_id, 0444, rmr_srv_member_id_show, NULL); + +static ssize_t rmr_srv_pool_blksize_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + /* TODO: introduce blksize for pool */ + return sprintf(page, "128k\n"); +} + +static struct kobj_attribute rmr_srv_pool_blksize_attr = + __ATTR(blksize, 0444, rmr_srv_pool_blksize_show, NULL); + +static ssize_t rmr_srv_leave_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +void rmr_srv_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self) +{ + if (pool->kobj.state_in_sysfs) { + WARN_ON(!list_empty(&pool->sess_list)); + kobject_del(&pool->sessions_kobj); + kobject_put(&pool->sessions_kobj); + if (sysfs_self) + sysfs_remove_file_self(&pool->kobj, sysfs_self); + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + } +} + +static ssize_t rmr_srv_leave_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (READ_ONCE(srv_pool->io_store)) { + pr_err("pool %s has a store registered\n", pool->poolname); + return -EINVAL; + } + + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_EMPTY) { + pr_err("pool %s cannot leave: not in EMPTY state (state=%d)\n", + pool->poolname, atomic_read(&srv_pool->state)); + return -EINVAL; + } + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + if (srv_pool->clt) { + int err; + + err = rmr_srv_remove_clt_pool(srv_pool); + if (err) { + pr_err("pool %s failed to remove clt_pool\n", pool->poolname); + return -EINVAL; + } + } + pr_info("srv: Deleting pool '%s'\n", pool->poolname); + + rmr_srv_destroy_pool(pool); + rmr_srv_destroy_pool_sysfs_files(pool, &attr->attr); + rmr_put_srv_pool(srv_pool); + + return count; +} + +static struct kobj_attribute rmr_srv_leave_pool_attr = + __ATTR(leave_pool, 0644, rmr_srv_leave_pool_show, + rmr_srv_leave_pool_store); + +static ssize_t rmr_srv_pool_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool = container_of(kobj, struct rmr_pool, kobj); + struct rmr_dirty_id_map *map; + int i, lock_idx; + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + rmr_map_dump_bitmap(map); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return 0; +} + +static ssize_t rmr_srv_pool_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + rmr_id_t id = { 0, 0 }; + int srv_id; + struct rmr_dirty_id_map *map; + + pool = container_of(kobj, struct rmr_pool, kobj); + if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) { + pr_err("cannot parse id.a %s\n", buf); + return -EINVAL; + } + pr_debug("Add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id); + + /* + * If given chunk number exceeds total chunks for us, ignore! + */ + if (id.b > pool->no_of_chunks) + return count; + + map = rmr_pool_find_map(pool, srv_id); + if (!map) { + pr_err("in pool %s cannot find map for srv_id %u\n", + pool->poolname, srv_id); + return -EINVAL; + } + + rmr_map_set_dirty(map, id, 0); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + + return count; +} + +static struct kobj_attribute rmr_srv_pool_map_attr = + __ATTR(map, 0644, rmr_srv_pool_map_show, + rmr_srv_pool_map_store); + +static ssize_t rmr_srv_pool_map_ver_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + ssize_t written; + + pool = container_of(kobj, struct rmr_pool, kobj); + + written = scnprintf(page, PAGE_SIZE, "Map ver: %llu\n", pool->map_ver); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_map_ver_attr = + __ATTR(map_version, 0444, rmr_srv_pool_map_ver_show, NULL); + +static ssize_t rmr_srv_pool_last_io_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + ssize_t written = 0; + int i; + rmr_id_t *id; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + for (i = 0; i < srv_pool->queue_depth; i++) { + id = &srv_pool->last_io[i]; + + if (id->a == U64_MAX && id->b == U64_MAX) + continue; + + written += scnprintf(page + written, PAGE_SIZE - written, + "[%d]=(%llu,%llu) ", i, id->a, id->b); + } + if (written == 0) + written += scnprintf(page + written, PAGE_SIZE - written, + "(empty)"); + written += scnprintf(page + written, PAGE_SIZE - written, "\n"); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_last_io_attr = + __ATTR(last_io, 0644, rmr_srv_pool_last_io_show, NULL); + +static ssize_t rmr_srv_add_clt_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo poolname > %s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_add_clt_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + struct rmr_pool *clt = NULL; + char name[NAME_MAX]; + int err; + struct rmr_attrs attrs; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (sscanf(buf, "%s", name) != 1) { + pr_err("cannot parse %s\n", buf); + return -EINVAL; + } + + clt = rmr_clt_open(NULL, NULL, name); + if (IS_ERR_OR_NULL(clt)) { + pr_err("cannot open pool %s err %ld\n", name, PTR_ERR(clt)); + return -EEXIST; + } + + pr_info("%s: Adding client pool %s, to server pool %s\n", + __func__, pool->poolname, clt->poolname); + + err = rmr_clt_query(clt, &attrs); + if (unlikely(err)) + goto close_rmr; + + if (!attrs.sync) { + pr_err("%s: Add clt called for non-sync rmr client pool %s\n", __func__, name); + err = -EINVAL; + goto close_rmr; + } + + srv_pool->max_sync_io_size = attrs.max_io_size; + + /* The sync client holds a pointer to its parent server pool. */ + srv_pool->clt = clt; + + /* Re-trigger md sync now that the sync path is available. */ + rmr_srv_mark_pool_md_dirty(srv_pool); + + /* + * Check if the device paramters of connected servers share the same values. + */ + err = rmr_srv_check_params(srv_pool); + if (err) + goto close_clt; + + return count; + +close_clt: + srv_pool->clt = NULL; + srv_pool->max_sync_io_size = 0; +close_rmr: + pr_err("%s: Adding client pool failed\n", __func__); + rmr_clt_close(clt); + return err; +} + +static struct kobj_attribute rmr_srv_add_clt_pool_attr = + __ATTR(add_clt, 0644, rmr_srv_add_clt_pool_show, + rmr_srv_add_clt_pool_store); + +static ssize_t rmr_srv_pool_sync_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + return scnprintf(page, PAGE_SIZE, "Usage: echo \"start|stop\" > /%s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_pool_sync_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int err = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!strncasecmp(buf, "start", 5)) { + /* + * Start + */ + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + pr_info("For pool %s, sync thread already running\n", pool->poolname); + goto out; + } + + mutex_lock(&srv_pool->srv_pool_lock); + + if (!atomic_read(&srv_pool->store_state) && + atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_err("Pool %s not in working state. Sync thread start failed\n", + pool->poolname); + err = -EINVAL; + goto unlock_mutex; + } + + err = rmr_srv_sync_thread_start(srv_pool); + if (err) { + pr_err("For pool %s, rmr_srv_sync_thread_start Error %d\n", + pool->poolname, err); + goto unlock_mutex; + } + + mutex_unlock(&srv_pool->srv_pool_lock); + + } else if (!strncasecmp(buf, "stop", 4)) { + /* + * Stop + */ + if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_STOPPED) { + pr_info("For pool %s, sync thread already stopped\n", pool->poolname); + goto out; + } + + err = rmr_srv_sync_thread_stop(srv_pool); + if (err) { + pr_err("For pool %s, rmr_srv_sync_thread_stop Error %d\n", + pool->poolname, err); + goto err; + } + } else { + pr_err("Unknown value\n"); + err = -EINVAL; + goto err; + } + +out: + return count; + +unlock_mutex: + mutex_unlock(&srv_pool->srv_pool_lock); +err: + return err; +} + +static struct kobj_attribute rmr_srv_pool_sync_attr = + __ATTR(sync, 0644, rmr_srv_pool_sync_show, + rmr_srv_pool_sync_store); + +static ssize_t sync_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int state; + ssize_t written = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + state = atomic_read(&srv_pool->thread_state); + switch (state) { + case SYNC_THREAD_RUNNING: + written = sysfs_emit(page, "Running\n"); + break; + case SYNC_THREAD_STOPPED: + written = sysfs_emit(page, "Stopped\n"); + break; + case SYNC_THREAD_REQ_STOP: + written = sysfs_emit(page, "Request_to_stop\n"); + break; + case SYNC_THREAD_WAIT: + written = sysfs_emit(page, "Wait\n"); + break; + default: + written = sysfs_emit(page, "Unknown value %d\n", state); + break; + } + + return written; +} + +static struct kobj_attribute rmr_srv_pool_sync_state_attr = + __ATTR_RO(sync_state); + +static ssize_t rmr_srv_pool_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int state; + ssize_t written = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + state = atomic_read(&srv_pool->state); + + switch (state) { + case RMR_SRV_POOL_STATE_EMPTY: + written = sysfs_emit(page, "empty\n"); + + break; + case RMR_SRV_POOL_STATE_REGISTERED: + written = sysfs_emit(page, "registered\n"); + + break; + case RMR_SRV_POOL_STATE_CREATED: + written = sysfs_emit(page, "created\n"); + + break; + case RMR_SRV_POOL_STATE_NORMAL: + written = sysfs_emit(page, "normal\n"); + + break; + case RMR_SRV_POOL_STATE_NO_IO: + written = sysfs_emit(page, "no_io\n"); + + break; + default: + written = sysfs_emit(page, "Unknown value %d\n", state); + + break; + } + + written += sysfs_emit_at(page, written, "Maintenance mode: %d\n", + srv_pool->maintenance_mode); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_state_attr = + __ATTR(state, 0644, rmr_srv_pool_state_show, NULL); + +static ssize_t rmr_srv_remove_clt_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_remove_clt_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int err; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + err = rmr_srv_remove_clt_pool(srv_pool); + if (err) { + pr_err("pool %s failed to remove clt_pool\n", pool->poolname); + return -EINVAL; + } + + return count; +} + +static struct kobj_attribute rmr_srv_remove_clt_pool_attr = + __ATTR(remove_clt, 0644, rmr_srv_remove_clt_pool_show, + rmr_srv_remove_clt_pool_store); + +static ssize_t rmr_srv_pool_test_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_pool_test_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int err; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + if (!srv_pool->clt) { + pr_err("pool %s no clt pool assigned to this rmr pool. cannot do map test.\n", + pool->poolname); + return -EINVAL; + } + + pr_info("pool %s start test map...\n", pool->poolname); + err = rmr_clt_test_map(pool, srv_pool->clt); + if (err) { + pr_err("pool %s, test map failed, err %d\n", + pool->poolname, err); + return err; + } + pr_info("pool %s test map done.", pool->poolname); + + return count; +} + +static struct kobj_attribute rmr_srv_pool_test_map_attr = + __ATTR(test_map, 0644, rmr_srv_pool_test_map_show, + rmr_srv_pool_test_map_store); + +static ssize_t rmr_srv_pool_metadata_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_pool_md *pool_md; + struct rmr_srv_md *srv_md; + int i; + ssize_t written = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + pool_md = &pool->pool_md; + + written += sysfs_emit_at(page, written, + "The metadata of %s is: group_id %u, chunk_size %u, " + "mapped_size %llu, queue_depth %u, " + "bitmap_offset %llu, bitmap_len %llu, " + "last_io_offset %llu, last_io_len %llu\n\n", + pool_md->poolname, pool_md->group_id, pool_md->chunk_size, + pool_md->mapped_size, pool_md->queue_depth, + rmr_bitmap_offset(pool_md->queue_depth), + rmr_bitmap_len(pool->no_of_chunks), + (u64)RMR_LAST_IO_OFFSET, + rmr_last_io_len(pool_md->queue_depth)); + written += sysfs_emit_at(page, written, + "The client pool: map_ver %llu\n\n", pool_md->map_ver); + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + srv_md = &pool_md->srv_md[i]; + if (!srv_md->member_id) + continue; + + written += sysfs_emit_at(page, written, "The server pool with member_id %u: " + "mapped_size %llu, store_state %u, " + "pool_state %u, map_update_state %u, " + "map_ver %llu, discard_entries %x.\n\n", + srv_md->member_id, srv_md->mapped_size, + srv_md->store_state, + srv_md->srv_pool_state, + srv_md->map_update_state, srv_md->map_ver, + srv_md->discard_entries); + } + + return written; +} + +static struct kobj_attribute rmr_srv_pool_metadata_attr = + __ATTR(metadata, 0444, rmr_srv_pool_metadata_show, NULL); + +static const char *map_update_state_str(enum srv_map_update_state state) +{ + switch (state) { + case MAP_UPDATE_STATE_DISABLED: + return "disabled"; + case MAP_UPDATE_STATE_READY: + return "ready"; + case MAP_UPDATE_STATE_DONE: + return "done"; + } + return "unknown"; +} + +static ssize_t rmr_srv_pool_map_update_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + return sysfs_emit(page, "%s\n", map_update_state_str(srv_pool->map_update_state)); +} + +static struct kobj_attribute rmr_srv_pool_map_update_state_attr = + __ATTR(map_update_state, 0644, rmr_srv_pool_map_update_state_show, NULL); + +static ssize_t rmr_srv_pool_map_unsynced_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + ssize_t written = 0; + struct rmr_pool *pool; + struct rmr_dirty_id_map *map; + rmr_id_t id; + int i, j, lock_idx; + + pool = container_of(kobj, struct rmr_pool, kobj); + + id.a = 1; + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; (i < RMR_POOL_MAX_SESS && written < PAGE_SIZE); i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + written += sysfs_emit_at(page, written, "member_id : %d\n", map->member_id); + for (j = 0; j < map->no_of_chunks; j++) { + size_t len; + + id.b = j; + if (rmr_map_check_dirty(map, id) && + (map->bitmap_filter[id.b] & MAP_ENTRY_UNSYNCED)) { + len = sysfs_emit_at(page, written, "(%llu, %llu) ", + id.a, id.b); + if (!len) // break early if map is too big + break; + written += len; + } + } + written += sysfs_emit_at(page, written, "\n"); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return written; +} + +static ssize_t rmr_srv_pool_map_unsynced_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + rmr_id_t id = { 0, 0 }; + int srv_id; + struct rmr_dirty_id_map *map; + + pool = container_of(kobj, struct rmr_pool, kobj); + if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) { + pr_err("cannot parse id.a %s\n", buf); + return -EINVAL; + } + pr_debug("add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id); + + map = rmr_pool_find_map(pool, srv_id); + if (!map) { + pr_err("in pool %s cannot find map for srv_id %u\n", + pool->poolname, srv_id); + return -EINVAL; + } + + rmr_map_set_dirty(map, id, MAP_ENTRY_UNSYNCED); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + + return count; +} +static struct kobj_attribute rmr_srv_pool_map_unsynced_attr = + __ATTR(map_unsynced, 0644, rmr_srv_pool_map_unsynced_show, + rmr_srv_pool_map_unsynced_store); + +static ssize_t map_summary_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + int lock_idx; + int written; + + pool = container_of(kobj, struct rmr_pool, kobj); + + lock_idx = srcu_read_lock(&pool->map_srcu); + written = rmr_map_summary_format(pool, page, PAGE_SIZE); + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_map_summary_attr = + __ATTR_RO(map_summary); + +static struct attribute *rmr_srv_pool_attrs[] = { + &rmr_srv_leave_pool_attr.attr, + &rmr_srv_member_id_attr.attr, + &rmr_srv_pool_blksize_attr.attr, + &rmr_srv_pool_map_attr.attr, + &rmr_srv_pool_map_ver_attr.attr, + &rmr_srv_pool_last_io_attr.attr, + &rmr_srv_add_clt_pool_attr.attr, + &rmr_srv_pool_sync_attr.attr, + &rmr_srv_pool_sync_state_attr.attr, + &rmr_srv_pool_state_attr.attr, + &rmr_srv_remove_clt_pool_attr.attr, + &rmr_srv_pool_test_map_attr.attr, + &rmr_srv_pool_metadata_attr.attr, + &rmr_srv_pool_map_update_state_attr.attr, + &rmr_srv_pool_map_unsynced_attr.attr, + &rmr_srv_pool_map_summary_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_srv_pool); + +static struct kobj_type rmr_srv_pool_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_srv_pool_groups, +}; + +static struct kobj_type ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +static int rmr_srv_create_pool_sysfs_files(struct rmr_pool *pool) +{ + int ret; + + ret = kobject_init_and_add(&pool->kobj, &rmr_srv_pool_ktype, + &rmr_pool_dev->kobj, "%s", pool->poolname); + if (ret) { + pr_err("Failed to create sysfs dir for pool '%s': %d\n", + pool->poolname, ret); + return ret; + } + + ret = kobject_init_and_add(&pool->sessions_kobj, &ktype, &pool->kobj, + "sessions"); + if (unlikely(ret)) { + pr_err("Failed to create sessions dir for pool '%s': %d\n", + pool->poolname, ret); + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + } + + return ret; +} + +/* remove new line from string */ +static void strip(char *s) +{ + char *p = s; + + while (*s != '\0') { + if (*s != '\n') + *p++ = *s++; + else + ++s; + } + *p = '\0'; +} + +enum rmr_srv_opts { + RMR_SRV_OPT_POOL_NAME, + RMR_SRV_OPT_MEMBER_ID, + RMR_JOIN_OPT_Mandatory_count, + RMR_SRV_OPT_ERR, +}; + +static const char * const rmr_srv_opts_mandatory_names[] = { + [RMR_SRV_OPT_POOL_NAME] = "poolname", + [RMR_SRV_OPT_MEMBER_ID] = "member_id", +}; + +static const match_table_t rmr_srv_opt_tokens = { + { RMR_SRV_OPT_POOL_NAME, "poolname=%s" }, + { RMR_SRV_OPT_MEMBER_ID, "member_id=%s" }, + { RMR_SRV_OPT_ERR, NULL }, +}; + +static int rmr_srv_parse_options(const char *buf, char *poolname, + u32 *member_id) +{ + char *options, *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token, ret = 0, i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + options = strstrip(options); + strip(options); + while ((p = strsep(&options, " ")) != NULL) { + if (!*p) + continue; + token = match_token(p, rmr_srv_opt_tokens, args); + opt_mask |= (1 << token); + + switch (token) { + case RMR_SRV_OPT_POOL_NAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("join_pool: name too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(poolname, p, NAME_MAX); + kfree(p); + break; + + case RMR_SRV_OPT_MEMBER_ID: + p = match_strdup(args); + + ret = kstrtou32(p, 0, member_id); + if (ret) { + pr_err("member_id isn't an integer: %d\n", ret); + kfree(p); + goto out; + } + + kfree(p); + break; + + default: + pr_err("join_pool: Unknown parameter or missing value" + " '%s'\n", p); + ret = -EINVAL; + goto out; + } + }; + + for (i = 0; i < RMR_JOIN_OPT_Mandatory_count; i++) { + if ((opt_mask & (1 << rmr_srv_opt_tokens[i].token))) { + ret = 0; + } else { + pr_err("join_pool: Mandatory parameter missing: %s\n", + rmr_srv_opts_mandatory_names[i]); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + + +static ssize_t rmr_srv_join_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + char poolname[NAME_MAX]; + u32 member_id = UINT_MAX; + int err; + + err = rmr_srv_parse_options(buf, poolname, &member_id); + if (unlikely(err)) + return err; + + if (member_id > MAX_POOL_ID) { + pr_err("%s: member_id gt max allowed pools (%u > %u)\n", + __func__, member_id, MAX_POOL_ID); + return -EINVAL; + } + + if (member_id == 0) { + pr_err("%s: member_id is not allowed to be zero\n", __func__); + return -EINVAL; + } + + strip(poolname); + + pr_info("%s: Creating server pool with poolname %s, member_id %u\n", + __func__, poolname, member_id); + + srv_pool = rmr_create_srv_pool(poolname, member_id); + if (IS_ERR(srv_pool)) { + pr_err("failed to create srv pool %s\n", poolname); + return PTR_ERR(srv_pool); + } + + pool = rmr_create_pool(poolname, srv_pool); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto destroy_pool; + } + + srv_pool->pool = pool; + pool->is_clt = false; + rmr_srv_pool_update_params(pool); + + err = rmr_srv_create_pool_sysfs_files(pool); + if (err) { + pr_err("%s: pool %s failed to create sysfs files\n", __func__, pool->poolname); + goto destroy_pool; + } + + return count; + +destroy_pool: + rmr_put_srv_pool(srv_pool); + + return err; +} + +static ssize_t rmr_srv_join_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"poolname= member_id= > %s\n", + attr->attr.name); +} + +static struct kobj_attribute rmr_srv_join_pool_attr = + __ATTR(join_pool, 0644, rmr_srv_join_pool_show, + rmr_srv_join_pool_store); + +static struct attribute *default_attrs[] = { + &rmr_srv_join_pool_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +int rmr_srv_create_sysfs_files(void) +{ + int err; + dev_t devt = MKDEV(0, 0); + + rmr_dev_class = class_create("rmr-server"); + if (IS_ERR(rmr_dev_class)) + return PTR_ERR(rmr_dev_class); + + rmr_ctl_dev = device_create(rmr_dev_class, NULL, devt, NULL, "ctl"); + if (IS_ERR(rmr_ctl_dev)) { + err = PTR_ERR(rmr_ctl_dev); + goto cls_destroy; + } + + rmr_pool_dev = device_create(rmr_dev_class, NULL, devt, NULL, "pools"); + if (IS_ERR(rmr_pool_dev)) { + err = PTR_ERR(rmr_pool_dev); + goto ctl_destroy; + } + + err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto pool_destroy; + + return 0; + +pool_destroy: + device_unregister(rmr_pool_dev); +ctl_destroy: + device_unregister(rmr_ctl_dev); +cls_destroy: + class_destroy(rmr_dev_class); + + return err; +} + +void rmr_srv_destroy_sysfs_files(void) +{ + sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group); + device_unregister(rmr_pool_dev); + device_unregister(rmr_ctl_dev); + class_destroy(rmr_dev_class); +} diff --git a/drivers/infiniband/ulp/rmr/rmr-srv.c b/drivers/infiniband/ulp/rmr/rmr-srv.c new file mode 100644 index 000000000000..66af29b90c53 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv.c @@ -0,0 +1,3306 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include + +#include "rmr-srv.h" +#include "rmr-req.h" +#include "rmr-clt.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(RMR_VER_STRING); +MODULE_DESCRIPTION("RMR Server"); +MODULE_LICENSE("GPL"); + +static struct rtrs_srv_ctx *rtrs_ctx; +struct kmem_cache *rmr_req_cachep; + +static LIST_HEAD(g_sess_list); +static DEFINE_MUTEX(g_sess_lock); + +#define MIN_CHUNK_SIZE (128 << 10) +#define MAX_CHUNK_SIZE (1024 << 10) +#define DEFAULT_CHUNK_SIZE MIN_CHUNK_SIZE + +static int __read_mostly chunk_size = DEFAULT_CHUNK_SIZE; + +module_param_named(chunk_size, chunk_size, uint, 0444); +MODULE_PARM_DESC(chunk_size, + "Unit size which is tracked for being dirty. (default: " + /* cppcheck-suppress unknownMacro */ + __stringify(DEFAULT_CHUNK_SIZE) "KB)"); + +static int __read_mostly sync_queue_depth = DEFAULT_SYNC_QUEUE_DEPTH; + +module_param_named(sync_queue_depth, sync_queue_depth, uint, 0644); +MODULE_PARM_DESC(sync_queue_depth, + "Max in-flight sync requests per pool (default: " + __stringify(DEFAULT_SYNC_QUEUE_DEPTH) ")"); + +bool rmr_get_srv_pool(struct rmr_srv_pool *srv_pool) +{ + pr_debug("pool %s, before inc refcount %d\n", + srv_pool->pool->poolname, refcount_read(&srv_pool->refcount)); + return refcount_inc_not_zero(&srv_pool->refcount); +} + +static struct rmr_srv_pool *rmr_find_and_get_srv_pool(u32 group_id) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + mutex_lock(&pool_mutex); + pool = rmr_find_pool_by_group_id(group_id); + if (!pool) { + mutex_unlock(&pool_mutex); + return ERR_PTR(-ENOENT); + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + if (!rmr_get_srv_pool(srv_pool)) { + mutex_unlock(&pool_mutex); + return ERR_PTR(-EINVAL); + } + mutex_unlock(&pool_mutex); + + return srv_pool; +} + +void rmr_put_srv_pool(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + + might_sleep(); + + pr_debug("pool %s, before dec refcnt %d\n", + (pool ? pool->poolname : "(empty)"), refcount_read(&srv_pool->refcount)); + if (refcount_dec_and_test(&srv_pool->refcount)) { + mutex_destroy(&srv_pool->srv_pool_lock); + + if (srv_pool->clt) + rmr_clt_close(srv_pool->clt); + + kfree(srv_pool->last_io); + srv_pool->last_io = NULL; + kfree(srv_pool->last_io_idx); + srv_pool->last_io_idx = NULL; + + if (pool) { + pr_info("srv: destroy pool %s\n", pool->poolname); + free_pool(pool); + } + + cancel_delayed_work_sync(&srv_pool->md_sync_dwork); + destroy_workqueue(srv_pool->md_sync_wq); + + cancel_delayed_work_sync(&srv_pool->clean_dwork); + destroy_workqueue(srv_pool->clean_wq); + + kfree(srv_pool); + } +} + +static const char *rmr_get_srv_pool_state_name(enum rmr_srv_pool_state state) +{ + switch (state) { + case RMR_SRV_POOL_STATE_EMPTY: return "RMR_SRV_POOL_STATE_EMPTY"; + case RMR_SRV_POOL_STATE_REGISTERED: return "RMR_SRV_POOL_STATE_REGISTERED"; + case RMR_SRV_POOL_STATE_CREATED: return "RMR_SRV_POOL_STATE_CREATED"; + case RMR_SRV_POOL_STATE_NORMAL: return "RMR_SRV_POOL_STATE_NORMAL"; + case RMR_SRV_POOL_STATE_NO_IO: return "RMR_SRV_POOL_STATE_NO_IO"; + + default: return "Unknown state"; + } +} + +/** + * rmr_srv_change_pool_state() - Change srv pool state + * + * @srv_pool: Server pool whose state is to be changed + * @new_state: State to which the transition is to be made + * + * Return: + * old state on succes + * negative error code on failure + * + * Description: + * This function controls the state transitions for rmr-srv pool state. + * Every state transition is controlled by this except to NORMAL. + * Function rmr_srv_set_pool_state_normal handles transition to state NORMAL. + * "always-invalid" state transitions are checked and prevented here + * Case dependent valid/invalid state transition, should be handled by caller + */ +static inline int rmr_srv_change_pool_state(struct rmr_srv_pool *srv_pool, + enum rmr_srv_pool_state new_state) +{ + enum rmr_srv_pool_state old_state = atomic_read(&srv_pool->state); + int cmp_state; + + WARN_ON(new_state == RMR_SRV_POOL_STATE_NORMAL); + + if (old_state == new_state) + return old_state; + + pr_info("%s: Old state %s, Requested state %s\n", + __func__, rmr_get_srv_pool_state_name(old_state), + rmr_get_srv_pool_state_name(new_state)); + + switch (new_state) { + case RMR_SRV_POOL_STATE_NO_IO: + /* + * NO_IO can be reached from REGISTERED, CREATED, or NORMAL. + * EMPTY -> NO_IO is illegal: a pool with no store cannot have + * active sessions that fail. + */ + if (WARN_ON(old_state == RMR_SRV_POOL_STATE_EMPTY)) + goto err; + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_NO_IO); + break; + case RMR_SRV_POOL_STATE_EMPTY: + /* + * EMPTY is reached from REGISTERED (store unregistered, no + * sessions) or from NO_IO (last session left, no store). A + * direct jump from CREATED or NORMAL is illegal — those states + * must pass through NO_IO first. + */ + if (WARN_ON(old_state == RMR_SRV_POOL_STATE_CREATED || + old_state == RMR_SRV_POOL_STATE_NORMAL)) + goto err; + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_EMPTY); + break; + case RMR_SRV_POOL_STATE_REGISTERED: + /* + * REGISTERED is entered from EMPTY (store just registered, no + * sessions) or from NO_IO (last session left, store still + * present). A direct jump from CREATED or NORMAL is illegal — + * those states must pass through NO_IO first. + */ + if (WARN_ON(old_state == RMR_SRV_POOL_STATE_CREATED || + old_state == RMR_SRV_POOL_STATE_NORMAL)) + goto err; + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_REGISTERED); + + break; + case RMR_SRV_POOL_STATE_CREATED: + /* + * CREATED is entered only from REGISTERED, when the first + * non-sync create-mode join arrives. Any other predecessor + * is illegal. + */ + cmp_state = RMR_SRV_POOL_STATE_REGISTERED; + if (atomic_try_cmpxchg(&srv_pool->state, &cmp_state, RMR_SRV_POOL_STATE_CREATED)) + goto out; + WARN_ON(1); + goto err; + default: + pr_err("%s: Unknown state %d\n", __func__, new_state); + goto err; + } + +out: + rmr_srv_mark_pool_md_dirty(srv_pool); + return old_state; + +err: + pr_err("%s: Failed. Old state %s, Requested state %s\n", + __func__, rmr_get_srv_pool_state_name(old_state), + rmr_get_srv_pool_state_name(new_state)); + return -EINVAL; +} + +/** + * rmr_srv_set_pool_state_normal() - Change srv pool state to NORMAL + * + * @srv_pool: Server pool whose state is to be changed to NORMAL + * + * Return: + * old state on succes + * negative error code on failure + * + * Description: + * This function controls the state transitions for rmr-srv pool state to NORMAL + * "always-invalid" state transitions are checked and prevented here + * Case dependent valid/invalid state transition, should be handled by caller + */ +static int rmr_srv_set_pool_state_normal(struct rmr_srv_pool *srv_pool) +{ + int old_state; + + mutex_lock(&srv_pool->srv_pool_lock); + old_state = atomic_read(&srv_pool->state); + + pr_info("%s: Old state %s\n", __func__, + rmr_get_srv_pool_state_name(old_state)); + + if (old_state == RMR_SRV_POOL_STATE_NORMAL) + goto out; + + /* + * CREATED -> NORMAL: normal enable on a newly created pool. + * NO_IO -> NORMAL: map update completed, pool can serve IOs again. + * Any other predecessor is illegal. + */ + if (WARN_ON(old_state != RMR_SRV_POOL_STATE_CREATED && + old_state != RMR_SRV_POOL_STATE_NO_IO)) { + old_state = -EINVAL; + goto out; + } + + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_NORMAL); + rmr_srv_mark_pool_md_dirty(srv_pool); + pr_info("%s: Server pool state changed to NORMAL\n", __func__); + +out: + mutex_unlock(&srv_pool->srv_pool_lock); + + return old_state; +} + +/** + * rmr_srv_clear_map() - clear the dirty map if other pool member completely synced it + * + * @pool: rmr pool that holds the maps to clean + * @member_id: pool member id for which map is reported as clean + * + * Description: + * If other pool member responded that he finished syncing his data, then we can + * clear his map replicated to this nodes, in case of some clear commands were + * lost or failed. + * + * Return: + * no + * + * Context: + * This function can wait on spin_lock if the deleted entry should be inserted back + * + * Locks: + * no + */ +static void rmr_srv_clear_map(struct rmr_pool *pool, u8 member_id) +{ + // TODO: this looks like rmr_pool_map_remove_entries, can we do something about this? + // I was not able to merge them, but it would be nice. + struct rmr_dirty_id_map *map = NULL; + rmr_id_t id; + int i, lock_idx; + + pr_debug("pool %s clear map entries for member_id=%u\n", + pool->poolname, member_id); + + lock_idx = srcu_read_lock(&pool->map_srcu); + map = rmr_pool_find_map(pool, member_id); + if (!map) { + pr_err("for pool %s cannot find map for member id %u\n", pool->poolname, member_id); + goto unlock; + } + + /* if the map state changed since we send our CHECK_MAP command, it means that + * some entries were added and the map is not clean and we should not wipe it. + * rsp of CHECK_MAP cmd can be outdated a little so we do not trust it then. + */ + if (atomic_read(&map->check_state) != RMR_MAP_STATE_CHECKING) + pr_debug("map for member_id=%u cannot be cleared now, state changed\n", + map->member_id); + + for (i = 0; i < map->no_of_chunks; i++) { + id.a = 1; + id.b = i; + + rmr_map_unset_dirty(map, id, MAP_NO_FILTER); + + /* If the state changed since the last check then it is possible that after + * clear_bit of RMR_MAP_STATE_CHECK_CLEAR in the rmr_req_check_map we called + * rmr_map_insert. There we check that entry is already in the map and leave + * the function. But the following erease here would delete it. So we return + * erased entry back to the table if the state of checking changed. + */ + if (atomic_read(&map->check_state) != RMR_MAP_STATE_CHECKING) { + pr_debug("map for member_id=%u cannot be cleared now, state changed\n", + map->member_id); + + rmr_map_set_dirty(map, id, 0); + goto unlock; + } + } + pr_debug("clear map entries for member_id=%u is done\n", member_id); +unlock: + srcu_read_unlock(&pool->map_srcu, lock_idx); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); +} + +/** + * rmr_srv_check_map_clear() - periodic work that checks if the other node finished sync + * + * @work: delayed work structure to start and repeat the work + * + * Description: + * Check the dirty maps of all of the other pool members. If any of the maps is dirty + * then send check command and if the pool member responds that it has cleared his map, + * then we should clear it locally. When checking is done reschedule itself again. + * + * Return: + * no + * + * Context: + * runs in the process context. + * + * Locks: + * no + */ +static void rmr_srv_check_map_clear(struct work_struct *work) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_pool *pool; + int i, lock_idx; + + srv_pool = container_of(to_delayed_work(work), struct rmr_srv_pool, clean_dwork); + + if (!srv_pool->pool) { + pr_debug("no rmr pool assigend to srv_pool yet.\n"); + goto out; + } + + pool = srv_pool->pool; + pr_debug("check map for srv pool %s started...\n", pool->poolname); + + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_debug("srv pool %s is not in normal state, skip map clear check", + pool->poolname); + goto out; + } + + if (!srv_pool->clt) { + pr_debug("srv pool %s does not have sync pool assigned, skip map clear check\n", + pool->poolname); + goto out; + } + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; i < pool->maps_cnt; i++) { + struct rmr_dirty_id_map *map; + u8 member_id; + int ret; + + map = rcu_dereference(pool->maps[i]); + if (WARN_ON(!map)) + break; + + member_id = map->member_id; + if (member_id == srv_pool->member_id) { + pr_debug("srv pool %s skip checking map with id %u, since it is me.\n", + pool->poolname, member_id); + continue; + } + + if (rmr_map_empty(map)) { + pr_debug("srv pool %s map for member_id=%u is empty, no need to check\n", + pool->poolname, map->member_id); + continue; + } + + atomic_set(&map->check_state, RMR_MAP_STATE_CHECKING); + + ret = rmr_clt_pool_member_synced(srv_pool->clt, member_id); + if (ret < 0) { + pr_debug("pool %s failed to check if member_id=%u synced, ret %d\n", + pool->poolname, member_id, ret); + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + continue; + } + + pr_debug("pool %s check if pool member %u synced, reported %u\n\n", + pool->poolname, member_id, ret); + if (ret) + rmr_srv_clear_map(pool, member_id); + + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + pr_debug("check map for pool %s done. schedule next one.\n", pool->poolname); + +out: + queue_delayed_work(srv_pool->clean_wq, &srv_pool->clean_dwork, + msecs_to_jiffies(RMR_SRV_CHECK_MAPS_INTERVAL_MS)); +} + +struct rmr_srv_pool *rmr_create_srv_pool(char *poolname, u32 member_id) +{ + struct rmr_srv_pool *srv_pool; + srv_pool = kzalloc(sizeof(struct rmr_srv_pool), GFP_KERNEL); + if (unlikely(!srv_pool)) + return ERR_PTR(-ENOMEM); + + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_EMPTY); + srv_pool->maintenance_mode = false; + refcount_set(&srv_pool->refcount, 1); + mutex_init(&srv_pool->srv_pool_lock); + + atomic_set(&srv_pool->store_state, false); + + srv_pool->member_id = member_id; + srv_pool->max_sync_io_size = U32_MAX; + + /* Sync thread */ + srv_pool->th_tsk = NULL; + atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED); + atomic_set(&srv_pool->in_flight_sync_reqs, 0); + + /* clean outdated entries from the map work */ + srv_pool->clean_wq = alloc_workqueue("%s_clean_wq", 0, 0, poolname); + if (!srv_pool->clean_wq) { + kfree(srv_pool); + pr_err("failed to create wq pool %s\n", poolname); + return ERR_PTR(-ENOMEM); + } + INIT_DELAYED_WORK(&srv_pool->clean_dwork, rmr_srv_check_map_clear); + queue_delayed_work(srv_pool->clean_wq, &srv_pool->clean_dwork, + msecs_to_jiffies(RMR_SRV_CHECK_MAPS_INTERVAL_MS)); + + /* sync metadata of the rmr pool */ + srv_pool->md_sync_wq = alloc_workqueue("%s_md_sync_wq", 0, 0, poolname); + if (!srv_pool->md_sync_wq) { + kfree(srv_pool); + pr_err("failed to create md_sync_wq pool %s\n", poolname); + return ERR_PTR(-ENOMEM); + } + + INIT_DELAYED_WORK(&srv_pool->md_sync_dwork, rmr_srv_md_sync); + /* No initial queue — first dirty event will schedule the work. */ + return srv_pool; +} + +void rmr_srv_pool_update_params(struct rmr_pool *pool) +{ + pr_info("%s: Setting chunk_size for pool %s to %d", + __func__, pool->poolname, chunk_size); + pool->chunk_size = chunk_size; + pool->chunk_size_shift = ilog2(chunk_size); +} + +static struct rmr_pool *rmr_srv_sess_get_pool(struct rmr_srv_sess *srv_sess, u32 group_id) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + bool ret; + + rcu_read_lock(); + pool = xa_load(&srv_sess->pools, group_id); + if (!pool) { + pool = ERR_PTR(-ENXIO); + goto out; + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + ret = rmr_get_srv_pool(srv_pool); + if (!ret) + pool = ERR_PTR(-ENXIO); + +out: + rcu_read_unlock(); + return pool; +} + +static void rmr_srv_sess_put_pool(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + rmr_put_srv_pool(srv_pool); +} + +/** + * rmr_srv_endreq() - Function called when an rmr server request finishes processing + * + * @req: Pointer to the request ending + * @err: Error value. Would be 0 for a successful request + */ +void rmr_srv_endreq(struct rmr_srv_req *req, int err) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_pool *pool = srv_pool->pool; + struct rtrs_srv_op *rtrs_op = req->rtrs_op; + struct rmr_dirty_id_map *map; + int i; + + if (req->flags == RMR_OP_MD_WRITE || req->flags == RMR_OP_MD_READ) { + if (unlikely(err)) + pr_err("Failed to complete the md req %x\n", req->flags); + goto put_ref; + } else if (unlikely(err) && !req->sync) { + struct rmr_srv_pool *srv_pool = req->srv_pool; + + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + } else if (rmr_op(req->flags) == RMR_OP_WRITE) { + srv_pool->last_io[req->mem_id].a = req->id.a; + srv_pool->last_io[req->mem_id].b = req->id.b; + + if (!test_and_set_bit(MD_DIRTY_LAST_IO, &srv_pool->md_dirty)) { + mod_delayed_work(srv_pool->md_sync_wq, + &srv_pool->md_sync_dwork, + msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS)); + } + + for (i = 0; i < req->failed_cnt; i++) { + int err; + + map = rmr_pool_find_map(srv_pool->pool, req->failed_srv_id[i]); + if (!map) { + pr_err("Cannot find map for srv_id %u\n", req->failed_srv_id[i]); + err = -EINVAL; + goto out; + } + + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + rmr_map_set_dirty(map, req->id, 0); + + if (req->map_ver > srv_pool->pool->map_ver) + srv_pool->pool->map_ver = req->map_ver; + } + if (req->failed_cnt) { + rmr_srv_mark_pool_md_dirty(srv_pool); + rmr_srv_mark_maps_dirty(srv_pool); + } + } + +out: + /* The requests created by rmr-srv don't use rtrs_op. */ + rtrs_srv_resp_rdma(rtrs_op, err); + rmr_srv_sess_put_pool(req->srv_pool->pool); +put_ref: + percpu_ref_put(&pool->ids_inflight_ref); +} + +static void rmr_srv_stop_sync_and_unset_store(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + atomic_set(&srv_pool->store_state, false); + + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP); + wake_up_process(srv_pool->th_tsk); + + while (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(1000)); + } + } +} + +static void rmr_srv_delete_store_member(struct rmr_pool *pool, unsigned long id) +{ + rmr_pool_remove_map(pool, id); + xa_erase(&pool->stg_members, id); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); +} + +/** + * rmr_srv_add_store_member() - Register a storage member and create its dirty map + * + * @pool: The pool to which the member belongs. + * @id: Member ID of the storage node to add. + * + * Records @id in pool->stg_members and allocates a dirty map for it. + * On failure the stg_members entry is removed before returning. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_add_store_member(struct rmr_pool *pool, unsigned long id) +{ + struct rmr_dirty_id_map *map; + int ret; + + map = rmr_pool_find_map(pool, id); + if (map) { + pr_err("%s: pool %s, member_id %lu map already exists\n", + __func__, pool->poolname, id); + return -EEXIST; + } + + ret = xa_err(xa_store(&pool->stg_members, id, XA_TRUE, GFP_KERNEL)); + if (ret) { + pr_err("%s: Failed to add storage member %lu: %d\n", + __func__, id, ret); + return ret; + } + + /* + * Create the map of the newly added member. + */ + map = rmr_map_create(pool, id); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + pr_err("%s: pool %s, member_id %lu failed to create map on err %d: %pe\n", + __func__, pool->poolname, id, ret, map); + goto rem_store; + } + return 0; + +rem_store: + xa_erase(&pool->stg_members, id); + return ret; +} + +/** + * rmr_srv_handle_other_member_add() - Handle a POOL_INFO ADD message for a different member + * + * @srv_pool: The server pool receiving the notification. + * @pool_info_cmd: The received POOL_INFO command carrying member_id, mode, and dirty. + * + * For %RMR_POOL_INFO_MODE_ASSEMBLE, verifies that the member and its dirty map + * already exist (the node is rejoining a pool it was previously part of). + * For %RMR_POOL_INFO_MODE_CREATE, adds the member via rmr_srv_add_store_member() + * and optionally marks its map fully dirty if the client reported outstanding data. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_handle_other_member_add(struct rmr_srv_pool *srv_pool, + const struct rmr_msg_pool_info_cmd *pool_info_cmd) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_dirty_id_map *map; + int ret; + + if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_ASSEMBLE) { + pr_info("%s: Member %u got add of member %u with mode assemble\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + + /* + * For assemble, member info should already exist. + */ + if (xa_load(&pool->stg_members, pool_info_cmd->member_id) != XA_TRUE) { + pr_err("%s: pool %s, member_id %u not present\n", + __func__, pool->poolname, pool_info_cmd->member_id); + return -ENOENT; + } + + map = rmr_pool_find_map(pool, pool_info_cmd->member_id); + if (!map) { + pr_err("%s: pool %s, member_id %u, map not present\n", + __func__, pool->poolname, pool_info_cmd->member_id); + return -ENOENT; + } + } else if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_CREATE && + pool_info_cmd->member_id != srv_pool->member_id) { + pr_info("%s: Member %u got add of member %u with mode create\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + + ret = rmr_srv_add_store_member(pool, pool_info_cmd->member_id); + if (ret) { + pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret); + return ret; + } + + if (pool_info_cmd->dirty) { + map = rmr_pool_find_map(pool, pool_info_cmd->member_id); + if (WARN_ON(!map)) { + xa_erase(&pool->stg_members, pool_info_cmd->member_id); + return -EINVAL; + } + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + } + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + } else { + pr_err("%s: pool %s, member_id %u, unexpected mode %u for ADD operation\n", + __func__, pool->poolname, pool_info_cmd->member_id, + pool_info_cmd->mode); + return -EINVAL; + } + + return 0; +} + +int rmr_srv_query(struct rmr_pool *pool, u64 mapped_size, struct rmr_attrs *attr) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_dirty_id_map *map; + size_t queue_depth; + + if (pool) { + srv_pool = (struct rmr_srv_pool *)pool->priv; + queue_depth = srv_pool->queue_depth; + } else { + /* + * If pool is NULL, we are being called to estimate the md size + * before the pool is created. Use max queue depth in that case. + */ + queue_depth = RMR_SRV_MAX_QDEPTH; + } + + /* + * Dummy map structure, so that we can reuse the update map param function. + */ + map = (struct rmr_dirty_id_map *)get_zeroed_page(GFP_KERNEL); + if (!map) { + pr_err("%s: Cannot allocate map\n", __func__); + return -ENOMEM; + } + + map->no_of_chunks = (mapped_size >> (ilog2(chunk_size) - 9)); + rmr_map_update_page_params(map); + + attr->rmr_md_size = (map->total_slp * PAGE_SIZE * RMR_POOL_MAX_SESS) + RMR_MD_SIZE; + attr->rmr_md_size += (queue_depth * sizeof(*srv_pool->last_io_idx)); + + attr->rmr_md_size = attr->rmr_md_size / SECTOR_SIZE; + + free_page((unsigned long)map); + return 0; +} +EXPORT_SYMBOL(rmr_srv_query); + +/** + * rmr_srv_set_map() - Create the dirty map for this server's member in the pool + * + * @pool: The pool for which the map is to be created. + * @mode: Registration mode; if %RMR_SRV_DISK_REPLACE, any existing map for + * this member is removed before creating the new one. + * + * Description: + * Invoked after the mapped size of the pool has been validated. Updates + * pool metadata with the mapped size, recalculates the chunk count, and + * calls rmr_srv_add_store_member() to register this node's map. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_set_map(struct rmr_pool *pool, enum rmr_srv_register_disk_mode mode) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + int ret, md_i; + + pr_info("%s: Mapped size of the pool %s is set to %lld\n", + __func__, pool->poolname, pool->mapped_size); + + /* Update mapped_size in the pool metadata. */ + md_i = rmr_pool_find_md(&pool->pool_md, srv_pool->member_id, true); + if (md_i < 0) { + pr_err("No space for new member %d.\n", srv_pool->member_id); + return -ENOMEM; + } + pool->pool_md.srv_md[md_i].mapped_size = pool->mapped_size; + + /* + * The existing map is irrelevant if user asked for store REPLACE. + */ + if (mode == RMR_SRV_DISK_REPLACE) + rmr_pool_remove_map(pool, srv_pool->member_id); + + ret = rmr_srv_add_store_member(pool, srv_pool->member_id); + if (ret) { + pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret); + goto err_out; + } + + return ret; + +err_out: + pool->pool_md.srv_md[md_i].mapped_size = 0; + return ret; +} + +/** + * rmr_srv_register() - Register a backend store with an RMR server pool + * + * @poolname: Name of the pool to which the store is to be registered. + * @ops: Store operations pointer. + * @priv: Private data for the store. + * @mapped_size: Size of the storage device in sectors. + * @mode: Registration mode: %RMR_SRV_DISK_CREATE for a new store, + * %RMR_SRV_DISK_REPLACE to replace an existing one, or + * %RMR_SRV_DISK_ADD to rejoin an existing pool. + * + * Description: + * An RMR server pool requires a backend store to service I/Os. + * This function registers that store, sets up the pool's dirty map for + * this member, and records the marked_create flag for validation when + * the first client joins. + * + * Return: + * Pointer to the rmr_pool on success, NULL on error. + */ +static bool rmr_srv_pool_has_non_sync_sess(struct rmr_pool *pool) +{ + struct rmr_srv_pool_sess *pool_sess; + + list_for_each_entry(pool_sess, &pool->sess_list, pool_entry) { + if (!pool_sess->sync) + return true; + } + return false; +} + +struct rmr_pool *rmr_srv_register(char *poolname, struct rmr_srv_store_ops *ops, void *priv, + u64 mapped_size, enum rmr_srv_register_disk_mode mode) +{ + struct rmr_pool *pool; + struct rmr_srv_io_store *io_store; + struct rmr_srv_pool *srv_pool; + u32 group_id = rmr_pool_hash(poolname); + enum rmr_srv_pool_state state; + int ret; + + srv_pool = rmr_find_and_get_srv_pool(group_id); + if (IS_ERR(srv_pool)) { + pr_err("pool %s does not exists: %pe\n", poolname, srv_pool); + return NULL; + } + pool = srv_pool->pool; + + mutex_lock(&srv_pool->srv_pool_lock); + if (mode == RMR_SRV_DISK_CREATE && + (rmr_srv_pool_has_non_sync_sess(pool) || + rmr_pool_find_map(pool, srv_pool->member_id))) { + pr_err("%s: Cannot register (create) new backend for %s; Sessions/Map exists\n", + __func__, poolname); + ret = -EEXIST; + goto put_err; + } + + if (mode == RMR_SRV_DISK_REPLACE && + (!rmr_srv_pool_has_non_sync_sess(pool))) { + pr_err("%s: Cannot register (replace) new backend for %s; No non-sync session\n", + __func__, poolname); + ret = -EINVAL; + goto put_err; + } + + if (srv_pool->io_store) { + pr_err("Srv pool %s already has store registered\n", poolname); + goto put_err; + } + + if (pool->mapped_size && pool->mapped_size != mapped_size) { + pr_err("Pool %s already has mapped size %lld, cannot register store with %lld\n", + poolname, pool->mapped_size, mapped_size); + ret = -EINVAL; + goto put_err; + } + + io_store = kzalloc(sizeof(*io_store), GFP_KERNEL); + if (!io_store) { + pr_err("Failed to allocate io_store for %s\n", poolname); + goto put_err; + } + + pool->mapped_size = mapped_size; + io_store->ops = ops; + io_store->priv = priv; + srv_pool->io_store = io_store; + + /* The pool updates its number of tracking chunks with the mapped size just provided. */ + rmr_pool_update_no_of_chunk(pool); + + if (mode == RMR_SRV_DISK_CREATE || mode == RMR_SRV_DISK_REPLACE) { + ret = rmr_srv_set_map(pool, mode); + if (ret) { + pr_err("%s: failed to set maps in rmr pool %s, err %d\n", + __func__, poolname, ret); + goto free_io_store; + } + } else if (mode == RMR_SRV_DISK_ADD) { + /* + * Read the pool metadata stored on this device before md_sync writes + * new metadata to the store. + */ + ret = rmr_srv_refresh_md(srv_pool); + if (ret) { + pr_err("%s: cannot refresh md of the pool\n", __func__); + goto free_io_store; + } + } else { + pr_err("%s: Wrong register disk mode %d\n", __func__, mode); + ret = -EINVAL; + goto free_io_store; + } + + srv_pool->marked_create = (mode == RMR_SRV_DISK_CREATE); + atomic_set(&srv_pool->store_state, true); + rmr_srv_mark_pool_md_dirty(srv_pool); + state = atomic_read(&srv_pool->state); + if (state != RMR_SRV_POOL_STATE_NORMAL && + state != RMR_SRV_POOL_STATE_NO_IO) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_REGISTERED); + mutex_unlock(&srv_pool->srv_pool_lock); + + __module_get(THIS_MODULE); + pr_info("Registered store with pool %s\n", poolname); + + return srv_pool->pool; + +free_io_store: + kfree(io_store); + srv_pool->io_store = NULL; +put_err: + mutex_unlock(&srv_pool->srv_pool_lock); + rmr_put_srv_pool(srv_pool); + return NULL; +} +EXPORT_SYMBOL(rmr_srv_register); + +static void rmr_srv_delete_md(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map = NULL; + int err, lock_idx; + u32 map_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth) + RMR_MAP_BUF_HDR_SIZE; + u64 per_map_size = 0; + u64 len; + u8 map_idx; + void *buf; + + /* + * It could happen to access the pool while the pool is not there. Use reference counting + * for server pool to avoid the issue. + */ + err = rmr_get_srv_pool(srv_pool); + if (!err) { + pr_err("%s: pool is not there\n", __func__); + return; + } + + len = rmr_bitmap_offset(pool->pool_md.queue_depth) + PAGE_SIZE; + buf = kzalloc(len, GFP_KERNEL); + if (!buf) + goto put_pool; + + /* + * On-disk layout of rmr pool metadata: + * + * 0 RMR_MD_SIZE +last_io_len +PAGE_SIZE + * +-----------+-------------+---------------+--------------------+ + * | pool_md | last_io | hdr_region | maps_region ... | + * +-----------+-------------+---------------+--------------------+ + * <-RMR_MD_SIZE><-last_io_len><--PAGE_SIZE--> maps_cnt * per_map + */ + err = process_md_io(pool, NULL, 0, len, RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: failed to process md write io with err 0x%x.\n", __func__, err); + + /* + * Zero the bitmap on disk using O(1) offset formula. + */ + lock_idx = srcu_read_lock(&pool->map_srcu); + for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) { + u32 map_data_offset; + el_flp *flp_ptr; + u64 no_of_slps; + int i, j; + + map = rcu_dereference(pool->maps[map_idx]); + if (WARN_ON(!map)) + break; + + per_map_size = map->total_slp * PAGE_SIZE; + map_data_offset = map_region_offset + map_idx * per_map_size; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + err = process_md_io(pool, NULL, map_data_offset, + PAGE_SIZE, RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: bitmap write failed at 0x%x, err 0x%x.\n", + __func__, map_data_offset, err); + map_data_offset += PAGE_SIZE; + } + } + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + rmr_srv_delete_store_member(pool, srv_pool->member_id); + + free_page((unsigned long)buf); +put_pool: + rmr_put_srv_pool(srv_pool); +} + +/** + * rmr_srv_unregister() - Unregister the backend store from rmr server pool + * + * @poolname: Name of the pool from which the store is to be unregistered + * @delete: If true, delete all the metadata associated with this pool + * + * Description: + * rmr server pool needs a backend store which serves the IOs + * This function is used to unregister a backend store from rmr server pool. + * + * Return: + * None + */ +void rmr_srv_unregister(char *poolname, bool delete) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + struct rmr_srv_io_store *io_store; + + mutex_lock(&pool_mutex); + pool = rmr_find_pool(poolname); + mutex_unlock(&pool_mutex); + + if (!pool) { + pr_err("%s, Pool %s does not exists\n", __func__, poolname); + return; + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + mutex_lock(&srv_pool->srv_pool_lock); + + if (!srv_pool->io_store) { + pr_err("Srv pool %s not registered\n", poolname); + mutex_unlock(&srv_pool->srv_pool_lock); + return; + } + + if (srv_pool->marked_delete) { + if (!delete) { + pr_err("%s: Storage server marked for delete, but delete mode not set\n", + __func__); + pr_err("%s: Continuing with only removal", __func__); + } + } else if (!srv_pool->marked_create && delete) { + pr_err("%s: Storage server not marked for delete, abandoning delete.\n", __func__); + delete = false; + } + + io_store = srv_pool->io_store; + + rmr_srv_stop_sync_and_unset_store(pool); + + percpu_ref_kill_and_confirm(&pool->ids_inflight_ref, rmr_pool_confirm_inflight_ref); + wait_for_completion(&pool->complete_done); + wait_for_completion(&pool->confirm_done); + + /* + * Re-init so metadata IO can go in if needed + */ + reinit_completion(&pool->complete_done); + reinit_completion(&pool->confirm_done); + percpu_ref_reinit(&pool->ids_inflight_ref); + + if (delete) + rmr_srv_delete_md(pool); + + kfree(srv_pool->io_store); + srv_pool->io_store = NULL; + + mutex_lock(&pool->sess_lock); + if (!rmr_srv_pool_has_non_sync_sess(pool)) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_EMPTY); + mutex_unlock(&pool->sess_lock); + + srv_pool->marked_delete = false; + mutex_unlock(&srv_pool->srv_pool_lock); + + pool->mapped_size = 0; + + rmr_put_srv_pool(srv_pool); + + pr_info("Unregistered store with pool %s\n", poolname); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(rmr_srv_unregister); + +/** + * rmr_srv_pool_cmd_with_rsp() - Sends a user command to all sessions of the internal (sync) clt + * + * @pool: rmr pool to which the command is for + * @conf: confirmation function to be called after completion + * @priv: pointer to priv data, to be returned to user while calling conf function + * @usr_vec: kvec containing user data (mostly command messages?) + * @nr: number of kvecs + * @buf: buf where the response from the user server is to be directed + * @buf_len: length of the buffer + * @size: size of the buf to be sent to a single session + * + * Description: + * This function provides an interface for the user to send commands to storage nodes connected + * through the internal network of this rmr pool. + * It redirects the command through the rmr-client pool in this storage node, which then sends + * the command to all the storage nodes it is connected to. + * The command is sent as a read, so that the response from the user srv side can be received + * The buffer sent by the user is meant to receive the response from the user server side. + * The size of the buffer is set during rmr_clt_open. + * + * Return: + * 0 on success + * negative errno in case of error + * + * Context: + * Inflight commands will block map update, until the inflights are completed. + */ +int rmr_srv_pool_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, + size_t size) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!srv_pool->clt) { + pr_warn("srv pool %s does not have sync pool assigned.\n", + pool->poolname); + return -EAGAIN; + } + + return rmr_clt_cmd_with_rsp(srv_pool->clt, conf, priv, usr_vec, nr, buf, buf_len, size); +} +EXPORT_SYMBOL(rmr_srv_pool_cmd_with_rsp); + +static int rmr_srv_send_discard_all(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_pool *sync_pool = srv_pool->clt; + struct rmr_msg_pool_cmd msg = {}; + int err; + + /* + * If the member_id is not this server's member_id, it means this server is the receiving + * node of the discard request. + */ + if (srv_pool->member_id != member_id) + return 0; + + pr_info("%s: Send discards across storage nodes for pool %s\n", + __func__, pool->poolname); + + rmr_clt_init_cmd(sync_pool, &msg); + msg.cmd_type = RMR_CMD_SEND_DISCARD; + msg.send_discard_cmd.member_id = member_id; + + err = rmr_clt_pool_send_all(sync_pool, &msg); + if (err) { + pr_err("Failed to send discard cmd for pool %s: %d\n", + pool->poolname, err); + } + return err; +} + +/** + * rmr_srv_discard_id() - discard the data chunks of length from offset on disk + * + * @pool: source pool. + * @offset offset in bytes. + * @length: length in bytes + * @member_id: member id of the storage node to discard the data from. If 0, then the node is + * this server pool. + * @sync: indicates whether to send sync requests to other connected nodes. + * + * Return: + * 0 on success, err code otherwise + * + * Description: + * This function discards the data chunks on the server with member_id. It will mark the + * data chunks as dirty and set the discard_entries flag of the corresponding srv_md true. + * Then it notifies all the connected nodes it has discarded data. + */ +int rmr_srv_discard_id(struct rmr_pool *pool, u64 offset, u64 length, u8 member_id, bool sync) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + rmr_id_t id; + int md_i, err; + + if (!member_id) + member_id = srv_pool->member_id; + + map = rmr_pool_find_map(pool, member_id); + if (!map) { + pr_err("for srv pool %s cannot find map for member_id %u\n", + pool->poolname, member_id); + return -EINVAL; + } + + md_i = rmr_pool_find_md(&pool->pool_md, member_id, false); + if (md_i < 0) { + pr_err("%s: for srv pool %s cannot find md for member_id %u\n", + __func__, pool->poolname, member_id); + return -EINVAL; + } + + /* + * If this node has received a response of the discard request from a normal server, + * the node will continue to mark all the data chunks as dirty. + */ + if (member_id == srv_pool->member_id && sync) { + if (!srv_pool->clt) { + pr_err("pool %s has no sync pool assigned. Cannot send discards.\n", + pool->poolname); + return -ENXIO; + } + + /* + * This node tries to send discards to all its connected nodes. The other node + * that has received the discards will start a new round. In the end, all normal + * nodes that are connected to this node should receive the discards. + */ + err = rmr_srv_send_discard_all(pool, member_id); + if (err) { + pr_err("%s: no server receives discards for pool %s: %d\n", + __func__, pool->poolname, err); + return err; + } + } + + /* + * Set the discard_entries flag of the corresponding srv_md true. Be careful that setting + * the wrong srv_md will lead to loops of discards. + */ + pool->pool_md.srv_md[md_i].discard_entries = true; + rmr_srv_mark_pool_md_dirty(srv_pool); + + if (length) { + rmr_map_calc_chunk(pool, offset, length, &id); + rmr_map_set_dirty(map, id, MAP_ENTRY_UNSYNCED); + } else { + /* discard all data chunks */ + rmr_map_set_dirty_all(map, MAP_ENTRY_UNSYNCED); + pr_info("%s: Discard all data chunks for member_id %u in srv_pool %s: %u\n", + __func__, member_id, pool->poolname, srv_pool->member_id); + } + + rmr_map_clear_filter_all(map, MAP_ENTRY_UNSYNCED); + rmr_srv_mark_maps_dirty(srv_pool); + + return 0; +} +EXPORT_SYMBOL(rmr_srv_discard_id); + +void rmr_srv_replace_store(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + RMR_STORE_SET_REPLACE(pool->map_ver); + rmr_srv_flush_pool_md(srv_pool); +} +EXPORT_SYMBOL(rmr_srv_replace_store); + +/** + * rmr_srv_pool_check_store() - Check whether IO is allowed for a pool or not + * + * @pool: pool to check + * + * Return: + * 1 if IO is allowed, 0 therwise + * + * Description: + * For a rmr-srv pool, the store registered provides a way to check whether it can process + * IOs or not. + */ +static int rmr_srv_pool_check_store(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_io_store *store = srv_pool->io_store; + void *store_priv; + + if (!store) { + pr_debug("for srv pool %s no store assigned\n", pool->poolname); + return false; + } + + if (!store->ops) { + pr_err("for pool %s store has no ops assigned\n", pool->poolname); + return false; + } + store_priv = store->priv; + + return store->ops->io_allowed(store_priv); +} + +/** + * process_msg_io() - Process IO message + * + * @srv_sess: rmr srv session over which the message was received + * @rtrs_op: rtrs IO context + * @data: pointer to data buf + * @datalen: len of data buf + * @usr: pointer to user buf + * @usrlen: len of user buf + * + * Return: + * 0 on success + * negative error code otherwise + * + * Description: + * Perform some basic checks. + * Create an IO request and start its state machine. + */ +static int process_msg_io(struct rmr_srv_sess *srv_sess, + struct rtrs_srv_op *rtrs_op, void *data, + u32 datalen, const void *usr, size_t usrlen) +{ + const struct rmr_msg_io *msg = usr; + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + struct rmr_srv_req *req; + int err = 0; + u32 group_id = le32_to_cpu(msg->hdr.group_id); + + pool = rmr_srv_sess_get_pool(srv_sess, group_id); + if (IS_ERR(pool)) { + pr_err_ratelimited("Got I/O request on session %s for unknown pool group id %d: %pe\n", + srv_sess->sessname, group_id, pool); + return PTR_ERR(pool); + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + + /* + * No new references will come in after we have killed the percpu_ref. + * Percpu_ref_tryget_live() returns false when @confirm_kill in + * percpu_ref_kill_and_confirm() is done. + */ + if (!percpu_ref_tryget_live(&pool->ids_inflight_ref)) { + err = -EIO; + goto no_put; + } + + if (!atomic_read(&srv_pool->store_state) || + atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_err_ratelimited("server pool %s is not up for IO (state = %s)\n", + pool->poolname, + rmr_get_srv_pool_state_name(atomic_read(&srv_pool->state))); + err = -EIO; + goto put_pool; + } + + /* + * The IOs coming from internal sync sessions are always READ. + */ + if (msg->sync && rmr_op(le32_to_cpu(msg->flags)) != RMR_OP_READ) { + pr_err_ratelimited("process_msg_io: pool %s write IO from internal connection.\n", + pool->poolname); + err = -EIO; + goto put_pool; + } + + /* + * For non internal IOs, make sure the underlying store is ready for IO + */ + if (!msg->sync && !rmr_srv_pool_check_store(pool)) { + pr_err("process_msg_io: pool %s IO not allowed\n", pool->poolname); + err = -EIO; + goto put_pool; + } + + req = rmr_srv_req_create(msg, srv_pool, rtrs_op, data, datalen, rmr_srv_endreq); + if (IS_ERR(req)) { + pr_err("Failed to create rmr_req %pe\n", req); + + //TODO: do we have to rtrs_srv_resp_rdma here ? + err = PTR_ERR(req); + goto put_pool; + } + + rmr_req_submit(req); + return 0; + +put_pool: + percpu_ref_put(&pool->ids_inflight_ref); + +no_put: + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + rmr_srv_sess_put_pool(pool); + return err; +} + +int rmr_srv_get_sync_permit(struct rmr_srv_pool *srv_pool) +{ + atomic_inc(&srv_pool->in_flight_sync_reqs); + + while (atomic_read(&srv_pool->in_flight_sync_reqs) >= sync_queue_depth) { + /* Permit overslow; sleep */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_RUNNING) { + atomic_dec(&srv_pool->in_flight_sync_reqs); + + return -EINTR; + } + } + + return 0; +} + +void rmr_srv_put_sync_permit(struct rmr_srv_pool *srv_pool) +{ + atomic_dec(&srv_pool->in_flight_sync_reqs); + + wake_up_process(srv_pool->th_tsk); +} + +static int rmr_srv_sync_map(void *arg) +{ + struct rmr_srv_pool *srv_pool = arg; + struct rmr_pool *pool = srv_pool->pool; + struct rmr_dirty_id_map *map; + rmr_id_t rmr_id; + struct rmr_map_entry *entry; + int err = 0; + u64 i; + + pr_info("Sync thread starting!\n"); + + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + /* + * We do not need to error out here. + * Since no session has ever been added to this pool, + * it technically means this pool is in sync state. + */ + pr_info("No map found for pool %s\n", pool->poolname); + goto out; + } + + rmr_id.a = 1; + for (i = 0; i < map->no_of_chunks; i++) { + if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_REQ_STOP) { + pr_info("Request to stop sync thread\n"); + err = -EINTR; + goto err; + } + + if (!atomic_read(&srv_pool->store_state) || + atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_WAIT); + pr_err("Pool not in desired state\n"); + /* Unsure what error to return here */ + err = -EINVAL; + goto err; + } + + rmr_id.b = i; + entry = rmr_map_get_dirty_entry(map, rmr_id); + if (entry) { + if (atomic_cmpxchg(&entry->sync_cnt, -1, 0) != -1) { + /* someone has already started sync for this id */ + continue; + } + + err = rmr_srv_sync_chunk_id(srv_pool, entry, rmr_id, true); + if (err) { + /* this is to undo the previous cmpxchg if the error in + * rmr_srv_sync_chunk_id happened before any requests were created + */ + atomic_cmpxchg(&entry->sync_cnt, 0, -1); + pr_err("Failed to sync chunk (%llu, %llu)\n", rmr_id.a, rmr_id.b); + goto err; + } + } + } + + /* + * Finished syncing chunks, + * Now change the thread state to wait, + * to wait for the in flight syncs + */ + atomic_set(&srv_pool->thread_state, SYNC_THREAD_WAIT); + +err: + while (atomic_read(&srv_pool->in_flight_sync_reqs) != 0) { + /* + * Wait for all permits to get freed. + * Since the completion path needs this thread to + * be up and running + */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + //TODO: should it be timeout? + } + +out: + atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED); + + pr_info("Sync thread exiting with err %d\n", err); + return err; +} + +int rmr_srv_sync_thread_start(struct rmr_srv_pool *srv_pool) +{ + atomic_set(&srv_pool->in_flight_sync_reqs, 0); + srv_pool->th_tsk = kthread_run(rmr_srv_sync_map, srv_pool, + "rmr_srv_sync_thread"); + if (IS_ERR(srv_pool->th_tsk)) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED); + return -ENOMEM; + } + + atomic_set(&srv_pool->thread_state, SYNC_THREAD_RUNNING); + return 0; +} + +int rmr_srv_sync_thread_stop(struct rmr_srv_pool *srv_pool) +{ + if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_RUNNING) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP); + wake_up_process(srv_pool->th_tsk); + } + + return 0; +} + +void rmr_srv_sync_req_failed(struct rmr_srv_pool *srv_pool) +{ + /* + * TODO: Investigate the necessity to change server state + * to RMR_SRV_POOL_STATE_NO_IO for sync_req failure. + */ + // rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + rmr_srv_sync_thread_stop(srv_pool); +} + +static void rmr_srv_read_map_buf(struct rmr_pool *pool, void *buf, size_t buflen, + const struct rmr_msg_map_buf_cmd *map_buf_cmd) +{ + int size; + u8 map_idx = map_buf_cmd->map_idx; + u64 slp_idx = map_buf_cmd->slp_idx; + + size = rmr_pool_maps_to_buf(pool, &map_idx, &slp_idx, buf, buflen, MAP_NO_FILTER); + if (size == 0) { + // No more dirty map to write + struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + + map_buf_hdr->version = RMR_MAP_FORMAT_VER; + map_buf_hdr->member_id = 0; + } +} + +static void rmr_srv_update_md_buf(struct rmr_srv_pool *srv_pool, void *buf, size_t buflen) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_pool_md *pool_md = &pool->pool_md; + struct rmr_pool_md *buf_md = (struct rmr_pool_md *)buf; + u8 member_id = srv_pool->member_id; + int idx, buf_idx; + + /* Zero out the buffer in case data is corrupted somehow. */ + memset(buf, 0, buflen); + idx = rmr_pool_find_md(pool_md, member_id, false); + if (idx < 0) { + pr_err("The server pool hasn't updated srv_md yet %d\n", member_id); + return; + } + + buf_idx = rmr_pool_find_md(buf_md, member_id, true); + if (buf_idx < 0) { + pr_err("The buffer has no space for the member_id %d\n", member_id); + return; + } + + memcpy(&buf_md->srv_md[buf_idx], &pool_md->srv_md[idx], sizeof(struct rmr_srv_md)); +} + +static int rmr_srv_save_last_io_to_map(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + int i, j, lock_idx; + + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("no map found for member_id %u\n", srv_pool->member_id); + return -EINVAL; + } + + for (i = 0; i < srv_pool->queue_depth; i++) { + rmr_id_t *id; + struct rmr_dirty_id_map *mp; + + id = &srv_pool->last_io[i]; + + if (id->a == U64_MAX && id->b == U64_MAX) + continue; + + if (rmr_map_check_dirty(map, *id)) { + /* + * We already have this id added to our map, and which says + * that its dirty for us. This means that last_io info about + * this id is outdated. + * We honor the info in the map, and skip this entry + */ + continue; + } + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (j = 0; j < pool->maps_cnt; j++) { + mp = rcu_dereference(pool->maps[j]); + if (WARN_ON(!mp) || mp->member_id == srv_pool->member_id) + continue; + + rmr_map_set_dirty(mp, *id, 0); + + // Clean the entry since it has been used up + id->a = U64_MAX; + id->b = U64_MAX; + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + } + + rmr_srv_mark_maps_dirty(srv_pool); + return 0; +} + +/** + * process_msg_user_cmd() - Process user command + * + * @pool: rmr pool + * @cmd_msg: pointer to command message. The user data is right after this struct. + * @data: data buffer to be passed down the user + * @datalen: length of the user buffer + * + * Description: + * Pass down the user command to the user server side. + * The user command data is kept right after the pool command (see arranging of kvec) + * + * Return: + * 0 in case of success + * negative is case of failure + * + * Context: + * The call goes to the user server side. Care must be taken not to block. + */ +static int process_msg_user_cmd(struct rmr_srv_pool *srv_pool, + const struct rmr_msg_pool_cmd *cmd_msg, void *data, int datalen) +{ + struct rmr_srv_io_store *store = srv_pool->io_store; + size_t usr_len = cmd_msg->user_cmd.usr_len; + int ret; + + pr_debug("%s: cmd_len=%zu usr_len=%zu\n", __func__, sizeof(*cmd_msg), usr_len); + + if (!store) { + pr_err("%s: No store registered\n", __func__); + return -EAGAIN; + } + + ret = store->ops->submit_cmd(store->priv, cmd_msg + 1, usr_len, data, datalen); + + return ret; +} + +static void do_sess_leave_srv_sess(struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_sess *srv_sess = pool_sess->srv_sess; + + mutex_lock(&srv_sess->lock); + list_del(&pool_sess->srv_sess_entry); + mutex_unlock(&srv_sess->lock); +} + +static void sess_leave_pool(struct rmr_pool *pool, + struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_sess *srv_sess = pool_sess->srv_sess; + + pr_info("pool sesss %s leaves pool %s\n", + pool_sess->sessname, pool->poolname); + + mutex_lock(&pool->sess_lock); + list_del(&pool_sess->pool_entry); + xa_erase(&srv_sess->pools, pool->group_id); + mutex_unlock(&pool->sess_lock); + + rmr_srv_sysfs_del_sess(pool_sess); + + pool_sess->srv_pool = NULL; +} + +static void rmr_srv_free_pool_sess(struct rmr_srv_pool_sess *pool_sess) +{ + kfree(pool_sess); +} + +static void destroy_sess(struct rmr_srv_sess *srv_sess) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_srv_pool_sess *pool_sess, *tmp; + + // why do they do this in rnbd srv ? + // if (list_empty(&srv_sess->pool_sess_list)) + // goto out; + + mutex_lock(&srv_sess->lock); + list_for_each_entry_safe (pool_sess, tmp, &srv_sess->pool_sess_list, srv_sess_entry) { + list_del(&pool_sess->srv_sess_entry); + srv_pool = pool_sess->srv_pool; + + // A network disconnect event + if (!pool_sess->sync) + rmr_srv_change_pool_state(pool_sess->srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + sess_leave_pool(srv_pool->pool, pool_sess); + rmr_put_srv_pool(srv_pool); + rmr_srv_free_pool_sess(pool_sess); + } + mutex_unlock(&srv_sess->lock); + + xa_destroy(&srv_sess->pools); + might_sleep(); + + mutex_lock(&g_sess_lock); + list_del(&srv_sess->g_list_entry); + mutex_unlock(&g_sess_lock); + + mutex_destroy(&srv_sess->lock); + kfree(srv_sess); +} + +void rmr_srv_destroy_pool(struct rmr_pool *pool) +{ + struct rmr_srv_pool_sess *pool_sess, *tmp; + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!pool) { + pr_err("%s: pool is empty\n", __func__); + return; + } + + list_for_each_entry_safe (pool_sess, tmp, &pool->sess_list, pool_entry) { + WARN_ON(!pool_sess->srv_pool); + + do_sess_leave_srv_sess(pool_sess); + sess_leave_pool(srv_pool->pool, pool_sess); + rmr_put_srv_pool(srv_pool); + rmr_srv_free_pool_sess(pool_sess); + } +} + +int rmr_srv_remove_clt_pool(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *clt; + + clt = srv_pool->clt; + if (!clt) { + pr_info("Srv pool %s has no internal clt pool assigned\n", + srv_pool->pool->poolname); + return -EINVAL; + } + + pr_info("from pool %s remove sync (internal) pool %s\n", + srv_pool->pool->poolname, clt->poolname); + srv_pool->clt = NULL; + + rmr_clt_close(clt); + + pr_info("pool %s removed\n", clt->poolname); + + return 0; +} + +static int create_srv_sess(struct rtrs_srv_sess *rtrs) +{ + struct rmr_srv_sess *srv_sess; + char sessname[NAME_MAX]; + int err; + + err = rtrs_srv_get_path_name(rtrs, sessname, sizeof(sessname)); + if (unlikely(err)) { + pr_err("rtrs_srv_get_sess_name(%s): %d\n", sessname, err); + return err; + } + srv_sess = kzalloc(sizeof(*srv_sess), GFP_KERNEL); + if (!srv_sess) + return -ENOMEM; + + mutex_init(&srv_sess->lock); + srv_sess->rtrs = rtrs; + strscpy(srv_sess->sessname, sessname, NAME_MAX); + xa_init_flags(&srv_sess->pools, XA_FLAGS_ALLOC); + INIT_LIST_HEAD(&srv_sess->pool_sess_list); + mutex_init(&srv_sess->lock); + + mutex_lock(&g_sess_lock); + list_add(&srv_sess->g_list_entry, &g_sess_list); + mutex_unlock(&g_sess_lock); + + rtrs_srv_set_sess_priv(rtrs, srv_sess); + + return 0; +} + +static int rmr_srv_link_ev(struct rtrs_srv_sess *rtrs, + enum rtrs_srv_link_ev ev, void *priv) +{ + struct rmr_srv_sess *srv_sess = priv; + + switch (ev) { + case RTRS_SRV_LINK_EV_CONNECTED: + return create_srv_sess(rtrs); + + case RTRS_SRV_LINK_EV_DISCONNECTED: + if (WARN_ON(!srv_sess)) + return -EINVAL; + + destroy_sess(srv_sess); + return 0; + + default: + pr_warn("Received unknown rtrs session event %d from session %s\n", + ev, srv_sess->sessname); + return -EINVAL; + } +} + +static struct rmr_srv_pool_sess *__find_sess_in_pool(struct rmr_pool *pool, + const char *sessname) +{ + struct rmr_srv_pool_sess *pool_sess; + + list_for_each_entry (pool_sess, &pool->sess_list, pool_entry) { + if (!strcmp(pool_sess->sessname, sessname)) { + return pool_sess; + } + } + + return NULL; +} + +static int sess_join_pool(struct rmr_pool *pool, struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_pool_sess *find; + struct rmr_srv_sess *srv_sess = pool_sess->srv_sess; + int ret = 0; + + mutex_lock(&pool->sess_lock); + find = __find_sess_in_pool(pool, pool_sess->sessname); + if (find) { + ret = -EEXIST; + goto unlock; + } + + ret = xa_err(xa_store(&srv_sess->pools, pool->group_id, pool, GFP_KERNEL)); + if (ret) { + pr_err("can not add pool %s err %d\n", pool->poolname, ret); + goto unlock; + } + pr_info("%s: Added pool %s to rmr_srv_sess %s\n", + __func__, pool->poolname, srv_sess->sessname); + + ret = rmr_srv_sysfs_add_sess(pool, pool_sess); + if (ret) { + pr_err("failed to create sysfs for pool sess %s in pool %s\n", + pool_sess->sessname, pool->poolname); + + xa_erase(&srv_sess->pools, pool->group_id); + goto unlock; + } + list_add(&pool_sess->pool_entry, &pool->sess_list); + +unlock: + mutex_unlock(&pool->sess_lock); + + return ret; +} + +static void do_sess_leave_pool(struct rmr_pool *pool, struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + do_sess_leave_srv_sess(pool_sess); + sess_leave_pool(pool, pool_sess); + rmr_put_srv_pool(srv_pool); + rmr_srv_free_pool_sess(pool_sess); +} + +/** + * process_msg_pool_info() - Process a POOL_INFO membership change notification + * + * @pool: Pool which received the command. + * @pool_info_cmd: The received POOL_INFO command carrying member_id, + * operation, mode, and dirty flag. + * + * Dispatches on (operation, mode) pairs notified by the client: + * - ADD + CREATE: a new storage node is joining; add it via + * rmr_srv_handle_other_member_add(). + * - ADD + ASSEMBLE: an existing node is reassembling; verify its map and + * stg_members entry already exist. + * - REMOVE + DELETE: a storage node is permanently leaving; remove its map + * and stg_members entry via rmr_srv_delete_store_member(). + * - REMOVE + DISASSEMBLE: temporary leave; no map changes needed (TODO). + * + * Return: + * 0 on success, negative error code on failure. + */ +static int process_msg_pool_info(struct rmr_pool *pool, + const struct rmr_msg_pool_info_cmd *pool_info_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + int ret = 0; + + pr_info("%s: Server pool %s with member_id %u, received pool_info message\n", + __func__, pool->poolname, srv_pool->member_id); + + if (pool_info_cmd->operation == RMR_POOL_INFO_OP_ADD) { + ret = rmr_srv_handle_other_member_add(srv_pool, pool_info_cmd); + if (ret) { + pr_err("%s: Failed to create maps for other pools: %d\n", + __func__, ret); + return ret; + } + } else if (pool_info_cmd->operation == RMR_POOL_INFO_OP_REMOVE) { + if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_DELETE) { + pr_info("%s: Member %u got remove of member %u with mode delete\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + rmr_srv_delete_store_member(pool, pool_info_cmd->member_id); + } else if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_DISASSEMBLE) { + pr_info("%s: Member %u got remove of member %u with mode disassemble, " + "preserving dirty map\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + /* + * Do NOT remove the dirty map or stg_members entry for the + * disassembled member. IOs arriving after this point will + * continue to accumulate dirty entries for that member via + * the piggyback mechanism, so it can resync on reassembly. + */ + } + } + rmr_srv_flush_pool_md(srv_pool); + + return ret; +} + +static struct rmr_srv_pool_sess *alloc_pool_sess(struct rmr_srv_pool *srv_pool, + struct rmr_srv_sess *srv_sess) +{ + struct rmr_srv_pool_sess *pool_sess; + + pool_sess = kzalloc_node(sizeof(*pool_sess), GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!pool_sess)) { + pr_err("Failed to allocate session for srv pool %s\n", srv_pool->pool->poolname); + return ERR_PTR(-ENOMEM); + } + + strscpy(pool_sess->sessname, srv_sess->sessname, NAME_MAX); + INIT_LIST_HEAD(&pool_sess->pool_entry); + INIT_LIST_HEAD(&pool_sess->srv_sess_entry); + pool_sess->srv_sess = srv_sess; + pool_sess->srv_pool = srv_pool; + + return pool_sess; +} + +/** + * rmr_srv_process_join_create() - Handle the CREATE case of a join_pool message + * + * @pool: The pool being created. + * @join_pool_cmd: The received join_pool command carrying dirty flag and + * per-member info for any pre-existing pool members. + * + * If the client reports that this server's existing data is dirty, marks own + * map fully dirty. Then iterates the per-member list in the message and adds + * each member via rmr_srv_add_store_member(), marking its map dirty if the + * client flagged it. On failure, all members added so far are cleaned up. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_process_join_create(struct rmr_pool *pool, + const struct rmr_msg_join_pool_cmd *join_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + int i, ret; + u8 member_id; + + /* + * Mark our maps dirty if client asked us to. + */ + if (join_pool_cmd->dirty) { + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("%s: No map found for %u\n", + __func__, srv_pool->member_id); + return -EINVAL; + } + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + } + + /* + * Add other storage members in case its a create message. + */ + for (i = 0; i < join_pool_cmd->mem_info.no_of_stor; i++) { + member_id = join_pool_cmd->mem_info.p_mem_info[i].member_id; + + ret = rmr_srv_add_store_member(pool, member_id); + if (ret) { + pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret); + goto cleanup; + } + + if (join_pool_cmd->mem_info.p_mem_info[i].c_dirty) { + map = rmr_pool_find_map(pool, member_id); + if (WARN_ON(!map)) { + xa_erase(&pool->stg_members, member_id); + ret = -EINVAL; + goto cleanup; + } + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + } + } + + return 0; + +cleanup: + while (i--) + rmr_srv_delete_store_member(pool, + join_pool_cmd->mem_info.p_mem_info[i].member_id); + return ret; +} + +static void rmr_srv_process_leave_delete(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + void *entry; + unsigned long id; + + /* + * When we are leaving a pool (not disassembly), we have to, + * 1) Delete dirty entries from all the maps of other storage nodes, since we do not + * need them anymore + * 2) Delete all the maps of other storage nodes. + * + * Map for this storage node is created/deleted during register/unregister. + */ + xa_for_each(&pool->stg_members, id, entry) { + if (id == srv_pool->member_id) + continue; + + rmr_srv_delete_store_member(pool, id); + } +} + +static int process_msg_join_pool(struct rmr_pool *pool, struct rmr_srv_sess *srv_sess, + struct rtrs_srv_sess *rtrs, bool sync, + const struct rmr_msg_join_pool_cmd *join_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_pool_sess *pool_sess; + int ret = 0, i; + bool alloced_last_io = false; + + pr_info("Client %s requests to join pool %s (state=%d)\n", + srv_sess->sessname, pool->poolname, atomic_read(&srv_pool->state)); + + mutex_lock(&srv_sess->lock); + + /* + * Here we only do chunk size check, + * to make sure different storage nodes do not use different chunk sizes. + */ + if (join_pool_cmd->chunk_size && pool->chunk_size != join_pool_cmd->chunk_size) { + pr_err("pool %s has chunksize %u != msg chunksize %u\n", + pool->poolname, pool->chunk_size, join_pool_cmd->chunk_size); + ret = -EINVAL; + goto unlock; + } + + mutex_lock(&srv_pool->srv_pool_lock); + if (atomic_read(&srv_pool->state) == RMR_SRV_POOL_STATE_EMPTY) { + pr_err("%s: pool %s has no store registered; join rejected\n", + __func__, pool->poolname); + ret = -EINVAL; + goto unlock_srv_pool_lock; + } + + if (!sync) { + if (join_pool_cmd->create) { + if (srv_pool->last_io || srv_pool->last_io_idx) { + pr_err("%s: pool %s already has last_io buffer allocated\n", + __func__, pool->poolname); + ret = -EEXIST; + goto unlock_srv_pool_lock; + } + + if (!srv_pool->marked_create) { + pr_err("%s: pool %s not in create state\n", + __func__, pool->poolname); + ret = -EINVAL; + goto unlock_srv_pool_lock; + } + } else if (srv_pool->marked_create) { + pr_err("%s: pool %s should not be in create state\n", + __func__, pool->poolname); + ret = -EINVAL; + goto unlock_srv_pool_lock; + } + } + + pool_sess = alloc_pool_sess(srv_pool, srv_sess); + if (IS_ERR(pool_sess)) { + pr_err("failed to allc pool_sees for pool %s sev_sess %s: %pe\n", + pool->poolname, srv_sess->sessname, pool_sess); + ret = PTR_ERR(pool_sess); + goto unlock_srv_pool_lock; + } + srv_pool->queue_depth = join_pool_cmd->queue_depth; + + ret = sess_join_pool(pool, pool_sess); + if (ret) { + pr_err("Failed to join pool\n"); + goto free_sess; + } + pool_sess->sync = sync; + + if (!pool_sess->sync && !srv_pool->last_io) { + /* Joining for the first time */ + srv_pool->last_io = kcalloc(srv_pool->queue_depth, sizeof(*srv_pool->last_io), + GFP_KERNEL); + if (!srv_pool->last_io) { + pr_err("Memory allocation failed for srv_pool->last_io\n"); + ret = -ENOMEM; + goto sess_leave; + } + alloced_last_io = true; + + /* The previous last_io buffer exists. */ + if (srv_pool->last_io_idx) { + memcpy(srv_pool->last_io, srv_pool->last_io_idx, + rmr_last_io_len(srv_pool->queue_depth)); + } else { + for (i = 0; i < srv_pool->queue_depth; i++) { + srv_pool->last_io[i].a = U64_MAX; + srv_pool->last_io[i].b = U64_MAX; + } + + srv_pool->last_io_idx = kcalloc(srv_pool->queue_depth, + sizeof(*srv_pool->last_io_idx), GFP_KERNEL); + if (!srv_pool->last_io_idx) { + ret = -ENOMEM; + goto free_last_io; + } + } + pr_info("Allocated %ld B last_io buffer for pool %s\n", + srv_pool->queue_depth * sizeof(*srv_pool->last_io), pool->poolname); + } + + /* + * Join/Rejoin messages from sync sessions do not affect our state. + * + * For non-sync sessions, if our state is NO_IO, pserver can either send a, + * - rejoin message in case our state NO_IO due to network/IO issue + * - join message in case pserver crashed + * hence, no state transition is needed. + */ + if (!pool_sess->sync) { + if (join_pool_cmd->create) { + /* + * First-time pool creation: set up member info and maps, + * then move to CREATED awaiting enable_pool(1). + */ + ret = rmr_srv_process_join_create(pool, join_pool_cmd); + if (ret) { + pr_err("%s: rmr_srv_process_join_create failed %d\n", + __func__, ret); + goto free_last_io; + } + + /* + * In the CREATE path pool_md has only magic set; all other + * header fields are normally populated later by + * RMR_CMD_SEND_MD_BUF. Initialise them now so that + * queue_depth (and the bitmap/last_io offsets derived from + * it) are correct before the first on-demand map flush fires. + */ + pool->pool_md.queue_depth = join_pool_cmd->queue_depth; + pool->pool_md.chunk_size = pool->chunk_size; + pool->pool_md.mapped_size = pool->mapped_size; + pool->pool_md.group_id = pool->group_id; + strscpy(pool->pool_md.poolname, pool->poolname, + sizeof(pool->pool_md.poolname)); + rmr_srv_mark_pool_md_dirty(srv_pool); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + + ret = rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_CREATED); + if (ret < 0) + goto leave_delete; + + srv_pool->marked_create = false; + } else if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NO_IO) { + /* + * Assemble or rejoin: a map update is needed before IOs + * can resume, so move to NO_IO. If we are already in + * NO_IO (e.g. pserver reconnecting after a network event + * that already drove us there), no transition is needed. + */ + ret = rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + if (ret < 0) + goto leave_delete; + } + } + + mutex_unlock(&srv_pool->srv_pool_lock); + + rmr_get_srv_pool(srv_pool); + list_add_tail(&pool_sess->srv_sess_entry, &srv_sess->pool_sess_list); + + mutex_unlock(&srv_sess->lock); + + return 0; + +leave_delete: + if (!pool_sess->sync && join_pool_cmd->create) + rmr_srv_process_leave_delete(pool); +free_last_io: + if (alloced_last_io) { + kfree(srv_pool->last_io); + srv_pool->last_io = NULL; + + kfree(srv_pool->last_io_idx); + srv_pool->last_io_idx = NULL; + } +sess_leave: + sess_leave_pool(pool, pool_sess); +free_sess: + rmr_srv_free_pool_sess(pool_sess); +unlock_srv_pool_lock: + mutex_unlock(&srv_pool->srv_pool_lock); +unlock: + mutex_unlock(&srv_sess->lock); + return ret; +} + +void rmr_srv_stop_sync_and_go_offline(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP); + wake_up_process(srv_pool->th_tsk); + + while (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(1000)); + } + } +} + +static int process_msg_leave_pool(struct rmr_pool *pool, struct rmr_srv_sess *sess, bool sync, + const struct rmr_msg_leave_pool_cmd *leave_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_pool_sess *pool_sess; + u64 last_io_len; + int ret = 0; + void *buf; + + pr_info("Session %s requests to leave pool %d\n", sess->sessname, + leave_pool_cmd->member_id); + + if (srv_pool->member_id != leave_pool_cmd->member_id) { + pr_err("%s: For sess %s, Srv pool member_id %d, Message member_id %d\n", + __func__, sess->sessname, srv_pool->member_id, leave_pool_cmd->member_id); + return -ENOENT; + } + + mutex_lock(&pool->sess_lock); + pool_sess = __find_sess_in_pool(pool, sess->sessname); + if (!pool_sess) { + mutex_unlock(&pool->sess_lock); + pr_err("Session %s is not in pool %s\n", sess->sessname, + pool->poolname); + return -ENOENT; + } + mutex_unlock(&pool->sess_lock); + + do_sess_leave_pool(pool, pool_sess); + + mutex_lock(&srv_pool->srv_pool_lock); + srv_pool->marked_delete = leave_pool_cmd->delete; + mutex_unlock(&srv_pool->srv_pool_lock); + + if (!sync) { + /* + * Stop the sync thread if its running, and go offline. + */ + rmr_srv_stop_sync_and_go_offline(pool); + + if (leave_pool_cmd->delete) { + rmr_srv_process_leave_delete(pool); + } else { + /* + * Disassemble: flush the dirty map to disk first so that + * the on-disk map reflects all dirty entries accumulated + * up to this point. On reassembly the map is read back + * and used to drive resync of any members that missed IOs. + */ + rmr_srv_md_maps_sync(pool); + + /* + * Clear last_io and persist it to disk so that it is not + * used after reassembly. Note: maps are always flushed + * above regardless of whether last_io is valid; the two + * operations are independent. + */ + last_io_len = rmr_last_io_len(pool->pool_md.queue_depth); + + if (!srv_pool->last_io || !last_io_len) + goto change_state; + + memset(srv_pool->last_io, 0, last_io_len); + if (srv_pool->last_io_idx) + memset(srv_pool->last_io_idx, 0, last_io_len); + + buf = kzalloc(last_io_len, GFP_KERNEL); + if (!buf) + goto change_state; + + ret = process_md_io(pool, NULL, + RMR_LAST_IO_OFFSET, + last_io_len, + RMR_OP_MD_WRITE, buf); + if (ret) { + pr_err("%s: For pool %s process_md_io failed\n", + __func__, pool->poolname); + } + kfree(buf); + } + +change_state: + /* + * All sessions have left. Transition back to REGISTERED if the + * backend store is still present, or to EMPTY if it is not. + */ + mutex_lock(&srv_pool->srv_pool_lock); + if (srv_pool->io_store) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_REGISTERED); + else + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_EMPTY); + mutex_unlock(&srv_pool->srv_pool_lock); + } + + return 0; +} + +static int process_msg_map_clear(struct rmr_srv_sess *srv_sess, + const void *usr) +{ + const struct rmr_msg_io *msg = usr; + struct rmr_pool *pool; + rmr_id_t id; + unsigned long key; + struct rmr_map_entry *entry; + struct rmr_dirty_id_map *map; + u8 member_id; + int err = 0; + u32 group_id = le32_to_cpu(msg->hdr.group_id); + + id.a = le64_to_cpu(msg->id_a); + id.b = le64_to_cpu(msg->id_b); + key = rmr_id_to_key(id); + member_id = msg->member_id; + + pr_debug("received map clear msg, id (%llu, %llu), member_id %u\n", + id.a, id.b, member_id); + + pool = rmr_srv_sess_get_pool(srv_sess, group_id); + if (IS_ERR(pool)) { + pr_err_ratelimited("Got I/O request on session %s for unknown pool: %pe\n", + srv_sess->sessname, pool); + return PTR_ERR(pool); + } + + map = rmr_pool_find_map(pool, member_id); + if (!map) { + pr_err("no map found for member_id %u\n", member_id); + err = -EINVAL; + goto put_pool; + //TODO: handle this , probably initialize map, or just throw err? + } + + entry = rmr_map_unset_dirty(map, id, MAP_NO_FILTER); + if (entry) { + /* We do not need any rcu protection here since it is deleted by the other + * rmr server. And sync can only be done for entries that are + * dirty for this particaular server. + */ + kmem_cache_free(rmr_map_entry_cachep, entry); + } + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + +put_pool: + rmr_srv_sess_put_pool(pool); + return err; +} + +static int process_msg_map_add(struct rmr_srv_sess *srv_sess, + const void *usr) +{ + const struct rmr_msg_io *msg = usr; + struct rmr_pool *pool; + int i, ret = 0; + struct rmr_dirty_id_map *map; + u32 group_id = le32_to_cpu(msg->hdr.group_id); + + pr_debug("received map add member_id %u, id (%llu %llu)\n", + msg->member_id, msg->id_a, msg->id_b); + + pool = rmr_srv_sess_get_pool(srv_sess, group_id); + if (IS_ERR(pool)) { + pr_err_ratelimited("Got I/O request on session %s for unknown pool: %pe\n", + srv_sess->sessname, pool); + return PTR_ERR(pool); + } + + for (i = 0; i < msg->failed_cnt; i++) { + u64 msg_map_ver = le64_to_cpu(msg->map_ver); + rmr_id_t id; + + map = rmr_pool_find_map(pool, msg->failed_id[i]); + if (!map) { + pr_err("no map found for member_id %u\n", msg->failed_id[i]); + ret = -EINVAL; + goto put_pool; + } + + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + id.a = le64_to_cpu(msg->id_a); + id.b = le64_to_cpu(msg->id_b); + rmr_map_set_dirty(map, id, 0); + + if (msg_map_ver > pool->map_ver) + pool->map_ver = msg_map_ver; + } + if (msg->failed_cnt) { + rmr_srv_mark_pool_md_dirty((struct rmr_srv_pool *)pool->priv); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + } + +put_pool: + rmr_srv_sess_put_pool(pool); + + return ret; +} + +/** + * rmr_srv_set_pool_mm() - Set the rmr srv pool to maintenance mode + * + * @srv_pool: The rmr srv pool to set in maintenance mode + * + * Description: + * While in maintenance mode, we do not serve IOs either, so we set state to NO_IO + * + * Return: + * 0 on success + * Error value on failure + */ +static int rmr_srv_set_pool_mm(struct rmr_srv_pool *srv_pool) +{ + srv_pool->maintenance_mode = true; + + return rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); +} + +/** + * rmr_srv_unset_pool_mm() - Clear the rmr srv pool maintenance mode + * + * @srv_pool: The rmr srv pool to clear maintenance mode of + * + * Description: + * While in maintenance mode, we do not serve IOs either, so we set state to NO_IO + * + * Return: + * 0 on success + * Error value on failure + */ +static int rmr_srv_unset_pool_mm(struct rmr_srv_pool *srv_pool) +{ + srv_pool->maintenance_mode = false; + rmr_srv_flush_pool_md(srv_pool); + + return 0; +} + +static int process_msg_enable_pool(struct rmr_pool *pool, struct rmr_srv_sess *sess, bool sync, + const struct rmr_msg_enable_pool_cmd *enable_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + enum rmr_srv_pool_state old_state = atomic_read(&srv_pool->state); + int ret = 0; + + /* + * Enable/Disable messages from sync sessions do not affect us. + */ + if (sync) { + pr_info("%s: From sync sess %s, for pool %s\n", __func__, sess->sessname, + pool->poolname); + return 0; + } + + pr_info("Client %s requests to set enable=%d pool %s current state %s\n", + sess->sessname, enable_pool_cmd->enable, pool->poolname, + rmr_get_srv_pool_state_name(old_state)); + + /* + * Enable when not in maintenance mode, can be handled simply + */ + if (enable_pool_cmd->enable && !srv_pool->maintenance_mode) { + /* + * CREATED -> NORMAL: initial enable after create-mode join. + * NO_IO -> NORMAL: was_last_authoritative recovery (pserver + * enables this node directly without a map update because its + * dirty map is already authoritative). + */ + if (old_state != RMR_SRV_POOL_STATE_CREATED && + old_state != RMR_SRV_POOL_STATE_NO_IO) { + pr_err("%s: pool %s cannot be enabled in state %s\n", + __func__, pool->poolname, + rmr_get_srv_pool_state_name(old_state)); + return -EINVAL; + } + + ret = rmr_srv_set_pool_state_normal(srv_pool); + if (ret < 0) + goto out_err; + + return 0; + } + + /* + * Any other case involves considering maintenance mode settings + */ + if (!enable_pool_cmd->enable) { + if (old_state != RMR_SRV_POOL_STATE_NORMAL && + old_state != RMR_SRV_POOL_STATE_NO_IO) { + pr_err("%s: pool %s can only disable from NORMAL or NO_IO state (current: %s)\n", + __func__, pool->poolname, + rmr_get_srv_pool_state_name(old_state)); + return -EINVAL; + } + ret = rmr_srv_set_pool_mm(srv_pool); + } else { + ret = rmr_srv_unset_pool_mm(srv_pool); + } + + if (ret < 0) + goto out_err; + + return 0; + +out_err: + /* + * Put srv pool state to old one + */ + atomic_set(&srv_pool->state, old_state); + return ret; +} + +/** + * process_msg_map_ready() - Process RMR_CMD_MAP_READY command + * + * @pool: Pool which received the command + * @sync: Whether the command was sent from an internal (sync) rmr-client or not + * + * Return: + * 0 on success + * Negative errno on failure + * + * Description: + * A RMR_CMD_MAP_READY command is the first command that is sent to a storage node which will + * receive a map from another storage node as part of a map update. + * + * It checks whether this storage node is ready and in an expected state to receive a map. + */ +static int process_msg_map_ready(struct rmr_pool *pool, bool sync) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + int i, err = 0, pool_state; + + mutex_lock(&srv_pool->srv_pool_lock); + pool_state = atomic_read(&srv_pool->state); + + /* A map update from another storage node is not allowed. */ + if (sync) { + pr_err("%s: (sync) Cannot receive map from other storage nodes\n", __func__); + err = -EINVAL; + goto out; + } + + /* + * A map update from pserver should start only when in, + * NO_IO - after a network/IO error + * CREATED - For extend (This is not nice. + * Extend should inform the storage node that it is being + * used for an extend leg for an already existing node, and + * the state should be set accordingly. So that we can allow + * this only when in NO_IO state.) + */ + if (pool_state != RMR_SRV_POOL_STATE_NO_IO && pool_state != RMR_SRV_POOL_STATE_CREATED) { + pr_err("(non-sync) pool state not correct %d", pool_state); + err = -EINVAL; + goto out; + } + + /* + * We seem to be in process of another map update. + */ + if (srv_pool->map_update_state != MAP_UPDATE_STATE_DISABLED) { + pr_err("rmr_srv_send_map Map update already in progress\n"); + err = -EINVAL; + goto out; + } + + /* + * If pserver is instructing us to receive a map, then the map we + * hold is meaningless. + */ + mutex_lock(&pool->maps_lock); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference_protected(pool->maps[i], + lockdep_is_held(&pool->maps_lock)); + if (!map) + continue; + + rmr_map_unset_dirty_all(map); + } + mutex_unlock(&pool->maps_lock); + rmr_srv_mark_maps_dirty(srv_pool); + + srv_pool->map_update_state = MAP_UPDATE_STATE_READY; + + pr_info("%s: process_msg_cmd: moved to MAP_UPDATE_STATE_READY\n", __func__); + +out: + mutex_unlock(&srv_pool->srv_pool_lock); + return err; +} + +/** + * process_msg_cmd_handler() - Processes rmr command message + * + * @work: scheduled work structure + * + * Description: + * The command messages being processed here, can be broadly divided into 2 categories. + * Ones which are able to use the rsp buffer to send back status. + * Ones which cannot use the rsp buffer to send back status. These ones use the rsp buffer + * for other purposes; like sending map data, or read user rsp buffer. + * + * Context: + * Execution time depends on the command. It may take a long time for commands which sends + * data (map). + */ +static void process_msg_cmd_handler(struct work_struct *work) +{ + struct rmr_cmd_work_info *work_info = container_of(work, struct rmr_cmd_work_info, cmd_work); + struct rmr_pool *pool = work_info->pool; + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_sess *sess = work_info->sess; + struct rtrs_srv_sess *rtrs = work_info->rtrs; + const struct rmr_msg_pool_cmd *cmd_msg = work_info->cmd_msg; + struct rmr_dirty_id_map *map; + u8 sync, flags; + u64 src_mapped_size; + int md_i, err = 0; + + /* + * The switch cases below are used by either map sending node, + * or the node which is to receive the map, but not both. + */ + switch (cmd_msg->cmd_type) { + case RMR_CMD_REJOIN_POOL: + /* + * For now, we do not have any difference between joinand + * rejoin on the storage server side + */ + case RMR_CMD_JOIN_POOL: + /* + * Server node, received a request for a new session + */ + err = process_msg_join_pool(pool, sess, rtrs, cmd_msg->sync, + &cmd_msg->join_pool_cmd); + if (err) { + pr_err("process_msg_join_pool failed with err %d\n", err); + goto out; + } + work_info->rsp->join_pool_cmd_rsp.chunk_size = pool->chunk_size; + + if (pool->mapped_size) { + work_info->rsp->join_pool_cmd_rsp.mapped_size = pool->mapped_size; + pr_info("srv pool %s sets mapped size %llu\n", + pool->poolname, pool->mapped_size); + } else + work_info->rsp->join_pool_cmd_rsp.mapped_size = 0; + + break; + case RMR_CMD_POOL_INFO: + /* + * Server node, received pool info command + */ + err = process_msg_pool_info(pool, &cmd_msg->pool_info_cmd); + if (err) { + pr_err("process_msg_pool_info failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_LEAVE_POOL: + err = process_msg_leave_pool(pool, sess, cmd_msg->sync, &cmd_msg->leave_pool_cmd); + if (err) { + pr_err("process_msg_leave_pool failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_ENABLE_POOL: + err = process_msg_enable_pool(pool, sess, cmd_msg->sync, &cmd_msg->enable_pool_cmd); + if (err) { + pr_err("process_msg_enable_pool failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_MAP_READY: + /* + * Map receiving node. + * Getting ready to receive dirty map + */ + pr_info("%s: RMR_CMD_MAP_READY\n", __func__); + + err = process_msg_map_ready(pool, cmd_msg->sync); + if (err) { + pr_err("process_msg_map_ready failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_MAP_SEND: + /* + * Map sending node. + * Send map to the node with member_id == map_send_cmd->receiver_member_id + */ + pr_info("%s: RMR_CMD_MAP_SEND\n", __func__); + + err = rmr_clt_send_map(pool, srv_pool->clt, &cmd_msg->map_send_cmd, MAP_NO_FILTER); + if (err) { + pr_err("rmr_clt_send_map failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_SEND_MAP_BUF: + /* + * Map receiving node. + * Received the map from another node. Save it. + */ + pr_info("%s: RMR_CMD_SEND_MAP_BUF\n", __func__); + + if (srv_pool->map_update_state != MAP_UPDATE_STATE_READY) { + pr_err("rmr_srv_send_map Node not ready to receive map\n"); + err = -EINVAL; + goto out; + } + + err = rmr_pool_save_map(pool, work_info->data, work_info->datalen, + false); + if (err) { + if (!cmd_msg->sync) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + pr_err("rmr_pool_save_map failed\n"); + goto out; + } + break; + case RMR_CMD_MAP_BUF_DONE: + /* + * Map receiving node. + * A confirmation that all map updates have been sent. + */ + pr_info("%s: RMR_CMD_MAP_BUF_DONE\n", __func__); + + if (srv_pool->map_update_state != MAP_UPDATE_STATE_READY) { + pr_err("rmr_srv_send_map Node state not correct\n"); + err = -EINVAL; + goto out; + } + + if (cmd_msg->map_buf_done_cmd.map_version < pool->map_ver) { + pr_err("Map version received (%llu) is older than ours (%llu)\n", + cmd_msg->map_buf_done_cmd.map_version, pool->map_ver); + err = -EINVAL; + goto out; + } + + pool->map_ver = cmd_msg->map_buf_done_cmd.map_version; + rmr_srv_mark_pool_md_dirty(srv_pool); + + srv_pool->map_update_state = MAP_UPDATE_STATE_DONE; + + break; + case RMR_CMD_MAP_DONE: + /* + * Map receiving node. + * A confirmation from the client, that map update was done successfully or not. + */ + pr_info("%s: RMR_CMD_MAP_DONE\n", __func__); + + if (srv_pool->map_update_state != MAP_UPDATE_STATE_DONE) { + pr_err("rmr_srv_send_map Map not updated succesfully\n"); + err = -EINVAL; + } + + /* + * On a successful map update, we go to NORMAL state. + * + * map_done_cmd.enable says whether this map update should make us go to + * NORMAL state or not. This is controlled by the pserver. + */ + if (cmd_msg->map_done_cmd.enable) { + if (rmr_srv_set_pool_state_normal(srv_pool) < 0) + err = -EINVAL; + } + + srv_pool->map_update_state = MAP_UPDATE_STATE_DISABLED; + break; + case RMR_CMD_MAP_DISABLE: + /* + * Something went wrong on the client side; we need to reset everything. + */ + pr_info("%s: RMR_CMD_MAP_DISABLE\n", __func__); + + if (!cmd_msg->sync) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + srv_pool->map_update_state = MAP_UPDATE_STATE_DISABLED; + break; + case RMR_CMD_READ_MAP_BUF: + /* + * Pserver wants to read our dirty map. So send it. + */ + pr_info("%s: RMR_CMD_READ_MAP_BUF\n", __func__); + + rmr_srv_read_map_buf(pool, work_info->data, work_info->datalen, + &cmd_msg->map_buf_cmd); + + goto out_no_rsp; + case RMR_CMD_MAP_CHECK: + pr_debug("%s: RMR_CMD_MAP_CHECK\n", __func__); + + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_debug("srv pool %s is not in normal state, cannot do map check\n", + pool->poolname); + work_info->rsp->value = false; + break; + } + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("pool %s no map found for member_id %u\n", + pool->poolname, srv_pool->member_id); + err = -EINVAL; + goto out; + } + work_info->rsp->value = rmr_map_empty(map); + pr_debug("pool %s member_id %d rsp with map_empty=%llu\n", + pool->poolname, srv_pool->member_id, + work_info->rsp->value); + + break; + + case RMR_CMD_LAST_IO_TO_MAP: + /* + * Use the last_io list, and add those IOs as dirty IDs to the map + * for every other storage server other than this one. + */ + pr_info("%s: RMR_CMD_LAST_IO_TO_MAP\n", __func__); + err = rmr_srv_save_last_io_to_map(pool); + if (err) { + pr_err("rmr_srv_save_last_io_to_map failed\n"); + goto out; + } + + break; + + case RMR_CMD_MAP_TEST: + /* + * Received the map test from another node. + * Check that we have everything that other node has. + */ + pr_info("%s: RMR_CMD_MAP_TEST\n", __func__); + + err = rmr_pool_save_map(pool, work_info->data, work_info->datalen, true); + if (err) { + pr_err("rmr_srv_save_map failed, test_only, err %d\n", err); + } + goto out_no_rsp; + case RMR_CMD_MD_SEND: + /* + * Received the message to copy metadata of server pool to the sender. + */ + src_mapped_size = cmd_msg->md_send_cmd.src_mapped_size; + pr_debug("stg %u: receives md_update message from pool %u\n", + srv_pool->member_id, cmd_msg->md_send_cmd.leader_id); + + /* Check the pool mapped_sizes are consistent or not */ + if (pool->mapped_size && src_mapped_size && pool->mapped_size != src_mapped_size) { + pr_err_ratelimited("This %s mapped_size %llu != src %d mapped_size %llu\n", + pool->poolname, pool->mapped_size, cmd_msg->md_send_cmd.leader_id, + src_mapped_size); + goto out; + } + + if (cmd_msg->md_send_cmd.read_full_md) { + if (work_info->datalen < sizeof(struct rmr_pool_md)) { + pr_err("%s: buffer too small for full pool_md (%zu < %zu)\n", + __func__, work_info->datalen, + sizeof(struct rmr_pool_md)); + err = -EINVAL; + goto out; + } + memcpy(work_info->data, &pool->pool_md, sizeof(struct rmr_pool_md)); + } else { + /* If updating buf incurs error, it simply waits for next md_update. */ + rmr_srv_update_md_buf(srv_pool, work_info->data, work_info->datalen); + } + + break; + case RMR_CMD_SEND_MD_BUF: + /* + * Received the client pool metadata. Save it. + */ + sync = cmd_msg->send_md_buf_cmd.sync; + flags = cmd_msg->send_md_buf_cmd.flags; + if (flags == RMR_OP_MD_WRITE) { + err = rmr_srv_md_process_buf(pool, work_info->data, sync); + if (err) { + pr_err("rmr_srv_write_md failed\n"); + goto out; + } + + if (atomic_read(&srv_pool->store_state)) { + /* write back to disk */ + err = process_md_io(pool, NULL, 0, work_info->datalen, flags, + &pool->pool_md); + if (err) { + pr_err("Failed to process md io\n"); + goto out; + } + } + } + + if (!sync && flags == RMR_OP_MD_READ) + memcpy(work_info->data, &pool->pool_md, sizeof(struct rmr_pool_md)); + + break; + case RMR_CMD_SEND_DISCARD: + /* Received the message to handle discards. */ + pr_info("%s: RMR_CMD_SEND_DISCARD for srv %u\n", + __func__, cmd_msg->send_discard_cmd.member_id); + if (!cmd_msg->sync) { + err = rmr_pool_md_check_discard(pool, cmd_msg->send_discard_cmd.member_id); + if (err > 0) { + /* This node has received discards. */ + err = 0; + pr_info("pool %s member_id %d has received discards\n", + pool->poolname, srv_pool->member_id); + goto out; + } + } + + /* + * For sync requests, even if the server that is not in normal state has received + * the discard request, its dirty map is still outdated. However, non-sync + * requests can overlook this check and proceed discarding directly. + */ + if (cmd_msg->sync && atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL){ + pr_err("srv pool %s not in normal state for sync discard request\n", + pool->poolname); + err = -EINVAL; + goto out; + } + + err = rmr_srv_discard_id(pool, 0, 0, cmd_msg->send_discard_cmd.member_id, + cmd_msg->sync); + if (err) + pr_err("Failed to discard id\n"); + + break; + case RMR_CMD_STORE_CHECK: + pr_debug("%s: RMR_CMD_STORE_CHECK\n", __func__); + + work_info->rsp->value = rmr_srv_pool_check_store(pool); + pr_debug("pool %s member_id %d rsp with value=%llu\n", + pool->poolname, srv_pool->member_id, + work_info->rsp->value); + + break; + case RMR_CMD_MAP_GET_VER: + pr_debug("%s: RMR_CMD_MAP_GET_VER\n", __func__); + + work_info->rsp->value = pool->map_ver; + pr_debug("pool %s member_id %d rsp with value=%llu\n", + pool->poolname, srv_pool->member_id, + work_info->rsp->value); + + break; + case RMR_CMD_MAP_SET_VER: + pr_debug("%s: RMR_CMD_MAP_SET_VER\n", __func__); + + pool->map_ver = work_info->cmd_msg->set_map_ver_cmd.map_ver; + rmr_srv_mark_pool_md_dirty(srv_pool); + break; + case RMR_CMD_DISCARD_CLEAR_FLAG: + pr_info("%s: RMR_CMD_DISCARD_CLEAR_FLAG\n", __func__); + + md_i = rmr_pool_find_md(&pool->pool_md, cmd_msg->send_discard_cmd.member_id, false); + if (md_i < 0) { + pr_info("Didn't find md for member_id %u\n", + cmd_msg->send_discard_cmd.member_id); + goto out; + } + + pool->pool_md.srv_md[md_i].discard_entries = false; + rmr_srv_flush_pool_md(srv_pool); + break; + case RMR_CMD_USER: + pr_debug("%s: RMR_CMD_USER\n", __func__); + + err = process_msg_user_cmd(srv_pool, cmd_msg, work_info->data, work_info->datalen); + if (err) { + pr_err("process_msg_user_cmd failed with err %d\n", err); + goto out_no_rsp; + } + + goto out_no_rsp; + default: + pr_warn("%s: switch default type: %d\n", __func__, cmd_msg->cmd_type); + + err = -EINVAL; + } + +out: + work_info->rsp->err = err; + work_info->rsp->member_id = srv_pool->member_id; + work_info->rsp->cmd_type = cmd_msg->cmd_type; + +out_no_rsp: + // Should we return err in rdma_resp ? + pr_debug("send rtrs completion from msg_cmd_handler, err:%d\n", err); + rtrs_srv_resp_rdma(work_info->rtrs_op, err); + + rmr_put_srv_pool(srv_pool); + kfree(work_info); +} + +static int schedule_process_msg_cmd(struct rmr_srv_sess *srv_sess, + struct rtrs_srv_op *rtrs_op, + void *data, size_t datalen, + const void *msg, size_t len) +{ + struct rmr_srv_pool *srv_pool; + const struct rmr_msg_pool_cmd *cmd_msg = msg; + const char *poolname = cmd_msg->pool_name; + struct rmr_cmd_work_info *work_info; + u32 group_id = le32_to_cpu(cmd_msg->hdr.group_id); + + pr_debug("pool %s received cmd %d\n", + poolname, cmd_msg->cmd_type); + + srv_pool = rmr_find_and_get_srv_pool(group_id); + if (IS_ERR(srv_pool)) { + pr_err("Cmd %s: pool %s does not exists: %pe\n", + rmr_get_cmd_name(cmd_msg->cmd_type), poolname, srv_pool); + return PTR_ERR(srv_pool); + } + + pr_debug("process_msg_cmd: pool %s found\n", poolname); + + work_info = kzalloc(sizeof(struct rmr_cmd_work_info), GFP_KERNEL); + if (!work_info) { + pr_err("failed to allocate work info to send map\n"); + rmr_put_srv_pool(srv_pool); + return -ENOMEM; + } + work_info->pool = srv_pool->pool; + work_info->sess = srv_sess; + work_info->rtrs = srv_sess->rtrs; + work_info->rtrs_op = rtrs_op; + work_info->cmd_msg = cmd_msg; + work_info->rsp = data; + work_info->data = data; + work_info->datalen = datalen; + + INIT_WORK(&work_info->cmd_work, process_msg_cmd_handler); + schedule_work(&work_info->cmd_work); + + return 0; +} + +static int rmr_srv_rdma_ev(void *priv, struct rtrs_srv_op *id, + void *data, size_t datalen, + const void *usr, size_t usrlen) +{ + struct rmr_srv_sess *srv_sess = priv; + const struct rmr_msg_hdr *hdr = usr; + int ret = 0; + u16 type; + + if (unlikely(WARN_ON(!srv_sess))) + return -ENODEV; + + type = le16_to_cpu(hdr->type); + + switch (type) { + case RMR_MSG_IO: + return process_msg_io(srv_sess, id, data, datalen, + usr, usrlen); + case RMR_MSG_MAP_CLEAR: + ret = process_msg_map_clear(srv_sess, usr); + break; + case RMR_MSG_MAP_ADD: + ret = process_msg_map_add(srv_sess, usr); + break; + case RMR_MSG_CMD: + return schedule_process_msg_cmd(srv_sess, id, data, datalen, + usr, usrlen); + default: + pr_warn("Received unexpected message type %d from session %s\n", + type, srv_sess->sessname); + return -EINVAL; + } + + rtrs_srv_resp_rdma(id, ret); + + return 0; +} + +/** + * rmr_srv_check_params() - Check the parameters of the storage node + * + * @srv_pool: The rmr srv pool to check parameters for + * + * Description: + * Checks the device params with other connected server nodes. + * + * Return: + * 0 on success. + * -Negative error code on failure. + */ +int rmr_srv_check_params(struct rmr_srv_pool *srv_pool) +{ + void *dev; + int err; + + /* If the store has not been added to this server pool, ignore device param checks. */ + if (!srv_pool->io_store) + return 0; + + dev = srv_pool->io_store->priv; + err = srv_pool->io_store->ops->get_params(dev); + if (err) { + pr_err("%s: store get_params failed for pool %s, err %d\n", + __func__, srv_pool->pool->poolname, err); + return err; + } + return 0; +} +EXPORT_SYMBOL(rmr_srv_check_params); + +static struct rtrs_srv_ops rtrs_ops; +static int __init rmr_srv_init_module(void) +{ + int err; + + if (!is_power_of_2(chunk_size) || + chunk_size < MIN_CHUNK_SIZE || chunk_size > MAX_CHUNK_SIZE) { + pr_err("Loading module %s failed. Invalid chunk_size %u\n", + KBUILD_MODNAME, chunk_size); + pr_err("Chunk size should be a power of 2, and between (min %u - max %u)\n", + MIN_CHUNK_SIZE, MAX_CHUNK_SIZE); + return -EINVAL; + } + + pr_info("Loading module %s, version %s, proto %s, chunk_size %u\n", + KBUILD_MODNAME, RMR_VER_STRING, RMR_PROTO_VER_STRING, chunk_size); + + rtrs_ops = (struct rtrs_srv_ops){ + .rdma_ev = rmr_srv_rdma_ev, + .link_ev = rmr_srv_link_ev, + }; + + rmr_req_cachep = kmem_cache_create("rmr_req_cachep", sizeof(struct rmr_srv_req), + 0, 0, NULL); + if (!rmr_req_cachep) { + pr_err("can not allocagte cachep for rmr_req\n"); + err = -ENOMEM; + goto out; + } + rmr_map_entry_cachep = kmem_cache_create("rmr_map_entry_cachep", + sizeof(struct rmr_map_entry), + 0, 0, NULL); + if (!rmr_map_entry_cachep) { + pr_err("can not allocagte cachep for rmr_map_entry\n"); + err = -ENOMEM; + goto req_destroy; + } + + BUILD_BUG_ON(PAGE_SIZE / sizeof(struct rmr_map_cbuf_hdr) < RMR_POOL_MAX_SESS); + + rtrs_ctx = rtrs_srv_open(&rtrs_ops, RTRS_PORT); + if (IS_ERR(rtrs_ctx)) { + err = PTR_ERR(rtrs_ctx); + pr_err("rtrs_srv_open(), err: %pe\n", rtrs_ctx); + goto map_destroy; + } + + err = rmr_srv_create_sysfs_files(); + if (err) { + pr_err("rmr_srv_create_sysfs_files(), err: %d\n", err); + goto srv_close; + } + + return 0; + +srv_close: + rtrs_srv_close(rtrs_ctx); +map_destroy: + kmem_cache_destroy(rmr_map_entry_cachep); +req_destroy: + kmem_cache_destroy(rmr_req_cachep); +out: + return err; +} + +static void __exit rmr_srv_cleanup_module(void) +{ + struct rmr_pool *pool, *tmp; + struct rmr_srv_pool *srv_pool; + + pr_info("Unloading module\n"); + kmem_cache_destroy(rmr_req_cachep); + + rtrs_srv_close(rtrs_ctx); + + list_for_each_entry_safe (pool, tmp, &pool_list, entry) { + srv_pool = (struct rmr_srv_pool *)pool->priv; + + WARN_ON(!list_empty(&pool->sess_list)); + rmr_srv_destroy_pool(pool); + rmr_srv_destroy_pool_sysfs_files(pool, NULL); + rmr_put_srv_pool(srv_pool); + } + + rmr_srv_destroy_sysfs_files(); + pr_info("Module unloaded\n"); +} + +module_init(rmr_srv_init_module); +module_exit(rmr_srv_cleanup_module); diff --git a/drivers/infiniband/ulp/rmr/rmr-srv.h b/drivers/infiniband/ulp/rmr/rmr-srv.h new file mode 100644 index 000000000000..a84586aa78bd --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_SRV_H +#define RMR_SRV_H + +/* rmr-srv-sysfs.c */ + +#include +#include +#include +#include +#include + +#include "rmr-pool.h" + +/* + * IO store interface implemented by an upper-layer consumer of rmr-server. + * All consumer-specific types are passed as void * so RMR remains + * independent of any particular client. + */ +struct rmr_srv_store_ops { + int (*submit_req)(void *device, void *data, u32 offset, u32 length, + unsigned long flags, u16 prio, void *priv); + int (*submit_md_req)(void *device, void *data, u32 offset, u32 length, + unsigned long flags, void *priv); + int (*submit_cmd)(void *device, const void *usr_buf, int usr_len, + void *data, int datalen); + bool (*io_allowed)(void *store_priv); + int (*get_params)(void *device); +}; + +#define DEFAULT_SYNC_QUEUE_DEPTH 32 +#define RMR_SRV_CHECK_MAPS_INTERVAL_MS 3000 +#define RMR_SRV_MD_SYNC_INTERVAL_MS 500 +#define RMR_SRV_DISCARD_TIMEOUT_MS 500 + +/* Bit indices for srv_pool->md_dirty — used with set_bit / test_and_clear_bit */ +enum rmr_srv_md_dirty_bit { + MD_DIRTY_POOL, /* pool_md fields changed */ + MD_DIRTY_MAPS, /* map bitmap changed */ + MD_DIRTY_LAST_IO, /* last_io updated */ +}; + +extern struct kmem_cache *rmr_req_cachep; +extern struct kmem_cache *rmr_map_entry_cachep; + +enum rmr_srv_register_disk_mode { + RMR_SRV_DISK_CREATE, /* Fresh store, new pool */ + RMR_SRV_DISK_ADD, /* Rejoin an existing pool */ + RMR_SRV_DISK_REPLACE, /* Replace an existing store */ +}; + +/* + * When adding state, remember to add an entry in the function rmr_get_srv_pool_state_name() + */ +enum rmr_srv_pool_state { + RMR_SRV_POOL_STATE_EMPTY, + RMR_SRV_POOL_STATE_REGISTERED, + RMR_SRV_POOL_STATE_CREATED, + RMR_SRV_POOL_STATE_NORMAL, + RMR_SRV_POOL_STATE_NO_IO, +}; + +struct rmr_srv_pool { + u8 member_id; + refcount_t refcount; + atomic_t state; + bool maintenance_mode; + + struct rmr_pool *pool; + + /* Sync thread */ + struct task_struct *th_tsk; + atomic_t thread_state; + atomic_t in_flight_sync_reqs; + + struct rmr_srv_io_store *io_store; + struct mutex srv_pool_lock; + atomic_t store_state; + + bool marked_create; + bool marked_delete; + + unsigned long md_dirty; /* bitmask of dirty regions */ + unsigned long map_update_state; + /* The internal client pool assigned to this server pool. */ + struct rmr_pool *clt; + size_t queue_depth; + rmr_id_t *last_io; + /* + * Each storage node keeps a command array with the length of queue depth to track the IOs + * in the last round. Use an array of chunk indexes as a copy of srv_pool->last_io so that + * it can be written back to/read from backing store as needed. + */ + rmr_id_t *last_io_idx; + + u32 max_sync_io_size; + struct workqueue_struct *clean_wq; + struct delayed_work clean_dwork; + + struct workqueue_struct *md_sync_wq; + struct delayed_work md_sync_dwork; + struct delayed_work last_io_sync_dwork; +}; + +/** + * rmr_srv_mark_pool_md_dirty() - Set MD_DIRTY_POOL and schedule delayed sync + * @srv_pool: Server pool with changed pool_md fields + */ +static inline void rmr_srv_mark_pool_md_dirty(struct rmr_srv_pool *srv_pool) +{ + set_bit(MD_DIRTY_POOL, &srv_pool->md_dirty); + mod_delayed_work(srv_pool->md_sync_wq, &srv_pool->md_sync_dwork, + msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS)); +} + +struct rmr_srv_sess { + struct list_head pool_sess_list; + struct rtrs_srv_sess *rtrs; + struct kobject kobj; + char sessname[NAME_MAX]; + struct mutex lock; + u8 ver; + struct xarray pools; + struct list_head g_list_entry; +}; + +struct rmr_srv_pool_sess { + struct list_head pool_entry; /* for pool->sess_list */ + struct list_head srv_sess_entry; + struct rmr_srv_pool *srv_pool; + struct kobject kobj; + char sessname[NAME_MAX]; + struct rmr_srv_sess *srv_sess; + bool sync; +}; + +struct rmr_srv_io_store { + struct rmr_srv_store_ops *ops; + void *priv; +}; + +struct rmr_cmd_work_info { + struct work_struct cmd_work; + struct rmr_pool *pool; + struct rmr_srv_sess *sess; + struct rtrs_srv_sess *rtrs; + const struct rmr_msg_pool_cmd *cmd_msg; + struct rmr_msg_pool_cmd_rsp *rsp; + struct rtrs_srv_op *rtrs_op; + void *data; + size_t datalen; +}; + +void rmr_put_srv_pool(struct rmr_srv_pool *srv_pool); +struct rmr_srv_pool *rmr_create_srv_pool(char *poolname, u32 member_id); +void rmr_srv_pool_update_params(struct rmr_pool *pool); +int rmr_srv_read_md(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len, + struct rmr_pool_md *pool_md_page); +int rmr_srv_send_md_update(struct rmr_pool *pool); +int rmr_srv_check_params(struct rmr_srv_pool *srv_pool); +void rmr_srv_mark_maps_dirty(struct rmr_srv_pool *srv_pool); + +/* rmr-srv-md.c */ +struct rmr_srv_req; /* forward decl for endreq prototype */ + +bool rmr_get_srv_pool(struct rmr_srv_pool *srv_pool); +void rmr_srv_endreq(struct rmr_srv_req *req, int err); + +int process_md_io(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, + u32 offset, u32 len, unsigned long flags, void *buf); +void rmr_srv_md_maps_sync(struct rmr_pool *pool); +void rmr_srv_flush_pool_md(struct rmr_srv_pool *srv_pool); +void rmr_srv_md_sync(struct work_struct *work); +int rmr_srv_md_process_buf(struct rmr_pool *pool, void *buf, bool sync); +int rmr_srv_refresh_md(struct rmr_srv_pool *srv_pool); + +/* rmr-srv-sysfs.c */ + +int rmr_srv_create_sysfs_files(void); +void rmr_srv_destroy_sysfs_files(void); +void rmr_srv_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self); +int rmr_srv_sysfs_add_sess(struct rmr_pool *pool, + struct rmr_srv_pool_sess *pool_sess); +void rmr_srv_sysfs_del_sess(struct rmr_srv_pool_sess *pool_sess); + +void rmr_srv_free_sync_permits(struct rmr_pool *pool); +void rmr_srv_destroy_pool(struct rmr_pool *pool); +int rmr_srv_remove_clt_pool(struct rmr_srv_pool *srv_pool); + +void rmr_srv_stop_sync_and_go_offline(struct rmr_pool *pool); + +int rmr_srv_get_sync_permit(struct rmr_srv_pool *srv_pool); +void rmr_srv_put_sync_permit(struct rmr_srv_pool *srv_pool); + +int rmr_srv_sync_thread_start(struct rmr_srv_pool *srv_pool); +int rmr_srv_sync_thread_stop(struct rmr_srv_pool *srv_pool); + +void rmr_srv_sync_req_failed(struct rmr_srv_pool *srv_pool); + +int rmr_srv_query(struct rmr_pool *pool, u64 mapped_size, struct rmr_attrs *attr); +/* register/unregister rmr-srv */ +struct rmr_pool *rmr_srv_register(char *poolname, struct rmr_srv_store_ops *ops, void *priv, + u64 mapped_size, enum rmr_srv_register_disk_mode mode); +void rmr_srv_unregister(char *poolname, bool delete); + +int rmr_srv_pool_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, + size_t size); +int rmr_srv_discard_id(struct rmr_pool *pool, u64 offset, u64 length, u8 member_id, bool sync); +void rmr_srv_replace_store(struct rmr_pool *pool); + +#endif /* RMR_SRV_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr.h b/drivers/infiniband/ulp/rmr/rmr.h new file mode 100644 index 000000000000..72d591ccc047 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr.h @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_H +#define RMR_H + +#include +#include +#include + +#include "rmr-proto.h" +struct rmr_pool; + +typedef void (rmr_conf_fn)(void *priv, int errno); +enum rmr_wait_type { + NO_WAIT = RTRS_PERMIT_NOWAIT, + WAIT = RTRS_PERMIT_WAIT +}; + +/* + * Here goes RMR client API + */ + +/** + * inverse operation. decrements refcount + * and free if it reaches 0. + */ +void rmr_clt_put_pool(struct rmr_pool *pool); + +/** + * enum rmr_clt_link_ev - Events about connectivity state of a client + * @RMR_CLT_LINK_EV_RECONNECTED Client was reconnected. + * @RMR_CLT_LINK_EV_DISCONNECTED Client was disconnected. + */ +enum rmr_clt_link_ev { + RMR_CLT_LINK_EV_RECONNECTED, + RMR_CLT_LINK_EV_DISCONNECTED, +}; + +typedef void (rmr_clt_ev_fn)(void *priv, enum rmr_clt_link_ev ev); +/** + * rmr_clt_open() - Opens a pool from the RMR client + * @priv: User supplied private data. + * @link_ev: Event notification for connection state changes + * @priv: user supplied data that was passed to rmr_clt_open() + * @ev: Occurred event + * @poolname: name of the pool + * + * Only one user can open a pool at the same time. + * However administrative operations are possible. + * + * Return a valid pointer on success otherwise PTR_ERR. + */ +struct rmr_pool *rmr_clt_open(void *priv, rmr_clt_ev_fn *link_ev, const char *poolname); + +/** + returns the priv data that had been provided with open() +*/ +void *rmr_clt_get_priv(struct rmr_pool *pool); + +/** + * rmr_clt_close() - Closes a pool + * @pool: Pool handler, is freed on return + */ +void rmr_clt_close(struct rmr_pool *pool); + +#define RMR_OP_BITS 8 +#define RMR_OP_MASK ((1 << RMR_OP_BITS) - 1) + +/** + * enum rmr_io_flags - RMR request types from rq_flag_bits + * @RMR_OP_READ: read object + * @RMR_OP_WRITE: write object + * @RMR_OP_DISCARD: remove object + * @RMR_OP_SYNCREQ: sync request + * @RMR_OP_WRITE_ZEROES: write zeroes + * @RMR_OP_FLUSH: flush object + * @RMR_OP_MD_READ: read metadata of rmr + * @RMR_OP_MD_WRITE: write metadata of rmr + */ +enum rmr_io_flags { + /* Operations */ + RMR_OP_READ = 0, + RMR_OP_WRITE = 1, + RMR_OP_DISCARD = 2, + RMR_OP_SYNCREQ = 3, + RMR_OP_WRITE_ZEROES = 4, + RMR_OP_FLUSH = 5, + /* Add metadata related operations below this. */ + RMR_OP_MD_READ = 6, + RMR_OP_MD_WRITE = 7, + + /* Flags */ + RMR_F_SYNC = 1 <<(RMR_OP_BITS + 0), // 0x100, 0b0100000000 + RMR_F_FUA = 1 <<(RMR_OP_BITS + 1), // 0x200, 0b1000000000 +}; + +static inline u32 rmr_op(u32 flag) +{ + return flag & RMR_OP_MASK; +} + +static inline u32 rmr_flags(u32 flag) +{ + return flag & ~RMR_OP_MASK; +} + +/** + * Something to keep the 128 bit block_id (a.k.a object_id) + */ +typedef struct { + u64 a; + u64 b; +} rmr_id_t; + +struct rmr_iu; + +/** + * rmr_clt_get_iu() - allocates iu for future RDMA operation + * @pool: Current pool + * @id: Id of the object/block + * @flag: READ/WRITE/REMOVE + * @wait: WAIT/NO_WAIT + * + * Description: + * Allocates iu for the following RDMA operation. Iu is used + * to preallocate all resources and to propagate memory pressure + * up earlier. + * + */ +struct rmr_iu *rmr_clt_get_iu(struct rmr_pool *pool, + enum rmr_io_flags flag, + enum rmr_wait_type wait); + +/** + * rmr_clt_put_iu() - puts allocated iu + * @pool: Current pool + * @id: Id of the object/block + * @flag: READ/WRITE/REMOVE + * @iu: Iu to be freed + * + * Context: + * Does not matter + */ +void rmr_clt_put_iu(struct rmr_pool *pool, struct rmr_iu *iu); + +/** + * rmr_clt_request() - Request data transfer to/from server via RDMA. + * + * + * @pool: The Pool + * @iu: Iu allocated by pevious rmr_clt_get_iu call. + * @offset: offset inside the object to read/write: + * @length: length of data starting from offset + * @flag: READ/WRITE/REMOVE + * @prio: priority of IO + * @priv: User provided data, passed back with corresponding + * @(conf) confirmation. + * @conf: callback function to be called as confirmation + * @sg: Pages to be sent/received to/from server. + * @sg_cnt: Number of elements in the @sg + * + * Return: + * 0: Success + * -EAGAIN: Currently there are no resources to execute the request. + * Retry again later. + * <0: Error + * + * On flag=READ rtrs client will request a data transfer from Server to client. + * The data that the server will respond with will be stored in @sg when + * the user confirmation function is called. + * On flag=WRITE rtrs client will rdma write data in sg to server side. + */ +int rmr_clt_request(struct rmr_pool *pool, struct rmr_iu *iu, + size_t offset, size_t length, enum rmr_io_flags flag, unsigned short prio, + void *priv, rmr_conf_fn *conf, struct scatterlist *sg, unsigned int sg_cnt); + +int rmr_clt_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, + size_t size); + + +/** + * rmr_attrs - RMR pool attributes + */ +struct rmr_attrs { + u32 queue_depth; + u32 max_io_size; + u32 chunk_size; + u32 max_segments; + u64 rmr_md_size; /* in sectors */ + u8 sync; + struct kobject *pool_kobj; +}; + +/** + * rmr_clt_query() - queries RMR pool attributes + * + * Returns: + * 0 on success + * -EINVAL no session in the pool + */ +int rmr_clt_query(struct rmr_pool *pool, struct rmr_attrs *attr); + +typedef enum { + RMR_MAP_ADD, + RMR_MAP_REMOVE, +} rmr_map_cmd; + +#define RMR_STORE_ID_BITS 32 +#define RMR_STORE_ID_OFFSET (64 - RMR_STORE_ID_BITS) + +#define RMR_CHUNK_BITS 32 +#define RMR_CHUNK_OFFSET 0 + +enum rmr_pool_state { + RMR_POOL_STATE_CREATED = 0, + RMR_POOL_STATE_JOINED, + RMR_POOL_STATE_ONLINE, + /* maybe we will use this later */ + RMR_POOL_STATE_DEGRADED, + RMR_POOL_STATE_SYNCING, +}; + +#endif