From d8b8ab043a0452e17fa846f44fae64abe775d2f1 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:13 +0200 Subject: [PATCH 01/13] RDMA/rmr: add public and private headers Reliable Multicast over RTRS (RMR) is an RDMA ULP that provides active-active block-level replication on top of the RTRS transport. It guarantees delivery of an I/O to a group of storage nodes and handles resynchronization of data between storage nodes without involving the compute client. Add the public interface header (rmr.h) used by upper-layer consumers, plus the private headers shared between the client and server modules: rmr-proto.h wire protocol definitions rmr-pool.h pool data structures rmr-map.h dirty-map data structures rmr-req.h server-side request lifecycle helpers rmr-clt.h client-side structs and function declarations rmr-srv.h server-side structs and function declarations, including the IO store interface (struct rmr_srv_store_ops) implemented by an upper-layer consumer No code is compiled by this patch on its own; the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/infiniband/ulp/rmr/rmr-clt.h | 291 ++++++++++++++++++ drivers/infiniband/ulp/rmr/rmr-map.h | 246 +++++++++++++++ drivers/infiniband/ulp/rmr/rmr-pool.h | 400 +++++++++++++++++++++++++ drivers/infiniband/ulp/rmr/rmr-proto.h | 273 +++++++++++++++++ drivers/infiniband/ulp/rmr/rmr-req.h | 65 ++++ drivers/infiniband/ulp/rmr/rmr-srv.h | 219 ++++++++++++++ drivers/infiniband/ulp/rmr/rmr.h | 229 ++++++++++++++ 7 files changed, 1723 insertions(+) create mode 100644 drivers/infiniband/ulp/rmr/rmr-clt.h create mode 100644 drivers/infiniband/ulp/rmr/rmr-map.h create mode 100644 drivers/infiniband/ulp/rmr/rmr-pool.h create mode 100644 drivers/infiniband/ulp/rmr/rmr-proto.h create mode 100644 drivers/infiniband/ulp/rmr/rmr-req.h create mode 100644 drivers/infiniband/ulp/rmr/rmr-srv.h create mode 100644 drivers/infiniband/ulp/rmr/rmr.h diff --git a/drivers/infiniband/ulp/rmr/rmr-clt.h b/drivers/infiniband/ulp/rmr/rmr-clt.h new file mode 100644 index 000000000000..c50651efe4a3 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_CLT_H +#define RMR_CLT_H + +#include +#include "rmr-pool.h" + +#define RECONNECT_DELAY 30 +#define MAX_RECONNECTS -1 +#define RTRS_LINK_NAME "rtrs" + +#define RMR_MAP_CLEAN_DELAY_MS 5000 +#define RMR_RECOVER_INTERVAL_MS 3000 + +enum rmr_clt_sess_state { + RMR_CLT_SESS_DISCONNECTED = 1, + RMR_CLT_SESS_CONNECTED, +}; + +struct rmr_clt_sess { + char sessname[NAME_MAX]; + struct kobject kobj; + struct mutex lock; + struct rtrs_clt_sess *rtrs; + bool rtrs_ready; + /* server this session is connected to */ + int queue_depth; + u32 max_io_size; + u32 max_segments; + struct list_head pool_sess_list; + struct list_head g_list; + struct kref kref; + enum rmr_clt_sess_state state; +}; + +/* + * NB: If you change here, make sure the changes are in sync with + * pool_sess state machine routine i.e. pool_sess_change_state(). + */ +enum rmr_clt_pool_sess_state { + RMR_CLT_POOL_SESS_CREATED = 1, // No IO, No dirty map addition, Yes cmd msgs + RMR_CLT_POOL_SESS_NORMAL, // Yes IO, No dirty map addition, Yes cmd msgs + RMR_CLT_POOL_SESS_FAILED, // No IO, Yes dirty map addition, No cmd msgs + RMR_CLT_POOL_SESS_RECONNECTING, // No IO, Yes, dirty map addition, Yes cmd msgs + // But not with an updated map + + RMR_CLT_POOL_SESS_REMOVING // No IO, No dirty map addition, Yes cmd msgs + // Getting removed from pool +}; + +struct rmr_clt_pool_sess { + char sessname[NAME_MAX]; + struct rmr_pool *pool; + struct kobject kobj; + u8 member_id; /* refers to the pool id on the */ + struct kobject sess_kobj; + struct list_head entry; /* for pool->sess_list */ + struct list_head clt_sess_entry; /* for clt_sess->pool_sess_list */ + struct rmr_clt_sess *clt_sess; + atomic_t state; /* rmr_clt_pool_sess_state */ + u8 ver; /* protocol version */ + u8 pool_id; /* refers to the pool id on the */ + bool maintenance_mode; /* If the pool is in maintenance mode or not */ + bool was_last_authoritative; /* last NORMAL sess before it went FAILED; + * carries complete dirty maps for all members */ +}; + +struct rmr_clt_stats { + struct kobject kobj_stats; + atomic_t read_retries; +}; + +/* + * State descriptions: + * RMR_CLT_POOL_STATE_JOINED: An rmr_clt_pool which has one or more legs (rmr_clt_pool_sess) + * added to it. This means the pool has joined into pools from + * storage nodes + * + * RMR_CLT_POOL_STATE_IN_USE: An rmr_clt_pool which is in use by an upper layer client. This + * is usually done by calling rmr_clt_open + * + * Note: When adding a new state, + * remember to add an entry in the function rmr_get_clt_pool_state_name() + */ +enum rmr_clt_pool_state { + RMR_CLT_POOL_STATE_JOINED = 0, + RMR_CLT_POOL_STATE_IN_USE, + // RMR_CLT_POOL_STATE_DEGRADED, uncomment and use + // RMR_CLT_POOL_STATE_DIRTY, + RMR_CLT_POOL_STATE_MAX, +}; + +struct rmr_clt_pool { + struct rmr_pool *pool; + refcount_t refcount; + unsigned long state; + struct mutex clt_pool_lock; + + size_t queue_depth; + + struct rmr_clt_stats stats; + struct kobject stats_kobj; + + void *priv; /* provided by user */ + rmr_clt_ev_fn *link_ev; /* deliver events to user */ + + atomic_t io_freeze; + wait_queue_head_t map_update_wq; + struct mutex io_freeze_lock; + + struct workqueue_struct *recover_wq; + struct delayed_work recover_dwork; + + /* use sessions round robbin to read */ + struct rmr_clt_pool_sess __rcu *__percpu *pcpu_sess; +}; + +struct rmr_iu_comp { + wait_queue_head_t wait; + int errno; +}; + +/** + * rmr_iu - reserves resources needed to do an I/O op on pool + */ +struct rmr_iu { + struct rmr_pool *pool; + unsigned int mem_id; + struct list_head sess_list; /* list of per-session tags */ + u8 num_sessions; + refcount_t ref; /* lifetime refcount */ + struct rmr_msg_io msg; + int errno; + atomic_t succeeded; + refcount_t refcount; + rmr_conf_fn *conf; + void *priv; + /* for retry of failed reads */ + struct work_struct work; + struct scatterlist *sg; + unsigned int sg_cnt; +}; + +struct rmr_clt_sess_iu { + void *buf; /* for session messages */ + struct rtrs_permit *permit; + struct rmr_clt_pool_sess *pool_sess; + int errno; + union { + /* for session messages only */ + struct scatterlist sg; + /* for tag->sess_list of io messages*/ + struct list_head entry; + }; + + /* for session messages only */ + struct work_struct work; + + /* for io requests */ + struct rmr_iu *rmr_iu; + unsigned int mem_id; + + /* for command messages */ + struct rmr_clt_cmd_unit *rmr_cmd_unit; + + /* for session messages only */ + struct rmr_iu_comp comp; + atomic_t refcount; +}; + +struct rmr_clt_iu_comp { + wait_queue_head_t wait; + int errno; +}; + +struct rmr_clt_cmd_unit { + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + + struct list_head sess_list; + int num_sessions; + + int failed_state; + int errno; + atomic_t succeeded; + refcount_t refcount; + + rmr_conf_fn *conf; + void *priv; +}; + +/* rmr-clt.c */ +struct rmr_pool *rmr_clt_create_pool(const char *name); +void rmr_put_clt_pool(struct rmr_clt_pool *clt_pool); + +void rmr_clt_change_pool_state(struct rmr_clt_pool *rmr_clt_pool, + enum rmr_clt_pool_state new_state, bool set); +int rmr_clt_remove_pool_from_sysfs(struct rmr_pool *pool, + const struct attribute *sysfs_self); +struct rmr_clt_sess *find_and_get_or_create_clt_sess(char *sessname, + struct rtrs_addr *paths, + size_t path_cnt); +struct rmr_clt_pool_sess *rmr_clt_add_pool_sess(struct rmr_pool *pool, + struct rmr_clt_sess *clt_sess, bool create); +void rmr_clt_sess_put(struct rmr_clt_sess *sess); +void rmr_clt_del_pool_sess(struct rmr_clt_pool_sess *sess); +void rmr_clt_destroy_pool_sess(struct rmr_clt_pool_sess *sess, bool delete); + +const char *rmr_clt_sess_state_str(enum rmr_clt_pool_sess_state state); +void resend_join_pool(struct rmr_clt_sess *sess); +int rmr_clt_reconnect_sess(struct rmr_clt_sess *sess, + const struct rtrs_addr *paths, + size_t path_cnt); +int rmr_clt_start_last_io_update(struct rmr_pool *pool); +int rmr_clt_set_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess); +int rmr_clt_enable_sess(struct rmr_clt_pool_sess *sess); + +int rmr_clt_send_map_update(struct rmr_pool *pool, struct rmr_iu *iu); + +int rmr_clt_pool_send_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg); +int rmr_clt_send_cmd_with_data(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen); +int rmr_clt_map_add_id(struct rmr_pool *pool, int stg_id, rmr_id_t id); +void rmr_clt_init_cmd(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg); +int rmr_clt_pool_send_cmd(struct rmr_clt_pool_sess *sess, struct rmr_msg_pool_cmd *msg, bool wait); +int rmr_clt_del_stor_from_pool(struct rmr_clt_pool_sess *pool_sess, bool delete); +bool rmr_clt_sess_is_sync(struct rmr_clt_pool_sess *sess); +int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool delete, bool wait); +void rmr_clt_free_pool_sess(struct rmr_clt_pool_sess *pool_sess); +int rmr_clt_send_map(struct rmr_pool *map_src_pool, struct rmr_pool *clt_pool, + const struct rmr_msg_map_send_cmd *map_send_cmd, rmr_map_filter filter); +int rmr_clt_test_map(struct rmr_pool *src_pool, struct rmr_pool *dst_pool); +int rmr_clt_send_cmd_with_data_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen); +int rmr_clt_pool_send_md_all(struct rmr_pool *src_pool, struct rmr_pool *clt_pool); +int rmr_clt_pool_send_cmd_all(struct rmr_pool *pool, enum rmr_msg_cmd_type cmd_type); +void recover_work(struct work_struct *work); + +int rmr_clt_pool_member_synced(struct rmr_pool *pool, u8 member_id); + +bool pool_sess_change_state(struct rmr_clt_pool_sess *pool_sess, + enum rmr_clt_pool_sess_state newstate); + +void rmr_clt_pool_io_freeze(struct rmr_clt_pool *clt_pool); +void rmr_clt_pool_io_unfreeze(struct rmr_clt_pool *clt_pool); +void rmr_clt_pool_io_wait_complete(struct rmr_clt_pool *clt_pool); +int rmr_clt_pool_try_enable(struct rmr_pool *pool); +int send_msg_enable_pool(struct rmr_clt_pool_sess *pool_sess, bool enable); + +void rmr_get_iu(struct rmr_iu *iu); +void rmr_put_iu(struct rmr_iu *iu); +void rmr_msg_put_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu); +void wake_up_iu_comp(struct rmr_clt_sess_iu *sess_iu); +void msg_conf(void *priv, int errno); + +/* rmr-map-mgmt.c */ +void send_map_check(struct rmr_clt_pool_sess *pool_sess); +void send_store_check(struct rmr_clt_pool_sess *pool_sess); +int send_map_get_version(struct rmr_clt_pool_sess *pool_sess, u64 *ver); +int send_discard(struct rmr_clt_pool_sess *pool_sess, u8 cmd_type, u8 member_id); +int rmr_clt_handle_map_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp); +int rmr_clt_handle_store_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp); +int rmr_clt_read_map(struct rmr_pool *pool); +int rmr_clt_spread_map(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess_chosen, + bool enable, bool skip_normal); +int rmr_clt_unset_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess); +void sched_map_add(struct work_struct *work); +void msg_pool_cmd_map_content_conf(struct work_struct *work); + +/* rmr-clt-sysfs.c */ +int rmr_clt_create_sysfs_files(void); +void rmr_clt_destroy_sysfs_files(void); +void rmr_clt_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self); +int rmr_clt_create_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess); +void rmr_clt_destroy_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess); + +int rmr_clt_reset_read_retries(struct rmr_clt_stats *stats, bool enable); +ssize_t rmr_clt_stats_read_retries_to_str(struct rmr_clt_stats *stats, char *page); + +#endif /* RMR_CLT_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-map.h b/drivers/infiniband/ulp/rmr/rmr-map.h new file mode 100644 index 000000000000..76ef6506421f --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-map.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_MAP_H +#define RMR_MAP_H + +#include +#include + +#include "rmr.h" + +/** + * The dirty map buffer is used to track dirty chunks through bits. + * The position of the bit denotes the chunk number it tracks. + * + * Bitmap structure + * ---------------- + * The dirty bitmap is stored in a 2 level tree-like structure. + * The main unit of storage are memory pages; They act as nodes of this structure. + * The first level pages (FLP) stores the address of the second level pages. + * There can be a total of 256 first level pages. + * The second level pages (SLP, also the leaf nodes/pages) stores the bitmap. + * + * The first level pages have to store the address of the second level pages. + * An address being 8B (default/max) long, the addresses of a maximum of 512 pages can + * be stored in a first level page. This then decides the maximum leaf pages a pool can + * have, which, for our example, is [(# pages of FLP) * (PAGE_SIZE / address_size)], + * (256*512)=131072. + * With the above info, the available space for bitmap is 131072*4KB(PAGE_SIZE)=512MB. + * + * A chunk is the smallest unit of data which is tracked for being dirty. A chunk is + * called dirty/unsynced, even if a single byte in it is dirty/unsynced. + * To track a chunk, a single byte (1B) is used. The least significant bit is used to signify + * if the chunk is dirty (set) or not. Other bits can be used for other purposes (for example, + * filters). The maximum number of chunks RMR can manage are then, (512MB)/1B=536870912. + * This number is fixed, as one can see from the calculations, and hence the maximum size of + * metadata RMR can allocate and use is fixed. + + * The user configurable part is the chunk size. Its range is 128KB-1MB, and it has to be a + * power of 2. + * The chunk size decides the maximum mapped size for an RMR pool. + * For example, for chunk size 1MB, and taking the maximum number of chunks RMR can allocate + * and handle (536870912, see above), the maximum mapped size would be (536870912*1MB)=512TB. + * The table showing the relation between chunk size and maximum mapped size is as follows, + * Chunk size Maximum mapped size + * 128KB 64TB + * 256KB 128TB + * 512KB 256TB + * 1MB 512TB + * + * Calculating chunk number + * ------------------------ + * Some key points + * 1) The Linux kernel has a fixed size for sector, which is 512 (or 9 bitshift) + * 2) The mapped_size provided and stores in the rmr_pool structure is in sectors. + * 3) The chunk_size provided and stored in the rmr_pool structure is in bytes. + * 4) The code calculates and stores chunk_size_shift in the rmr_pool structure to do fast + * calculation. + * 5) The IO offset give to RMR (through function rmr_clt_request) is in bytes. + * + * -- + * With the above points, lets have a sample scenario with mapped_size 1GB and chunk_size 128KB + * The numbers would then be, + * + * no_of_chunks = (mapped_size / chunk_size) + * no_of_chunks = 8192 + * + * chunk_size = 131072 + * chunk_size_shift = 17 + * + * dirty_map buffer size (in BYTES) = (no_of_chunks / bits in a byte) + * dirty_map buffer size (in BYTES) = 1024 + * + * -- + * Lets do a sample calculation of chunk_no from offset and length of an IO + * + * For offset 30801920 and length 4096 + * + * chunk_no = (offset >> chunk_size_shift) + * chunk_no = 235 + * + */ + +#define RMR_KEY_SHIFT 32 + +// Each chunk requires 1B of metadata +#define PER_CHUNK_MD 1 +#define PER_CHUNK_MD_LOG2 ilog2(PER_CHUNK_MD) + +#define GET_CHUNK_NUMBER(offset, shift) (offset >> shift) +#define GET_FOLLOWING_CHUNKS(offset_len, shift, start) (((offset_len - 1) >> shift) - start + 1) + +#define CHUNK_TO_OFFSET(chunk_no, shift) (chunk_no << shift) + +// The element type stored in FLP +typedef unsigned long el_flp; + +enum { + CHUNK_DIRTY_BIT = 0, + CHUNK_FILTER_BIT, +}; + +enum { + MAX_NO_OF_FLP = 256, + NO_OF_SLP_PER_FLP = (PAGE_SIZE >> ilog2(sizeof(void *))), + NO_OF_SLP_PER_FLP_LOG2 = ilog2(NO_OF_SLP_PER_FLP), + MAX_NO_OF_SLP = (MAX_NO_OF_FLP * NO_OF_SLP_PER_FLP), + + NO_OF_CHUNKS_PER_PAGE = (PAGE_SIZE >> PER_CHUNK_MD_LOG2), + // Chunks data is stored only in SLP + MAX_NO_OF_CHUNKS = (MAX_NO_OF_SLP * NO_OF_CHUNKS_PER_PAGE), + + CHUNKS_PER_SLP = (PAGE_SIZE >> PER_CHUNK_MD_LOG2), + CHUNKS_PER_SLP_LOG2 = ilog2(CHUNKS_PER_SLP), + CHUNKS_PER_FLP = (CHUNKS_PER_SLP * NO_OF_SLP_PER_FLP), + CHUNKS_PER_FLP_LOG2 = ilog2(CHUNKS_PER_FLP), +}; + +typedef enum { + MAP_NO_FILTER = 0, + MAP_ENTRY_UNSYNCED +} rmr_map_filter; + +enum rmr_map_state { + RMR_MAP_STATE_NO_CHECK = 0, + RMR_MAP_STATE_CHECKING, + // do we have some other useful states ? +}; + +struct rmr_dirty_id_map { + u8 member_id; + struct xarray rmr_id_map; + unsigned long ts; + atomic_t check_state; + + /* + * The usage of this is restricted to form a linked lised + * during mass deletion. Since this is in an RCU list (maps + * in rmr_pool), we cannot use this or change any data until + * the RCU period completes. So we use this next variable + * during mass deletion so we can have a list and don't have + * to wait and restart the search on every individual deletion + * of a map. Refer destroy_clt_pool(). + */ + struct rmr_dirty_id_map *next; + + u64 no_of_chunks; + u64 no_of_flp; + u64 no_of_slp_in_last_flp; + u64 no_of_chunk_in_last_slp; + u64 total_slp; + u8 *bitmap_filter; + void *dirty_bitmap[MAX_NO_OF_FLP]; +}; + +struct rmr_map_entry { + atomic_t sync_cnt; + struct llist_head wait_list; +}; + +/* + * The header of the bitmap buffer. + */ +struct rmr_map_cbuf_hdr { + u64 version; + u8 member_id; + + u64 no_of_chunks; + u64 no_of_flp; + u64 no_of_slp_in_last_flp; + u64 no_of_chunk_in_last_slp; + u64 total_slp; +} __packed; + +static inline unsigned long rmr_id_to_key(rmr_id_t id) +{ + unsigned long res; + + // highest bits for id.a, the rest are for id.b; + res = ((id.a << RMR_KEY_SHIFT) | id.b); + return res; +} + +static inline u64 key_to_a(unsigned long key) +{ + return key >> RMR_KEY_SHIFT; +} + +static inline u64 key_to_b(unsigned long key) +{ + return key & ((1ULL << RMR_KEY_SHIFT) - 1); +} + +void rmr_map_update_page_params(struct rmr_dirty_id_map *map); +struct rmr_dirty_id_map *rmr_map_create(struct rmr_pool *pool, u8 member_id); +void rmr_map_destroy(struct rmr_dirty_id_map *map); +void rmr_map_calc_chunk(struct rmr_pool *pool, size_t offset, size_t length, rmr_id_t *id); +void rmr_map_set_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter); +void rmr_map_set_dirty_all(struct rmr_dirty_id_map *map, u8 filter); +struct rmr_map_entry *rmr_map_unset_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter); +bool rmr_map_check_dirty(struct rmr_dirty_id_map *map, rmr_id_t id); +struct rmr_map_entry *rmr_map_get_dirty_entry(struct rmr_dirty_id_map *map, rmr_id_t id); +void rmr_map_clear_filter_all(struct rmr_dirty_id_map *map, u8 filter); +void rmr_map_unset_dirty_all(struct rmr_dirty_id_map *map); +bool rmr_map_empty(struct rmr_dirty_id_map *map); + +void rmr_map_bitwise_or_buf(void *dst_buf, void *src_buf, u32 buf_size); +int rmr_map_create_entries(struct rmr_dirty_id_map *map); + +void rmr_map_hexdump_bitmap_buf(u8 member_id, void *buf, u32 buf_size); +void rmr_map_slps_to_buf(struct rmr_dirty_id_map *map, u64 slp_idx, u64 no_of_slp, u8 *buf); +u64 rmr_map_buf_to_slps(struct rmr_dirty_id_map *map, u8 *buf, u32 buf_size, u64 slp_idx, + bool test); +void rmr_map_dump_bitmap(struct rmr_dirty_id_map *map); +int rmr_map_summary_format(struct rmr_pool *pool, char *buf, size_t buf_size); +void rmr_map_bidump_bitmap_buf(void *buf, u8 member_id, u32 buf_size); + +static inline void map_entry_get_sync(struct rmr_map_entry *entry) +{ + atomic_inc(&entry->sync_cnt); + pr_debug("after get ref for entry %p, sync cnt %d\n", + entry, atomic_read(&entry->sync_cnt)); +} + +static inline int map_entry_put_sync(struct rmr_map_entry *entry) +{ + pr_debug("before dec_and_test for entry %p, sync cnt %d\n", + entry, atomic_read(&entry->sync_cnt)); + return atomic_dec_and_test(&entry->sync_cnt); +} + +static inline void rmr_maplist_destroy(struct rmr_dirty_id_map *maplist) +{ + struct rmr_dirty_id_map *mp; + + while (maplist != NULL) { + mp = maplist; + maplist = maplist->next; + rmr_map_destroy(mp); + } +} +#endif /* RMR_MAP_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-pool.h b/drivers/infiniband/ulp/rmr/rmr-pool.h new file mode 100644 index 000000000000..3cb7d3ae84b9 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-pool.h @@ -0,0 +1,400 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_POOL_H +#define RMR_POOL_H + +#include /* for NAME_MAX */ +#include +#include +#include /* for jhash() */ +#include /* for round_up */ +#include "rmr.h" +#include "rmr-map.h" + +#define RMR_POOL_MD_MAGIC 0xDEADBEEF +#define XA_TRUE ((void *)1UL) +#define XA_FALSE ((void *)2UL) + +extern struct kmem_cache *rmr_map_entry_cachep; +/* + * enum srv_sync_thread_state + */ +enum srv_sync_thread_state { + SYNC_THREAD_REQ_STOP, /* 0 */ + SYNC_THREAD_STOPPED, + SYNC_THREAD_RUNNING, + SYNC_THREAD_WAIT, +}; + +enum srv_map_update_state { + MAP_UPDATE_STATE_DISABLED, + MAP_UPDATE_STATE_READY, + MAP_UPDATE_STATE_DONE, +}; + +/* The srv pool specific structure */ +struct rmr_srv_md { + u64 map_ver; + u64 mapped_size; /* server store size in sectors */ + u8 member_id; + u8 srv_pool_state; /* server pool state */ + u8 store_state; /* state of io_store */ + u8 map_update_state; + bool discard_entries; +}; + +/* Shared by each pool */ +struct rmr_pool_md { + char poolname[NAME_MAX]; + u64 magic; + u32 group_id; + u32 chunk_size; /* rmr client */ + u64 mapped_size; /* client view of store size */ + u32 queue_depth; + u64 map_ver; + struct rmr_srv_md srv_md[RMR_POOL_MAX_SESS]; +} __packed; + +struct rmr_pool { + char poolname[NAME_MAX]; + u32 group_id; /* jhash() on poolname */ + struct kobject kobj; + struct kobject sessions_kobj; + struct list_head entry; /* for global pool_list */ + + struct list_head sess_list; /* list of sessions */ + struct mutex sess_lock; /* protect list of sessions */ + struct srcu_struct sess_list_srcu; + + void *priv; + u64 mapped_size; + u32 chunk_size; + u8 chunk_size_shift; + u64 no_of_chunks; + + struct percpu_ref ids_inflight_ref; + struct completion complete_done; + struct completion confirm_done; + + struct completion discard_done; /* for sync client pool */ + /* Set when waiting for response of discard request */ + atomic_t discard_waiting; + + u8 maps_cnt; + struct mutex maps_lock; + struct rmr_dirty_id_map __rcu + *maps[RMR_POOL_MAX_SESS]; + /* All member ids of the storage nodes */ + struct xarray stg_members; + u64 map_ver; + atomic_t normal_count; /* number of pool sessions currently in NORMAL state */ + struct srcu_struct map_srcu; + + struct rmr_pool_md pool_md; + + bool is_clt; + bool sync; +}; + +/** + * rmr_pool_find_md - find the index of the srv_md with the provided key in the pool_md + * + * @pool_md: the pool_md to search + * @key: the member_id of the server pool to search for + * @empty_slot: the empty slot is required by caller or not + * + * Description: + * Find the index of the srv_md with the matched key. If there is no such a key and the empty + * slot is not required, return -1. + * + * Return: + * >= 0, the index of the key in the pool_md. Return the index of an empty slot when the key + * is not found and the empty_slot flag is true + * -1 if the key is not found and empty_slot is false, or the pool_md doesn't exist + */ +static inline int rmr_pool_find_md(struct rmr_pool_md *pool_md, u8 key, bool empty_slot) +{ + int i; + int empty_i = -1; + + if (!pool_md) + return -1; + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + if (!pool_md->srv_md[i].member_id) + empty_i = i; + + if (pool_md->srv_md[i].member_id == key) + return i; + } + + if (empty_slot) + return empty_i; + return -1; +} + +/** + * rmr_pool_md_check_discard - check the discard_entries flag of the srv_md + * + * @pool: the pool to check pool_md + * @member_id: the member_id of the srv_md to check + * + * Description: + * Check if the pool has received the discards from the server pool with the provided + * member_id. + * + * Return: + * 1 (true) if the pool has received the discards, + * 0 (false) if the pool has not received the discards, + * <0 if the pool has no info of the server pool + */ +static inline int rmr_pool_md_check_discard(struct rmr_pool *pool, u8 member_id) +{ + int md_i = rmr_pool_find_md(&pool->pool_md, member_id, false); + + if (md_i < 0) { + pr_err("Failed to find md for member_id %u\n", member_id); + return -EINVAL; + } + + /* If the flag is set, this pool has received the discards. */ + return pool->pool_md.srv_md[md_i].discard_entries; +} + +#define RMR_MAP_FORMAT_VER 1 +/* + * Get the first most significant bit of map_ver. If it is one, then the store of that storage node + * is being replaced. + */ +#define RMR_STORE_IS_REPLACE(map_ver) (map_ver >> 63 & 1ULL) +#define RMR_STORE_GET_VER(map_ver) (map_ver & ~(1ULL << 63)) +#define RMR_STORE_SET_REPLACE(map_ver) (map_ver |= 1ULL << 63) +#define RMR_STORE_UNSET_REPLACE(map_ver) (map_ver &= ~(1ULL << 63)) +#define RTRS_IO_LIMIT 102400 +//#define RTRS_IO_LIMIT 40 //for tests only + +/* + * TODO: + * We currently do not have mapped_size while creating dirty maps, + * which means we cannot calculate no_of_chunks, hence cannot allocate bitmap + * So, as a workaround, we allocate max size bitmap, + * and to reduce that allocation, we cap max mapped_size. + * + * 1GB max mapped size for now. + * (Size mentioned in number of sectors, just like nr_sects) + */ +#define RMR_MAX_MAPPED_SIZE 2097152 + +/* The header structure of rmr pool metadata will not over this limit. */ +#define RMR_MD_SIZE PAGE_SIZE +#define RMR_MD_SIZE_SECTORS (PAGE_SIZE / SECTOR_SIZE) +#define RMR_MAP_BUF_HDR_SIZE PAGE_SIZE +#define RMR_SRV_MD_SIZE (sizeof(struct rmr_srv_md) * RMR_POOL_MAX_SESS) +#define RMR_CLT_MD_SIZE (sizeof(struct rmr_pool_md) - RMR_SRV_MD_SIZE) +#define RMR_SECTOR_SIZE 512 +#define RMR_INT_ROUND_UP(x, y) (((x) + (y) - 1) / (y)) +#define RMR_ROUND_UP(x) round_up(x, RMR_SECTOR_SIZE) + +#define RMR_SRV_MAX_QDEPTH 512 + +/* last_io region starts right after the pool_md header page */ +#define RMR_LAST_IO_OFFSET RMR_MD_SIZE + +static inline u64 rmr_last_io_len(u32 queue_depth) +{ + return RMR_ROUND_UP((u64)queue_depth * sizeof(rmr_id_t)); +} + +static inline u64 rmr_bitmap_offset(u32 queue_depth) +{ + return RMR_LAST_IO_OFFSET + rmr_last_io_len(queue_depth); +} + +static inline u64 rmr_per_map_bitmap_size(u64 no_of_chunks) +{ + return DIV_ROUND_UP(no_of_chunks, CHUNKS_PER_SLP) * PAGE_SIZE; +} + +static inline u64 rmr_bitmap_len(u64 no_of_chunks) +{ + return RMR_POOL_MAX_SESS * rmr_per_map_bitmap_size(no_of_chunks); +} + +struct rmr_map_buf_hdr { + u64 version; + u64 member_id; + + /* + * dst_slp_idx: SLP index in the local dirty map buffer, + * from where to write the recved dirty map buffer + */ + u64 dst_slp_idx; + u32 buf_size; + + /* + * slp_idx: Only used for MAP_READ, + * to let client know where to ask from in the next iteration + */ + u64 map_idx; + u64 slp_idx; +} __packed; + +extern struct list_head pool_list; +extern struct mutex pool_mutex; + +const char *rmr_get_cmd_name(enum rmr_msg_cmd_type cmd); + +struct rmr_pool *rmr_create_pool(const char *poolname, void *priv); +void free_pool(struct rmr_pool *pool); + +struct rmr_pool *rmr_find_pool_by_group_id(u32 group_id); +struct rmr_pool *rmr_find_pool(const char *poolname); +int rmr_pool_maps_to_buf(struct rmr_pool *pool, u8 *map_idx, u64 *slp_idx, + void *buf, size_t buflen, rmr_map_filter filter); +int rmr_pool_save_map(struct rmr_pool *pool, void *buf, size_t buflen, + bool test_only); + +static inline void rmr_pool_update_no_of_chunk(struct rmr_pool *pool) +{ + u64 calc_no_of_chunks = 0, old_no_of_chunks = pool->no_of_chunks; + + /* + * In include/linux/types.h + * + * "Linux always considers sectors to be 512 (SECTOR_SHIFT==9) bytes long independently + * of the devices real block size." + * + * mapped_size is saved in sectors. + */ + if (pool->mapped_size) { + calc_no_of_chunks = (pool->mapped_size >> (pool->chunk_size_shift - 9)); + + if (pool->chunk_size && + (pool->mapped_size << 9) % pool->chunk_size) + calc_no_of_chunks += 1; + } + + if (calc_no_of_chunks != pool->no_of_chunks) { + pool->no_of_chunks = calc_no_of_chunks; + pr_info("%s: For %s, no_of_chunks old (%llu), updated %llu\n", + __func__, pool->poolname, old_no_of_chunks, pool->no_of_chunks); + } +} + +/* + * rmr_pool_maps_append - Append a map to the dense maps array + * @pool: pool + * @map: map to add + * + * Context: Caller must hold maps_lock. + */ +static inline void rmr_pool_maps_append(struct rmr_pool *pool, + struct rmr_dirty_id_map *map) +{ + rcu_assign_pointer(pool->maps[pool->maps_cnt], map); + pool->maps_cnt++; +} + +/* + * rmr_pool_maps_swap_remove - Remove map at index @i using swap-with-last + * @pool: pool + * @i: index of the map in the map array to remove + * @map: the map being removed + * + * Description: + * Maintains the dense invariant: pool->maps[0:maps_cnt] has no NULL gaps. + * + * Context: Caller must hold maps_lock. + */ +static inline void rmr_pool_maps_swap_remove(struct rmr_pool *pool, u8 i, + struct rmr_dirty_id_map *map) +{ + u8 last = pool->maps_cnt - 1; + + if (i != last) + rcu_assign_pointer(pool->maps[i], rcu_dereference_protected(pool->maps[last], + lockdep_is_held(&pool->maps_lock))); + + rcu_assign_pointer(pool->maps[last], NULL); + pool->maps_cnt--; +} + +static inline struct rmr_dirty_id_map *rmr_pool_find_map(struct rmr_pool *pool, u8 member_id) +{ + int i; + struct rmr_dirty_id_map *map; + struct rmr_dirty_id_map *res = NULL; + + rcu_read_lock(); + for (i = 0; i < pool->maps_cnt; i++) { + map = rcu_dereference(pool->maps[i]); + + if (WARN_ON(!map) || map->member_id != member_id) + continue; + + res = map; + break; + } + rcu_read_unlock(); + + return res; +} + +static inline int rmr_pool_remove_map(struct rmr_pool *pool, u8 member_id) +{ + int i; + struct rmr_dirty_id_map *mp; + struct rmr_dirty_id_map *map = NULL; + + pr_info("%s: pool %s is removing map for member_id %d\n", + __func__, pool->poolname, member_id); + + mutex_lock(&pool->maps_lock); + for (i = 0; i < pool->maps_cnt; i++) { + mp = rcu_dereference_protected(pool->maps[i], + lockdep_is_held(&pool->maps_lock)); + if (WARN_ON(!mp)) + continue; + if (mp->member_id == member_id) { + map = mp; + break; + } + } + + if (!map) { + mutex_unlock(&pool->maps_lock); + pr_err("%s: pool %s cannot find map for member_id %d\n", + __func__, pool->poolname, member_id); + return -EINVAL; + } + + /* Dirty map entries are also removed since the map no longer exists. */ + rmr_map_unset_dirty_all(map); + + rmr_pool_maps_swap_remove(pool, i, map); + synchronize_srcu(&pool->map_srcu); + + mutex_unlock(&pool->maps_lock); + + /* Free up the memory */ + rmr_map_destroy(map); + + return 0; +} + + +bool rmr_pool_change_state(struct rmr_pool *pool, enum rmr_pool_state new_state); + +void rmr_pool_confirm_inflight_ref(struct percpu_ref *ref); + +static inline u32 rmr_pool_hash(const char *poolname) +{ + return jhash(poolname, strlen(poolname), 0); +} + +#endif /* RMR_POOL_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-proto.h b/drivers/infiniband/ulp/rmr/rmr-proto.h new file mode 100644 index 000000000000..02c20ed76bef --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-proto.h @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_PROTO_H +#define RMR_PROTO_H + +#define RMR_PROTO_VER_MAJOR 0 +#define RMR_PROTO_VER_MINOR 1 + +#define RMR_PROTO_VER_STRING __stringify(RMR_PROTO_VER_MAJOR) "." \ + __stringify(RMR_PROTO_VER_MINOR) + +#ifndef RMR_VER_STRING +#define RMR_VER_STRING __stringify(RMR_PROTO_VER_MAJOR) "." \ + __stringify(RMR_PROTO_VER_MINOR) +#endif + +/* TODO: should be configurable */ +#define RTRS_PORT 1234 + +#define RMR_POOL_MAX_SESS 4 + +/** + * enum rmr_msg_types - RMR message types + * @RMR_MSG_JOIN_POOL: Join pool message from client to server + * @RMR_MSG_JOIN_POOL_RSP: Join pool messge response from server to client + * @RMR_MSG_LEAVE_POOL: Leave pool message from client to server + * @RMR_MSG_IO: IO(read/write) request on an object + */ +enum rmr_msg_type { + RMR_MSG_CMD, + RMR_MSG_CMD_RSP, + RMR_MSG_IO, + RMR_MSG_MD, + RMR_MSG_MAP_CLEAR, + RMR_MSG_MAP_ADD, +}; + +/** + * struct rmr_msg_hdr - header of RMR messages + * @type: Message type, valid values see: enum rmr_msg_types + */ +struct rmr_msg_hdr { + __le32 group_id; /* poolname jhash() */ + __le16 type; + __le16 __padding; +}; + +/** + * struct rmr_msg_io - message for object I/O read/write + * @hdr: message header + * @id_a: first 64bit of the object id + * @id_b: second 64bit of the object id + * @offset: offset from where to read/write + * @flags: bitmask, valid values are defined in enum rmr_io_flags + * @length: number of bytes for I/O read/write + * @pool_id: pool id to which the object belongs + */ +struct rmr_msg_io { + struct rmr_msg_hdr hdr; + __le64 id_a; + __le64 id_b; + + __le32 offset; + __le32 length; + __le32 flags; + __le16 prio; + + __le32 mem_id; + __le64 map_ver; + u8 failed_id[RMR_POOL_MAX_SESS]; + u8 failed_cnt; + + u8 member_id; + u8 sync; + u8 __padding[19]; //padding is not correct now i think +}; + +struct rmr_pool_member_info { + u8 no_of_stor; + + struct per_mem_info { + u8 member_id; + u8 c_dirty; + } p_mem_info[RMR_POOL_MAX_SESS]; +}; + +/** + * enum rmr_msg_cmd_types - RMR command types + * @RMR_CMD_MAP_READY: Get ready to receive map + * @RMR_CMD_MAP_SEND: Send map to certain node + * @RMR_CMD_MAP_DONE: Confirm map receipt + * + * When adding a command, + * make sure to add it to the function rmr_get_cmd_name. + */ +enum rmr_msg_cmd_type { + RMR_CMD_MAP_READY, // 0 + RMR_CMD_MAP_SEND, + RMR_CMD_SEND_MAP_BUF, + RMR_CMD_MAP_BUF_DONE, + RMR_CMD_MAP_DONE, + RMR_CMD_MAP_DISABLE, + RMR_CMD_READ_MAP_BUF, + RMR_CMD_MAP_CHECK, + RMR_CMD_LAST_IO_TO_MAP, + RMR_CMD_STORE_CHECK, + RMR_CMD_MAP_TEST, + /* sends the metadata of non-sync rmr-client to server */ + RMR_CMD_SEND_MD_BUF, + /*sends the message of discards to the node */ + RMR_CMD_SEND_DISCARD, + /* sends the message of md_update to the node; the node sends its srv_md back. */ + RMR_CMD_MD_SEND, + + RMR_CMD_MAP_GET_VER, // 14 + RMR_CMD_MAP_SET_VER, + RMR_CMD_DISCARD_CLEAR_FLAG, + + /* + * Add map related commands above this + */ + RMR_MAP_CMD_MAX, + + RMR_CMD_POOL_INFO, // 18 + RMR_CMD_JOIN_POOL, + + RMR_CMD_REJOIN_POOL, + + RMR_CMD_LEAVE_POOL, + RMR_CMD_ENABLE_POOL, // 22 + + RMR_CMD_USER, + + /* + * Add pool related commands above this + */ + RMR_POOL_CMD_MAX, +}; + +struct rmr_msg_map_send_cmd { + u8 receiver_member_id; +}; + +struct rmr_msg_map_buf_cmd { + u64 version; + u8 map_idx; + u64 slp_idx; +}; + +struct rmr_msg_map_buf_done_cmd { + u64 map_version; +}; + +struct rmr_msg_map_done_cmd { + u8 enable; +}; + +struct rmr_msg_send_md_buf_cmd { + u8 sync; /* if the pool is sync or not */ + u8 sender_id; + u8 receiver_id; + u64 flags; +}; + +struct rmr_msg_send_discard_cmd { + u8 member_id; /* the storage node that discards all data */ +}; + +struct rmr_msg_md_send_cmd { + u64 src_mapped_size; /* the pool mapped size on the sending side */ + u8 sender_id; + u8 leader_id; + u8 read_full_md; /* 1 = return full pool_md; 0 = own entry only */ +}; + +struct rmr_msg_pool_info_cmd { + u8 member_id; + u8 operation; /* add/remove */ + u8 mode; /* For add -> create/assemble. For remove -> delete/disassemble */ + u8 dirty; /* Valid only when operation=ADD and mode=CREATE */ +}; + +enum rmr_pool_info_op { + RMR_POOL_INFO_OP_ADD = 0, + RMR_POOL_INFO_OP_REMOVE, +}; + +enum rmr_pool_info_mode { + RMR_POOL_INFO_MODE_CREATE = 0, + RMR_POOL_INFO_MODE_ASSEMBLE, + RMR_POOL_INFO_MODE_DELETE, + RMR_POOL_INFO_MODE_DISASSEMBLE, +}; + +struct rmr_msg_set_map_ver_cmd { + u8 map_ver; /* the map version to set */ +}; + +struct rmr_msg_join_pool_cmd { + u64 queue_depth; + u32 chunk_size; + struct rmr_pool_member_info mem_info; + u8 dirty; + u8 create; + u8 rejoin; +}; + +struct rmr_msg_leave_pool_cmd { + u8 member_id; + u8 delete; +}; + +struct rmr_msg_enable_pool_cmd { + u32 enable; +}; + +struct rmr_msg_user_cmd { + size_t usr_len; +}; + +struct rmr_msg_join_pool_cmd_rsp { + u64 mapped_size; + u32 chunk_size; +}; + +struct rmr_msg_pool_cmd { + struct rmr_msg_hdr hdr; + u8 ver; + u8 cmd_type; + u8 sync; + u8 rsvd[1]; + s8 pool_name[NAME_MAX]; + union { + struct rmr_msg_map_send_cmd map_send_cmd; + struct rmr_msg_map_buf_cmd map_buf_cmd; + struct rmr_msg_map_buf_done_cmd map_buf_done_cmd; + struct rmr_msg_map_done_cmd map_done_cmd; + + struct rmr_msg_send_md_buf_cmd send_md_buf_cmd; + struct rmr_msg_send_discard_cmd send_discard_cmd; + struct rmr_msg_md_send_cmd md_send_cmd; + + struct rmr_msg_pool_info_cmd pool_info_cmd; + + struct rmr_msg_set_map_ver_cmd set_map_ver_cmd; + + struct rmr_msg_join_pool_cmd join_pool_cmd; + + struct rmr_msg_leave_pool_cmd leave_pool_cmd; + struct rmr_msg_enable_pool_cmd enable_pool_cmd; + + struct rmr_msg_user_cmd user_cmd; + }; +}; + +struct rmr_msg_pool_cmd_rsp { + struct rmr_msg_hdr hdr; + enum rmr_msg_cmd_type cmd_type; + u8 err; + u8 ver; + u8 member_id; + union { + struct rmr_msg_join_pool_cmd_rsp join_pool_cmd_rsp; + u64 value; + }; +}; + +#endif /* RMR_PROTO_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-req.h b/drivers/infiniband/ulp/rmr/rmr-req.h new file mode 100644 index 000000000000..8f15b36fe480 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-req.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_REQ_H +#define RMR_REQ_H + +#include "rmr-pool.h" + +struct rmr_srv_req { + struct rmr_srv_pool *srv_pool; + rmr_id_t id; + + u32 offset; + u32 length; + u32 flags; + u16 prio; + + u32 mem_id; + struct rtrs_srv_op *rtrs_op; + struct rmr_srv_io_store *store; + void *data; + u32 datalen; //TODO: what is the difference between lenghth? + void (*endreq)(struct rmr_srv_req *, int err); + struct work_struct work; + int err; + u8 failed_cnt; + u8 failed_srv_id[RMR_POOL_MAX_SESS]; + u64 map_ver; + void *priv; + struct llist_node node; + bool from_sync; + struct scatterlist sg; + struct rmr_iu *iu; + struct rmr_srv_req *parent; + bool sync; + struct rcu_head rcu; +}; + +struct rmr_srv_req *rmr_srv_req_create(const struct rmr_msg_io *msg, + struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, + void *data, u32 datalen, + void (*endreq)(struct rmr_srv_req *, int)); +struct rmr_srv_req *rmr_srv_md_req_create(struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, void *data, + u32 offset, u32 len, unsigned long flags, + void (*endreq)(struct rmr_srv_req *, int)); +void rmr_req_submit(struct rmr_srv_req *req); +void rmr_md_req_submit(struct rmr_srv_req *req); +void rmr_srv_req_resp(struct rmr_srv_req *req, int err); +void rmr_srv_md_req_resp(struct rmr_srv_req *req, int err); +int rmr_srv_sync_chunk_id(struct rmr_srv_pool *srv_pool, struct rmr_map_entry *entry, + rmr_id_t id, bool from_sync); + +void rmr_process_wait_list(struct rmr_map_entry *entry, int err); + +struct rmr_map_entry_info { + rmr_id_t id; + u8 srv_id; +}; +#endif /* RMR_REQ_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr-srv.h b/drivers/infiniband/ulp/rmr/rmr-srv.h new file mode 100644 index 000000000000..a84586aa78bd --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_SRV_H +#define RMR_SRV_H + +/* rmr-srv-sysfs.c */ + +#include +#include +#include +#include +#include + +#include "rmr-pool.h" + +/* + * IO store interface implemented by an upper-layer consumer of rmr-server. + * All consumer-specific types are passed as void * so RMR remains + * independent of any particular client. + */ +struct rmr_srv_store_ops { + int (*submit_req)(void *device, void *data, u32 offset, u32 length, + unsigned long flags, u16 prio, void *priv); + int (*submit_md_req)(void *device, void *data, u32 offset, u32 length, + unsigned long flags, void *priv); + int (*submit_cmd)(void *device, const void *usr_buf, int usr_len, + void *data, int datalen); + bool (*io_allowed)(void *store_priv); + int (*get_params)(void *device); +}; + +#define DEFAULT_SYNC_QUEUE_DEPTH 32 +#define RMR_SRV_CHECK_MAPS_INTERVAL_MS 3000 +#define RMR_SRV_MD_SYNC_INTERVAL_MS 500 +#define RMR_SRV_DISCARD_TIMEOUT_MS 500 + +/* Bit indices for srv_pool->md_dirty — used with set_bit / test_and_clear_bit */ +enum rmr_srv_md_dirty_bit { + MD_DIRTY_POOL, /* pool_md fields changed */ + MD_DIRTY_MAPS, /* map bitmap changed */ + MD_DIRTY_LAST_IO, /* last_io updated */ +}; + +extern struct kmem_cache *rmr_req_cachep; +extern struct kmem_cache *rmr_map_entry_cachep; + +enum rmr_srv_register_disk_mode { + RMR_SRV_DISK_CREATE, /* Fresh store, new pool */ + RMR_SRV_DISK_ADD, /* Rejoin an existing pool */ + RMR_SRV_DISK_REPLACE, /* Replace an existing store */ +}; + +/* + * When adding state, remember to add an entry in the function rmr_get_srv_pool_state_name() + */ +enum rmr_srv_pool_state { + RMR_SRV_POOL_STATE_EMPTY, + RMR_SRV_POOL_STATE_REGISTERED, + RMR_SRV_POOL_STATE_CREATED, + RMR_SRV_POOL_STATE_NORMAL, + RMR_SRV_POOL_STATE_NO_IO, +}; + +struct rmr_srv_pool { + u8 member_id; + refcount_t refcount; + atomic_t state; + bool maintenance_mode; + + struct rmr_pool *pool; + + /* Sync thread */ + struct task_struct *th_tsk; + atomic_t thread_state; + atomic_t in_flight_sync_reqs; + + struct rmr_srv_io_store *io_store; + struct mutex srv_pool_lock; + atomic_t store_state; + + bool marked_create; + bool marked_delete; + + unsigned long md_dirty; /* bitmask of dirty regions */ + unsigned long map_update_state; + /* The internal client pool assigned to this server pool. */ + struct rmr_pool *clt; + size_t queue_depth; + rmr_id_t *last_io; + /* + * Each storage node keeps a command array with the length of queue depth to track the IOs + * in the last round. Use an array of chunk indexes as a copy of srv_pool->last_io so that + * it can be written back to/read from backing store as needed. + */ + rmr_id_t *last_io_idx; + + u32 max_sync_io_size; + struct workqueue_struct *clean_wq; + struct delayed_work clean_dwork; + + struct workqueue_struct *md_sync_wq; + struct delayed_work md_sync_dwork; + struct delayed_work last_io_sync_dwork; +}; + +/** + * rmr_srv_mark_pool_md_dirty() - Set MD_DIRTY_POOL and schedule delayed sync + * @srv_pool: Server pool with changed pool_md fields + */ +static inline void rmr_srv_mark_pool_md_dirty(struct rmr_srv_pool *srv_pool) +{ + set_bit(MD_DIRTY_POOL, &srv_pool->md_dirty); + mod_delayed_work(srv_pool->md_sync_wq, &srv_pool->md_sync_dwork, + msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS)); +} + +struct rmr_srv_sess { + struct list_head pool_sess_list; + struct rtrs_srv_sess *rtrs; + struct kobject kobj; + char sessname[NAME_MAX]; + struct mutex lock; + u8 ver; + struct xarray pools; + struct list_head g_list_entry; +}; + +struct rmr_srv_pool_sess { + struct list_head pool_entry; /* for pool->sess_list */ + struct list_head srv_sess_entry; + struct rmr_srv_pool *srv_pool; + struct kobject kobj; + char sessname[NAME_MAX]; + struct rmr_srv_sess *srv_sess; + bool sync; +}; + +struct rmr_srv_io_store { + struct rmr_srv_store_ops *ops; + void *priv; +}; + +struct rmr_cmd_work_info { + struct work_struct cmd_work; + struct rmr_pool *pool; + struct rmr_srv_sess *sess; + struct rtrs_srv_sess *rtrs; + const struct rmr_msg_pool_cmd *cmd_msg; + struct rmr_msg_pool_cmd_rsp *rsp; + struct rtrs_srv_op *rtrs_op; + void *data; + size_t datalen; +}; + +void rmr_put_srv_pool(struct rmr_srv_pool *srv_pool); +struct rmr_srv_pool *rmr_create_srv_pool(char *poolname, u32 member_id); +void rmr_srv_pool_update_params(struct rmr_pool *pool); +int rmr_srv_read_md(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len, + struct rmr_pool_md *pool_md_page); +int rmr_srv_send_md_update(struct rmr_pool *pool); +int rmr_srv_check_params(struct rmr_srv_pool *srv_pool); +void rmr_srv_mark_maps_dirty(struct rmr_srv_pool *srv_pool); + +/* rmr-srv-md.c */ +struct rmr_srv_req; /* forward decl for endreq prototype */ + +bool rmr_get_srv_pool(struct rmr_srv_pool *srv_pool); +void rmr_srv_endreq(struct rmr_srv_req *req, int err); + +int process_md_io(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, + u32 offset, u32 len, unsigned long flags, void *buf); +void rmr_srv_md_maps_sync(struct rmr_pool *pool); +void rmr_srv_flush_pool_md(struct rmr_srv_pool *srv_pool); +void rmr_srv_md_sync(struct work_struct *work); +int rmr_srv_md_process_buf(struct rmr_pool *pool, void *buf, bool sync); +int rmr_srv_refresh_md(struct rmr_srv_pool *srv_pool); + +/* rmr-srv-sysfs.c */ + +int rmr_srv_create_sysfs_files(void); +void rmr_srv_destroy_sysfs_files(void); +void rmr_srv_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self); +int rmr_srv_sysfs_add_sess(struct rmr_pool *pool, + struct rmr_srv_pool_sess *pool_sess); +void rmr_srv_sysfs_del_sess(struct rmr_srv_pool_sess *pool_sess); + +void rmr_srv_free_sync_permits(struct rmr_pool *pool); +void rmr_srv_destroy_pool(struct rmr_pool *pool); +int rmr_srv_remove_clt_pool(struct rmr_srv_pool *srv_pool); + +void rmr_srv_stop_sync_and_go_offline(struct rmr_pool *pool); + +int rmr_srv_get_sync_permit(struct rmr_srv_pool *srv_pool); +void rmr_srv_put_sync_permit(struct rmr_srv_pool *srv_pool); + +int rmr_srv_sync_thread_start(struct rmr_srv_pool *srv_pool); +int rmr_srv_sync_thread_stop(struct rmr_srv_pool *srv_pool); + +void rmr_srv_sync_req_failed(struct rmr_srv_pool *srv_pool); + +int rmr_srv_query(struct rmr_pool *pool, u64 mapped_size, struct rmr_attrs *attr); +/* register/unregister rmr-srv */ +struct rmr_pool *rmr_srv_register(char *poolname, struct rmr_srv_store_ops *ops, void *priv, + u64 mapped_size, enum rmr_srv_register_disk_mode mode); +void rmr_srv_unregister(char *poolname, bool delete); + +int rmr_srv_pool_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, + size_t size); +int rmr_srv_discard_id(struct rmr_pool *pool, u64 offset, u64 length, u8 member_id, bool sync); +void rmr_srv_replace_store(struct rmr_pool *pool); + +#endif /* RMR_SRV_H */ diff --git a/drivers/infiniband/ulp/rmr/rmr.h b/drivers/infiniband/ulp/rmr/rmr.h new file mode 100644 index 000000000000..72d591ccc047 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr.h @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef RMR_H +#define RMR_H + +#include +#include +#include + +#include "rmr-proto.h" +struct rmr_pool; + +typedef void (rmr_conf_fn)(void *priv, int errno); +enum rmr_wait_type { + NO_WAIT = RTRS_PERMIT_NOWAIT, + WAIT = RTRS_PERMIT_WAIT +}; + +/* + * Here goes RMR client API + */ + +/** + * inverse operation. decrements refcount + * and free if it reaches 0. + */ +void rmr_clt_put_pool(struct rmr_pool *pool); + +/** + * enum rmr_clt_link_ev - Events about connectivity state of a client + * @RMR_CLT_LINK_EV_RECONNECTED Client was reconnected. + * @RMR_CLT_LINK_EV_DISCONNECTED Client was disconnected. + */ +enum rmr_clt_link_ev { + RMR_CLT_LINK_EV_RECONNECTED, + RMR_CLT_LINK_EV_DISCONNECTED, +}; + +typedef void (rmr_clt_ev_fn)(void *priv, enum rmr_clt_link_ev ev); +/** + * rmr_clt_open() - Opens a pool from the RMR client + * @priv: User supplied private data. + * @link_ev: Event notification for connection state changes + * @priv: user supplied data that was passed to rmr_clt_open() + * @ev: Occurred event + * @poolname: name of the pool + * + * Only one user can open a pool at the same time. + * However administrative operations are possible. + * + * Return a valid pointer on success otherwise PTR_ERR. + */ +struct rmr_pool *rmr_clt_open(void *priv, rmr_clt_ev_fn *link_ev, const char *poolname); + +/** + returns the priv data that had been provided with open() +*/ +void *rmr_clt_get_priv(struct rmr_pool *pool); + +/** + * rmr_clt_close() - Closes a pool + * @pool: Pool handler, is freed on return + */ +void rmr_clt_close(struct rmr_pool *pool); + +#define RMR_OP_BITS 8 +#define RMR_OP_MASK ((1 << RMR_OP_BITS) - 1) + +/** + * enum rmr_io_flags - RMR request types from rq_flag_bits + * @RMR_OP_READ: read object + * @RMR_OP_WRITE: write object + * @RMR_OP_DISCARD: remove object + * @RMR_OP_SYNCREQ: sync request + * @RMR_OP_WRITE_ZEROES: write zeroes + * @RMR_OP_FLUSH: flush object + * @RMR_OP_MD_READ: read metadata of rmr + * @RMR_OP_MD_WRITE: write metadata of rmr + */ +enum rmr_io_flags { + /* Operations */ + RMR_OP_READ = 0, + RMR_OP_WRITE = 1, + RMR_OP_DISCARD = 2, + RMR_OP_SYNCREQ = 3, + RMR_OP_WRITE_ZEROES = 4, + RMR_OP_FLUSH = 5, + /* Add metadata related operations below this. */ + RMR_OP_MD_READ = 6, + RMR_OP_MD_WRITE = 7, + + /* Flags */ + RMR_F_SYNC = 1 <<(RMR_OP_BITS + 0), // 0x100, 0b0100000000 + RMR_F_FUA = 1 <<(RMR_OP_BITS + 1), // 0x200, 0b1000000000 +}; + +static inline u32 rmr_op(u32 flag) +{ + return flag & RMR_OP_MASK; +} + +static inline u32 rmr_flags(u32 flag) +{ + return flag & ~RMR_OP_MASK; +} + +/** + * Something to keep the 128 bit block_id (a.k.a object_id) + */ +typedef struct { + u64 a; + u64 b; +} rmr_id_t; + +struct rmr_iu; + +/** + * rmr_clt_get_iu() - allocates iu for future RDMA operation + * @pool: Current pool + * @id: Id of the object/block + * @flag: READ/WRITE/REMOVE + * @wait: WAIT/NO_WAIT + * + * Description: + * Allocates iu for the following RDMA operation. Iu is used + * to preallocate all resources and to propagate memory pressure + * up earlier. + * + */ +struct rmr_iu *rmr_clt_get_iu(struct rmr_pool *pool, + enum rmr_io_flags flag, + enum rmr_wait_type wait); + +/** + * rmr_clt_put_iu() - puts allocated iu + * @pool: Current pool + * @id: Id of the object/block + * @flag: READ/WRITE/REMOVE + * @iu: Iu to be freed + * + * Context: + * Does not matter + */ +void rmr_clt_put_iu(struct rmr_pool *pool, struct rmr_iu *iu); + +/** + * rmr_clt_request() - Request data transfer to/from server via RDMA. + * + * + * @pool: The Pool + * @iu: Iu allocated by pevious rmr_clt_get_iu call. + * @offset: offset inside the object to read/write: + * @length: length of data starting from offset + * @flag: READ/WRITE/REMOVE + * @prio: priority of IO + * @priv: User provided data, passed back with corresponding + * @(conf) confirmation. + * @conf: callback function to be called as confirmation + * @sg: Pages to be sent/received to/from server. + * @sg_cnt: Number of elements in the @sg + * + * Return: + * 0: Success + * -EAGAIN: Currently there are no resources to execute the request. + * Retry again later. + * <0: Error + * + * On flag=READ rtrs client will request a data transfer from Server to client. + * The data that the server will respond with will be stored in @sg when + * the user confirmation function is called. + * On flag=WRITE rtrs client will rdma write data in sg to server side. + */ +int rmr_clt_request(struct rmr_pool *pool, struct rmr_iu *iu, + size_t offset, size_t length, enum rmr_io_flags flag, unsigned short prio, + void *priv, rmr_conf_fn *conf, struct scatterlist *sg, unsigned int sg_cnt); + +int rmr_clt_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, + size_t size); + + +/** + * rmr_attrs - RMR pool attributes + */ +struct rmr_attrs { + u32 queue_depth; + u32 max_io_size; + u32 chunk_size; + u32 max_segments; + u64 rmr_md_size; /* in sectors */ + u8 sync; + struct kobject *pool_kobj; +}; + +/** + * rmr_clt_query() - queries RMR pool attributes + * + * Returns: + * 0 on success + * -EINVAL no session in the pool + */ +int rmr_clt_query(struct rmr_pool *pool, struct rmr_attrs *attr); + +typedef enum { + RMR_MAP_ADD, + RMR_MAP_REMOVE, +} rmr_map_cmd; + +#define RMR_STORE_ID_BITS 32 +#define RMR_STORE_ID_OFFSET (64 - RMR_STORE_ID_BITS) + +#define RMR_CHUNK_BITS 32 +#define RMR_CHUNK_OFFSET 0 + +enum rmr_pool_state { + RMR_POOL_STATE_CREATED = 0, + RMR_POOL_STATE_JOINED, + RMR_POOL_STATE_ONLINE, + /* maybe we will use this later */ + RMR_POOL_STATE_DEGRADED, + RMR_POOL_STATE_SYNCING, +}; + +#endif From 18a97ed859fadacf2d3772cb72070dac8f25efe2 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:14 +0200 Subject: [PATCH 02/13] RDMA/rmr: add shared library code (pool, map, request) Add the three source files that provide functionality shared by both the RMR client and the RMR server: rmr-pool.c pool refcounting, lookup and lifecycle helpers used by both client and server pool implementations. rmr-map.c dirty-map data structure used to track which blocks have not yet been replicated to a given pool member. rmr-req.c server-side request infrastructure that submits an I/O to an upper-layer store via struct rmr_srv_store_ops and propagates the completion back into RMR. These files are not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/infiniband/ulp/rmr/rmr-map.c | 904 ++++++++++++++++++++++++++ drivers/infiniband/ulp/rmr/rmr-pool.c | 401 ++++++++++++ drivers/infiniband/ulp/rmr/rmr-req.c | 796 +++++++++++++++++++++++ 3 files changed, 2101 insertions(+) create mode 100644 drivers/infiniband/ulp/rmr/rmr-map.c create mode 100644 drivers/infiniband/ulp/rmr/rmr-pool.c create mode 100644 drivers/infiniband/ulp/rmr/rmr-req.c diff --git a/drivers/infiniband/ulp/rmr/rmr-map.c b/drivers/infiniband/ulp/rmr/rmr-map.c new file mode 100644 index 000000000000..f4b7dd7c3b50 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-map.c @@ -0,0 +1,904 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#include + +#include "rmr-map.h" +#include "rmr-pool.h" + +void rmr_map_update_page_params(struct rmr_dirty_id_map *map) +{ + unsigned long remaining_chunks; + + map->no_of_flp = (map->no_of_chunks >> CHUNKS_PER_FLP_LOG2); + + /* + * If the number of chunks are not completely filling an FLP (CHUNKS_PER_FLP), + * then the remaining would be tracked by the next FLP. Thus the next FLP would + * have unused SLP pointers. We will calculate the number of SLP slots which will + * be used in the last FLP. + */ + remaining_chunks = map->no_of_chunks & (CHUNKS_PER_FLP - 1); + if (!remaining_chunks) { + /* + * If there are no remaining chunks, then the last FLP is completely full. + */ + map->no_of_slp_in_last_flp = NO_OF_SLP_PER_FLP; + map->no_of_chunk_in_last_slp = NO_OF_CHUNKS_PER_PAGE; + } else { + /* + * If there are remaining chunks, then we add another FLP for it. + * This FLP will not be full, hence we calculate the number of SLP slots + * that will be used. + */ + map->no_of_flp += 1; + map->no_of_slp_in_last_flp = (remaining_chunks >> CHUNKS_PER_SLP_LOG2); + + /* + * Same as above. It could be that the number of chunks do not fit neatly + * in the last SLP (CHUNKS_PER_SLP), and the remaining ones end up in the + * SLP with remaining chunk slots. + */ + remaining_chunks &= (CHUNKS_PER_SLP - 1); + if (!remaining_chunks) { + /* + * If there are no remaining chunks, then the last SLP is completely full. + */ + map->no_of_chunk_in_last_slp = CHUNKS_PER_SLP; + } else { + /* + * If there are remaining chunks, then we add another SLP. + */ + map->no_of_slp_in_last_flp += 1; + map->no_of_chunk_in_last_slp = remaining_chunks; + } + } + + map->total_slp = ((map->no_of_flp - 1) * NO_OF_SLP_PER_FLP) + map->no_of_slp_in_last_flp; +} + +static void rmr_map_update_map_params(struct rmr_pool *pool, struct rmr_dirty_id_map *map) +{ + map->no_of_chunks = pool->no_of_chunks; + + rmr_map_update_page_params(map); + + pr_info("%s: Chunks info %u, %u, %u, %llu\n", + __func__, pool->chunk_size, ilog2(pool->chunk_size), + pool->chunk_size_shift, map->no_of_chunks); + pr_info("%s: FLPs %llu, SLPs in last FLP %llu, Total SLPs %llu, chunks in last SLP %llu\n", + __func__, map->no_of_flp, map->no_of_slp_in_last_flp, map->total_slp, + map->no_of_chunk_in_last_slp); + pr_info("%s: Dirty map size %lldB\n", __func__, (map->total_slp * PAGE_SIZE)); +} + +static int rmr_map_allocate_pages(struct rmr_pool *pool, struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + u64 no_of_slps; + int i, j; + + for (i = 0; i < map->no_of_flp;) { + map->dirty_bitmap[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!map->dirty_bitmap[i]) + goto err_alloc; + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + /* + * Move the increment to here, so that later in err_alloc: if we have to free, + * the index i, is pointing in the correct position. + */ + i++; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + *flp_ptr = get_zeroed_page(GFP_KERNEL); + if (!*flp_ptr) + goto err_alloc; + } + } + + // TODO remove this + map->bitmap_filter = kcalloc(pool->no_of_chunks, sizeof(*map->bitmap_filter), GFP_KERNEL); + if (!map->bitmap_filter) + goto err_alloc; + + return 0; + +err_alloc: + for (--i; i >= 0; i--) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + for (--j; j >= 0; j--) + free_page((unsigned long)*(flp_ptr + j)); + + j = NO_OF_SLP_PER_FLP; + free_page((unsigned long)map->dirty_bitmap[i]); + } + + return -ENOMEM; +} + +struct rmr_dirty_id_map *rmr_map_create(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_dirty_id_map *map = NULL; + int ret; + + pr_info("%s: Creating map for member_id %u, in pool %s. Existing map_cnt %u\n", + __func__, member_id, pool->poolname, pool->maps_cnt); + + if (!pool->no_of_chunks) { + pr_err("%s: dirty map size cannot be zero\n", __func__); + return ERR_PTR(-EINVAL); + } + + mutex_lock(&pool->maps_lock); + + /* + * Don't create if already exists + */ + map = rmr_pool_find_map(pool, member_id); + if (map != NULL) { + pr_err("Map with member_id %u already exists\n", member_id); + ret = -EEXIST; + goto err_unlock; + } + + if (pool->maps_cnt >= RMR_POOL_MAX_SESS) { + pr_err("pool %s can not create new map, max number of sessions %d achieved\n", + pool->poolname, RMR_POOL_MAX_SESS); + ret = -EINVAL; + goto err_unlock; + } + + /* + * Allocate memory and init the structure + */ + map = (struct rmr_dirty_id_map *)get_zeroed_page(GFP_KERNEL); + if (!map) { + pr_err("cannot allocate map for member_id %u\n", member_id); + ret = -ENOMEM; + goto err_unlock; + } + rmr_map_update_map_params(pool, map); + + ret = rmr_map_allocate_pages(pool, map); + if (ret) { + pr_err("cannot allocate memory for member_id %u\n", member_id); + goto err_map; + } + + xa_init_flags(&map->rmr_id_map, XA_FLAGS_ALLOC); + map->member_id = member_id; + map->ts = jiffies; + + rmr_pool_maps_append(pool, map); + + mutex_unlock(&pool->maps_lock); + + return map; + +err_map: + free_page((unsigned long)map); +err_unlock: + mutex_unlock(&pool->maps_lock); + return ERR_PTR(ret); +} + +void rmr_map_destroy(struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + int i, j; + u64 no_of_slps; + + WARN_ON(!xa_empty(&map->rmr_id_map)); + map->ts = jiffies; + + pr_info("%s: member_id %u\n", __func__, map->member_id); + kfree(map->bitmap_filter); + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++) + free_page((unsigned long)*(flp_ptr + j)); + + free_page((unsigned long)map->dirty_bitmap[i]); + } + + free_page((unsigned long)map); +} + +/** + * rmr_map_calc_chunk - Calculate chunk number from offset and length of IO + * + * @pool: The pool + * @offset: Offset of the IO + * @length: Length of the IO + * @id: rmr_id_t where to populate the chunk details + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * + * For example: + * if id.a is 1, only id.b is dirty. + * if id.a is 2, id.b and (id.b+1) is dirty + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +void rmr_map_calc_chunk(struct rmr_pool *pool, size_t offset, size_t length, rmr_id_t *id) +{ + u64 off_len = offset + length; + + id->b = GET_CHUNK_NUMBER(offset, pool->chunk_size_shift); + id->a = GET_FOLLOWING_CHUNKS(off_len, pool->chunk_size_shift, id->b); +} + +/** + * rmr_get_chunk_md_from_id - Get the chunk metadata byte from rmr_id_t + * + * @map: The map to work on + * @id: rmr_id_t to use to get the chunk metadata byte + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline u8 *rmr_get_chunk_md_from_id(struct rmr_dirty_id_map *map, rmr_id_t id) +{ + unsigned long idb_slp, idb_slp_index, idb_chunk; + el_flp *flp_ptr; + u8 *slp, *chunk_md; + + /* + * First get the pointer to first level page (FLP). + * To get that, we need to find which first level page the chunk belongs, and it can + * be found by dividing the chunk number by the maximum number of chunks 1 FLP can track. + * + * After that we need to adjust the id.b to go one level down. This is because we just + * moved to the desired FLP, and hence that portion of id.b can be dropped. + * For this we do the modulo with CHUNKS_PER_FLP. + */ + flp_ptr = (el_flp *)(map->dirty_bitmap[id.b >> CHUNKS_PER_FLP_LOG2]); + idb_slp = id.b & (CHUNKS_PER_FLP - 1); + + /* + * Now we need to move to the second level page (SLP). + * The addresses to SLPs are stored in the FLP as a list of addresses. Hence we calculate + * the desired slp index which has the address to the SLP our chunk md resides in. + * + * We then adjust our flp_ptr according to the index. + * Note that flp_ptr is of type el_flp (flp element), which is unsigned long, since + * addresses are of that data type. This lets us move to the slp index easily. + */ + idb_slp_index = idb_slp >> CHUNKS_PER_SLP_LOG2; + flp_ptr += idb_slp_index; + + /* + * The location pointed by flp_ptr is storing the address to the SLP we want to move to. + * So we dereference it first, and then cast it to relevant pointer (to the chunk metadata + * data type, which is u8). + * + * The last step it to move to the correct chunk metadata in the SLP. + * + * Each SLP can store metadata for CHUNKS_PER_SLP chunks. So we adjust the idb_slp + * accordingly. And then move our slp pointer to the correct chunk metadata byte. + */ + slp = (u8 *)(*flp_ptr); + idb_chunk = idb_slp & (CHUNKS_PER_SLP - 1); + chunk_md = slp + idb_chunk; + + return chunk_md; +} + +static bool rmr_chunk_md_check_dirty(u8 *chunk_md) +{ + return (*chunk_md) & (0x1 << CHUNK_DIRTY_BIT); +} + +static void rmr_chunk_md_set_dirty(u8 *chunk_md) +{ + *chunk_md |= (0x1 << CHUNK_DIRTY_BIT); +} + +static void rmr_chunk_md_unset_dirty(u8 *chunk_md) +{ + *chunk_md &= ~(0x1 << CHUNK_DIRTY_BIT); +} + +/** + * rmr_map_set_dirty - Set bits from rmr_id_t + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * @filter: Filter to add to entry + * + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline void rmr_map_set_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, u8 filter) +{ + u8 *chunk_md; + u64 i; + + map->ts = jiffies; + + chunk_md = rmr_get_chunk_md_from_id(map, id); + for (i = 0; i < id.a; i++) { + rmr_chunk_md_set_dirty(chunk_md); + chunk_md++; + } +} + +inline void rmr_map_set_dirty_all(struct rmr_dirty_id_map *map, u8 filter) +{ + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks; + bool is_last_flp; + u8 *slp; + int i, j, k; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + is_last_flp = (i == (map->no_of_flp - 1)); + + if (is_last_flp) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (u8 *)(*flp_ptr); + + if (is_last_flp && j == (no_of_slps - 1)) + no_of_chunks = map->no_of_chunk_in_last_slp; + else + no_of_chunks = NO_OF_CHUNKS_PER_PAGE; + + for (k = 0; k < no_of_chunks; k++, slp++) + rmr_chunk_md_set_dirty(slp); + } + } +} + +/** + * rmr_map_unset_dirty - Clear bits from rmr_id_t, and free entry if any + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * @filter: Filter to add to entry + * + * Description: + * This version can be used by both client and server. + * If entry is found, the function frees it. + * Clears the bit using info from the given rmr_id_t + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline struct rmr_map_entry *rmr_map_unset_dirty(struct rmr_dirty_id_map *map, rmr_id_t id, + u8 filter) +{ + struct rmr_map_entry *entry; + u8 *chunk_md; + u64 i; + + map->ts = jiffies; + + chunk_md = rmr_get_chunk_md_from_id(map, id); + BUG_ON(!chunk_md); + for (i = 0; i < id.a; i++) { + rmr_chunk_md_unset_dirty(chunk_md); + chunk_md++; + } + + entry = xa_erase(&map->rmr_id_map, rmr_id_to_key(id)); + if (!entry) { + pr_debug("in the member_id %d there is no entry for id [%llu, %llu]\n", + map->member_id, id.a, id.b); + } + + return entry; +} + +/* + * rmr_map_check_dirty - Check if the following bits are set or not + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline bool rmr_map_check_dirty(struct rmr_dirty_id_map *map, rmr_id_t id) +{ + u8 *chunk_md; + + chunk_md = rmr_get_chunk_md_from_id(map, id); + return rmr_chunk_md_check_dirty(chunk_md); +} + +/** + * rmr_map_get_dirty_entry - Check and return entry if the following bits are set + * + * @map: Map to work on + * @id: rmr_id_t containing the chunk info + * id.b: chunk number denoted by this entry + * id.a: Number of chunks dirty starting (and including) id.b + * + * Description: + * Check if a chunk is dirty or not. + * If the particular chunk is dirty, then create an entry for it and return back. + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline struct rmr_map_entry *rmr_map_get_dirty_entry(struct rmr_dirty_id_map *map, rmr_id_t id) +{ + struct rmr_map_entry *entry; + int err; + + if (rmr_map_check_dirty(map, id)) { + entry = xa_load(&map->rmr_id_map, rmr_id_to_key(id)); + if (entry) { + pr_debug("%s: For id [%llu, %llu], entry exists member_id %u\n", + __func__, id.a, id.b, map->member_id); + return entry; + } + + entry = kmem_cache_zalloc(rmr_map_entry_cachep, GFP_KERNEL); + if (!entry) { + pr_err("%s: Cannot allocate entry for member_id %d, id [[%llu, %llu]]\n", + __func__, map->member_id, id.a, id.b); + return ERR_PTR(-ENOMEM); + } + + atomic_set(&entry->sync_cnt, -1); + init_llist_head(&entry->wait_list); + + err = xa_insert(&map->rmr_id_map, rmr_id_to_key(id), entry, GFP_KERNEL); + if (err == 0) + return entry; + + kmem_cache_free(rmr_map_entry_cachep, entry); + + if (err == -EBUSY) + return xa_load(&map->rmr_id_map, rmr_id_to_key(id)); + else + return ERR_PTR(-ENOMEM); + } + + return NULL; +} + +/** + * rmr_map_clear_filter_all - Clear filter for entire bitmap + * + * @map: Map to work on + * @filter: Filter to be cleared + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline void rmr_map_clear_filter_all(struct rmr_dirty_id_map *map, u8 filter) +{ + u64 i; + + for (i = 0; i < map->no_of_chunks; i++) + map->bitmap_filter[i] &= ~filter; +} + +/** + * rmr_map_unset_dirty_all - Clear all chunk bits (the entire map) + * + * @map: Map to work on + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline void rmr_map_unset_dirty_all(struct rmr_dirty_id_map *map) +{ + rmr_id_t id; + u64 i; + + /* + * TODO: memcpy zeroes or something faster + */ + + id.a = 1; + for (i = 0; i < map->no_of_chunks; i++) { + id.b = i; + + if (!rmr_map_check_dirty(map, id)) + continue; + + rmr_map_unset_dirty(map, id, MAP_NO_FILTER); + } + + rmr_map_clear_filter_all(map, MAP_ENTRY_UNSYNCED); +} + +/** + * rmr_map_empty - Check if there are any chunks dirty + * + * @map: Map to work on + * + * Return: + * True: If map is empty + * False: Otherwise + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +inline bool rmr_map_empty(struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks; + bool is_last_flp; + u8 *slp; + int i, j, k; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + is_last_flp = (i == (map->no_of_flp - 1)); + + if (is_last_flp) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (u8 *)(*flp_ptr); + + if (is_last_flp && j == (no_of_slps - 1)) + no_of_chunks = map->no_of_chunk_in_last_slp; + else + no_of_chunks = NO_OF_CHUNKS_PER_PAGE; + + for (k = 0; k < no_of_chunks; k++, slp++) { + if (rmr_chunk_md_check_dirty(slp)) + return false; + } + } + } + + return true; +} + +inline void rmr_map_bitwise_or_buf(void *dst_buf, void *src_buf, u32 buf_size) +{ + u8 *src_byte, *dst_byte; + + src_byte = src_buf; + dst_byte = dst_buf; + + while (buf_size--) + *(dst_byte + buf_size) |= *(src_byte + buf_size); +} + +inline int rmr_map_create_entries(struct rmr_dirty_id_map *map) +{ + struct rmr_map_entry *entry; + rmr_id_t id; + int err; + u64 i; + + id.a = 1; + for (i = 0; i < map->no_of_chunks; i++) { + id.b = i; + + if (!rmr_map_check_dirty(map, id)) + continue; + + if (xa_load(&map->rmr_id_map, rmr_id_to_key(id))) + continue; + + entry = kmem_cache_zalloc(rmr_map_entry_cachep, GFP_KERNEL); + if (!entry) { + pr_err("%s: Cannot allocate entry for member_id %d, chunk %llu\n", + __func__, map->member_id, i); + return -ENOMEM; + } + + atomic_set(&entry->sync_cnt, -1); + init_llist_head(&entry->wait_list); + + pr_debug("%s: Adding entry %p for chunk %llu\n", + __func__, entry, i); + + err = xa_insert(&map->rmr_id_map, rmr_id_to_key(id), entry, GFP_KERNEL); + if (err) { + pr_err("%s: Cannot insert entry for member_id %d, chunk %llu\n", + __func__, map->member_id, i); + return err; + } + } + + return 0; +} + +/** + * rmr_map_slps_to_buf - Copy SLPs to given buf + * + * @map: Map to work on + * @slp_idx: SLP number to start copying from + * @no_of_slp: Number of SLPs to copy + * @buf: Buffer to copy SLPs to + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +void rmr_map_slps_to_buf(struct rmr_dirty_id_map *map, u64 slp_idx, u64 no_of_slp, u8 *buf) +{ + el_flp *flp_ptr; + u64 slp_no, flp_no, i = 0; + void *slp; + + flp_no = slp_idx >> NO_OF_SLP_PER_FLP_LOG2; + slp_no = slp_idx & (NO_OF_SLP_PER_FLP - 1); + + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + while (i < no_of_slp) { + slp = (void *)(*(flp_ptr + slp_no)); + + memcpy(buf, slp, PAGE_SIZE); + buf += PAGE_SIZE; + + slp_no++; + if (slp_no >= NO_OF_SLP_PER_FLP) { + flp_no += 1; + slp_no = 0; + + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + } + + i++; + } + + return; +} + +/** + * rmr_map_buf_to_slps - Copy data from buf to SLPs + * + * @map: Map to work on + * @buf: Buffer from which to copy data + * @buf_size: Buffer size + * @slp_idx: SLP number to start copying to + * @test: Whether to compare data or copy + * + * Return: + * Number of SLPs to which data was copied. + * 0 in case of failure. + * + * Context: + * srcu pool->map_srcu should be held while calling this function. + */ +u64 rmr_map_buf_to_slps(struct rmr_dirty_id_map *map, u8 *buf, u32 buf_size, u64 slp_idx, + bool test) +{ + el_flp *flp_ptr; + u64 slp_no, flp_no, i = 0; + u64 no_of_slp; + void *slp; + + /* + * The buf_size should be a factor of PAGE_SIZE + */ + if (buf_size % PAGE_SIZE) { + pr_info("%s: Failed %u\n", __func__, buf_size); + return 0; + } + + no_of_slp = buf_size >> PAGE_SHIFT; + + flp_no = slp_idx >> NO_OF_SLP_PER_FLP_LOG2; + slp_no = slp_idx & (NO_OF_SLP_PER_FLP - 1); + + pr_info("%s: no_of_slp=%llu, flp_no=%llu, slp_no=%llu, slp_idx=%llu\n", + __func__, no_of_slp, flp_no, slp_no, slp_idx); + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + while (i < no_of_slp) { + slp = (void *)(*(flp_ptr + slp_no)); + + if (test && memcmp(slp, buf, PAGE_SIZE)) { + pr_info("%s: Compare failed\n", __func__); + return 0; + } else if (!test) { + memcpy(slp, buf, PAGE_SIZE); + } + buf += PAGE_SIZE; + + slp_no++; + if (slp_no >= NO_OF_SLP_PER_FLP) { + flp_no += 1; + slp_no = 0; + + flp_ptr = (el_flp *)map->dirty_bitmap[flp_no]; + } + + i++; + } + + return no_of_slp; +} + +void rmr_map_hexdump_bitmap_buf(u8 member_id, void *buf, u32 buf_size) +{ + u8 *buf_byte; + u32 size = 0; + + buf_byte = buf; + + pr_info("%s: Starting bitmap dump for member %u in hex, size %u\n", + __func__, member_id, buf_size); + pr_info("---------------------------------------------------------\n"); + while (size < buf_size) { + pr_cont("%02X", *(buf_byte + size)); + size++; + } + + pr_info("\n"); +} + +void rmr_map_dump_bitmap(struct rmr_dirty_id_map *map) +{ + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks; + bool is_last_flp; + u8 *slp; + int i, j; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + is_last_flp = (i == (map->no_of_flp - 1)); + + if (is_last_flp) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (u8 *)(*flp_ptr); + + if (is_last_flp && j == (no_of_slps - 1)) + no_of_chunks = map->no_of_chunk_in_last_slp; + else + no_of_chunks = NO_OF_CHUNKS_PER_PAGE; + + /* Each chunk is represented by a byte */ + rmr_map_hexdump_bitmap_buf(map->member_id, slp, no_of_chunks); + } + } +} + +/** + * rmr_map_summary_format - Format a per-member dirty-chunk summary into buf + * + * @pool: Pool whose maps to summarise + * @buf: Output buffer (must be at least @buf_size bytes) + * @buf_size: Size of @buf in bytes + * + * Description: + * Output format (one line per member that has a map): + * member : [ ...] / dirty + * At most 50 dirty chunk indices are listed per member; if there + * are more, a "..." marker appears before the closing bracket. + * + * Context: caller must hold srcu pool->map_srcu. + * + * Return: number of bytes written (excluding trailing NUL). + */ +int rmr_map_summary_format(struct rmr_pool *pool, char *buf, size_t buf_size) +{ + struct rmr_dirty_id_map *map; + el_flp *flp_ptr; + u64 no_of_slps, no_of_chunks_in_slp; + u64 chunk_idx, dirty_count; + bool is_last_flp; + u8 *slp; + int printed_ids; + int pos = 0; + int i, fi, si; + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + pos += scnprintf(buf + pos, buf_size - pos, + "member %u: [", map->member_id); + + dirty_count = 0; + chunk_idx = 0; + printed_ids = 0; + for (fi = 0; fi < map->no_of_flp; fi++) { + flp_ptr = (el_flp *)map->dirty_bitmap[fi]; + is_last_flp = (fi == (map->no_of_flp - 1)); + no_of_slps = is_last_flp ? + map->no_of_slp_in_last_flp : NO_OF_SLP_PER_FLP; + + for (si = 0; si < no_of_slps; si++, flp_ptr++) { + u64 ci; + + slp = (u8 *)(*flp_ptr); + no_of_chunks_in_slp = + (is_last_flp && si == (no_of_slps - 1)) ? + map->no_of_chunk_in_last_slp : + NO_OF_CHUNKS_PER_PAGE; + + for (ci = 0; ci < no_of_chunks_in_slp; + ci++, chunk_idx++) { + if (!(slp[ci] & (1 << CHUNK_DIRTY_BIT))) + continue; + dirty_count++; + /* Cap listed IDs to fit all members in PAGE_SIZE */ + if (printed_ids < 50) { + pos += scnprintf(buf + pos, + buf_size - pos, + "%llu ", chunk_idx); + printed_ids++; + } + } + } + } + + /* Overwrite trailing space before ']' */ + if (pos > 0 && buf[pos - 1] == ' ') + pos--; + if (printed_ids < dirty_count) + pos += scnprintf(buf + pos, buf_size - pos, + "...] %llu/%llu dirty\n", + dirty_count, map->no_of_chunks); + else + pos += scnprintf(buf + pos, buf_size - pos, + "] %llu/%llu dirty\n", + dirty_count, map->no_of_chunks); + } + + return pos; +} + +void rmr_map_bidump_bitmap_buf(void *buf, u8 member_id, u32 buf_long) +{ + char box[65]; + u64 *buf_byte; + u64 the_byte; + int i, j; + u32 count = 0; + + buf_byte = buf; + + pr_info("%s: bitmap for member %d dump in binary, the size in longs %u\n", + __func__, member_id, buf_long); + while (count < buf_long) { + the_byte = *(buf_byte + count); + for (i = 63, j = 0; i >= 0; i--, j++) + box[j] = (the_byte & (1ULL << i)) ? '1' : '0'; + box[j] = '\0'; + pr_cont("[%s]", box); + count++; + } + + pr_info("\n"); + pr_info("---------------------------------------------------------\n"); +} diff --git a/drivers/infiniband/ulp/rmr/rmr-pool.c b/drivers/infiniband/ulp/rmr/rmr-pool.c new file mode 100644 index 000000000000..5e5632d9d701 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-pool.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#include +#include +#include + +#include "rmr-pool.h" + +LIST_HEAD(pool_list); +DEFINE_MUTEX(pool_mutex); /* mutex to protect pool_list */ +struct kmem_cache *rmr_map_entry_cachep; + +const char *rmr_get_cmd_name(enum rmr_msg_cmd_type cmd) +{ + switch (cmd) { + case RMR_CMD_MAP_READY: return "RMR_CMD_MAP_READY"; + case RMR_CMD_MAP_SEND: return "RMR_CMD_MAP_SEND"; + case RMR_CMD_SEND_MAP_BUF: return "RMR_CMD_SEND_MAP_BUF"; + case RMR_CMD_MAP_BUF_DONE: return "RMR_CMD_MAP_BUF_DONE"; + case RMR_CMD_MAP_DONE: return "RMR_CMD_MAP_DONE"; + case RMR_CMD_MAP_DISABLE: return "RMR_CMD_MAP_DISABLE"; + case RMR_CMD_READ_MAP_BUF: return "RMR_CMD_READ_MAP_BUF"; + case RMR_CMD_MAP_CHECK: return "RMR_CMD_MAP_CHECK"; + case RMR_CMD_LAST_IO_TO_MAP: return "RMR_CMD_LAST_IO_TO_MAP"; + case RMR_CMD_STORE_CHECK: return "RMR_CMD_STORE_CHECK"; + case RMR_CMD_MAP_TEST: return "RMR_CMD_MAP_TEST"; + case RMR_CMD_SEND_MD_BUF: return "RMR_CMD_SEND_MD_BUF"; + case RMR_CMD_MD_SEND: return "RMR_CMD_MD_SEND"; + + case RMR_CMD_MAP_GET_VER: return "RMR_CMD_MAP_GET_VER"; + case RMR_CMD_MAP_SET_VER: return "RMR_CMD_MAP_SET_VER"; + case RMR_CMD_DISCARD_CLEAR_FLAG: return "RMR_CMD_DISCARD_CLEAR_FLAG"; + case RMR_CMD_SEND_DISCARD: return "RMR_CMD_SEND_DISCARD"; + + case RMR_MAP_CMD_MAX: return "RMR_MAP_CMD_MAX"; + + case RMR_CMD_POOL_INFO: return "RMR_CMD_POOL_INFO"; + case RMR_CMD_JOIN_POOL: return "RMR_CMD_JOIN_POOL"; + + case RMR_CMD_REJOIN_POOL: return "RMR_CMD_REJOIN_POOL"; + + case RMR_CMD_LEAVE_POOL: return "RMR_CMD_LEAVE_POOL"; + case RMR_CMD_ENABLE_POOL: return "RMR_CMD_ENABLE_POOL"; + + case RMR_CMD_USER: return "RMR_CMD_USER"; + + case RMR_POOL_CMD_MAX: return "RMR_POOL_CMD_MAX"; + + default: return "Unknown command"; + } +} + +void free_pool(struct rmr_pool *pool) +{ + WARN_ON(!list_empty(&pool->sess_list)); + + cleanup_srcu_struct(&pool->sess_list_srcu); + cleanup_srcu_struct(&pool->map_srcu); + + if (!list_empty(&pool->entry)) { + mutex_lock(&pool_mutex); + list_del(&pool->entry); + mutex_unlock(&pool_mutex); + } + + percpu_ref_exit(&pool->ids_inflight_ref); + kfree(pool); +} + +/** + * rmr_find_pool_by_group_id - Find a pool with group_id in global pool list + * + * @group_id: Group_id of the pool being searched + * + * Locks: + * Caller should hold global pool_mutex + */ +struct rmr_pool *rmr_find_pool_by_group_id(u32 group_id) +{ + struct rmr_pool *pool; + + list_for_each_entry(pool, &pool_list, entry) + if (pool->group_id == group_id) + return pool; + + return NULL; +} + +/** + * rmr_find_pool - Find a pool named poolname in the global pool list + * + * @poolname: Name of the pool to be searched + * + * Locks: + * Caller must hold global pool_mutex + */ +struct rmr_pool *rmr_find_pool(const char *poolname) +{ + struct rmr_pool *pool; + + lockdep_assert_held(&pool_mutex); + + list_for_each_entry(pool, &pool_list, entry) { + if (!strcmp(poolname, pool->poolname)) + return pool; + } + + return NULL; +} + +static void rmr_pool_inflight_ref_release(struct percpu_ref *ref) +{ + struct rmr_pool *pool = container_of(ref, struct rmr_pool, ids_inflight_ref); + + complete_all(&pool->complete_done); +} + +void rmr_pool_confirm_inflight_ref(struct percpu_ref *ref) +{ + struct rmr_pool *pool = container_of(ref, struct rmr_pool, ids_inflight_ref); + + complete_all(&pool->confirm_done); +} + +static struct rmr_pool *alloc_pool(const char *poolname, u32 group_id) +{ + struct rmr_pool *pool; + int ret; + + pr_debug("%s: allocate pool %s with group_id %u\n", + __func__, poolname, group_id); + + if (strlen(poolname) > NAME_MAX) { + pr_err("%s: Failed to create '%s': name too long\n", __func__, poolname); + return ERR_PTR(-EINVAL); + } + + pool = kzalloc(sizeof(struct rmr_pool), GFP_KERNEL); + if (unlikely(!pool)) + return ERR_PTR(-ENOMEM); + + ret = init_srcu_struct(&pool->sess_list_srcu); + if (ret) { + pr_err("%s: Sess list srcu init failed, err: %d\n", __func__, ret); + pool = ERR_PTR(ret); + goto free_pool; + } + + ret = init_srcu_struct(&pool->map_srcu); + if (ret) { + pr_err("%s: Map srcu init failed, err: %d\n", __func__, ret); + pool = ERR_PTR(ret); + goto cleanup_sess_srcu; + } + + ret = percpu_ref_init(&pool->ids_inflight_ref, + rmr_pool_inflight_ref_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (ret) { + pr_err("%s: Percpu reference init failed for pool %s\n", __func__, poolname); + pool = ERR_PTR(ret); + goto cleanup_map_srcu; + } + + pool->group_id = group_id; + pool->map_ver = 1; + pool->mapped_size = 0; + xa_init_flags(&pool->stg_members, XA_FLAGS_ALLOC); + init_completion(&pool->complete_done); + init_completion(&pool->confirm_done); + mutex_init(&pool->sess_lock); + mutex_init(&pool->maps_lock); + INIT_LIST_HEAD(&pool->entry); + INIT_LIST_HEAD(&pool->sess_list); + + init_completion(&pool->discard_done); + atomic_set(&pool->discard_waiting, 0); + atomic_set(&pool->normal_count, 0); + + strscpy(pool->poolname, poolname, sizeof(pool->poolname)); + + return pool; + +cleanup_map_srcu: + cleanup_srcu_struct(&pool->map_srcu); +cleanup_sess_srcu: + cleanup_srcu_struct(&pool->sess_list_srcu); +free_pool: + kfree(pool); + return pool; +} + +struct rmr_pool *rmr_create_pool(const char *poolname, void *priv) +{ + u32 group_id; + struct rmr_pool *pool; + + mutex_lock(&pool_mutex); + + pool = rmr_find_pool(poolname); + if (unlikely(pool)) { + pr_err("Pool '%s' already exists\n", poolname); + pool = ERR_PTR(-EEXIST); + goto out; + } + + /* Calculate the poolname hash */ + group_id = rmr_pool_hash(poolname); + + /* Double ensure there is no hash-clash */ + pool = rmr_find_pool_by_group_id(group_id); + if (unlikely(pool)) { + pr_err("Pool '%s' already exists\n", poolname); + pool = ERR_PTR(-EEXIST); + goto out; + } + + pool = alloc_pool(poolname, group_id); + if (IS_ERR(pool)) { + pr_err("Pool allocation failed for pool %s\n", poolname); + goto out; + } + + list_add(&pool->entry, &pool_list); + pool->priv = priv; + pool->pool_md.magic = RMR_POOL_MD_MAGIC; + +out: + mutex_unlock(&pool_mutex); + return pool; +} + +/** + * rmr_pool_maps_to_buf - Copy dirty_bitmap buffer of pool to buf + * + * @pool: The pool whose map is to be copied + * @map_idx: The map index in the pool's map array + * @offset: The offset to read from in the maps dirty_bitmap buffer + * @buf: Pointer to buf where to copy the dirty_bitmap buffer + * @buflen: Length of the buf available to copy to + * @filter: TODO + * + * Description: + * This function is one half of the (map <-> buf) pair. It is used to save map into a buf. + * The other half is rmr_pool_save_map, which is used to save a buf into the map. + * This function is used while both sending a map and reading a map. + * The process for both of them is largely same. + * + * The relevant params like member_id, offset for the dirty_bitmap buffer + * are stored in the rmr_map_buf_hdr, which is kept at the starting of buf. + * + * The caller has to take care of sending the correct map index and offset to copy from. + * For this, the function provides some help in the form of updating the map_idx and + * offset values (for map send), and storing it those in map_buf_hdr (for map read). + * + * Return value: + * 0 If there is no more data to send + * Total size copied to buf + */ +int rmr_pool_maps_to_buf(struct rmr_pool *pool, u8 *map_idx, u64 *slp_idx, + void *buf, size_t buflen, rmr_map_filter filter) +{ + struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + struct rmr_dirty_id_map *map = NULL; + int lock_idx; + u64 no_of_slp; + + /* Adjust buf and buflen */ + buf += sizeof(struct rmr_map_buf_hdr); + buflen -= sizeof(struct rmr_map_buf_hdr); + + lock_idx = srcu_read_lock(&pool->map_srcu); + for ( ; ; *map_idx += 1) { + + if (*map_idx >= pool->maps_cnt) { + srcu_read_unlock(&pool->map_srcu, lock_idx); + return 0; + } + + map = rcu_dereference(pool->maps[*map_idx]); + if (map) + break; + } + + map_buf_hdr->version = RMR_MAP_FORMAT_VER; + + /* This is for the destination, to inform where to store */ + map_buf_hdr->member_id = map->member_id; + map_buf_hdr->dst_slp_idx = (*slp_idx); + + /* + * SLPs are pages. Duh! + */ + no_of_slp = buflen >> PAGE_SHIFT; + no_of_slp = min(no_of_slp, (map->total_slp - *slp_idx)); + rmr_map_slps_to_buf(map, *slp_idx, no_of_slp, buf); + map_buf_hdr->buf_size = no_of_slp * PAGE_SIZE; + + if ((*slp_idx + no_of_slp) >= map->total_slp) { + /* + * All done for this map. + * Now move on to the next one, and reset the index. + */ + *map_idx += 1; + *slp_idx = 0; + } else { + /* + * Copy the number of SLPs we can, and increment the index. + */ + *slp_idx += no_of_slp; + } + + pr_info("%s: buf_size %u, buflen w/o hdr %lu\n", + __func__, map_buf_hdr->buf_size, buflen); + + /* This is for MAP_READ, to inform where to ask from next */ + map_buf_hdr->map_idx = *map_idx; + map_buf_hdr->slp_idx = *slp_idx; + + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return (map_buf_hdr->buf_size + sizeof(struct rmr_map_buf_hdr)); +} + +/** + * rmr_pool_save_map - Copy given buf to dirty_bitmap buffer of pool + * + * @pool: The pool whose map is the dest for the copy + * @buf: Pointer to buf from where to copy + * @buflen: Length of the buf available to copy + * @test_only: Only test if the buf given matches with dirty_bitmap buf of pool + * @map_clean: TODO + * + * Description: + * This function is the other half of the (map <-> buf) pair. + * It saves buf into the map of pool. The relevant params are read from the + * rmr_map_buf_hdr which lies in the start of the given buf. + * + * Return value: + * 0 on success + * -errno on error + */ +int rmr_pool_save_map(struct rmr_pool *pool, void *buf, size_t buflen, + bool test_only) +{ + struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + struct rmr_dirty_id_map *map = NULL; + int err = 0, lock_idx; + u32 buf_size; + u64 slp_idx; + + if (map_buf_hdr->version != RMR_MAP_FORMAT_VER) { + pr_err("Wrong map format. Expected %d but received %llu\n", + RMR_MAP_FORMAT_VER, map_buf_hdr->version); + return -EINVAL; + } + + /* Adjust buf and buflen */ + buf += sizeof(struct rmr_map_buf_hdr); + buflen -= sizeof(struct rmr_map_buf_hdr); + + lock_idx = srcu_read_lock(&pool->map_srcu); + map = rmr_pool_find_map(pool, map_buf_hdr->member_id); + if (!map) { + pr_err("%s: No map found for member_id %llu\n", + __func__, map_buf_hdr->member_id); + err = -ENOENT; + goto out; + } + + slp_idx = map_buf_hdr->dst_slp_idx; + buf_size = map_buf_hdr->buf_size; + + pr_info("%s: For pool %s, received map for %llu, slp_idx %llu, buf_size %u, buflen %lu\n", + __func__, pool->poolname, map_buf_hdr->member_id, slp_idx, buf_size, buflen); + + /* Sanity */ + WARN_ON(buf_size > buflen); + WARN_ON(buf_size % PAGE_SIZE); + + pr_info("%s: buf_size %u, buflen w/o hdr %lu\n", __func__, map_buf_hdr->buf_size, buflen); + + /* + * The buf_size would be a factor of PAGE_SIZE, + * and thats how we know no_of_slp(s) to save. + */ + if (!rmr_map_buf_to_slps(map, buf, buf_size, slp_idx, test_only)) { + pr_err("%s: rmr_map_buf_to_slps failed\n", __func__); + goto out; + } + +out: + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return err; +} diff --git a/drivers/infiniband/ulp/rmr/rmr-req.c b/drivers/infiniband/ulp/rmr/rmr-req.c new file mode 100644 index 000000000000..d748579c489c --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-req.c @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#include + +#include "rmr-req.h" +#include "rmr-srv.h" +#include "rmr-clt.h" + +extern struct kmem_cache *rmr_req_cachep; +extern struct kmem_cache *rmr_map_entry_cachep; +extern struct rmr_store_ops *pstore_ops; + +static void rmr_req_complete(struct rmr_srv_req *req); +static void rmr_req_store_done(struct rmr_srv_req *req); +static void rmr_req_sync_failed(struct rmr_srv_req *req); +static void rmr_req_send_map_clear(struct rmr_srv_req *req); +static void rmr_req_sync_complete(struct rmr_srv_req *req); +static void rmr_req_store(struct rmr_srv_req *req); + +/** + * rmr_srv_req_resp - Response from the lower level module + * + * @req: Request to be processed + * @err: Error value + * + * Description: + * This function is the return point from the below module + * where IO is submitted. + * + * Context: + * In this function the request should always be in state RMR_REQ_STATE_STORE + */ +void rmr_srv_req_resp(struct rmr_srv_req *req, int err) +{ + /* + * Use the error sent from lower layer + */ + req->err = err; + + /* + * For Normal (non-sync) requests we handle both non-error and error cases from one + * place. Since its simple. + */ + if (rmr_op(req->flags) != RMR_OP_SYNCREQ) { + rmr_req_complete(req); + return; + } + + /* + * Sync requests are complicated, since it needs extra post-processing + * once IO is done for us. + * + * 1) In case of no failure, we need to send map clear to other nodes, + * since they think we are still dirty for this chunk. + * + * 2) We need to check for waiting IO in entry->wait_list, and kick them. + */ + if (!req->err) + rmr_req_store_done(req); + else + rmr_req_sync_failed(req); +} +EXPORT_SYMBOL(rmr_srv_req_resp); + +/** + * rmr_srv_req_create - Create an rmr server request + * + * @msg: IO message containing information + * @srv_pool: Server pool creating this request + * @rtrs_op: rtrs IO context + * @data: pointer to data buf + * @datalen: len of data buf + * @endreq: Function to be called at the end of rmr request processing + * + * Description: + * RMR server request are base structures which holds the IO while they are being processed. + * They go through a state machine, while a number of checks are done. IOs which are + * destined for a chunk that is dirty, are paused while that chunk is synced. + * + * Return: + * Pointer to the create rmr server request on success + * Error pointer on failure + */ +struct rmr_srv_req *rmr_srv_req_create(const struct rmr_msg_io *msg, struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, void *data, u32 datalen, + void (*endreq)(struct rmr_srv_req *, int)) +{ + struct rmr_srv_req *req; + struct rmr_srv_io_store *store = srv_pool->io_store; + int i; + + if (!store || !atomic_read(&srv_pool->store_state)) { + pr_err("%s: store not set, or srv_pool not in correct state %s\n", + __func__, srv_pool->pool->poolname); + return ERR_PTR(-ENODEV); + } + + req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL); + if (!req) { + pr_err("cannot allocate memory for rmr_req.\n"); + return ERR_PTR(-ENOMEM); + } + + req->id.a = le64_to_cpu(msg->id_a); + req->id.b = le64_to_cpu(msg->id_b); + + req->offset = le32_to_cpu(msg->offset); + req->length = le32_to_cpu(msg->length); + req->flags = le32_to_cpu(msg->flags); + req->prio = le16_to_cpu(msg->prio); + + req->mem_id = le32_to_cpu(msg->mem_id); + for (i = 0; i < msg->failed_cnt; i++) + req->failed_srv_id[i] = msg->failed_id[i]; + + req->failed_cnt = msg->failed_cnt; + req->map_ver = le64_to_cpu(msg->map_ver); + req->sync = msg->sync; + + req->data = data; + req->datalen = datalen; + req->rtrs_op = rtrs_op; + req->srv_pool = srv_pool; + req->store = store; + req->endreq = endreq; + + pr_debug("req %p, chunk_size %u\n", req, req->srv_pool->pool->chunk_size); + + return req; +} + +struct rmr_srv_req *rmr_srv_md_req_create(struct rmr_srv_pool *srv_pool, + struct rtrs_srv_op *rtrs_op, void *data, + u32 offset, u32 len, unsigned long flags, + void (*endreq)(struct rmr_srv_req *, int)) +{ + struct rmr_srv_req *req; + struct rmr_srv_io_store *store = srv_pool->io_store; + + if (!store) { + pr_err("No store_id registered for srv pool %s\n", srv_pool->pool->poolname); + return ERR_PTR(-ENODEV); + } + + req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL); + if (!req) { + pr_err("cannot allocate memory for rmr_req.\n"); + return ERR_PTR(-ENOMEM); + } + req->offset = offset; + req->length = len; + req->flags = flags; + req->sync = false; /* A md req is always non-sync */ + + req->data = data; + req->rtrs_op = rtrs_op; + req->srv_pool = srv_pool; + req->store = store; + req->endreq = endreq; + + pr_debug("md req %p, len %u\n", req, len); + + return req; +} + +void rmr_req_submit(struct rmr_srv_req *req); +static void rmr_req_sched(struct work_struct *work) +{ + struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work); + + pr_debug("scheduled work process for req %p\n", req); + if (req->err) + rmr_req_complete(req); + else + rmr_req_submit(req); +} + +void rmr_process_wait_list(struct rmr_map_entry *entry, int err) +{ + struct llist_node *first, *next; + struct rmr_srv_req *req; + + pr_debug("processing wait list for entry %p, sync_cnt=%d\n", + entry, atomic_read(&entry->sync_cnt)); + + WARN_ON(atomic_read(&entry->sync_cnt) > 0); + + while (!llist_empty(&entry->wait_list)) { + first = llist_del_all(&entry->wait_list); + while (first) { + next = first->next; + req = llist_entry(first, struct rmr_srv_req, node); + + pr_debug("process waiting req %p id (%llu, %llu) flags %u\n", + req, req->id.a, req->id.b, req->flags); + if (err) { + pr_err("fail waiting req %p id (%llu, %llu) flags %u err %d\n", + req, req->id.a, req->id.b, req->flags, err); + req->err = -EIO; + } + + pr_debug("schedule processing req %p with err %d\n", req, req->err); + INIT_WORK(&req->work, rmr_req_sched); + schedule_work(&req->work); + + first = next; + } + } +} + +void rmr_req_submit(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_map_entry *entry; + struct rmr_dirty_id_map *map; + + if (rmr_op(req->flags) == RMR_OP_FLUSH && !req->length) { + rmr_req_store(req); + return; + } + + pr_debug("check map for req %p flag %u request id [%llu, %llu] offset %u length %u\n", + req, req->flags, + req->id.a, req->id.b, req->offset, req->length); + + map = rmr_pool_find_map(srv_pool->pool, srv_pool->member_id); + if (!map) { + pr_err("no map found for pool_id %u\n", srv_pool->member_id); + req->err = -EINVAL; + goto err; + } + + rcu_read_lock(); + entry = rmr_map_get_dirty_entry(map, req->id); + if (!entry) { + /* + * The chunk containing data for this req is NOT dirty for us + */ + pr_debug("check map for req %p flags %u request id [%llu, %llu], no entry in the map\n", + req, req->flags, req->id.a, req->id.b); + rcu_read_unlock(); + rmr_req_store(req); + return; + } else { + /* + * The chunk for this data is dirty for us. + * + * we have 2 cases. + * + * 1) Its coming from a sync rmr-clt (Its an internal read). + * Then, fail the IO, since we do not want to end up in a deadlock, + * or go through multiple hops for a single read. The sender can try some other + * node itself. + */ + if (req->sync) { + WARN_ON(rmr_op(req->flags) != RMR_OP_READ); + rcu_read_unlock(); + req->err = -EIO; + goto err; + } + + /* + * 2) If its coming from a non-sync rmr-clt, + * simply go ahead with syncing the data first. + */ + llist_add(&req->node, &entry->wait_list); + pr_debug("%s: req %p flags %u id (%llu %llu) added to wait list. sync_cnt %d\n", + __func__, req, req->flags, req->id.a, req->id.b, + atomic_read(&entry->sync_cnt)); + + rcu_read_unlock(); + /* + * If we are the first who grabs the entry then start sync. + * + * Otherwise, the one syncing the data would pick us up from the entry->wait_list + * and kick us. So simply exit for now. + */ + if (atomic_cmpxchg(&entry->sync_cnt, -1, 0) == -1) { + int err; + + req->priv = entry; + err = rmr_srv_sync_chunk_id(srv_pool, entry, req->id, false); + if (err) { + atomic_set(&entry->sync_cnt, -1); + rmr_process_wait_list(entry, err); + } + } + } + + return; + +err: + rmr_req_complete(req); +} + +static void rmr_req_store(struct rmr_srv_req *req) +{ + int err; + + pr_debug("submit to store req %p flags %u request id [%llu, %llu] offset %u length %u\n", + req, req->flags, + req->id.a, req->id.b, req->offset, req->length); + + err = req->store->ops->submit_req(req->store->priv, req->data, req->offset, + req->length, req->flags, req->prio, req); + if (err) { + pr_err("%s: error submitting req %p, err %d\n", __func__, req, err); + req->err = err; + if (rmr_op(req->flags) == RMR_OP_SYNCREQ) + rmr_req_sync_failed(req); + else + rmr_req_complete(req); + } +} + +static void rmr_md_req_store(struct rmr_srv_req *req) +{ + int err; + + err = req->store->ops->submit_md_req(req->store->priv, req->data, req->offset, req->length, + req->flags, req); + if (err) { + req->endreq(req, err); + pr_err("release md req %p, flags %u\n", req, req->flags); + kmem_cache_free(rmr_req_cachep, req); + } +} + +/* md req submission path*/ +void rmr_md_req_submit(struct rmr_srv_req *req) +{ + rmr_md_req_store(req); +} + +static void rmr_req_sched_store(struct work_struct *work) +{ + struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work); + + pr_debug("scheduled store for req %p\n", req); + rmr_req_store(req); +} + +static void rmr_req_remote_io_done(void *priv, int err) +{ + struct rmr_srv_req *req = priv; + + pr_debug("called for req %p, err code %d\n", req, err); + + rmr_clt_put_iu(req->srv_pool->clt, req->iu); + + if (err) { + req->err = err; + rmr_req_sync_failed(req); + return; + } + + pr_debug("schedule store for req %p with err %d\n", req, req->err); + INIT_WORK(&req->work, rmr_req_sched_store); + schedule_work(&req->work); +} + +static void rmr_req_remote_read(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_pool *clt = srv_pool->clt; + unsigned long flags; + int err; + + pr_debug("redirecting req id (%llu, %llu)\n", + req->id.a, req->id.b); + if (!clt) { + pr_err("No srv pool assigned for redirect for %s\n", srv_pool->pool->poolname); + err = -EINVAL; + goto err; + } + + if (rmr_op(req->flags) == RMR_OP_SYNCREQ) + flags = RMR_OP_READ; + else + flags = req->flags; + + req->iu = rmr_clt_get_iu(clt, flags, WAIT); + if (IS_ERR_OR_NULL(req->iu)) { + pr_err("Failed to get rmr_iu for req id (%llu, %llu)\n", + req->id.a, req->id.b); + err = -EINVAL; + goto err; + } + + sg_init_one(&req->sg, req->data, req->datalen); + + pr_debug("After sg_init_one nents=%d\n", sg_nents(&req->sg)); + + /* look at the flags here! */ + err = rmr_clt_request(clt, req->iu, req->offset, req->length, flags, + req->prio, req, rmr_req_remote_io_done, + &req->sg, sg_nents(&req->sg)); + if (err) { + pr_err("rmr_clt_request error %d\n", err); + rmr_clt_put_iu(clt, req->iu); + err = -EREMOTEIO; + goto err; + } + + pr_debug("remote read submitted\n"); + return; + +err: + req->err = err; + rmr_req_sync_failed(req); +} + +static void rmr_sync_req_sched(struct work_struct *work) +{ + struct rmr_srv_req *req = container_of(work, struct rmr_srv_req, work); + + pr_debug("scheduled work process for req %p\n", req); + if (req->err) + rmr_req_sync_complete(req); + else + rmr_req_send_map_clear(req); +} + +static void rmr_req_complete(struct rmr_srv_req *req) +{ + pr_debug("send completeion for req %p flags %u request id (%llu, %llu) offset %u length %u err %d\n", + req, req->flags, + req->id.a, req->id.b, req->offset, req->length, req->err); + + /* endreq() records the Last IO buffer accordingly. */ + req->endreq(req, req->err); + + pr_debug("release req %p, flags %u\n", req, req->flags); + + kmem_cache_free(rmr_req_cachep, req); +} + +static struct rmr_srv_req *rmr_req_create_sync_req(struct rmr_srv_pool *srv_pool, rmr_id_t id, + u32 offset, u32 len, bool from_sync, + struct rmr_srv_req *parent) +{ + struct rmr_srv_req *req; + struct rmr_srv_io_store *store = srv_pool->io_store; + + if (!store) { + pr_err("No store_id registered for srv pool %s\n", srv_pool->pool->poolname); + return ERR_PTR(-ENODEV); + } + + req = kmem_cache_zalloc(rmr_req_cachep, GFP_KERNEL); + if (!req) { + pr_err("cannot allocate memory for rmr_req.\n"); + return ERR_PTR(-ENOMEM); + } + req->id.a = id.a; + req->id.b = id.b; + req->flags = RMR_OP_SYNCREQ; + req->length = len; + req->offset = offset; + req->srv_pool = srv_pool; + req->store = store; + req->from_sync = from_sync; + + if (parent) { + req->data = parent->data + offset; + } else { + req->data = kmalloc(req->length, GFP_KERNEL); + if (!req->data) { + pr_err("cannot allocate memory for sync req id [%llu, %llu]\n", + req->id.a, req->id.b); + kmem_cache_free(rmr_req_cachep, req); + return ERR_PTR(-ENOMEM); + } + } + req->datalen = len; + req->parent = parent; + + pr_debug("sync req %p created, flags %u request id (%llu, %llu) offset %u length %u parent %p\n", + req, req->flags, req->id.a, req->id.b, req->offset, req->length, parent); + + return req; +} + +//should be called only if corresponding map entry has 0 sync cnt +int rmr_srv_sync_chunk_id(struct rmr_srv_pool *srv_pool, struct rmr_map_entry *entry, + rmr_id_t id, bool from_sync) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_dirty_id_map *map; + struct rmr_srv_req *parent_req; + u32 max_io_size, total_len, offset; + + if (!srv_pool->clt) { + pr_err("For pool %s no sync pool assigned.\n", pool->poolname); + return -EINVAL; + } + max_io_size = srv_pool->max_sync_io_size; + + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("no map found for pool_id %u\n", srv_pool->member_id); + //TODO: handle this , probably initialize map, or just throw err? + return -EINVAL; + } + + offset = CHUNK_TO_OFFSET(id.b, pool->chunk_size_shift); + total_len = pool->chunk_size; + + pr_debug("pool %s sync id (%llu, %llu), total_len %u, max_io_size %u\n", + pool->poolname, id.a, id.b, total_len, max_io_size); + + /* + * The parent_req starts with total_len, then get decremented in loop below. + * The child reqs are filled one by one from end to second. + * + * Maybe refactor this to a simple loop? + */ + parent_req = rmr_req_create_sync_req(srv_pool, id, offset, total_len, from_sync, NULL); + if (IS_ERR_OR_NULL(parent_req)) { + pr_err("pool %s failed to create main sync req to sync id (%llu, %llu)\n", + pool->poolname, id.a, id.b); + return -ENOMEM; + } + parent_req->priv = entry; + + if (from_sync) { + if (rmr_srv_get_sync_permit(srv_pool)) { + pr_err("rmr_srv_sync_chunk_id failed to acquire permit for parent\n"); + kfree(parent_req->data); + kmem_cache_free(rmr_req_cachep, parent_req); + + return -EINVAL; + } + } + + // inc ref cnt for parent_req + map_entry_get_sync(entry); + while (parent_req->length > max_io_size) { + struct rmr_srv_req *req; + u32 child_offset = offset + (parent_req->length - max_io_size); + + // submit req + req = rmr_req_create_sync_req(srv_pool, id, (parent_req->length - max_io_size), + max_io_size, from_sync, parent_req); + if (IS_ERR_OR_NULL(req)) { + pr_err("%s: Pool %s, id (%llu, %llu), offset %u, len %u, err %ld\n", + __func__, pool->poolname, id.a, id.b, + (parent_req->length - max_io_size), max_io_size, PTR_ERR(req)); + parent_req->err = PTR_ERR(req); + + rmr_req_sync_failed(parent_req); + return -EINVAL; + } + + /* + * The offset sent to rmr_req_create_sync_req for this req is in context of the + * chunk. But the real offset for this req in the disk is this. + */ + req->offset = child_offset; + + if (from_sync) { + if (rmr_srv_get_sync_permit(srv_pool)) { + pr_err("rmr_srv_sync_chunk_id failed to acquire permit for child\n"); + kmem_cache_free(rmr_req_cachep, req); + + parent_req->err = -EBUSY; + rmr_req_sync_failed(parent_req); + return -EINVAL; + } + } + + // inc ref cnt for the child req just created + map_entry_get_sync(entry); + req->priv = entry; + rmr_req_remote_read(req); + + parent_req->length -= max_io_size; + parent_req->datalen -= max_io_size; + } + + //submit parent req + rmr_req_remote_read(parent_req); + + return 0; +} + +static void __release_parent_req(struct rcu_head *head) +{ + struct rmr_srv_req *req = container_of(head, struct rmr_srv_req, rcu); + struct rmr_map_entry *entry = req->priv; + + pr_debug("is called for req=%p id=(%llu,%llu) err=%d, entry=%p\n", + req, req->id.a, req->id.b, req->err, entry); + + kfree(req->data); + + //may be now we can stop saving entry in req->priv, but always rmr_map_find it + if (!req->err) { + pr_debug("req %p, completed all sync req, lets clean map\n", req); + rmr_process_wait_list(entry, 0); + } else { + pr_debug("req %p completed with err %d, process wait list\n", + req, req->err); + + /* sync of this entry failed, we reset the sync_cnt so that the other req + * or sync thread could try again in the future. Without resetting, no one + * could get the ref and start sync again. + */ + atomic_set(&entry->sync_cnt, -1); + rmr_process_wait_list(entry, req->err); + } + + pr_debug("free entry %p for req %p\n", entry, req); + kmem_cache_free(rmr_map_entry_cachep, entry); + + if (req->from_sync) + rmr_srv_put_sync_permit(req->srv_pool); + + kmem_cache_free(rmr_req_cachep, req); +} + +static void rmr_req_sync_complete(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_dirty_id_map *map; + int lock_idx; + + pr_debug("sync_req %p completed for id (%llu, %llu), offset %u, len %u, err %d, from sync %d\n", + req, req->id.a, req->id.b, req->offset, req->length, + req->err, req->from_sync); + + if (req->err) + rmr_srv_sync_req_failed(req->srv_pool); + + pr_debug("release sync req %p, flags %u\n", req, req->flags); + + /* + * Only parent sync req own the allocated data. + */ + if (!req->parent) { + if (!req->err) { + map = rmr_pool_find_map(srv_pool->pool, + srv_pool->member_id); + if (map) { + lock_idx = srcu_read_lock(&srv_pool->pool->map_srcu); + rmr_map_unset_dirty(map, req->id, + MAP_NO_FILTER); + srcu_read_unlock(&srv_pool->pool->map_srcu, lock_idx); + } else { + pr_err("no map found for pool_id %u\n", srv_pool->member_id); + req->err = -EINVAL; + } + } + + pr_debug("req %p, completed all sync req, lets clean map\n", + req); + call_rcu(&req->rcu, __release_parent_req); + } else { + /* + * Child req has nothing to do but put permit and free + */ + if (req->from_sync) + rmr_srv_put_sync_permit(req->srv_pool); + + kmem_cache_free(rmr_req_cachep, req); + } +} + +static void rmr_req_sync_failed(struct rmr_srv_req *req) +{ + rmr_srv_sync_req_failed(req->srv_pool); + + pr_err("pool %s sync req %p failed for id (%llu, %llu), offset %u, len %u, err %d\n", + req->srv_pool->pool->poolname, req, req->id.a, req->id.b, + req->offset, req->length, req->err); + + rmr_req_store_done(req); +} + +// this is actually very like rmr_req_remote_io_done but without rmr_clt_put_iu +// do we want to have one function for both cases? +static void rmr_req_map_clear_done(void *priv, int err) +{ + struct rmr_srv_req *req = priv; + + rmr_clt_put_iu(req->srv_pool->clt, req->iu); + + pr_debug("called for req %p, err code %d\n", req, err); + if (err) + pr_err("pool %s, sync req with id (%llu, %llu) failed to send map clear\n", + req->srv_pool->pool->poolname, req->id.a, req->id.b); + + rmr_req_sync_complete(req); +} + +static void rmr_req_store_done(struct rmr_srv_req *req) +{ + struct rmr_map_entry *entry = req->priv; + struct rmr_srv_req *parent_req = NULL; + + pr_debug("called for req %p id (%llu, %llu ) offset %u len %u with parent req %p\n", + req, req->id.a, req->id.b, req->offset, req->length, req->parent); + + if (req->parent) + parent_req = req->parent; + else + parent_req = req; + + if (req->err) + parent_req->err = req->err; + + if (map_entry_put_sync(entry)) { + pr_debug("%s: for entry %p id (%llu, %llu) all sync req done.\n", __func__, + entry, req->id.a, req->id.b); + + /* We have to schedule the work of parent req from here since we are in the + * interrupt context of either parent req or child req + */ + pr_debug("%s: process parent_req %p\n", __func__, parent_req); + INIT_WORK(&parent_req->work, rmr_sync_req_sched); + schedule_work(&parent_req->work); + } + + if (req != parent_req) { + pr_debug("completing req %p with err %d\n", req, req->err); + rmr_req_sync_complete(req); + } +} + +static void rmr_req_send_map_clear(struct rmr_srv_req *req) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_pool *pool = srv_pool->clt; + struct rmr_iu *iu; + int err; + + if (!pool) { + pr_err("Cannot send map clear. No pool client assigend for srv pool %s\n", + req->srv_pool->pool->poolname); + req->err = -EINVAL; + goto err; + } + + /* + * We try to clear map, but if we fail to, we simply ignore the error. + * Such zombie entries will be clear by rmr_srv_check_map_clear. + */ + iu = rmr_clt_get_iu(pool, RMR_OP_WRITE, WAIT); + if (IS_ERR_OR_NULL(iu)) { + pr_err("Failed to get rmr_iu for req id (%llu, %llu)\n", + req->id.a, req->id.b); + goto err; + } + + pr_debug("send map clear req id (%llu, %llu), member_id %u\n", + req->id.a, req->id.b, srv_pool->member_id); + + /* + * For MAP_CLEAR, we only need rmr_id_t for chunk number, + * and our member_id to say to clear the above chunk number for ths storage node. + * + * We also update the minimum members needed for map update. + */ + iu->msg.hdr.group_id = cpu_to_le32(pool->group_id); + iu->msg.hdr.type = cpu_to_le16(RMR_MSG_MAP_CLEAR); + iu->msg.hdr.__padding = 0; + + iu->msg.id_a = cpu_to_le64(req->id.a); + iu->msg.id_b = cpu_to_le64(req->id.b); + iu->msg.member_id = srv_pool->member_id; + + iu->msg.flags = cpu_to_le32(RMR_OP_WRITE); + + iu->conf = rmr_req_map_clear_done; + iu->priv = req; + + req->iu = iu; + + err = rmr_clt_send_map_update(pool, req->iu); + if (err) { + pr_err("%s error %d\n", __func__, err); + rmr_clt_put_iu(pool, req->iu); + goto err; + } + + pr_debug("send map clear submitted\n"); + return; + +err: + rmr_req_sync_complete(req); +} From d35f05d7d79f339e20b97b52612f23da09afab33 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:15 +0200 Subject: [PATCH 03/13] RDMA/rmr: client: main functionality Add the RMR client implementation: rmr-clt.c client core: session and pool-session state machine, RTRS transport setup, IO submission and completion paths, command messaging. rmr-map-mgmt.c client-side dirty-map management: spreading updates to pool members, handling map check responses and resync coordination. rmr-clt-stats.c client per-pool statistics counters. rmr-clt-trace.c tracepoint definitions for client state rmr-clt-trace.h transitions and IO submission events. The trace points are referenced from rmr-clt.c and rmr-map-mgmt.c, so they are added together with the client core. These files are not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/infiniband/ulp/rmr/rmr-clt-stats.c | 29 + drivers/infiniband/ulp/rmr/rmr-clt-trace.c | 11 + drivers/infiniband/ulp/rmr/rmr-clt-trace.h | 110 + drivers/infiniband/ulp/rmr/rmr-clt.c | 3866 ++++++++++++++++++++ drivers/infiniband/ulp/rmr/rmr-map-mgmt.c | 933 +++++ 5 files changed, 4949 insertions(+) create mode 100644 drivers/infiniband/ulp/rmr/rmr-clt-stats.c create mode 100644 drivers/infiniband/ulp/rmr/rmr-clt-trace.c create mode 100644 drivers/infiniband/ulp/rmr/rmr-clt-trace.h create mode 100644 drivers/infiniband/ulp/rmr/rmr-clt.c create mode 100644 drivers/infiniband/ulp/rmr/rmr-map-mgmt.c diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-stats.c b/drivers/infiniband/ulp/rmr/rmr-clt-stats.c new file mode 100644 index 000000000000..83a4089defc0 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-stats.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rmr-clt.h" + +int rmr_clt_reset_read_retries(struct rmr_clt_stats *stats, bool enable) +{ + if (unlikely(!enable)) + return -EINVAL; + + atomic_set(&stats->read_retries, 0); + + return 0; +} + +ssize_t rmr_clt_stats_read_retries_to_str( + struct rmr_clt_stats *stats, char *page) +{ + return sysfs_emit(page, "%u\n", + atomic_read(&stats->read_retries)); +} + diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-trace.c b/drivers/infiniband/ulp/rmr/rmr-clt-trace.c new file mode 100644 index 000000000000..2e6d9adee7c8 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-trace.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ +#include "rmr-clt.h" + +#define CREATE_TRACE_POINTS +#include "rmr-clt-trace.h" + diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-trace.h b/drivers/infiniband/ulp/rmr/rmr-clt-trace.h new file mode 100644 index 000000000000..1d9a511dc763 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-trace.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rmr_clt + +#if !defined(_TRACE_RMR_CLT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RMR_CLT_H + +#include + +struct rmr_clt_pool_sess; + +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_CREATED); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_NORMAL); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_FAILED); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_RECONNECTING); +TRACE_DEFINE_ENUM(RMR_CLT_POOL_SESS_REMOVING); + +#define show_pool_sess_state(x) \ + __print_symbolic(x, \ + { RMR_CLT_POOL_SESS_CREATED, "CREATED" }, \ + { RMR_CLT_POOL_SESS_NORMAL, "NORMAL" }, \ + { RMR_CLT_POOL_SESS_FAILED, "FAILED" }, \ + { RMR_CLT_POOL_SESS_RECONNECTING, "RECONNECTING" }, \ + { RMR_CLT_POOL_SESS_REMOVING, "REMOVING" }) + +TRACE_EVENT(pool_sess_change_state, + TP_PROTO(struct rmr_clt_pool_sess *pool_sess, + int newstate, + int oldstate, + int changed), + + TP_ARGS(pool_sess, newstate, oldstate, changed), + + TP_STRUCT__entry( + __string(sessname, pool_sess->sessname) + __field(int, newstate) + __field(int, oldstate) + __field(int, changed) + ), + + TP_fast_assign( + __assign_str(sessname); + __entry->newstate = newstate; + __entry->oldstate = oldstate; + __entry->changed = changed; + ), + + TP_printk("RMR-CLT: sessname=%s newstate='%s' oldstate='%s' state-changed='%d'", + __get_str(sessname), + show_pool_sess_state(__entry->newstate), + show_pool_sess_state(__entry->oldstate), + __entry->changed + ) +); + +DECLARE_EVENT_CLASS(rtrs_clt_request_class, + TP_PROTO(int dir, struct rmr_clt_sess_iu *sess_iu), + + TP_ARGS(dir, sess_iu), + + TP_STRUCT__entry( + __field(int, dir) + __array(char, sessname, NAME_MAX) + __field(void *, rtrs) + __field(void *, clt_sess) + ), + + TP_fast_assign( + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + __entry->dir = dir; + memcpy(__entry->sessname, pool_sess->sessname, NAME_MAX); + __entry->rtrs = clt_sess->rtrs; + __entry->clt_sess = clt_sess; + ), + + TP_printk("rtrs clt request: sessname=%s dir=%s rtrs=%p clt_sess=%p", + __entry->sessname, + __print_symbolic(__entry->dir, + { READ, "READ" }, + { WRITE, "WRITE" }), + __entry->rtrs, + __entry->clt_sess + ) +); + +#define DEFINE_RTRS_CLT_EVENT(name) \ +DEFINE_EVENT(rtrs_clt_request_class, name, \ + TP_PROTO(int dir, struct rmr_clt_sess_iu *sess_iu), \ + TP_ARGS(dir, sess_iu)) + +DEFINE_RTRS_CLT_EVENT(send_usr_msg); +DEFINE_RTRS_CLT_EVENT(retry_failed_read); +DEFINE_RTRS_CLT_EVENT(rmr_clt_request); +DEFINE_RTRS_CLT_EVENT(rmr_clt_cmd_with_rsp); +DEFINE_RTRS_CLT_EVENT(send_map_update); + +#endif /* _TRACE_RMR_CLT_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rmr-clt-trace +#include + diff --git a/drivers/infiniband/ulp/rmr/rmr-clt.c b/drivers/infiniband/ulp/rmr/rmr-clt.c new file mode 100644 index 000000000000..33e4b6d84b0b --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt.c @@ -0,0 +1,3866 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include + +#include "rmr-clt.h" +#include "rmr-clt-trace.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_DESCRIPTION("RMR Client"); +MODULE_VERSION(RMR_VER_STRING); +MODULE_LICENSE("GPL"); + +#define RMR_CLT_SEND_MSG_TIMEOUT_MS 30000 + +//static int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool wait); +static void retry_failed_read(struct work_struct *work); +static DEFINE_MUTEX(g_sess_lock); +static LIST_HEAD(g_sess_list); + +static bool rmr_get_clt_pool(struct rmr_clt_pool *clt_pool) +{ + pr_debug("pool %s, before inc refcount %d\n", + clt_pool->pool->poolname, refcount_read(&clt_pool->refcount)); + return refcount_inc_not_zero(&clt_pool->refcount); +} + +static struct rmr_clt_pool *rmr_find_and_get_clt_pool(const char *poolname) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + + mutex_lock(&pool_mutex); + pool = rmr_find_pool(poolname); + if (!pool) { + clt_pool = ERR_PTR(-ENOENT); + goto out; + } + + clt_pool = (struct rmr_clt_pool *)pool->priv; + if (!rmr_get_clt_pool(clt_pool)) + clt_pool = ERR_PTR(-EINVAL); + +out: + mutex_unlock(&pool_mutex); + return clt_pool; +} + +void rmr_put_clt_pool(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + might_sleep(); + + pr_debug("clt pool %s, before dec refcnt %d\n", + (pool ? pool->poolname : "(empty)"), refcount_read(&clt_pool->refcount)); + if (refcount_dec_and_test(&clt_pool->refcount)) { + + destroy_workqueue(clt_pool->recover_wq); + mutex_destroy(&clt_pool->io_freeze_lock); + mutex_destroy(&clt_pool->clt_pool_lock); + + if (pool) { + pr_info("clt: destroy pool %s\n", pool->poolname); + free_pool(pool); + } + + kfree(clt_pool); + } +} + +static inline int rmr_clt_sess_get(struct rmr_clt_sess *sess) +{ + return kref_get_unless_zero(&sess->kref); +} + +static void rmr_clt_sess_release(struct kref *kref) +{ + struct rmr_clt_sess *clt_sess; + + clt_sess = container_of(kref, struct rmr_clt_sess, kref); + + mutex_lock(&g_sess_lock); + + rmr_clt_destroy_clt_sess_sysfs_files(clt_sess); + + pr_info("close rtrs for session %s\n", clt_sess->sessname); + rtrs_clt_close(clt_sess->rtrs); + list_del(&clt_sess->g_list); + kfree(clt_sess); + + mutex_unlock(&g_sess_lock); +} + +void rmr_clt_sess_put(struct rmr_clt_sess *sess) +{ + kref_put(&sess->kref, rmr_clt_sess_release); +} + +static const char *rmr_get_clt_pool_state_name(enum rmr_clt_pool_state state) +{ + switch (state) { + case RMR_CLT_POOL_STATE_JOINED: return "RMR_CLT_POOL_STATE_JOINED"; + case RMR_CLT_POOL_STATE_IN_USE: return "RMR_CLT_POOL_STATE_IN_USE"; + + default: return "Unknown state"; + } +} + +static void rmr_clt_dump_state(struct rmr_clt_pool *rmr_clt_pool) +{ + char current_state[1024] = {0}; + int i, n = 0, len = sizeof(current_state); + + for (i = 0; i < RMR_CLT_POOL_STATE_MAX; i++) { + enum rmr_clt_pool_state state = (enum rmr_clt_pool_state)i; + + if (test_bit(state, &rmr_clt_pool->state)) + n += scnprintf(current_state + n, len - n, "%s, ", + rmr_get_clt_pool_state_name(state)); + } + + pr_info("%s: RMR client pool current state: %s\n", __func__, current_state); +} + +/** + * rmr_clt_change_pool_state() - Change clt pool state + * + * @clt_pool: Client pool whose state is to be changed + * @new_state: New state to set + * @set: Informs whether to set/unset the given new+state + */ +void rmr_clt_change_pool_state(struct rmr_clt_pool *rmr_clt_pool, + enum rmr_clt_pool_state new_state, bool set) +{ + if (set) { + set_bit(new_state, &rmr_clt_pool->state); + pr_info("%s: state %s set\n", + __func__, rmr_get_clt_pool_state_name(new_state)); + } else { + clear_bit(new_state, &rmr_clt_pool->state); + pr_info("%s: state %s cleared\n", + __func__, rmr_get_clt_pool_state_name(new_state)); + } + + rmr_clt_dump_state(rmr_clt_pool); +} + +/** + * send_map_get_version() - Send a map get version command + * + * @pool_sess: pool session where to send the message + * + * Description: + * Ask the storage node to send back its map_version. + * + * Return: + * 0 on success + * Negative error in case of failure + */ + +/** + * rmr_clt_md_update() - Update the client (non-sync) pool metadata + */ +static void rmr_clt_md_update(struct rmr_pool *pool) +{ + struct rmr_pool_md *clt_md = &pool->pool_md; + + if (pool->sync) + return; + + clt_md->map_ver = pool->map_ver; +} + +#if 0 +static int send_map_set_version(struct rmr_clt_pool_sess *pool_sess, u64 ver) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_SET_VER; + msg.set_map_ver_cmd.map_ver = ver; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + } + return err; +} + +/** + * rmr_clt_coordinate_discard() - Coordinate the discard_entries flag + * + * @pool: the client pool + * @member_id: member id of the source node + * + * Description: + * This function sends discard request to all normal pool sessions of the pool. + * It is to solve the case where network is partitioned between the server nodes + * and only the client connects those partitions. Any request that failed on a session + * would fail this call. + * + * TODO: To address the network partitions (including the client), wait for consistency + * protocols. + * + * Return: + * 0 on success + * Negative error in case of failure + * + * Pre-requisite: rcu read lock should be held by caller + */ +static int rmr_clt_coordinate_discard(struct rmr_pool *pool, u8 cmd_type, u8 member_id) +{ + struct rmr_clt_pool_sess *pool_sess; + int err = 0; + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + /* + * If the pool session state is not normal, the dirty maps of the that pool is + * likely corrupted. Don't bother to send the discards. + */ + if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_NORMAL) + continue; + + pr_info("%s: send discards to (pool_sess %s: %d) with member_id %u\n", + __func__, pool_sess->sessname, pool_sess->member_id, member_id); + + /* Send discard request to the pool session. */ + err = send_discard(pool_sess, cmd_type, member_id); + if (err) { + pr_err("%s: Failed discard request on sess %s for member_id %u\n", + __func__, pool_sess->sessname, member_id); + return err; + } + } + + return err; +} + +static int rmr_clt_handle_discard(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_dirty_id_map *map; + int idx, ret, err = 0; + u64 map_ver; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + /* Find out if there is pending discard requests on the server side */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + ret = send_map_get_version(pool_sess, &map_ver); + if (ret) + continue; + + /* + * When disk replacement appears at the storage node, pserver will set the all + * map entries of that server to dirty. + */ + if (RMR_STORE_IS_REPLACE(map_ver)) { + map = rmr_pool_find_map(pool, pool_sess->member_id); + if (!map) { + pr_err("The clt pool %s cannot find map for member_id %u\n", + pool->poolname, pool_sess->member_id); + err = -EINVAL; + goto out; + } + + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + + /* Check any normal pool session failed to receive discards */ + err = rmr_clt_coordinate_discard(pool, RMR_CMD_SEND_DISCARD, + pool_sess->member_id); + if (err) { + pr_err("%s: Failed to coordinate discard state for member_id %u\n", + __func__, pool_sess->member_id); + goto out; + } + + /* update the map version */ + err = send_map_set_version(pool_sess, RMR_STORE_UNSET_REPLACE(map_ver)); + if (err) { + pr_err("%s: Failed to reset map version for %s\n", + __func__, pool_sess->sessname); + goto out; + } + + /* Everyone knows about the discarded entries now. */ + err = rmr_clt_coordinate_discard(pool, RMR_CMD_DISCARD_CLEAR_FLAG, + pool_sess->member_id); + if (err) { + pr_err("%s: Failed to clear discard flag for S%u\n", + __func__, pool_sess->member_id); + goto out; + } + } + } + +out: + srcu_read_unlock(&pool->sess_list_srcu, idx); + return err; +} +#endif + +static int rmr_clt_start_send_md(struct rmr_pool *pool); + +/** + * recover_work() - A work thread, which performs a number of tasks at regular intervals + * + * @work: The work struct holding the data + * + * Description: + * Every client pool has its own work thread. It performs the following 3 tasks. + * 1) Pool sessions in NORMAL state, and having dirty map entries associated with it, + * are checked, and if the entries are cleared from the particular storage node, then + * they are deleted from the pserver also. + * 2) If the pool session state is FAILED, but the network state (clt session) is connected, + * then a store check message is send to the pool session. The storage node wil confirm + * with the backend, if IOs can be send or not. + * 3) Send the client pool metadata to the servers. + */ +void recover_work(struct work_struct *work) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool_md *clt_md; + int index, lock_idx = 0; + + clt_pool = container_of(to_delayed_work(work), struct rmr_clt_pool, recover_dwork); + pool = clt_pool->pool; + + pr_debug("check map for pool %s started...\n", pool->poolname); + + lock_idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + pr_debug("pool %s sess %s sess->member_id %d sess->state %d\n", + pool->poolname, pool_sess->sessname, + pool_sess->member_id, atomic_read(&pool_sess->state)); + + clt_md = &pool->pool_md; + index = rmr_pool_find_md(clt_md, pool_sess->member_id, false); + if (index < 0) { + pr_debug("%s failed to find pool_sess %u\n", + __func__, pool_sess->member_id); + continue; + } + if (pool_sess->maintenance_mode) + goto pool_sess_state_check; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) { + struct rmr_dirty_id_map *map; + + map = rmr_pool_find_map(pool, pool_sess->member_id); + if (!map) { + pr_debug("pool %s no map found for member_id %u\n", + pool->poolname, pool_sess->member_id); + continue; + } + if (!rmr_map_empty(map)) { + pr_debug("pool %s sess %s map is not empty, check stg map...\n", + pool->poolname, pool_sess->sessname); + send_map_check(pool_sess); + } + } +pool_sess_state_check: + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED && + clt_sess->state == RMR_CLT_SESS_CONNECTED) { + pr_debug("pool %s sess %s try pool sess recover\n", + pool->poolname, pool_sess->sessname); + send_store_check(pool_sess); + } + } + srcu_read_unlock(&pool->sess_list_srcu, lock_idx); + + rmr_clt_md_update(pool); + /* If the send fails, wait for the next update. */ + rmr_clt_start_send_md(pool); + + pr_debug("check map for pool %s done. schedule next one.\n", pool->poolname); + + queue_delayed_work(clt_pool->recover_wq, &clt_pool->recover_dwork, + msecs_to_jiffies(RMR_RECOVER_INTERVAL_MS)); +} + +static int init_clt_pool(struct rmr_clt_pool *clt_pool) +{ + int err; + + clt_pool->pcpu_sess = alloc_percpu(typeof(*clt_pool->pcpu_sess)); + if (unlikely(!clt_pool->pcpu_sess)) { + err = -ENOMEM; + goto out_err; + } + + return 0; + +out_err: + return err; +} + +static void destroy_clt_pool(struct rmr_pool *pool) +{ + int i; + struct rmr_clt_pool *clt_pool; + struct rmr_dirty_id_map *map; + struct rmr_dirty_id_map *maplist = NULL; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + if (clt_pool) { + free_percpu(clt_pool->pcpu_sess); + clt_pool->pcpu_sess = NULL; + } + + mutex_lock(&pool->maps_lock); + for (i = 0; i < pool->maps_cnt; i++) { + map = rcu_dereference_protected(pool->maps[i], + lockdep_is_held(&pool->maps_lock)); + if (WARN_ON(!map)) + continue; + rcu_assign_pointer(pool->maps[i], NULL); + map->next = maplist; + maplist = map; + } + pool->maps_cnt = 0; + + if (maplist) + synchronize_srcu(&pool->map_srcu); + + mutex_unlock(&pool->maps_lock); + + rmr_maplist_destroy(maplist); +} + +static void rmr_put_sess_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu); + +static struct rmr_iu * +rmr_alloc_iu(void) +{ + struct rmr_iu *iu; + + iu = kzalloc(sizeof(*iu), GFP_KERNEL); + if (!iu) + return NULL; + INIT_LIST_HEAD(&iu->sess_list); + iu->num_sessions = 0; + refcount_set(&iu->ref, 1); + return iu; +} + +void rmr_get_iu(struct rmr_iu *iu) +{ + refcount_inc(&iu->ref); +} + +void rmr_put_iu(struct rmr_iu *iu) +{ + struct rmr_clt_sess_iu *sess_iu, *tmp; + + if (refcount_dec_and_test(&iu->ref)) { + list_for_each_entry_safe(sess_iu, tmp, + &iu->sess_list, entry) { + if (!list_empty(&sess_iu->entry)) + list_del_init(&sess_iu->entry); + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); + } + kfree(iu); + } +} + +void rmr_clt_free_pool_sess(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_clt_pool *clt_pool; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + clt_pool = (struct rmr_clt_pool *)pool_sess->pool->priv; + + if (!list_empty(&pool_sess->clt_sess_entry)) { + mutex_lock(&clt_sess->lock); + list_del(&pool_sess->clt_sess_entry); + mutex_unlock(&clt_sess->lock); + } + + pr_info("before free pool_sess %s, clt_sess refcount=%d\n", + pool_sess->sessname, kref_read(&clt_sess->kref)); + + kfree(pool_sess); +} + +void rmr_clt_put_pool(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + rmr_put_clt_pool(clt_pool); +} +EXPORT_SYMBOL(rmr_clt_put_pool); + +/** + * rmr_clt_open() - Open a client for use + * + * @priv: private data for the user + * @link_ev: holds the link event callback + * @poolname: name of the pool to open + * + * Description: + * Open an RMR pool for the user to use. The rmr pool must have at least one session. + * A single pool can be opened and used by only a single user. + * + * Return: + * Returns pointer to the rmr pool opened. + */ +struct rmr_pool *rmr_clt_open(void *priv, rmr_clt_ev_fn *link_ev, const char *poolname) +{ + struct rmr_clt_pool *clt_pool; + int err; + + clt_pool = rmr_find_and_get_clt_pool(poolname); + if (IS_ERR(clt_pool)) { + pr_err("RMR client pool '%s' is not found\n", poolname); + err = PTR_ERR(clt_pool); + goto err_out; + } + + if (!mutex_trylock(&clt_pool->clt_pool_lock)) { + pr_err("RMR client pool '%s' is busy, recovery in progress\n", poolname); + err = -EBUSY; + goto put_err; + } + if (test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) { + pr_err("RMR client pool '%s' is already in use\n", poolname); + err = -ENOENT; + goto put_err; + } + + if (!test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) { + pr_err("RMR client pool '%s' has no sessions open\n", poolname); + err = -ENOENT; + goto put_err; + } + + clt_pool->link_ev = link_ev; + clt_pool->priv = priv; + + err = init_clt_pool(clt_pool); + if (unlikely(err)) { + pr_err("RMR client pool '%s' failed to initialize: %d\n", poolname, err); + goto put_err; + } + + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_IN_USE, true); + + mutex_unlock(&clt_pool->clt_pool_lock); + return clt_pool->pool; + +put_err: + mutex_unlock(&clt_pool->clt_pool_lock); + rmr_put_clt_pool(clt_pool); +err_out: + return ERR_PTR(err); +} +EXPORT_SYMBOL(rmr_clt_open); + +void rmr_clt_close(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + mutex_lock(&clt_pool->clt_pool_lock); + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_IN_USE, false); + + pr_info("%s: RMR client close called for pool %s\n", __func__, pool->poolname); + + /* + * Freeze I/O. + * Degrade ref count to the usual model with a single shared + * atomic_t counter + */ + rmr_clt_pool_io_freeze(clt_pool); + pr_info("pool %s wait for inflight io to complete\n", clt_pool->pool->poolname); + + /* Wait for all completion */ + rmr_clt_pool_io_wait_complete(clt_pool); + + pr_info("pool %s inflight io completed\n", clt_pool->pool->poolname); + + clt_pool->link_ev = NULL; + clt_pool->priv = NULL; + + /* Unfreeze and Resurrect */ + rmr_clt_pool_io_unfreeze(clt_pool); + + mutex_unlock(&clt_pool->clt_pool_lock); + + rmr_put_clt_pool(clt_pool); +} +EXPORT_SYMBOL(rmr_clt_close); + +void *rmr_clt_get_priv(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + if (clt_pool) + return clt_pool->priv; + + return NULL; +} +EXPORT_SYMBOL(rmr_clt_get_priv); + +static struct rmr_clt_sess *alloc_clt_sess(const char *sessname) +{ + struct rmr_clt_sess *sess; + + sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!sess)) { + pr_err("Failed to create session %s," + " allocating session struct failed\n", + sessname); + return ERR_PTR(-ENOMEM); + } + strscpy(sess->sessname, sessname, sizeof(sess->sessname)); + mutex_init(&sess->lock); + INIT_LIST_HEAD(&sess->pool_sess_list); + kref_init(&sess->kref); + sess->state = RMR_CLT_SESS_DISCONNECTED; + + return sess; +} + +static struct rmr_clt_pool_sess *alloc_pool_sess(struct rmr_pool *pool, + struct rmr_clt_sess *clt_sess) +{ + struct rmr_clt_pool_sess *pool_sess; + + pool_sess = kzalloc_node(sizeof(*pool_sess), GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!pool_sess)) { + pr_err("Failed to allocate session for pool %s\n", pool->poolname); + return ERR_PTR(-ENOMEM); + } + + strscpy(pool_sess->sessname, clt_sess->sessname, NAME_MAX); + INIT_LIST_HEAD(&pool_sess->entry); + INIT_LIST_HEAD(&pool_sess->clt_sess_entry); + pool_sess->pool = pool; + pool_sess->clt_sess = clt_sess; + pool_sess->maintenance_mode = false; + atomic_set(&pool_sess->state, RMR_CLT_POOL_SESS_CREATED); + + return pool_sess; +} + +/* + * Checks if the session already exists (search by session name) + * Returns TRUE if session found, FALSE otherwise. + */ +static bool __find_sess_by_name(struct rmr_pool *pool, const char *sessname) +{ + struct rmr_clt_pool_sess *pool_sess; + int idx; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (!strcmp(sessname, pool_sess->sessname)) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + return true; + } + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return false; +} + +/** + * __find_sess_by_member_id() - Find and return pool_sess with a given member_id + * + * @pool: RMR pool to search pool_sess in + * @member_id: member ID to search + * + * Return: + * Pointer to rmr_clt_pool_sess on success + * NULL if no pool session exists with the given member_id + * + * Context: + * The caller should hold srcu_read_lock + */ +static struct rmr_clt_pool_sess *__find_sess_by_member_id(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_clt_pool_sess *pool_sess = NULL, *tmp_pool_sess; + + list_for_each_entry_srcu(tmp_pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (member_id == tmp_pool_sess->member_id) { + pool_sess = tmp_pool_sess; + break; + } + } + + return pool_sess; +} + +/** + * pool_sess_change_state() - Change pool session state + * + * @pool_sess: Pool session whose state is to be changed + * @newstate: New state which is to be set + * + * Description: + * Pool session states decide a number of crucial things. + * Where the IOs can be sent, which node has an outdated map, etc. + * As such, transitioning of states are important and is tightly controlled through + * this function. All state transitions should happen through this function. + * + * Return: + * True in case the state was changed + * False in case the state was not changed + */ +bool pool_sess_change_state(struct rmr_clt_pool_sess *pool_sess, + enum rmr_clt_pool_sess_state newstate) +{ + bool changed = false; + int oldstate = atomic_read(&pool_sess->state); + + if (WARN_ON(oldstate == RMR_CLT_POOL_SESS_REMOVING)) + goto out; + + switch (newstate) { + case RMR_CLT_POOL_SESS_NORMAL: + if (pool_sess->maintenance_mode) + break; + /* + * Non-sync sessions must pass through RECONNECTING before + * reaching NORMAL so that a map update can take place first. + * Sync sessions skip RECONNECTING entirely and go FAILED→NORMAL + * directly. + */ + if (!rmr_clt_sess_is_sync(pool_sess)) { + if (WARN_ON(oldstate == RMR_CLT_POOL_SESS_FAILED)) + break; + if (oldstate == RMR_CLT_POOL_SESS_CREATED || + oldstate == RMR_CLT_POOL_SESS_RECONNECTING) + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + } else { + if (oldstate == RMR_CLT_POOL_SESS_CREATED || + oldstate == RMR_CLT_POOL_SESS_FAILED || + oldstate == RMR_CLT_POOL_SESS_RECONNECTING) + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + } + break; + case RMR_CLT_POOL_SESS_RECONNECTING: + /* + * Sync sessions never need a map update and must not enter + * RECONNECTING. + */ + if (WARN_ON(rmr_clt_sess_is_sync(pool_sess) && + !pool_sess->maintenance_mode)) + break; + if (oldstate == RMR_CLT_POOL_SESS_FAILED || + oldstate == RMR_CLT_POOL_SESS_CREATED || + (oldstate == RMR_CLT_POOL_SESS_NORMAL && pool_sess->maintenance_mode)) + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + break; + case RMR_CLT_POOL_SESS_FAILED: + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + /* + * TODO + * We should really be updating map version with the state, + * Or before it. + */ + if (changed && oldstate != RMR_CLT_POOL_SESS_FAILED) + pool_sess->pool->map_ver++; + break; + case RMR_CLT_POOL_SESS_REMOVING: + changed = atomic_try_cmpxchg(&pool_sess->state, + &oldstate, + newstate); + break; + default: + pr_err("%s: Unknown state %d\n", __func__, newstate); + break; + } + + if (changed && !rmr_clt_sess_is_sync(pool_sess)) { + if (newstate == RMR_CLT_POOL_SESS_NORMAL) { + /* + * Entering NORMAL: this session is no longer the last + * authoritative holder of the dirty map. + */ + pool_sess->was_last_authoritative = false; + atomic_inc(&pool_sess->pool->normal_count); + } else if (oldstate == RMR_CLT_POOL_SESS_NORMAL) { + /* + * Leaving NORMAL via FAILED or maintenance-mode + * RECONNECTING: decrement the count of NORMAL sessions. + * If this was the last one, mark it as authoritative so + * that recovery can enable it directly (without a map + * update) when it comes back — its dirty map was the last + * complete one the pool had. + * + * REMOVING is not marked authoritative: a deliberate + * removal (delete or disassemble) is not an uncontrolled + * failure. On reassembly the leg goes through the full + * map update path and does not need the direct-enable + * shortcut. + */ + if (newstate == RMR_CLT_POOL_SESS_FAILED || + (newstate == RMR_CLT_POOL_SESS_RECONNECTING && + pool_sess->maintenance_mode)) { + if (atomic_dec_and_test(&pool_sess->pool->normal_count)) + pool_sess->was_last_authoritative = true; + } else { + /* REMOVING */ + atomic_dec(&pool_sess->pool->normal_count); + } + } + } + +out: + + trace_pool_sess_change_state(pool_sess, newstate, oldstate, changed); + + return changed; +} + +void rmr_clt_pool_io_freeze(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + mutex_lock(&clt_pool->io_freeze_lock); + if (atomic_inc_return(&clt_pool->io_freeze) == 1) + percpu_ref_kill(&pool->ids_inflight_ref); + mutex_unlock(&clt_pool->io_freeze_lock); +} + +void rmr_clt_pool_io_unfreeze(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + mutex_lock(&clt_pool->io_freeze_lock); + if (atomic_dec_return(&clt_pool->io_freeze) == 0) { + reinit_completion(&pool->complete_done); + percpu_ref_reinit(&pool->ids_inflight_ref); + + wake_up_all(&clt_pool->map_update_wq); + } + mutex_unlock(&clt_pool->io_freeze_lock); +} + +void rmr_clt_pool_io_wait_complete(struct rmr_clt_pool *clt_pool) +{ + struct rmr_pool *pool = clt_pool->pool; + + wait_for_completion(&pool->complete_done); +} + +//am: what kind of locking is rquired for that ? +static void set_pool_sess_states_to_failed(struct rmr_clt_sess *clt_sess) +{ + struct rmr_clt_pool_sess *pool_sess; + + mutex_lock(&clt_sess->lock); + + list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) { + if (pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED)) + pr_info("set sess %s to failed due to link_ev\n", pool_sess->sessname); + } + mutex_unlock(&clt_sess->lock); +} + +static void rmr_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev) +{ + struct rmr_clt_sess *clt_sess = priv; + + switch (ev) { + case RTRS_CLT_LINK_EV_DISCONNECTED: + pr_info("Rtrs link ev disconnected: session %s\n", + clt_sess->sessname); + clt_sess->state = RMR_CLT_SESS_DISCONNECTED; + set_pool_sess_states_to_failed(clt_sess); + break; + case RTRS_CLT_LINK_EV_RECONNECTED: + pr_info("Rtrs link ev reconnected: session %s\n", + clt_sess->sessname); + clt_sess->state = RMR_CLT_SESS_CONNECTED; + resend_join_pool(clt_sess); + break; + default: + pr_err("Unknown rtrs link event received (%d), " + "session: %s\n", + ev, clt_sess->sessname); + } +} + +/* + * Gets an iu for I/O operations. + * + * Context: + * The call to this function should be protected with an srcu_read_lock. + */ +static struct rmr_clt_sess_iu *rmr_get_sess_iu(struct rmr_clt_pool_sess *pool_sess, + enum rtrs_clt_con_type con_type, + enum wait_type wait) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct rmr_clt_sess_iu *sess_iu; + struct rtrs_permit *permit; + + WARN_ON(!srcu_read_lock_held(&pool->sess_list_srcu)); + + if (clt_sess->state == RMR_CLT_SESS_DISCONNECTED) { + pr_info("The rmr client session %s state is disconnected\n", clt_sess->sessname); + return NULL; + } + + sess_iu = kzalloc(sizeof(*sess_iu), GFP_KERNEL); + if (!sess_iu) + return NULL; + + permit = rtrs_clt_get_permit(clt_sess->rtrs, con_type, wait); + if (unlikely(!permit)) { + kfree(sess_iu); + return NULL; + } + + INIT_LIST_HEAD(&sess_iu->entry); + sess_iu->permit = permit; + sess_iu->pool_sess = pool_sess; + + return sess_iu; +} + +/* + * Gets the iu for user messages. + * It will be reference counted initialized with refcount + */ +static inline struct rmr_clt_sess_iu *rmr_msg_get_iu(struct rmr_clt_pool_sess *pool_sess, + enum rtrs_clt_con_type con_type, + enum wait_type wait, int refcount) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_sess_iu *sess_iu; + int idx; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + sess_iu = rmr_get_sess_iu(pool_sess, con_type, wait); + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (unlikely(!sess_iu)) + return NULL; + + init_waitqueue_head(&sess_iu->comp.wait); + sess_iu->comp.errno = INT_MAX; + atomic_set(&sess_iu->refcount, refcount); + + return sess_iu; +} + +/* + * reference counted put, refcount has to be initialized. + */ +void rmr_msg_put_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu) +{ + if (atomic_dec_and_test(&sess_iu->refcount)) { + rtrs_clt_put_permit(pool_sess->clt_sess->rtrs, sess_iu->permit); + kfree(sess_iu); + } +} + +/* + * put the sess_iu without reference counting. + * I/O does not need reference counting. + */ +static void rmr_put_sess_iu(struct rmr_clt_pool_sess *pool_sess, + struct rmr_clt_sess_iu *sess_iu) +{ + rtrs_clt_put_permit(pool_sess->clt_sess->rtrs, sess_iu->permit); + kfree(sess_iu); +} + +void wake_up_iu_comp(struct rmr_clt_sess_iu *sess_iu) +{ + sess_iu->comp.errno = sess_iu->errno; + wake_up(&sess_iu->comp.wait); +} + +void msg_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + + sess_iu->errno = errno; + /* just schedule the work because kfree must not be done here */ + schedule_work(&sess_iu->work); +} + +static int send_usr_msg(struct rtrs_clt_sess *rtrs, int dir, + struct rmr_clt_sess_iu *sess_iu, + struct kvec *vec, size_t nr, size_t len, + struct scatterlist *sg, unsigned int sg_len, + void (*conf)(struct work_struct *work), + int *errno, enum rmr_wait_type wait) +{ + int err; + struct rtrs_clt_req_ops req_ops; + + INIT_WORK(&sess_iu->work, conf); + req_ops = (struct rtrs_clt_req_ops){ + .priv = sess_iu, + .conf_fn = msg_conf, + }; + + trace_send_usr_msg(dir, sess_iu); + + err = rtrs_clt_request(dir, &req_ops, rtrs, sess_iu->permit, + vec, nr, len, sg, sg_len); + if (!err && wait) { + wait_event_timeout(sess_iu->comp.wait, + sess_iu->comp.errno != INT_MAX, + msecs_to_jiffies(RMR_CLT_SEND_MSG_TIMEOUT_MS)); + *errno = sess_iu->comp.errno; + if (*errno == INT_MAX) + *errno = -ETIMEDOUT; + } else { + *errno = 0; + } + return err; +} + +static int send_msg_rejoin_pool(struct rmr_clt_pool_sess *pool_sess, bool wait) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + int ret; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_REJOIN_POOL; + + msg.join_pool_cmd.rejoin = true; + msg.join_pool_cmd.chunk_size = pool->chunk_size; + msg.join_pool_cmd.queue_depth = clt_sess->queue_depth; + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait); + if (ret) + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + + return ret; +} + +static int send_msg_join_pool(struct rmr_clt_pool_sess *pool_sess, bool create, + bool dirty, bool wait) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool_member_info *mem_info; + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_pool_sess *t_pool_sess; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct rmr_dirty_id_map *map; + int ret, i = 0, idx; + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_JOIN_POOL; + + msg.join_pool_cmd.queue_depth = clt_sess->queue_depth; + msg.join_pool_cmd.chunk_size = pool->chunk_size; + msg.join_pool_cmd.rejoin = false; + + if (!msg.sync) { + msg.join_pool_cmd.create = create; + msg.join_pool_cmd.dirty = dirty; + mem_info = &(msg.join_pool_cmd.mem_info); + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(t_pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (t_pool_sess->member_id == pool_sess->member_id) + continue; + + map = rmr_pool_find_map(pool, t_pool_sess->member_id); + if (!map) { + pr_err("%s: Map with member_id %u does not exist\n", + __func__, t_pool_sess->member_id); + srcu_read_unlock(&pool->sess_list_srcu, idx); + return -ENOENT; + } + + mem_info->p_mem_info[i].member_id = t_pool_sess->member_id; + /* Only relevant for create */ + if (create) + mem_info->p_mem_info[i].c_dirty = !rmr_map_empty(map); + i++; + if (WARN_ON(i >= RMR_POOL_MAX_SESS)) + break; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + mem_info->no_of_stor = i; + } + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait); + if (ret) + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + + return ret; +} + +int send_msg_leave_pool(struct rmr_clt_pool_sess *pool_sess, bool delete, bool wait) +{ + struct rmr_msg_pool_cmd msg = {}; + int ret; + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_LEAVE_POOL; + + msg.leave_pool_cmd.member_id = pool_sess->member_id; + msg.leave_pool_cmd.delete = delete; + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, wait); + if (ret) + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + + return ret; +} + +bool rmr_clt_sess_is_sync(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + bool ret = false; + + if (!pool) { + WARN(1, "for sess %s pool is not assigned\n", + pool_sess->clt_sess->sessname); + return false; + } + + if (pool->sync) { + pr_debug("sess %s pool %s is sync (internal) clt sess\n", + pool_sess->clt_sess->sessname, pool->poolname); + ret = true; + } else { + pr_debug("sess %s pool %s is not sync clt sess\n", + pool_sess->clt_sess->sessname, pool->poolname); + ret = false; + } + return ret; +} + +/** + * rmr_clt_send_pool_info() - Notify all other pool members of a membership change + * + * @pool_sess: The pool session of the member whose state is changing. + * @op: Operation: %RMR_POOL_INFO_OP_ADD or %RMR_POOL_INFO_OP_REMOVE. + * @mode: For ADD: %RMR_POOL_INFO_MODE_CREATE or %RMR_POOL_INFO_MODE_ASSEMBLE. + * For REMOVE: %RMR_POOL_INFO_MODE_DELETE or %RMR_POOL_INFO_MODE_DISASSEMBLE. + * @dirty: When op is ADD and mode is CREATE, indicates that @pool_sess + * has outstanding dirty data that the receiving node must track. + * + * Sends a POOL_INFO command to every other non-FAILED, non-REMOVING + * member in the pool so they can update their view of pool membership. + * + * Return: + * 0 on success, negative error code on failure. + * + * Context: + * This function blocks while sending the command. + */ +static int rmr_clt_send_pool_info(struct rmr_clt_pool_sess *pool_sess, + enum rmr_pool_info_op op, enum rmr_pool_info_mode mode, + bool dirty) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_pool_sess *t_pool_sess; + struct rmr_msg_pool_cmd msg = {}; + int idx, ret = 0; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_POOL_INFO; + + msg.pool_info_cmd.member_id = pool_sess->member_id; + msg.pool_info_cmd.operation = op; + msg.pool_info_cmd.mode = mode; + + if (op == RMR_POOL_INFO_OP_ADD && mode == RMR_POOL_INFO_MODE_CREATE && dirty) + msg.pool_info_cmd.dirty = dirty; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(t_pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + enum rmr_clt_pool_sess_state state; + + /* + * No need to send the info message to the member who just joined. + */ + if (t_pool_sess->member_id == pool_sess->member_id) + continue; + + state = atomic_read(&t_pool_sess->state); + /* + * TODO: For FAILED session we have to store the missed + * msgs and send them later when the session recovers. + */ + if (state == RMR_CLT_POOL_SESS_FAILED || + state == RMR_CLT_POOL_SESS_REMOVING) + continue; + + ret = rmr_clt_pool_send_cmd(t_pool_sess, &msg, WAIT); + if (ret) { + pr_err("%s failed with err %d\n", rmr_get_cmd_name(msg.cmd_type), ret); + break; + } + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return ret; +} + +void resend_join_pool(struct rmr_clt_sess *clt_sess) +{ + struct rmr_clt_pool_sess *pool_sess; + + mutex_lock(&clt_sess->lock); + + list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) { + int err; + + err = send_msg_rejoin_pool(pool_sess, WAIT); + if (err) { + pr_err("send_msg_rejoin_pool failed for sess %s error %d\n", + pool_sess->sessname, err); + } + } + mutex_unlock(&clt_sess->lock); + + return; +} + +int send_msg_enable_pool(struct rmr_clt_pool_sess *pool_sess, bool enable) +{ + struct rmr_msg_pool_cmd msg = {}; + int ret; + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_ENABLE_POOL; + + msg.enable_pool_cmd.enable = enable; + + ret = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (ret) { + pr_err("%s failed\n", rmr_get_cmd_name(msg.cmd_type)); + goto err; + } + +err: + return ret; +} + +static const char *rmr_clt_pool_sess_state_names[] = { + [0] = "invalid state", + [RMR_CLT_POOL_SESS_CREATED] = "created", + [RMR_CLT_POOL_SESS_NORMAL] = "normal", + [RMR_CLT_POOL_SESS_FAILED] = "failed", + [RMR_CLT_POOL_SESS_RECONNECTING] = "reconnecting", + [RMR_CLT_POOL_SESS_REMOVING] = "removing" +}; + +const char *rmr_clt_sess_state_str(enum rmr_clt_pool_sess_state state) +{ + return rmr_clt_pool_sess_state_names[state]; +} + +int rmr_clt_reconnect_sess(struct rmr_clt_sess *clt_sess, + const struct rtrs_addr *paths, + size_t path_cnt) +{ + struct rtrs_attrs attrs; + struct rtrs_clt_ops rtrs_ops; + int err = 0; + + rtrs_ops = (struct rtrs_clt_ops){ + .priv = clt_sess, + .link_ev = rmr_clt_link_ev, + }; + + clt_sess->rtrs = rtrs_clt_open(&rtrs_ops, clt_sess->sessname, + paths, path_cnt, RTRS_PORT, + 0, /* Do not use pdu of rtrs */ + RECONNECT_DELAY, + MAX_RECONNECTS, 0); + if (IS_ERR(clt_sess->rtrs)) { + err = PTR_ERR(clt_sess->rtrs); + pr_err("rtrs_clt_open error %d\n", err); + goto err; + } + + err = rtrs_clt_query(clt_sess->rtrs, &attrs); + if (unlikely(err)) { + pr_err("rtrs_clt_query error %d\n", err); + goto close_sess; + } + clt_sess->max_io_size = attrs.max_io_size; + clt_sess->queue_depth = attrs.queue_depth; + clt_sess->max_segments = attrs.max_segments; + + clt_sess->state = RMR_CLT_SESS_CONNECTED; + + resend_join_pool(clt_sess); + + return err; + +close_sess: + rtrs_clt_close(clt_sess->rtrs); +err: + return err; +} + +//TODO: we do not use rsp in this function, do we need it as an argument? +static int rmr_clt_handle_rejoin_rsp(struct rmr_clt_pool_sess *pool_sess, struct rmr_msg_pool_cmd_rsp *rsp) +{ + int err = 0; + + if (rmr_clt_sess_is_sync(pool_sess)) { + /* + * The client on sync side does not need map update + * hence goes to "normal" state directly. + * NB: FAILED => NORMAL + */ + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + } else { + /* + * The client on non-sync side needs map update, + * + * A map update is to be triggered, which updates the map, + * and then sets state to "normal" + */ + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + + /* + * Send the info about the pool to all the storages. + * Contains IDs of storages connected to this pool. + */ + err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_ADD, + RMR_POOL_INFO_MODE_ASSEMBLE, false); + if (err) { + pr_err("Rejoin: rmr_clt_send_pool_info failed for session %s", + pool_sess->sessname); + return -EINVAL; + } + + err = rmr_clt_pool_try_enable(pool_sess->pool); + if (err) + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool_sess->pool->poolname, + pool_sess->sessname, err); + } + + return err; +} + +static void rmr_clt_handle_join_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_pool_md *clt_md; + u64 mapped_size; + + clt_md = &pool->pool_md; + + pool_sess->ver = min_t(u8, rsp->ver, RMR_PROTO_VER_MAJOR); + pool_sess->member_id = rsp->member_id; + xa_store(&pool->stg_members, pool_sess->member_id, pool_sess, GFP_KERNEL); + + pool->chunk_size = rsp->join_pool_cmd_rsp.chunk_size; + pool->chunk_size_shift = ilog2(pool->chunk_size); + clt_md->chunk_size = pool->chunk_size; + + mapped_size = rsp->join_pool_cmd_rsp.mapped_size; + if (mapped_size) { + pool->mapped_size = mapped_size; + pool->pool_md.mapped_size = mapped_size; + rmr_pool_update_no_of_chunk(pool); + pr_info("clt join_pool: mapped size %llu\n", pool->mapped_size); + } +} + +static int cmd_process_rsp(struct rmr_clt_pool_sess *pool_sess, struct rmr_msg_pool_cmd_rsp *rsp) +{ + int err = 0; + + pr_debug("rsp, cmd_type %d, member_id %d, err %d\n", + rsp->cmd_type, rsp->member_id, rsp->err); + + if (rsp->err) + return rsp->err; + + switch (rsp->cmd_type) { + case RMR_CMD_MAP_CHECK: + return rmr_clt_handle_map_check_rsp(pool_sess, rsp); + case RMR_CMD_STORE_CHECK: + return rmr_clt_handle_store_check_rsp(pool_sess, rsp); + case RMR_CMD_MAP_READY: + case RMR_CMD_MAP_SEND: + case RMR_CMD_MAP_BUF_DONE: + case RMR_CMD_MAP_DONE: + case RMR_CMD_MAP_DISABLE: + case RMR_CMD_LEAVE_POOL: + case RMR_CMD_LAST_IO_TO_MAP: + case RMR_CMD_MD_SEND: + case RMR_CMD_MAP_SET_VER: + case RMR_CMD_SEND_DISCARD: + case RMR_CMD_DISCARD_CLEAR_FLAG: + case RMR_CMD_POOL_INFO: + pr_debug("%s: No rsp handling for %s\n", __func__, rmr_get_cmd_name(rsp->cmd_type)); + break; + case RMR_CMD_REJOIN_POOL: + return rmr_clt_handle_rejoin_rsp(pool_sess, rsp); + case RMR_CMD_JOIN_POOL: + rmr_clt_handle_join_rsp(pool_sess, rsp); + break; + case RMR_CMD_ENABLE_POOL: + pool_sess->ver = min_t(u8, rsp->ver, RMR_PROTO_VER_MAJOR); + break; + default: + pr_warn("%s: switch default type: %d\n", __func__, rsp->cmd_type); + + err = -EINVAL; + } + + return err; +} + +static void msg_pool_cmd_conf(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + struct rmr_msg_pool_cmd_rsp *rsp = sess_iu->buf; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + + pr_debug("pool cmd for %s session %s member_id %d conf with errno %d\n", + pool_sess->pool->poolname, pool_sess->sessname, + pool_sess->member_id, sess_iu->errno); + + if (!sess_iu->errno) { + /* + * We need to check if there was an error while processing the cmd + * on the server side. If there was, then we fail the command. + */ + sess_iu->errno = cmd_process_rsp(pool_sess, rsp); + } + + kfree(rsp); + wake_up_iu_comp(sess_iu); + rmr_msg_put_iu(pool_sess, sess_iu); +} + +void rmr_clt_init_cmd(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.group_id = cpu_to_le32(pool->group_id); + msg->hdr.type = cpu_to_le16(RMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->ver = RMR_PROTO_VER_MAJOR; + msg->sync = pool->sync; + + strncpy(msg->pool_name, pool->poolname, sizeof(msg->pool_name)); +} +EXPORT_SYMBOL(rmr_clt_init_cmd); + +int rmr_clt_pool_send_cmd(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd *msg, bool wait) +{ + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct rmr_msg_pool_cmd_rsp *rsp; + struct rmr_clt_sess_iu *sess_iu; + struct kvec vec = { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + int err, errno; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (unlikely(!rsp)) + return -ENOMEM; + + sess_iu = rmr_msg_get_iu(pool_sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT, 2); + if (unlikely(!sess_iu)) { + kfree(rsp); + return -ENOMEM; + } + + sess_iu->buf = rsp; + sg_init_one(&sess_iu->sg, rsp, sizeof(*rsp)); + + err = send_usr_msg(clt_sess->rtrs, READ, sess_iu, + &vec, 1, sizeof(*rsp), &sess_iu->sg, 1, + msg_pool_cmd_conf, &errno, wait); + if (unlikely(err)) { + rmr_msg_put_iu(pool_sess, sess_iu); + kfree(rsp); + } else { + err = errno; + } + + rmr_msg_put_iu(pool_sess, sess_iu); + + return err; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess * +rmr_clt_get_first_normal_session(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess; + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) + return pool_sess; + } + + return NULL; +} + +/** + * rmr_clt_pool_send_all - Send a command to all sessions in the pool + * + * @pool: The client pool which sends the command message + * @msg: The command message of pool + * + * Description: + * When sending messages to all pool sessions, it will continue to send + * regardless of the failure of the previous communication. + * + * Return: + * 0 if at least one successful request + * less than 0 if all requests failed + */ +int rmr_clt_pool_send_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg) +{ + struct rmr_clt_pool_sess *pool_sess; + int idx, err = 0; + u8 member_id = 0; + int ret = 0; + + if (msg->cmd_type == RMR_CMD_SEND_DISCARD) + member_id = msg->send_discard_cmd.member_id; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + /* The node has had discards. */ + if (pool_sess->member_id == member_id) + continue; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) + continue; + + pr_info("pool %s send cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + + /* The err code reflects the response from this pool_sess. */ + err = rmr_clt_pool_send_cmd(pool_sess, msg, WAIT); + if (err) { + pr_err("pool %s sending cmd to sess %s failed, err=%d\n", + pool->poolname, pool_sess->sessname, err); + continue; + } + + pr_info("pool %s done sending cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + ret++; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (ret) + return 0; + + return -ENETUNREACH; +} +EXPORT_SYMBOL(rmr_clt_pool_send_all); + +/** + * rmr_clt_send_cmd_with_data_all - Send a command with data to all sessions in the pool + * + * Return: + * 0 on success of all sends + * less than 0 if all sends failed + * positive number of failed sends + */ +int rmr_clt_send_cmd_with_data_all(struct rmr_pool *pool, struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen) +{ + struct rmr_clt_pool_sess *pool_sess; + int idx, err = 0; + bool ret = false; + int errno = 0; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) { + errno++; + continue; + } + + pr_debug("pool %s send cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + err = rmr_clt_send_cmd_with_data(pool, pool_sess, msg, buf, buflen); + if (err) { + errno++; + pr_debug("pool %s sending cmd to sess %s failed, err=%d\n", + pool->poolname, pool_sess->sessname, err); + continue; + } + + pr_debug("pool %s done sending cmd %d to sess %s\n", + pool->poolname, msg->cmd_type, pool_sess->sessname); + ret = true; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (ret) + return errno; + + return -EINVAL; +} +EXPORT_SYMBOL(rmr_clt_send_cmd_with_data_all); + +/** + * rmr_clt_start_last_io_update() - Do the last IO update + * + * @pool: The pool + * + * Description: + * Last IO update is needed in case a pserver went down while connected to a pool. + * A pserver going down while performing IOs could mean that some IOs could have been + * executed in some nodes but not all. This function takes the last 'queue_depth' number of + * IOs on each storage node and makes sure they are synced in between all the nodes. + * Before performing the last IO conversion, it also makes sure that all the storage nodes + * have the lastest map. + * + * Return: + * 0 on success + * Error value on failure + * + * Context: + * srcu_read_lock should be held while calling this function. + */ +int rmr_clt_start_last_io_update(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess_chosen, *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + u64 map_ver, highest_map_ver = 0; + int j, err, idx, ret = 0; + int discard_ids[RMR_POOL_MAX_SESS]; + u8 id, nr_discards = 0; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + for (j = 0; j < RMR_POOL_MAX_SESS; j++) { + struct rmr_clt_pool_sess *ps; + u8 mid = pool->pool_md.srv_md[j].member_id; + + if (!mid) + continue; + + ps = xa_load(&pool->stg_members, mid); + if (!ps) { + pr_err("%s: member_id %u not yet assembled\n", + __func__, mid); + err = -EINVAL; + goto out; + } + if (atomic_read(&ps->state) != RMR_CLT_POOL_SESS_RECONNECTING) { + pr_err("%s: member_id %u not in reconnecting state\n", + __func__, mid); + err = -EINVAL; + goto out; + } + } + + /* + * Before pserver died, it could be that one or more storage nodes were down. + * This would mean there is a possibility that those storage nodes will not have + * the latest map. But that can create problems. + * We need to make sure that every storage node has the latest map. + * Hence, find out which node has the latest map first, + */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + err = send_map_get_version(pool_sess, &map_ver); + if (err) { + pr_err("%s: Failed to read map version for sess %s\n", + __func__, pool_sess->sessname); + err = -EINVAL; + goto out; + } + + if (RMR_STORE_IS_REPLACE(map_ver)) { + map_ver = RMR_STORE_GET_VER(map_ver); + discard_ids[nr_discards] = pool_sess->member_id; + nr_discards++; + } + + if (map_ver > highest_map_ver) { + highest_map_ver = map_ver; + pool_sess_chosen = pool_sess; + } + } + + for (j = 0; j < nr_discards; j++) { + id = discard_ids[j]; + pr_info("%s: Send discard req %d to S%d\n", + __func__, id, pool_sess_chosen->member_id); + err = send_discard(pool_sess_chosen, RMR_CMD_SEND_DISCARD, id); + if (err) { + pr_err("%s: Failed to send discard request to %s\n", + __func__, pool_sess_chosen->sessname); + goto out; + } + } + + /* + * We have the storage node with the latest map, + * make sure the latest map is sent to all other storage nodes. + */ + err = rmr_clt_spread_map(pool, pool_sess_chosen, false, false); + if (err) { + pr_err("%s: Failed to spread the latest map\n", __func__); + goto out; + } + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + for (j = 0; j < nr_discards; j++) { + id = discard_ids[j]; + pr_info("%s: Send discard clear req %d to S%d\n", + __func__, id, pool_sess->member_id); + err = send_discard(pool_sess, RMR_CMD_DISCARD_CLEAR_FLAG, id); + if (err) { + pr_err("%s: Failed to clear discard state on %s\n", + __func__, pool_sess->sessname); + } else { + ret++; + } + } + } + + if (nr_discards && !ret) { + pr_err("%s: Failed to clear discard state on any storage node\n", __func__); + err = -EINVAL; + goto out; + } + + /* + * Now that we are done with the dispersing of the latest map, + * we can start last IO update. + */ + rmr_clt_init_cmd(pool, &msg); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + msg.cmd_type = RMR_CMD_LAST_IO_TO_MAP; + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto out; + } + + err = rmr_clt_spread_map(pool, pool_sess, true, false); + if (err) { + pr_err("%s: Failed to spread last_io converted map\n", __func__); + goto out; + } + } + + err = rmr_clt_read_map(pool); + if (err) { + pr_err("%s: rmr_clt_read_map failed with err %d\n", __func__, err); + goto out; + } + +out: + srcu_read_unlock(&pool->sess_list_srcu, idx); + return err; +} + +/** + * rmr_clt_enable_sess() - Enable the rmr clt pool sessions + * + * @pool_sess: The rmr clt pool session to enable + * + * Description: + * This function takes care of enable request, for pool sessions + * not in maintenance mode and in mm. + * + * Return: + * 0 on success + * Error value on failure + */ +int rmr_clt_enable_sess(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + int pool_sess_state, err = 0; + + pr_info("%s: For session %s of pool %s\n", + __func__, pool_sess->sessname, pool->poolname); + + if (!pool_sess->maintenance_mode) { + /* + * Simple enable, not related to maintenance. + * Manual enable is only allowed for sessions in "created" state + */ + pool_sess_state = atomic_read(&pool_sess->state); + if (pool_sess_state != RMR_CLT_POOL_SESS_CREATED) { + pr_err("Cannot manually enable session: state %d\n", pool_sess_state); + err = -EINVAL; + goto out; + } + + err = send_msg_enable_pool(pool_sess, 1); + if (err) { + pr_err("Failed to send enable to pool %s. Err %d\n", + pool->poolname, err); + goto out; + } + + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + } else { + /* + * Enable when in maintenance mode. + */ + err = rmr_clt_unset_pool_sess_mm(pool_sess); + } + +out: + return err; +} + +/** + * rmr_clt_create_sess() - allocate and initialize rmr client session, rmr_clt_pool sess can use it + * to submit io to the rtrs connection + * + * @sessname: Name to be given to the new session being created. + * @paths: RTRS paths created for the session. + * @path_cnt: Number of paths. + * + * Return: + * Pointer to rmr_clt_sess on success + * ERR_PTR on failure + * + * Description: + * Create a new session to storage node with address "rtrs_addr". + * After this function is done, rmr_clt_pool_sess caan use this sess to submit io + * + * Context: + * This function blocks while creating the session + */ +static struct rmr_clt_sess *rmr_clt_create_sess(const char *sessname, + const struct rtrs_addr *paths, + size_t path_cnt) +{ + struct rmr_clt_sess *clt_sess; + struct rtrs_attrs attrs; + struct rtrs_clt_ops rtrs_ops; + int err; + + clt_sess = alloc_clt_sess(sessname); + if (IS_ERR(clt_sess)) { + pr_err("Session '%s' can not be allocated in pool\n", sessname); + return clt_sess; // TODO: isit err_cast here? + } + + rtrs_ops = (struct rtrs_clt_ops){ + .priv = clt_sess, + .link_ev = rmr_clt_link_ev, + }; + /* + * Nothing was found, establish rtrs connection and proceed further. + */ + clt_sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname, + paths, path_cnt, RTRS_PORT, + 0, /* Do not use pdu of rtrs */ + RECONNECT_DELAY, + MAX_RECONNECTS, 0); + if (IS_ERR(clt_sess->rtrs)) { + err = PTR_ERR(clt_sess->rtrs); + pr_err("rtrs_clt_open error %d\n", err); + goto free_clt_sess; + } + err = rtrs_clt_query(clt_sess->rtrs, &attrs); + if (unlikely(err)) { + pr_err("rtrs_clt_query error %d\n", err); + goto close_sess; + } + clt_sess->max_io_size = attrs.max_io_size; + clt_sess->queue_depth = attrs.queue_depth; + clt_sess->max_segments = attrs.max_segments; + //sess->sess_kobj = &sess->rtrs->dev.dev.kobj; + + err = rmr_clt_create_clt_sess_sysfs_files(clt_sess); + if (err) { + pr_err("failed to crete sysfs files for sess %s, err=%d\n", + clt_sess->sessname, err); + goto close_sess; + } + clt_sess->state = RMR_CLT_SESS_CONNECTED; + + mutex_lock(&g_sess_lock); + list_add(&clt_sess->g_list, &g_sess_list); + mutex_unlock(&g_sess_lock); + + return clt_sess; + +close_sess: + rtrs_clt_close(clt_sess->rtrs); + +free_clt_sess: + kfree(clt_sess); + + return ERR_PTR(err); +} + +/** + * rmr_clt_pool_try_enable() - Trigger pool session recovery if conditions are met + * + * @pool: The pool to check + * + * Scans pool sessions and fires the appropriate recovery action: + * + * Case 1: ≥1 NORMAL session exists → spread its map (with enable=true) to all + * non-NORMAL sessions, then set them to NORMAL on the client side + * Case 2: Exactly one was_last_authoritative RECONNECTING session exists → + * enable it directly (data is complete, no map needed), then spread + * its map to remaining sessions + * Cases 3/4: All pool_md members present and RECONNECTING → last_io_update + * + * Return: 0 on success or when conditions are not yet met, negative error on failure. + */ +int rmr_clt_pool_try_enable(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess, *normal_sess, *auth_sess; + bool any_member = false; + int idx, j, err = 0; + + pr_info("%s: Started for pool %s\n", __func__, pool->poolname); + + /* + * clt_pool_lock is held across all RPC round-trips below (MAP_READY, + * MAP_SEND, MAP_DONE, last_io_update exchanges). This serialises + * concurrent try_enable calls and prevents rmr_clt_open/close from + * racing with recovery. The RPC send path (rmr_clt_pool_send_cmd) + * uses per-session permits and does not acquire clt_pool_lock, so + * there is no deadlock. rmr_clt_open and rmr_clt_close use + * mutex_trylock and mutex_lock respectively to handle this. + */ + mutex_lock(&clt_pool->clt_pool_lock); + + normal_sess = NULL; + auth_sess = NULL; + + idx = srcu_read_lock(&pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + int state = atomic_read(&pool_sess->state); + + if (state == RMR_CLT_POOL_SESS_NORMAL) { + if (!normal_sess) + normal_sess = pool_sess; + } else if (state == RMR_CLT_POOL_SESS_RECONNECTING && + pool_sess->was_last_authoritative && + !pool_sess->maintenance_mode && + !auth_sess) { + auth_sess = pool_sess; + } + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + /* + * Invariant: at most one was_last_authoritative session can exist + * (guaranteed by atomic_dec_and_test in pool_sess_change_state), and + * it cannot coexist with a NORMAL session (if a NORMAL session exists, + * the pool never fully went to FAILED, so no session gets the flag). + */ + if (WARN_ON(auth_sess && normal_sess)) { + err = -EINVAL; + goto out; + } + + /* Case 2: was_last_authoritative session — enable it directly, then spread */ + if (auth_sess) { + err = send_msg_enable_pool(auth_sess, 1); + if (err) { + pr_err("%s: pool %s failed to enable auth sess %s: %d\n", + __func__, pool->poolname, auth_sess->sessname, err); + goto out; + } + pool_sess_change_state(auth_sess, RMR_CLT_POOL_SESS_NORMAL); + normal_sess = auth_sess; + } + + /* Case 1: ≥1 NORMAL session → spread map to all non-NORMAL sessions */ + if (normal_sess) { + idx = srcu_read_lock(&pool->sess_list_srcu); + err = rmr_clt_spread_map(pool, normal_sess, true, true); + if (err) + pr_err("%s: pool %s spread map from %s failed: %d\n", + __func__, pool->poolname, normal_sess->sessname, err); + else + goto out_normal; + + srcu_read_unlock(&pool->sess_list_srcu, idx); + goto out; + } + + /* Cases 3/4: all pool_md members present and RECONNECTING */ + for (j = 0; j < RMR_POOL_MAX_SESS; j++) { + struct rmr_clt_pool_sess *ps; + u8 mid = pool->pool_md.srv_md[j].member_id; + + if (!mid) + continue; + + any_member = true; + ps = xa_load(&pool->stg_members, mid); + if (!ps || atomic_read(&ps->state) != RMR_CLT_POOL_SESS_RECONNECTING || + ps->maintenance_mode) { + pr_info("%s: pool %s member_id %u not yet in reconnecting/mm, waiting\n", + __func__, pool->poolname, mid); + goto out; + } + } + + if (!any_member) { + pr_info("%s: pool %s has no members in pool_md, nothing to do\n", + __func__, pool->poolname); + goto out; + } + + pr_info("%s: pool %s all members reconnecting, starting last_io_update\n", + __func__, pool->poolname); + + err = rmr_clt_start_last_io_update(pool); + if (err) { + pr_err("%s: pool %s last_io_update failed: %d\n", + __func__, pool->poolname, err); + goto out; + } + + idx = srcu_read_lock(&pool->sess_list_srcu); +out_normal: + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_RECONNECTING || + pool_sess->maintenance_mode) + continue; + + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + +out: + mutex_unlock(&clt_pool->clt_pool_lock); + return err; +} + +/** + * rmr_clt_read_pool_md() - Read the full pool_md from a storage server's disk + * + * @pool_sess: The pool session to read from. + * + * Sends RMR_CMD_MD_SEND with read_full_md=1 to the given session and imports + * the returned srv_md[] entries into pool->pool_md, skipping already-known + * members. Used during add_sess mode=assemble so the client learns all pool + * member IDs from the server's on-disk metadata, not only the one being + * assembled. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_clt_read_pool_md(struct rmr_clt_pool_sess *pool_sess, bool first) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool_md *remote_md; + int i, err; + + remote_md = kzalloc(sizeof(*remote_md), GFP_KERNEL); + if (!remote_md) + return -ENOMEM; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MD_SEND; + msg.md_send_cmd.src_mapped_size = pool->mapped_size; + msg.md_send_cmd.sender_id = pool_sess->member_id; + msg.md_send_cmd.read_full_md = 1; + + err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, + remote_md, sizeof(*remote_md)); + if (err) { + pr_err("%s: failed to read pool_md from sess %s: %d\n", + __func__, pool_sess->sessname, err); + goto out; + } + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + u8 mid = remote_md->srv_md[i].member_id; + int idx; + + if (!mid) + continue; + + idx = rmr_pool_find_md(&pool->pool_md, mid, first); + if (idx < 0) + continue; + + if (!pool->pool_md.srv_md[idx].member_id) { + /* New entry — import blindly */ + memcpy(&pool->pool_md.srv_md[idx], &remote_md->srv_md[i], + sizeof(struct rmr_srv_md)); + } else { + /* Already known — verify stable fields are consistent */ + if (pool->pool_md.srv_md[idx].mapped_size != + remote_md->srv_md[i].mapped_size) + pr_warn("%s: member_id %u mapped_size mismatch: " + "expected %llu, got %llu from sess %s\n", + __func__, mid, + pool->pool_md.srv_md[idx].mapped_size, + remote_md->srv_md[i].mapped_size, + pool_sess->sessname); + } + } + +out: + kfree(remote_md); + return err; +} + +/** + * rmr_clt_process_non_sync_sess() - Set up map and notify peers for a new non-sync session + * + * @pool_sess: The newly added pool session. + * @create: True if this is a fresh pool creation; false for an assemble of an + * existing pool. + * @dirty: True if there are already other sessions in the pool; the new member's + * map will be marked fully dirty to trigger a resync. + * + * Creates the dirty map for @pool_sess and informs all existing pool members + * about the new storage node joining. On failure the map is removed. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_clt_process_non_sync_sess(struct rmr_clt_pool_sess *pool_sess, bool create, + bool dirty) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_dirty_id_map *map; + enum rmr_pool_info_mode mode; + u8 created_mids[RMR_POOL_MAX_SESS]; + int created_cnt = 0; + int i, err = 0; + + /* + * The mapped size of the pool is set after a backend device is mapped to the + * client. If a new client pool session is extended to this pool, the map for that + * new server node needs to be created for the client pool as well. + */ + if (!pool->mapped_size) { + pr_err("%s: pool %s mapped_size is 0\n", + __func__, pool->poolname); + err = -EINVAL; + goto out; + } + + pr_info("Through add_sess, pool %s mapped_size %llu\n", + pool->poolname, pool->mapped_size); + + rmr_pool_update_no_of_chunk(pool); + + if (create) { + if (rmr_pool_find_map(pool, pool_sess->member_id)) { + pr_err("%s: pool %s map for member_id %u already exists\n", + __func__, pool->poolname, pool_sess->member_id); + err = -EEXIST; + goto out; + } + + map = rmr_map_create(pool, pool_sess->member_id); + if (IS_ERR(map)) { + err = PTR_ERR(map); + pr_err("%s: pool %s failed to create map for member_id %u\n", + __func__, pool->poolname, pool_sess->member_id); + goto out; + } + + /* + * During pool creation, all storage nodes must start with identical + * data. The first node added is taken as the clean reference; any + * subsequent node joining must be fully synced from it. + * Mark the entire map dirty to trigger that initial resync. + */ + if (dirty) + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + + mode = RMR_POOL_INFO_MODE_CREATE; + } else { + /* + * For assemble, read pool_md first so we know all member IDs, + * then create maps for every member in the pool. + */ + mode = RMR_POOL_INFO_MODE_ASSEMBLE; + + err = rmr_clt_read_pool_md(pool_sess, !dirty); + if (err) { + pr_err("%s: failed to read pool_md from sess %s: %d\n", + __func__, pool_sess->sessname, err); + goto out; + } + + if (!dirty) { + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + u8 mid = pool->pool_md.srv_md[i].member_id; + + if (!mid) + continue; + + map = rmr_map_create(pool, mid); + if (IS_ERR(map)) { + err = PTR_ERR(map); + pr_err("%s: pool %s failed to create map for member_id %u\n", + __func__, pool->poolname, mid); + goto del_maps; + } + created_mids[created_cnt++] = mid; + } + } + } + + /* + * We need to send the info about this node joining to other storage nodes. + */ + err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_ADD, mode, dirty); + if (err) { + pr_err("rmr_clt_send_pool_info failed for session %s\n", + pool_sess->sessname); + if (create) + rmr_pool_remove_map(pool, pool_sess->member_id); + else + goto del_maps; + goto out; + } + + if (!create) { + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + err = rmr_clt_pool_try_enable(pool); + if (err) + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool->poolname, pool_sess->sessname, err); + } + + return err; + +del_maps: + for (i = 0; i < created_cnt; i++) + rmr_pool_remove_map(pool, created_mids[i]); +out: + return err; +} + +/** + * rmr_clt_add_pool_sess() - Add a client session to an RMR pool + * + * @pool: The pool to join. + * @clt_sess: The client transport session to associate. + * @create: True if this is a fresh pool creation; false for an assemble of an + * existing pool. + * + * Sends a join_pool command to the server, allocates a pool session, creates + * the dirty map for this storage node (for non-sync pools), and notifies the + * other pool members via a pool_info message. + * + * Return: + * Pointer to the new pool session on success, ERR_PTR on failure. + */ +struct rmr_clt_pool_sess *rmr_clt_add_pool_sess(struct rmr_pool *pool, + struct rmr_clt_sess *clt_sess, bool create) +{ + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool_md *clt_md; + int err, idx; + bool dirty = false; + + mutex_lock(&pool->sess_lock); + + if (__find_sess_by_name(pool, clt_sess->sessname)) { + pr_err("Session '%s' already exists in pool %s\n", + clt_sess->sessname, pool->poolname); + err = -EEXIST; + goto err_out; + } + + pool_sess = alloc_pool_sess(pool, clt_sess); + if (IS_ERR(pool_sess)) { + pr_err("pool session '%s' can not be allocated in pool %s\n", + clt_sess->sessname, pool->poolname); + err = PTR_ERR(pool_sess); + goto err_out; + } + + clt_pool = (struct rmr_clt_pool *)pool->priv; + + /* TODO handle case where tags are alreaydy initialized */ + clt_pool->queue_depth = clt_sess->queue_depth; + clt_md = &clt_pool->pool->pool_md; + clt_md->queue_depth = clt_sess->queue_depth; + + if (!pool->sync) + dirty = !list_empty(&pool->sess_list); + + err = send_msg_join_pool(pool_sess, create, dirty, WAIT); + if (unlikely(err)) { + pr_err("send_msg_join_pool error %d\n", err); + goto free_sess; + } + + /* + * Now that we have the member_id of the new storage node, + * check if it is unique. + */ + idx = srcu_read_lock(&pool->sess_list_srcu); + if (__find_sess_by_member_id(pool, pool_sess->member_id)) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + pr_err("%s: Session with member_id %u already exists\n", + __func__, pool_sess->member_id); + err = -EEXIST; + goto err_leave_pool; + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + + list_add_tail_rcu(&pool_sess->entry, &pool->sess_list); + + if (!pool->sync) { + err = rmr_clt_process_non_sync_sess(pool_sess, create, dirty); + if (err) { + pr_err("%s: rmr_clt_process_non_sync_sess failed for sess %s with err %d\n", + __func__, clt_sess->sessname, err); + goto rem_from_list; + } + } else + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_NORMAL); + + mutex_unlock(&pool->sess_lock); + + mutex_lock(&clt_sess->lock); + list_add_tail(&pool_sess->clt_sess_entry, &clt_sess->pool_sess_list); + mutex_unlock(&clt_sess->lock); + + return pool_sess; + +rem_from_list: + rmr_clt_del_pool_sess(pool_sess); +err_leave_pool: + send_msg_leave_pool(pool_sess, create, WAIT); +free_sess: + rmr_clt_free_pool_sess(pool_sess); +err_out: + mutex_unlock(&pool->sess_lock); + return ERR_PTR(err); +} + +//reauire g_sess_lock acquired +static struct rmr_clt_sess *__find_and_get_clt_sess(const char *sessname) +{ + struct rmr_clt_sess *sess, *sn; + +again: + list_for_each_entry_safe (sess, sn, &g_sess_list, g_list) { + if (strcmp(sessname, sess->sessname)) + continue; + + if (rmr_clt_sess_get(sess)) + return sess; + + pr_info("failed to get ref for sess %s\n", sessname); + goto again; //don't like it + } + + return NULL; +} + +struct rmr_clt_sess *find_and_get_or_create_clt_sess(char *sessname, + struct rtrs_addr *paths, + size_t path_cnt) +{ + struct rmr_clt_sess *sess; + + mutex_lock(&g_sess_lock); + sess = __find_and_get_clt_sess(sessname); + mutex_unlock(&g_sess_lock); + + if (!sess) { + pr_info("%s: Cannot find rmr_clt_sess with name %s\n", __func__, sessname); + sess = rmr_clt_create_sess(sessname, paths, path_cnt); + if (IS_ERR(sess)) { + return sess; + } + pr_info("%s: rmr_clt_sess %s created\n", __func__, sessname); + } + + return sess; +} + +/** + * rmr_clt_del_pool_sess() - Remove a session from the pool session list. + * @pool_sess: Pool session to remove. + * + * Removes @pool_sess from the pool's session list, waits for any in-progress + * SRCU readers to finish, and clears any per-CPU cached references to it. + * + * Context: Caller must hold pool->sess_lock. + */ +void rmr_clt_del_pool_sess(struct rmr_clt_pool_sess *pool_sess) +{ + int cpu; + bool dosync = false; + struct rmr_clt_pool_sess __rcu **ppcpu_sess; + struct rmr_pool *pool = pool_sess->pool; + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + list_del_rcu(&pool_sess->entry); + synchronize_srcu(&pool->sess_list_srcu); + + for_each_possible_cpu(cpu) { + preempt_disable(); + ppcpu_sess = per_cpu_ptr(clt_pool->pcpu_sess, cpu); + if (pool_sess == rcu_access_pointer(*ppcpu_sess)) { + rcu_assign_pointer(*ppcpu_sess, NULL); + dosync = true; + } + preempt_enable(); + } + + if (dosync) + synchronize_srcu(&pool->sess_list_srcu); +} + +/** + * rmr_clt_destroy_pool_sess() - Send leave_pool and free a pool session + * + * @pool_sess: Pool session to destroy. + * @delete: True for a permanent pool deletion; false for a temporary + * disassembly. This flag is forwarded in the leave_pool message + * so the server can act accordingly. + */ +void rmr_clt_destroy_pool_sess(struct rmr_clt_pool_sess *pool_sess, bool delete) +{ + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + send_msg_leave_pool(pool_sess, delete, WAIT); + rmr_clt_free_pool_sess(pool_sess); + rmr_clt_sess_put(clt_sess); +} + +static void rmr_clt_destroy_pool(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess, *tmp; + + destroy_clt_pool(pool); + + list_for_each_entry_safe (pool_sess, tmp, &pool->sess_list, entry) { + mutex_lock(&pool->sess_lock); + list_del_rcu(&pool_sess->entry); + mutex_unlock(&pool->sess_lock); + + rmr_clt_destroy_pool_sess(pool_sess, false /* never delete */); + } + + rmr_put_clt_pool(clt_pool); +} + +int rmr_clt_remove_pool_from_sysfs(struct rmr_pool *pool, + const struct attribute *sysfs_self) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (!pool->sync) + cancel_delayed_work_sync(&clt_pool->recover_dwork); + + rmr_clt_destroy_pool_sysfs_files(pool, sysfs_self); + rmr_clt_destroy_pool(pool); + return 0; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess * +rmr_clt_next_sess(struct rmr_pool *pool, struct rmr_clt_pool_sess *prev) +{ + struct rmr_clt_pool_sess *next; + + next = list_next_or_null_rcu(&pool->sess_list, + &prev->entry, + struct rmr_clt_pool_sess, + entry); + if (next) + return next; + + return list_first_or_null_rcu(&pool->sess_list, + struct rmr_clt_pool_sess, + entry); +} + +static inline bool rmr_clt_pool_sess_in_iu(struct rmr_iu *iu, + struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(iu->sess_list), entry) { + + if (sess_iu->pool_sess == pool_sess) + return true; + } + + return false; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess *rmr_clt_round_robin_sess(struct rmr_pool *pool, + struct rmr_iu *iu) +{ + struct rmr_clt_pool_sess *old, *next, *pool_sess; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess __rcu **ppcpu_sess; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + ppcpu_sess = this_cpu_ptr(clt_pool->pcpu_sess); + + if (iu) { + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (rmr_clt_pool_sess_in_iu(iu, pool_sess)) + continue; + + rcu_assign_pointer(*ppcpu_sess, pool_sess); + return pool_sess; + } + + return NULL; + } + + old = rcu_dereference(*ppcpu_sess); + if (!old) { + next = rmr_clt_get_first_normal_session(pool); + if (!next) + return NULL; + rcu_assign_pointer(*ppcpu_sess, next); + return next; + } + + for (next = rmr_clt_next_sess(pool, old); + next && next != old; + next = rmr_clt_next_sess(pool, next)) { + /* + * It could happen that the state of pool_sess hasn't been able to + * represent the recent rtrs-clt sess state. + */ + if (next->clt_sess->state == RMR_CLT_SESS_DISCONNECTED) + continue; + + if (atomic_read(&next->state) == RMR_CLT_POOL_SESS_NORMAL) { + rcu_assign_pointer(*ppcpu_sess, next); + return next; + } + } + + /* + * There may be just one session with normal state i.e. old. + * In this case per-cpu sess pointer does not need update. + */ + return rmr_clt_get_first_normal_session(pool); +} + +int rmr_clt_query(struct rmr_pool *pool, struct rmr_attrs *attr) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + int idx; + + if (unlikely(!clt_pool)) + return -EINVAL; + + attr->chunk_size = pool->chunk_size; + attr->sync = pool->sync; + + attr->queue_depth = U32_MAX; + attr->max_io_size = U32_MAX; + attr->max_segments = U32_MAX; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + if (list_empty(&pool->sess_list)) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + return -ENOENT; + } + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + + attr->queue_depth = min_t(int, clt_sess->queue_depth, attr->queue_depth); + attr->max_io_size = min_t(u32, clt_sess->max_io_size, attr->max_io_size); + attr->max_segments = min_t(u32, clt_sess->max_segments, attr->max_segments); + } + attr->pool_kobj = &(pool->kobj); + + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return 0; +} +EXPORT_SYMBOL(rmr_clt_query); + +struct rmr_iu *rmr_clt_get_iu(struct rmr_pool *pool, enum rmr_io_flags flag, + enum rmr_wait_type wait) +{ + int err = 0, idx; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_iu *iu; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + bool reset = false; + + clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) { + pr_err("%s: Pool %s not in use state\n", __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + /* + * We get the inflight ref first. + * If we see that an IO freeze is in progress, we put the ref, and wait for it to unfreeze + * + * The while loop protects us from parallel freeze, like + * A leg deletion, and right after that a call to rmr_clt_close. + * + * We are guranteed to not go on an infinite loop, since rmr_clt_close can be called only + * once, And, there are limited legs to delete + */ + percpu_ref_get(&pool->ids_inflight_ref); + while (atomic_read(&clt_pool->io_freeze) > 0) { + percpu_ref_put(&pool->ids_inflight_ref); + /* + * Coincidentally, the rcu lock might be held when the wait event occurs, + * violating the constraint that no sleeping during general rcu critical section. + * Temporarily release the rcu lock, and re-acquire it after waking up. + * + * TODO: This approach is simple but may need to be revisited. + */ + if (rcu_read_lock_held()) { + rcu_read_unlock(); + reset = true; + } + + wait_event(clt_pool->map_update_wq, !atomic_read(&clt_pool->io_freeze)); + + if (reset) + rcu_read_lock(); + + /* + * Once IO is unfrozen, we check if the state of the pool has changed. + * It could be that rmr_clt_close was called, and hence state is not IN_USE. + * Or, it could be that the last leg was deleted, and we are not in JOINED state + * + * In both the case, we cannot service IOs, hence fail. + */ + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state) || + !test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) { + pr_err("%s: Failed to get inflight IO ref.\n", __func__); + pr_err("%s: Pool %s is not joined or used\n", + __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + percpu_ref_get(&pool->ids_inflight_ref); + } + + iu = rmr_alloc_iu(); + if (unlikely(!iu)) { + percpu_ref_put(&pool->ids_inflight_ref); + return NULL; + } + + idx = srcu_read_lock(&pool->sess_list_srcu); + if (rmr_op(flag) == RMR_OP_READ) { + /* + * Round robin use of one of the sessions in normal state for READ. + * + * This call is always from rmr_clt_request, so for READ, + * this is the first pool_sess we are trying + */ + pool_sess = rmr_clt_round_robin_sess(pool, NULL); + if (unlikely(!pool_sess)) { + err = -ENODEV; + goto put_iu; + } + + sess_iu = rmr_get_sess_iu(pool_sess, RTRS_IO_CON, (enum wait_type) wait); + if (unlikely(!sess_iu)) + goto put_iu; + + sess_iu->rmr_iu = iu; + iu->num_sessions = 1; + list_add_tail(&(sess_iu->entry), (&iu->sess_list)); + } else { + /* + * For WRITE operations we need to submit to all sessions. + */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + /* Sessions must be in normal state for I/O */ + if (atomic_read(&pool_sess->state) != RMR_CLT_POOL_SESS_NORMAL) + continue; + + sess_iu = rmr_get_sess_iu(pool_sess, + RTRS_IO_CON, (enum wait_type) wait); + if (unlikely(!sess_iu)) + goto put_sessions; + + sess_iu->rmr_iu = iu; + /* + * The mem_id of sess_iu tracks the next free slot in the permit bitmap + * of an RTRS-clt session, which is used to store write IO chunk info by + * RMR-server. + */ + sess_iu->mem_id = sess_iu->permit->mem_id; + iu->num_sessions++; + list_add_tail(&(sess_iu->entry), (&iu->sess_list)); + } + } + + refcount_set(&iu->refcount, iu->num_sessions); + iu->errno = 0; + + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return iu; + +put_sessions: + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(iu->sess_list), entry) { + if (!list_empty(&sess_iu->entry)) + list_del_init(&sess_iu->entry); + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); + } +put_iu: + srcu_read_unlock(&pool->sess_list_srcu, idx); + rmr_put_iu(iu); + percpu_ref_put(&pool->ids_inflight_ref); + + if (err) + return ERR_PTR(err); + + return NULL; +} +EXPORT_SYMBOL(rmr_clt_get_iu); + +void rmr_clt_put_iu(struct rmr_pool *pool, struct rmr_iu *iu) +{ + rmr_put_iu(iu); + percpu_ref_put(&pool->ids_inflight_ref); +} +EXPORT_SYMBOL(rmr_clt_put_iu); + +/** + * Returns 1 if the errno represents a condition in the + * storage server that prevents the operation to be executed. + * The oposite is an error with respect to the storage server + * where the operation can be re-tried on a different one. + * + * Example is attemp to read a block that does not exists + * versus server has been crashed. + * + * Note that in doubt we have to trigger the re-try. + */ +/* +static inline int rmr_is_op_error(int errno) +{ + switch (-errno) { + case ENOENT: + case EINVAL: + case EEXIST: + case ENODEV: + return 1; + default: + return 0; + } +} +*/ + +static void msg_read_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + struct rmr_iu *iu = sess_iu->rmr_iu; + rmr_conf_fn *clt_conf = iu->conf; + + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + + if (errno) { + if (!iu->errno) + /* only first error is reported */ + iu->errno = errno; + + pr_err_ratelimited("%s got errno: %d for session %d. Schedule retry.\n", + __func__, errno, pool_sess->member_id); + if (!pool_sess->pool->sync) + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED); + + INIT_WORK(&iu->work, retry_failed_read); + schedule_work(&iu->work); + } else { + (*clt_conf)(iu->priv, errno); + } +} + +static void retry_failed_read(struct work_struct *work) +{ + struct rmr_iu *iu = container_of(work, struct rmr_iu, work); + struct rmr_pool *pool = iu->pool; + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + rmr_conf_fn *clt_conf = iu->conf; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu; + struct rtrs_clt_req_ops req_ops; + struct kvec vec; + int err, idx; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + pool_sess = rmr_clt_round_robin_sess(pool, iu); + if (!pool_sess) + goto give_up; + + sess_iu = rmr_get_sess_iu(pool_sess, RTRS_IO_CON, RTRS_PERMIT_WAIT); + if (unlikely(!sess_iu)) + goto give_up; + + pr_debug("%s: Pool %s to session %d, chunk [%llu, %llu]\n", + __func__, pool->poolname, pool_sess->member_id, + le64_to_cpu(iu->msg.id_a), le64_to_cpu(iu->msg.id_b)); + + sess_iu->rmr_iu = iu; + iu->msg.member_id = pool_sess->member_id; + atomic_inc(&clt_pool->stats.read_retries); + + list_add_tail(&(sess_iu->entry), (&iu->sess_list)); + + vec = (struct kvec) { + .iov_base = &iu->msg, + .iov_len = sizeof(iu->msg) + }; + + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_read_conf, + }; + + trace_retry_failed_read(READ, sess_iu); + + err = rtrs_clt_request(RMR_OP_READ, &req_ops, pool_sess->clt_sess->rtrs, sess_iu->permit, + &vec, 1, le32_to_cpu(iu->msg.length), iu->sg, iu->sg_cnt); + + srcu_read_unlock(&pool->sess_list_srcu, idx); + + if (err) + /* beware! recursion!! */ + msg_read_conf(sess_iu, err); + + return; +give_up: + srcu_read_unlock(&pool->sess_list_srcu, idx); + /* recursion termination! */ + (*clt_conf)(iu->priv, iu->errno); +} + +/* +static int rmr_clt_map_remove_id(struct rmr_pool *pool, int srv_id, rmr_id_t id) +{ + struct rmr_dirty_id_map *map; + + pr_debug("pool %s, remove id (%llu, %llu) for stg_id %d\n", + pool->poolname, id.a, id.b, srv_id); + + map = rmr_pool_find_map(pool, srv_id); + if (!map) { + pr_err("pool %s no map found for pool_id %u\n", + pool->poolname, srv_id); + return -EINVAL; + //TODO: handle this , probably initialize map, or just throw err? + } + + if (!rmr_map_empty(map)) { + void *val; + + val = rmr_map_find(map, id); + if (!val) { + pr_debug("pool %s value for id (%llu, %llu) is not in the dirty map\n", + pool->poolname, id.a, id.b); + return 0; + } + rmr_map_erase(map, id); + pr_debug("pool %s, id (%llu, %llu) is removed from map for stg_id %d\n", + pool->poolname, id.a, id.b, srv_id); + } + + return 0; +} +*/ + +static void msg_io_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + struct rmr_iu *iu = sess_iu->rmr_iu; + rmr_conf_fn *clt_conf = iu->conf; + void *clt_priv = iu->priv; + + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + WARN_ON(pool_sess->pool->sync); + + if (errno) { + pr_err("%s: For sess %s, id (%llu, %llu), got errno: %d\n", + __func__, pool_sess->sessname, iu->msg.id_a, iu->msg.id_b, errno); + sess_iu->errno = errno; + if (!iu->errno) + /* only first error is reported */ + iu->errno = errno; + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED); + pr_debug("iu->errno %d, errno %d, before dec refcnt %d\n", + iu->errno, errno, refcount_read(&iu->refcount)); + } else { + atomic_inc(&iu->succeeded); + // TODO: is it ok to clear it here? + // rmr_clt_map_remove_id(session->pool, session->pool_id, iu->id); + } + + pr_debug("called for id (%llu, %llu), errno %d, sessname %s\n", + iu->msg.id_a, iu->msg.id_b, errno, pool_sess->sessname); + + if (refcount_dec_and_test(&iu->refcount)) { + if (atomic_read(&iu->succeeded) == 0) { + /* + * None of the IOs succeeded. + * Map add is not needed; Just fail the IO. + */ + pr_err("Write IO failed. Passing it up. errno %d\n", iu->errno); + (*clt_conf)(clt_priv, iu->errno); + } else if (iu->errno) { + /* + * Some IOs failed. Send map update (add). + * The clt conf will be called when map update is done. + * + * We are using the same iu to send map update + * So reset the refcount. + */ + refcount_set(&iu->refcount, iu->num_sessions); + + /* + * we are in interrupt here, so sched map update + */ + pr_debug("%s: some IOs failed for %s. Starts map_add\n", __func__, + pool_sess->sessname); + INIT_WORK(&iu->work, sched_map_add); + schedule_work(&iu->work); + } else { + /* + * All good. + */ + errno = 0; + (*clt_conf)(clt_priv, errno); + } + } +} + +static inline void rmr_clt_put_cu(struct rmr_clt_cmd_unit *cmd_unit) +{ + percpu_ref_put(&cmd_unit->clt_pool->pool->ids_inflight_ref); + kfree(cmd_unit); +} + +/** + * msg_cmd_conf() - Confirmation function called for command user commands sent + * + * priv: Pointer to private data passed to rtrs. sess_iu in this case. + * errno: error status passed by rtrs + */ +static void msg_cmd_conf(void *priv, int errno) +{ + struct rmr_clt_sess_iu *sess_iu = (struct rmr_clt_sess_iu *)priv; + struct rmr_clt_cmd_unit *cmd_unit = sess_iu->rmr_cmd_unit; + rmr_conf_fn *clt_conf = cmd_unit->conf; + void *clt_priv = cmd_unit->priv; + int total_failed; + + pr_debug("%s: sessname:%s, errno=%d\n", __func__, sess_iu->pool_sess->sessname, errno); + if (!errno) + atomic_inc(&cmd_unit->succeeded); + + if (refcount_dec_and_test(&cmd_unit->refcount)) { + if (atomic_read(&cmd_unit->succeeded) == 0) { + /* + * None of the IOs succeeded. + */ + pr_err("CMD failed with err %pe. Passing it up.\n", ERR_PTR(errno)); + (*clt_conf)(clt_priv, errno); + } else { + total_failed = cmd_unit->failed_state + + (cmd_unit->num_sessions - atomic_read(&cmd_unit->succeeded)); + /* + * Pass the number of failures up to the user. + */ + (*clt_conf)(clt_priv, total_failed); + } + + rmr_clt_put_cu(cmd_unit); + } + + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); +} + +/* The amount of data that belongs to an I/O and the amount of data that + * should be read or written to the disk (bi_size) can differ. + * + * E.g. When WRITE_SAME is used, only a small amount of data is + * transferred that is then written repeatedly over a lot of sectors. + * + * Get the size of data to be transferred via RTRS by summing up the size + * of the scather-gather list entries. + */ +static size_t rmr_clt_get_sg_size(struct scatterlist *sglist, u32 len) +{ + struct scatterlist *sg; + size_t tsize = 0; + int i; + + for_each_sg(sglist, sg, len, i) + tsize += sg->length; + return tsize; +} + +/** + * rmr_clt_request() - Request data transfer to/from storage node via given pool + * + * @pool: The Pool + * @iu: Iu allocated by pevious rmr_clt_get_iu call. + * @offset: offset inside the object to read/write: + * @length: length of data starting from offset + * @flag: READ/WRITE/REMOVE + * @prio: priority of IO + * @priv: User provided data, passed back with corresponding + * @(conf) confirmation. + * @conf: callback function to be called as confirmation + * @sg: Pages to be sent/received to/from server. + * @sg_cnt: Number of elements in the @sg + * + * Description: + * Data transfer through the given pool, using the underlying RTRS <-> RDMA + * While sending write IOs, if there are FAILED or RECONNECTING pool sessions, that IO + * would be added as dirty for such sessions. + * This is used by both pserver client, and the rmr server on the storage node to perform + * sync reads. + * + * Return: + * 0 on success. This means IO was sent. Final confirmation would be sent via conf function + * Error value on failure + */ +int rmr_clt_request(struct rmr_pool *pool, struct rmr_iu *iu, + size_t offset, size_t length, enum rmr_io_flags flag, unsigned short prio, + void *priv, rmr_conf_fn *conf, struct scatterlist *sg, unsigned int sg_cnt) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + struct rtrs_clt_req_ops req_ops; + rmr_id_t id; + struct kvec vec; + size_t sg_len; + int dir, err, idx; + u32 rmr_flag; + + rmr_get_iu(iu); + rmr_flag = rmr_op(flag); + dir = (rmr_flag == RMR_OP_READ) ? READ : WRITE; + + sg_len = rmr_clt_get_sg_size(sg, sg_cnt); + if (!(flag & RMR_OP_DISCARD || flag & RMR_OP_WRITE_ZEROES)) + WARN_ON(length != sg_len); + + iu->msg.hdr.group_id = cpu_to_le32(pool->group_id); + iu->msg.hdr.type = cpu_to_le16(RMR_MSG_IO); + iu->msg.hdr.__padding = 0; + + iu->msg.offset = cpu_to_le32(offset); + iu->msg.length = cpu_to_le32(length); + iu->msg.flags = cpu_to_le32(flag); + iu->msg.prio = cpu_to_le16(prio); + + iu->msg.sync = pool->sync; + + iu->priv = priv; + iu->conf = conf; + iu->pool = pool; + + if (rmr_flag != RMR_OP_FLUSH && sg_len) { + rmr_map_calc_chunk(pool, offset, length, &id); + /* + * We are not ready to process IO requests which are across chunk boundary. + * The main area which needs work is triggering sync IO (see rmr-req.c) which + * holding the IO which touches multiple chunks. And then making sure other IOs + * which overlap these chunks are held properly, and restarted once the corresponding + * chunk is synced. + */ + BUG_ON(id.a > 1); + iu->msg.id_a = cpu_to_le64(id.a); + iu->msg.id_b = cpu_to_le64(id.b); + } + + if (rmr_flag == RMR_OP_READ) { + iu->sg = sg; + iu->sg_cnt = sg_cnt; + } else if (!pool->sync && rmr_flag == RMR_OP_WRITE) { + /* + * We take this path only for request from client side + * Never from rmr_req_remote_read. + */ + int failed_cnt = 0; + int i; + + atomic_set(&iu->succeeded, 0); + idx = srcu_read_lock(&pool->sess_list_srcu); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + struct rmr_clt_pool_sess *ps; + enum rmr_clt_pool_sess_state state; + u8 mid = pool->pool_md.srv_md[i].member_id; + + if (!mid) + continue; + + ps = xa_load(&pool->stg_members, mid); + if (ps) { + state = atomic_read(&ps->state); + if (state != RMR_CLT_POOL_SESS_FAILED && + state != RMR_CLT_POOL_SESS_RECONNECTING) + continue; + } + /* ps == NULL (disassembled) or FAILED/RECONNECTING */ + if (WARN_ON(failed_cnt >= RMR_POOL_MAX_SESS)) + break; + iu->msg.map_ver = cpu_to_le64(pool->map_ver); + iu->msg.failed_id[failed_cnt] = mid; + failed_cnt++; + rmr_clt_map_add_id(pool, mid, id); + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + iu->msg.failed_cnt = failed_cnt; + } else if (pool->sync) { + pr_err("rmr_clt_request: Sync sessions do not process writes\n"); + return -EPERM; + } + + vec = (struct kvec) { + .iov_base = &iu->msg, + .iov_len = sizeof(iu->msg) + }; + + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(iu->sess_list), entry) { + struct rmr_clt_sess *clt_sess; + + pool_sess = sess_iu->pool_sess; + clt_sess = pool_sess->clt_sess; + iu->msg.member_id = pool_sess->member_id; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING || + pool_sess->maintenance_mode) { + /* + * The storage for this session is getting removed from + * the pool, or is in maintenance mode. + * Simply complete this IO with error + */ + err = -EAGAIN; + goto complete_io; + } + + pr_debug("Sending %x request to pool %s session %s " + "chunk (%llu, %llu) offset %lu length %lu)\n", + rmr_flag, + pool->poolname, pool_sess->sessname, + id.a, id.b, offset, length); + + if (rmr_flag == RMR_OP_READ) { + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_read_conf, + }; + } else { + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_io_conf, + }; + + /* + * Update mem_id before transmitting each write IO to the corresponding + * server. + */ + iu->msg.mem_id = cpu_to_le32(sess_iu->mem_id); + } + + trace_rmr_clt_request(dir, sess_iu); + + err = rtrs_clt_request(dir, &req_ops, clt_sess->rtrs, + sess_iu->permit, &vec, 1, sg_len, + sg, sg_cnt); + +complete_io: + if (err) { + if (rmr_flag == RMR_OP_READ) + msg_read_conf(sess_iu, err); + else + msg_io_conf(sess_iu, err); + } + } + rmr_put_iu(iu); + + return 0; +} +EXPORT_SYMBOL(rmr_clt_request); + +/** + * rmr_clt_get_cu() - Allocate and return a command unit. + * + * @pool: rmr pool for which the command unit is to be allocated + * + * Description: + * Allocates and returns a command unit for the rmr pool. The command unit contains a list of + * session units, for each session which is not in the "REMOVING" state. + * + * Return: + * Pointer to the command unit + */ +static struct rmr_clt_cmd_unit *rmr_clt_get_cu(struct rmr_pool *pool) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_cmd_unit *cmd_unit; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + int idx; + + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state)) { + pr_err("%s: Pool %s not in use\n", __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + /* + * We get the inflight ref first. + * If we see that an IO freeze is in progress, we put the ref, and wait for it to unfreeze + * + * The while loop protects us from parallel freeze, like + * A leg deletion, and right after that a call to rmr_clt_close. + * + * We are guranteed to not go on an infinite loop, since rmr_clt_close can be called only + * once, And, there are limited legs to delete + */ + percpu_ref_get(&pool->ids_inflight_ref); + while (atomic_read(&clt_pool->io_freeze) > 0) { + percpu_ref_put(&pool->ids_inflight_ref); + wait_event(clt_pool->map_update_wq, !atomic_read(&clt_pool->io_freeze)); + + /* + * Once IO is unfrozen, we check if the state of the pool has changed. + * It could be that rmr_clt_close was called, and hence state is not IN_USE. + * Or, it could be that the last leg was deleted, and we are not in JOINED state + * + * In both the case, we cannot service IOs, hence fail. + */ + if (!test_bit(RMR_CLT_POOL_STATE_IN_USE, &clt_pool->state) || + !test_bit(RMR_CLT_POOL_STATE_JOINED, &clt_pool->state)) { + pr_err("%s: Failed to get inflight IO ref.\n", __func__); + pr_err("%s: Pool %s is not joined or used\n", __func__, pool->poolname); + rmr_clt_dump_state(clt_pool); + return NULL; + } + + percpu_ref_get(&pool->ids_inflight_ref); + } + + cmd_unit = kzalloc(sizeof(*cmd_unit), GFP_KERNEL); + if (!cmd_unit) { + percpu_ref_put(&pool->ids_inflight_ref); + return NULL; + } + + INIT_LIST_HEAD(&cmd_unit->sess_list); + cmd_unit->pool = pool; + cmd_unit->clt_pool = clt_pool; + atomic_set(&cmd_unit->succeeded, 0); + + idx = srcu_read_lock(&pool->sess_list_srcu); + /* + * Acquire the permits for all sessions. + * Continue only if we manage to get permits for all "normal" sessions?? + */ + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING) + continue; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) { + cmd_unit->failed_state++; + continue; + } + + sess_iu = rmr_get_sess_iu(pool_sess, RTRS_ADMIN_CON, RTRS_PERMIT_NOWAIT); + if (unlikely(!sess_iu)) + goto put_sessions; + + sess_iu->rmr_cmd_unit = cmd_unit; + + cmd_unit->num_sessions++; + list_add_tail(&(sess_iu->entry), (&cmd_unit->sess_list)); + } + srcu_read_unlock(&pool->sess_list_srcu, idx); + refcount_set(&cmd_unit->refcount, cmd_unit->num_sessions); + + return cmd_unit; + +put_sessions: + srcu_read_unlock(&pool->sess_list_srcu, idx); + + /* Free sess_ius */ + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(cmd_unit->sess_list), entry) { + if (!list_empty(&sess_iu->entry)) + list_del_init(&sess_iu->entry); + rmr_put_sess_iu(sess_iu->pool_sess, sess_iu); + } + + rmr_clt_put_cu(cmd_unit); + + return NULL; +} + +/** + * rmr_clt_cmd_err_conf() - Calls confirmation function for commands + * + * @work: schedules work + * + * Description: + * In case of error in the user command path, we cannot call the confirmation function + * directly, since it might end up calling confirmation function of the user itself. + * Hence a work is scheduled to call the confirmation function in case the code for sending + * user commands itself fails. + */ +static void rmr_clt_cmd_err_conf(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + + msg_cmd_conf(sess_iu, sess_iu->errno); +} + +/** + * rmr_clt_cmd_with_rsp() - Sends a user command to all sessions of an rmr pool + * + * @pool: rmr pool to which the command is for + * @conf: confirmation function to be called after completion + * @priv: pointer to priv data, to be returned to user while calling conf function + * @usr_vec: kvec containing user data (mostly command messages?) + * @nr: number of kvecs + * @buf: buf where the response from the user server is to be directed + * The buf must be physically contiguous in memory (kmalloc()'d). + * @buf_len: length of the buffer + * @size: size of the buf to be sent to a single session + * + * Description: + * This function provides an interface for the user to send commands to the server side. + * The command is sent as a read, so that the response from the user srv side can be received + * The buffer sent by the user is meant to receive the response from the user server side. + * The size of the buffer is set during rmr_clt_open. + * + * Return: + * 0 on success + * negative errno in case of error + * + * Context: + * Inflight commands will block map update, until the inflights are completed. + */ +int rmr_clt_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, size_t size) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + struct rmr_clt_cmd_unit *cmd_unit; + struct rmr_msg_pool_cmd msg = {}; + struct rtrs_clt_req_ops req_ops; + struct kvec *vec; + int i, j, err = 0; + + /* + * TODO: kvmalloc() memory is yet to be supported for SG I/O. + */ + if (is_vmalloc_addr(buf)) + return -EINVAL; + + if (buf_len != (RMR_POOL_MAX_SESS * size)) + return -EINVAL; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_USER; + + /* + * RMR msg struct + user vecs + */ + vec = kzalloc((1 + nr) * sizeof(*vec), GFP_KERNEL); + if (!vec) + return -ENOMEM; + + /* + * RMR msg struct first, + * followed by the user kvecs + */ + vec[0].iov_base = &msg; + vec[0].iov_len = sizeof(msg); + for (i = 1, j = 0; j < nr; i++, j++) { + vec[i].iov_base = usr_vec[j].iov_base; + vec[i].iov_len = usr_vec[j].iov_len; + + msg.user_cmd.usr_len += usr_vec[j].iov_len; + } + + cmd_unit = rmr_clt_get_cu(pool); + if (!cmd_unit) { + err = -ENOMEM; + goto out; + } + + cmd_unit->conf = conf; + cmd_unit->priv = priv; + + i = 0; + list_for_each_entry_safe(sess_iu, tmp_sess_iu, + &(cmd_unit->sess_list), entry) { + pool_sess = sess_iu->pool_sess; + + req_ops = (struct rtrs_clt_req_ops){ + .priv = sess_iu, + .conf_fn = msg_cmd_conf, + }; + + /* + * The user expects each node to be able to send back data of this "size" as + * response. + * So divide the user buffer into chunks of "size", and send them to each leg. + */ + sg_init_one(&sess_iu->sg, buf + (i * size), size); + + trace_rmr_clt_cmd_with_rsp(READ, sess_iu); + + err = rtrs_clt_request(READ, &req_ops, pool_sess->clt_sess->rtrs, sess_iu->permit, + vec, (1 + nr), size, &sess_iu->sg, 1); + if (err) { + /* + * We want to deal with this error just like we deal with the error + * received from the conf function returned from rtrs. + * This would help us to inform the user the correct number of commands + * which failed on the rmr level (rtrs is also rmr level for user). + */ + pr_warn("rtrs_clt_request Failed with err %d\n", err); + sess_iu->errno = err; + INIT_WORK(&sess_iu->work, rmr_clt_cmd_err_conf); + schedule_work(&sess_iu->work); + err = 0; + } + + i++; + } + + /* + * No session to send command + */ + if (i == 0) { + rmr_clt_put_cu(cmd_unit); + err = -EINVAL; + } + +out: + kfree(vec); + + return err; +} +EXPORT_SYMBOL(rmr_clt_cmd_with_rsp); + +/** + * rmr_clt_send_cmd_with_data() - send command containing data buffer as a payload or response + * + * @pool: rmr pool to send command + * @pool_sess: client pool session used to send + * @msg: initialized command message describing the command + * @buf: pointer to the data buffer for data transfers + * @buflen: size of the buffer in bytes + * + * Description: + * Performs sending the command described by msg with a payload or response + * in the buf. + * + * Return: + * 0 on success, error code otherwise. + * + * Context: + * This function blocks while sending the buffer. + * + * Locks: + * should be called under srcu_read_lock since it uses pool_sess + */ +int rmr_clt_send_cmd_with_data(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd *msg, + void *buf, unsigned int buflen) +{ + struct rmr_clt_sess_iu *sess_iu; + struct rmr_clt_sess *clt_sess = pool_sess->clt_sess; + struct kvec vec = { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + int errno = 0, err = 0; + int dir; + + switch (msg->cmd_type) { + case RMR_CMD_MAP_CHECK: + case RMR_CMD_READ_MAP_BUF: + case RMR_CMD_MAP_GET_VER: + case RMR_CMD_MD_SEND: + case RMR_CMD_MAP_SET_VER: + dir = READ; + break; + case RMR_CMD_MAP_TEST: + case RMR_CMD_SEND_MAP_BUF: + case RMR_CMD_SEND_MD_BUF: + dir = WRITE; + break; + default: + pr_err("%s: pool %s cmd type %u is not supported\n", + __func__, pool->poolname, msg->cmd_type); + return -EINVAL; + } + + // TODO: why io_con not admin? + if (clt_sess->state == RMR_CLT_SESS_DISCONNECTED) { + pr_debug("The rmr client session %s state is disconnected\n", clt_sess->sessname); + err = -EINVAL; + goto err; + } + + sess_iu = rmr_msg_get_iu(pool_sess, RTRS_IO_CON, RTRS_PERMIT_WAIT, 2); + if (unlikely(!sess_iu)) { + err = -ENOMEM; + goto err; + } + + sess_iu->buf = buf; + sg_init_one(&sess_iu->sg, buf, buflen); + + err = send_usr_msg(clt_sess->rtrs, dir, sess_iu, + &vec, 1, buflen, &sess_iu->sg, 1, + msg_pool_cmd_map_content_conf, &errno, WAIT); + if (unlikely(err)) { + rmr_msg_put_iu(pool_sess, sess_iu); + } else { + err = errno; + } + + rmr_msg_put_iu(pool_sess, sess_iu); + +err: + return err; +} + +/** + * rmr_clt_pool_member_synced() - check if the pool member has no data to sync + * + * @pool: rmr pool in which we perform the check + * @member_id: id of the pool member tto check + * + * Description: + * Send the check map command to the pool member with the specified id. + * Pool member returns whether he has unsynced chunks or not. + * + * Return: + * error code if failed to send, 0 if pool member is not synced completely, + * 1 if pool member is synced (has no dirty chunks in his map). + * + * Context: + * This function blocks while sending the command. + * + * Locks: + * no + */ +int rmr_clt_pool_member_synced(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd_rsp rsp = {}; + struct rmr_msg_pool_cmd msg = {}; + int ret = 0, idx; + enum rmr_clt_pool_sess_state state; + + pr_debug("start looking for session with member_id=%u\n", member_id); + idx = srcu_read_lock(&pool->sess_list_srcu); + + pool_sess = __find_sess_by_member_id(pool, member_id); + if (!pool_sess) { + pr_err("in pool %s failed to find sess with a member_id=%u\n", + pool->poolname, member_id); + ret = -ENOENT; + goto out; + } + + pr_debug("found session %s with member_id=%u\n", + pool_sess->sessname, member_id); + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_FAILED || + state == RMR_CLT_POOL_SESS_REMOVING) { + pr_debug("pool %s session %s is in %s state, cannot send cmd %s\n", + pool->poolname, pool_sess->sessname, + rmr_clt_sess_state_str(state), rmr_get_cmd_name(msg.cmd_type)); + ret = -EINVAL; + goto out; + } + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_CHECK; + + pr_debug("send cmd %u to %s\n", msg.cmd_type, pool_sess->sessname); + ret = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, &rsp, sizeof(rsp)); + if (ret) { + pr_err("%s: For pool %s failed to %s, err %d\n", + __func__, pool->poolname, rmr_get_cmd_name(msg.cmd_type), ret); + goto out; + } + + if (rsp.value) + ret = 1; // other side reported map is clear + + pr_debug("send cmd %u to %s is done\n", msg.cmd_type, pool_sess->sessname); +out: + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return ret; +} +EXPORT_SYMBOL(rmr_clt_pool_member_synced); + +/** + * rmr_pool_md_to_buf - Fill the buffer with the metadata + * + * @pool: rmr pool contains the metadata. It must be a non-sync pool, + * either client or server pool. + * @buf: buffer to fill with the metadata. + * + */ +static void rmr_clt_md_to_buf(struct rmr_pool *pool, u8 *buf) +{ + struct rmr_pool_md *pool_md; + struct rmr_srv_md *srv_md; + + if (pool->is_clt) { + pool_md = (struct rmr_pool_md *)buf; + /* copy the entire client pool md */ + memcpy(pool_md, &pool->pool_md, sizeof(struct rmr_pool_md)); + return; + } + + srv_md = (struct rmr_srv_md *)(&buf[RMR_CLT_MD_SIZE]); + memcpy(srv_md, &pool->pool_md.srv_md[0], RMR_SRV_MD_SIZE); +} + +/** + * rmr_clt_pool_send_md_all() - Send metadata of rmr pool + * + * Description: + * Send metadata of the src pool to all sessions of the client pool. + * 1) If the client pool is sync pool, it sends the entire server pool + * metadata back after the leader reads the metadata of its connected + * nodes. + * 2) If it is non-sync, send the client pool metadata to storage node + * backups. + */ +int rmr_clt_pool_send_md_all(struct rmr_pool *src_pool, struct rmr_pool *clt_pool) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + void *buf; + u32 buflen; + int err = 0, idx; + + if (!clt_pool) { + pr_err("Cannot send metadata when clt_pool is NULL\n"); + return -EINVAL; + } + + if (src_pool->sync) { + pr_err("Cannot send metadata when src_pool is sync\n"); + return -EINVAL; + } + + buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + buflen = RMR_MD_SIZE; + if (!buf) + return -ENOMEM; + + rmr_clt_md_to_buf(src_pool, buf); + + /* + * It will continue to send the md to the next session even if the previous send failed. + */ + idx = srcu_read_lock(&clt_pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &clt_pool->sess_list, entry, + (srcu_read_lock_held(&clt_pool->sess_list_srcu))) { + pr_debug("Start sending md for pool %s; to session %s with member_id %d\n", + src_pool->poolname, pool_sess->sessname, pool_sess->member_id); + + rmr_clt_init_cmd(clt_pool, &msg); + msg.cmd_type = RMR_CMD_SEND_MD_BUF; + msg.send_md_buf_cmd = (struct rmr_msg_send_md_buf_cmd) { + .sync = clt_pool->sync, + /* the receiver of buffer is the leader */ + .receiver_id = pool_sess->member_id, + /* change flags in cmd message */ + .flags = RMR_OP_MD_WRITE, + }; + + err = rmr_clt_send_cmd_with_data(clt_pool, pool_sess, &msg, buf, buflen); + if (err) { + pr_debug("Cannot send the clt/srv_md of entire pool to the pool sess %s\n", + pool_sess->sessname); + continue; + } + } + + pr_debug("send_md done\n"); + + kfree(buf); + + srcu_read_unlock(&clt_pool->sess_list_srcu, idx); + return err; +} +EXPORT_SYMBOL(rmr_clt_pool_send_md_all); + +static int rmr_clt_start_send_md(struct rmr_pool *pool) +{ + return rmr_clt_pool_send_md_all(pool, pool); +} + +/** + * rmr_clt_del_stor_from_pool() - Notify pool members that a storage node is leaving + * + * @pool_sess: The pool session of the departing storage node. + * @delete: True for a permanent deletion (%RMR_POOL_INFO_MODE_DELETE); + * false for a temporary disassembly (%RMR_POOL_INFO_MODE_DISASSEMBLE). + * + * Sends a POOL_INFO REMOVE message to all other active pool members so they + * can update their dirty maps and membership state accordingly. + * + * Return: + * 0 on success, negative error code on failure. + */ +int rmr_clt_del_stor_from_pool(struct rmr_clt_pool_sess *pool_sess, bool delete) +{ + enum rmr_pool_info_mode mode; + int err; + + if (delete) + mode = RMR_POOL_INFO_MODE_DELETE; + else + mode = RMR_POOL_INFO_MODE_DISASSEMBLE; + + err = rmr_clt_send_pool_info(pool_sess, RMR_POOL_INFO_OP_REMOVE, mode, false); + if (err) { + pr_err("rmr_clt_send_pool_info failed for session\n"); + return err; + } + + return 0; +} + +static int __init rmr_client_init(void) +{ + int err; + + pr_info("Loading module %s, version %s, proto %s\n", KBUILD_MODNAME, + RMR_VER_STRING, RMR_PROTO_VER_STRING); + + err = rmr_clt_create_sysfs_files(); + if (err) { + pr_err("Failed to load module," + " creating sysfs device files failed, err: %d\n", + err); + goto out; + } + + return 0; + +out: + return err; +} + +static void __exit rmr_client_exit(void) +{ + struct rmr_pool *pool, *tmp; + + pr_info("Unloading module\n"); + + list_for_each_entry_safe(pool, tmp, &pool_list, entry) + (void) rmr_clt_remove_pool_from_sysfs(pool, NULL); + + rmr_clt_destroy_sysfs_files(); + pr_info("Module unloaded\n"); +} + +module_init(rmr_client_init); +module_exit(rmr_client_exit); diff --git a/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c b/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c new file mode 100644 index 000000000000..cade5dbf2e20 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-map-mgmt.c @@ -0,0 +1,933 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) — client MAP-exchange management + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include + +#include "rmr-clt.h" +#include "rmr-clt-trace.h" + +void send_map_check(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_CHECK; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + return; + } +} + +void send_store_check(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_STORE_CHECK; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); //am : why wait ? + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + pr_err("sess %s failed to send store check with err %d\n", + pool_sess->sessname, err); + } +} + +int send_map_get_version(struct rmr_clt_pool_sess *pool_sess, u64 *ver) +{ + struct rmr_msg_pool_cmd_rsp rsp = {}; + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = RMR_CMD_MAP_GET_VER; + + err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, &rsp, sizeof(rsp)); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + return -EINVAL; + } + + *ver = rsp.value; + + return 0; +} + +int send_discard(struct rmr_clt_pool_sess *pool_sess, u8 cmd_type, u8 member_id) +{ + struct rmr_msg_pool_cmd msg = {}; + struct rmr_pool *pool = pool_sess->pool; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = cmd_type; + msg.send_discard_cmd.member_id = member_id; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For sess %s, %s failed with err %d\n", + __func__, pool_sess->sessname, rmr_get_cmd_name(msg.cmd_type), err); + } + + return err; +} + +int rmr_clt_handle_map_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp) +{ + struct rmr_pool *pool = pool_sess->pool; + struct rmr_dirty_id_map *map; + + pr_debug("pool %s sess %s member_id %u, rsp->value=%llu\n", + pool->poolname, pool_sess->sessname, rsp->member_id, rsp->value); + if (!rsp->value) // map is not empty on stg + return 0; + + pr_debug("pool %s server with id %u has empty dirty map, lets clean it.\n", + pool->poolname, rsp->member_id); + map = rmr_pool_find_map(pool, rsp->member_id); + if (!map) { + pr_err("%s: pool %s no map found for member_id %u\n", + __func__, pool->poolname, rsp->member_id); + return -EINVAL; + //TODO: handle this, how? + } + + if (!rmr_map_empty(map)) { + pr_debug("pool %s dirty map for member_id %d is not empty, map->ts %lu (now %lu)\n", + pool->poolname, rsp->member_id, map->ts, jiffies); + if (time_after(jiffies, map->ts + msecs_to_jiffies(RMR_MAP_CLEAN_DELAY_MS))) { + pr_info("%s: pool %s clear dirty map for member_id %d\n", + __func__, pool->poolname, rsp->member_id); + rmr_map_unset_dirty_all(map); + map->ts = jiffies; + } + } + + pr_debug("pool %s map with member_id %u cleaned\n", + pool->poolname, map->member_id); + return 0; +} + +int rmr_clt_handle_store_check_rsp(struct rmr_clt_pool_sess *pool_sess, + struct rmr_msg_pool_cmd_rsp *rsp) +{ + struct rmr_pool *pool = pool_sess->pool; + int err = 0; + + pr_debug("pool %s sess %s member_id %u, rsp->value=%llu\n", + pool->poolname, pool_sess->sessname, rsp->member_id, rsp->value); + if (!rsp->value) { + pr_debug("pool %s sess %s (state=%d) reported that store is not available, changing state\n", + pool->poolname, pool_sess->sessname, atomic_read(&pool_sess->state)); + return 0; + } + pr_info("pool %s sess %s (state=%d) reported that store is available, changing state\n", + pool->poolname, pool_sess->sessname, atomic_read(&pool_sess->state)); + + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + + if (!pool_sess->maintenance_mode) { + err = rmr_clt_pool_try_enable(pool); + if (err) { + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool->poolname, pool_sess->sessname, err); + return err; + } + } + + return 0; +} + +/* + * Pre-requisite: rcu read lock should be held by caller + */ +static struct rmr_clt_pool_sess *rmr_clt_get_first_reconnecting_session(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess; + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_RECONNECTING) + return pool_sess; + } + + return NULL; +} + +/** + * rmr_clt_pool_map_xfer() - transfer dirty maps between rmr client and server + * + * @pool: the rmr pool used for map transfers + * @pool_sess: client pool session that is used for map transfer + * @cmd_type: pool command type generated for this transfer, for now only + * RMR_CMD_READ_MAP_BUF, RMR_CMD_SEND_MAP_BUF, RMR_CMD_MAP_TEST are used + * @buf: pointer to the data buffer for data transfers + * @buflen: size of the buffer in bytes + * @map_idx: index of the map in dirty map array from which we start to send or receive + * the data + * @offset: key in the map from which we start to send/receive the data about the maps + * + * Description: + * Performs transfer of the information about the dirty maps starting from the map with + * position map_idx in the array of dirty maps and from the start_key at that map. + * cmd types are handled as follows: + * RMR_CMD_READ_MAP_BUF - read the information about the maps from the pool and fill buf + * RMR_CMD_SEND_MAP_BUF - send buf with filled data to the pull + * RMR_CMD_MAP_TEST - send the buf with data to the pool to perform map comparison + * + * Return: + * 0 on success, error code otherwise. + * + * Context: + * This function blocks while sending the buffer. + * + * Locks: + * should be called under srcu_read_lock since it uses pool_sess + */ +static int rmr_clt_pool_map_xfer(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess, + int cmd_type, void *buf, unsigned int buflen, + u8 map_idx, u64 slp_idx) +{ + struct rmr_msg_pool_cmd msg = {}; + int err; + + rmr_clt_init_cmd(pool, &msg); + msg.cmd_type = cmd_type; + + msg.map_buf_cmd.map_idx = map_idx; + msg.map_buf_cmd.slp_idx = slp_idx; + + err = rmr_clt_send_cmd_with_data(pool, pool_sess, &msg, buf, buflen); + if (err) { + pr_debug("pool %s failed to send map xfer cmd %u, err %d\n", + pool->poolname, cmd_type, err); + return err; + } + + return 0; +} + +int rmr_clt_read_map(struct rmr_pool *pool) +{ + struct rmr_clt_pool_sess *pool_sess = NULL; + struct rmr_map_buf_hdr *map_buf_hdr; + u8 map_idx = 0; + u64 slp_idx = 0; + int err = 0, buflen, idx; + void *buf; + + idx = srcu_read_lock(&pool->sess_list_srcu); + + pool_sess = rmr_clt_get_first_reconnecting_session(pool); + if (pool_sess == NULL) { + srcu_read_unlock(&pool->sess_list_srcu, idx); + pr_err("%s: No created session found\n", __func__); + return -EINVAL; + } + + buflen = RTRS_IO_LIMIT; + buf = kzalloc(buflen, GFP_KERNEL); + if (!buf) { + pr_err("%s: Error allocating buffer\n", __func__); + err = -ENOMEM; + goto ret; + } + + while (true) { + err = rmr_clt_pool_map_xfer(pool, pool_sess, RMR_CMD_READ_MAP_BUF, + buf, buflen, map_idx, slp_idx); + if (err) { + pr_debug("rmr_clt_pool_map_xfer failed for pool %s, err %d\n", + pool->poolname, err); + goto ret_free; + } + + map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + if (map_buf_hdr->member_id == 0) + break; + + err = rmr_pool_save_map(pool, buf, buflen, false); + if (err) { + pr_err("rmr_pool_save_map failed\n"); + goto ret_free; + } + + map_idx = map_buf_hdr->map_idx; + slp_idx = map_buf_hdr->slp_idx; + } + +ret_free: + kfree(buf); + +ret: + srcu_read_unlock(&pool->sess_list_srcu, idx); + + return err; +} + +/** + * rmr_clt_spread_map() - Spread the map contained in storage node connected by pool_sess_chosen + * + * @pool: The pool + * @pool_sess_chosen: pool session from where the map is to be updated from + * @enable: Whether the last MAP_DONE command should have the enable param set or not + * @skip_normal: If true, freeze IOs before spreading and silently skip any NORMAL + * sessions encountered in the loop (used in Case 1 recovery where + * pool_sess_chosen is itself a NORMAL session that is still serving IOs). + * If false, encountering a NORMAL session is treated as an error. + * + * Description: + * This function spreads the map contained in the storage node connected by given pool + * session. The param enable denotes whether the map update should result in the storage + * going to NORMAL state or not. This is controlled by the enable param in the last MAP_DONE + * message. + * + * Return: + * 0 on success + * Error value on failure + * + * Context: + * srcu_read_lock should be held while calling this function. + */ +int rmr_clt_spread_map(struct rmr_pool *pool, struct rmr_clt_pool_sess *pool_sess_chosen, + bool enable, bool skip_normal) +{ + struct rmr_clt_pool *clt_pool = (struct rmr_clt_pool *)pool->priv; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + int state, err = 0; + + rmr_clt_init_cmd(pool, &msg); + + /* + * If we expect NORMAL session, then we should expect IOs running. + * Which is why we should freeze IOs before doing map_update. + */ + if (skip_normal) { + /* Freeze IOs */ + rmr_clt_pool_io_freeze(clt_pool); + + /* Wait for all completion */ + rmr_clt_pool_io_wait_complete(clt_pool); + } + + /* + * TODO: Use rmr_clt_handle_discard to check whether the pool + * session has pending discard request to be sent. + * + * Enable this when we fix replace. + * + err = rmr_clt_handle_discard(pool); + if (err) { + pr_err("%s: discard handling failed\n", __func__); + goto err; + } + */ + + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (pool_sess == pool_sess_chosen) + continue; + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_NORMAL) { + if (skip_normal) + continue; + pr_err("%s: pool %s unexpected NORMAL session %s during spread\n", + __func__, pool->poolname, pool_sess->sessname); + err = -EINVAL; + goto err_out; + } + + if (state != RMR_CLT_POOL_SESS_RECONNECTING || + pool_sess->maintenance_mode) + continue; + + msg.cmd_type = RMR_CMD_MAP_READY; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto err_dis; + } + + msg.cmd_type = RMR_CMD_MAP_SEND; + msg.map_send_cmd.receiver_member_id = pool_sess->member_id; + err = rmr_clt_pool_send_cmd(pool_sess_chosen, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto err_dis; + } + + msg.cmd_type = RMR_CMD_MAP_DONE; + msg.map_done_cmd.enable = enable; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: %s failed\n", __func__, rmr_get_cmd_name(msg.cmd_type)); + goto err_dis; + } + } + + /* Unfreeze IOs and wake up */ + if (skip_normal) + rmr_clt_pool_io_unfreeze(clt_pool); + + return 0; + +err_dis: + list_for_each_entry_srcu(pool_sess, &pool->sess_list, entry, + (srcu_read_lock_held(&pool->sess_list_srcu))) { + if (pool_sess == pool_sess_chosen) + continue; + + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL) { + if (skip_normal) + continue; + pr_err("%s: pool %s unexpected NORMAL session %s during spread\n", + __func__, pool->poolname, pool_sess->sessname); + } + + msg.cmd_type = RMR_CMD_MAP_DISABLE; + rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + } + +err_out: + /* Unfreeze IOs and wake up */ + if (skip_normal) + rmr_clt_pool_io_unfreeze(clt_pool); + + return err; +} + +/** + * rmr_clt_set_pool_sess_mm() - Set the rmr clt pool session to maintenance mode + * + * @pool_sess: The rmr clt pool session to set in maintenance mode + * + * Description: + * This function does the necessary work required, like setting the pool session to + * maintenance mode and updating the state. + * It then also communicates this state change to the corresponding storage node. + * + * Return: + * 0 on success + * Error value on failure + */ +int rmr_clt_set_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + int err; + + pr_info("%s: Putting sess %s of pool %s in maintenance mode\n", + __func__, pool_sess->sessname, pool->poolname); + + if (pool_sess->maintenance_mode) + goto send_message; + + /* + * If the pool_sess is to be put in maintenance mode, + * update relevant states and params, Then send message to storage node. + * + * We do not need any kind of locking for this, because of the way IO units (IU) are + * allocated & sent. The mm mode update & the state change can happen at multiple places. + * + * 1) If the state changes before the pool_sess is picked up into the IU, then we are safe + * 2) If the state changes after the pool_sess is picked up into the IU, but before, + * rmr_clt_request, it will be failed in rmr_clt_request. + * 3) If the state changes after rmr_clt_request, the IO would be sent to the storage node + * for that pool_sess. Then we have 2 cases, + * a) The message for maintenance_mode is received by the storage node before the IO, + * then the storage node will fail the IO. Failure would then be handled by the client. + * b) The message for maintenance_mode is received by the storage node after the IO, + * then the storage node will process the IO, and return success to client. In this case + * also we are fine, since the IO got processes successfully. + */ + pool->map_ver++; + pool_sess->maintenance_mode = true; + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_RECONNECTING); + +send_message: + err = send_msg_enable_pool(pool_sess, 0); + if (err) { + pr_err("%s: send_msg_enable_pool failed for pool %s. Err %d\n", + __func__, pool->poolname, err); + } + + return err; +} + +/** + * rmr_clt_unset_pool_sess_mm() - Clear the rmr clt pool sessions maintenance mode + * + * @pool_sess: The rmr clt pool session to clear maintenance mode of + * + * Description: + * This function clears the maintenance mode of the given rmr clt pool session. + * It also does the map_update which essentially brings the pool_session and its + * corresponding storage node to NORMAL state. + * + * Return: + * 0 on success + * Error value on failure + */ +int rmr_clt_unset_pool_sess_mm(struct rmr_clt_pool_sess *pool_sess) +{ + struct rmr_pool *pool = pool_sess->pool; + int err; + + pr_info("%s: Putting to sess %s of pool %s out of maintenance mode\n", + __func__, pool_sess->sessname, pool->poolname); + + /* + * Cannot be in NORMAL and CREATED states while in maintenance mode. + */ + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_NORMAL); + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + + /* + * If this pool_sess is getting removed, we fail unset maintenance mode + */ + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_REMOVING) + return -EINVAL; + + /* + * First unset mm of storage node + */ + err = send_msg_enable_pool(pool_sess, 1); + if (err) { + pr_err("Failed to send enable to pool %s. Err %d\n", + pool->poolname, err); + return -EINVAL; + } + + /* Now do this */ + pool_sess->maintenance_mode = false; + + /* + * For FAILED states, further action would happen when it goes to RECONNECTING state + */ + if (atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_FAILED) + return 0; + + /* + * Since we are in RECONNECTING state, we do map update here. + */ + err = rmr_clt_pool_try_enable(pool); + if (err) { + pr_err("%s: pool %s try_enable failed for sess %s: %d\n", + __func__, pool->poolname, pool_sess->sessname, err); + return err; + } + + return 0; +} + +void msg_pool_cmd_map_content_conf(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + + pr_debug("%s: session %s conf with errno %d\n", + __func__, pool_sess->sessname, sess_iu->errno); + + wake_up_iu_comp(sess_iu); + rmr_msg_put_iu(pool_sess, sess_iu); +} + +static void send_map_update_done(struct work_struct *work) +{ + struct rmr_clt_sess_iu *sess_iu = container_of(work, struct rmr_clt_sess_iu, work); + struct rmr_iu *iu = sess_iu->rmr_iu; + struct rmr_clt_pool_sess *pool_sess = sess_iu->pool_sess; + int errno = sess_iu->errno; + + pr_debug("%s: Session %s, err %d, iu %p\n", + __func__, pool_sess->sessname, errno, iu); + WARN_ON(atomic_read(&pool_sess->state) == RMR_CLT_POOL_SESS_CREATED); + + /* + * We leave "iu->errno" set from the IO failure. + * Even though one map_add succeeds, we clear `iu->errno` + * and the main IO succeeds. And all other map_adds + * simply trigger session state change to FAILURE. + */ + if (!errno) { + iu->errno = 0; + } else { + pr_err_ratelimited("%s: for sess %s got errno: %d\n", + __func__, pool_sess->sessname, errno); + + if (iu->errno) + /* only the last error is reported */ + iu->errno = errno; + pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_FAILED); + } + + pr_debug("%s: Before dec and test iu %p refcnt=%d\n", + __func__, iu, refcount_read(&iu->refcount)); + + if (refcount_dec_and_test(&iu->refcount)) { + rmr_conf_fn *conf = iu->conf; + + pr_debug("all maps updated, call conf %p withh errno %d\n", + conf, errno); + (*conf)(iu->priv, iu->errno); + } +} + +/** + * rmr_clt_send_map_update() - Send map update to all connected storage nodes + * + * @pool: The client pool of whose sessions the update is to be sent + * @iu: The IO unit containing the information for the update + * + * Description: + * Send map update, using the underlying RTRS <-> RDMA + * Currently we use the same rmr_iu as IO, since it saves us time. + * When an IO fails, and a MAP_ADD is to be sent, the code reuses the + * same rmr_iu used for IO. This way we do not spend time acquiring + * and initializing another rmr_iu. + * + * A map update currently can either be a MAP_ADD or a MAP_CLEAR. + * The caller must make sure the basic and required information for both + * the above commands is updated in the rmr_iu. + * Basic being the pool group_id, msg hdr type, etc. + * Required being the following, + * MAP_ADD requires the rmr_id_t chunk numbers, failed_id array and failed_cnt + * MAP_CLEAR requires the rmr_id_t and the member_id + * + * Return: + * 0 on success. This means the map_update was sent successfully. + * The subsequent status (err or not) goes to iu->conf call, + * so the caller should check that too. + * + * Error value on failure. When this function returns error, + * be aware that the iu->conf will not be called. + */ +int rmr_clt_send_map_update(struct rmr_pool *pool, struct rmr_iu *iu) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu, *tmp_sess_iu; + struct rtrs_clt_req_ops req_ops; + struct kvec vec; + int err; + + pr_debug("%s: rmr_id (%llu, %llu), msg %d, refcnt=%d\n", __func__, + iu->msg.id_a, iu->msg.id_b, iu->msg.hdr.type, refcount_read(&iu->refcount)); + + if (!pool) { + pr_err("Cannot send map update. pool is NULL\n"); + return -EINVAL; + } + + rmr_get_iu(iu); + + vec = (struct kvec){ + .iov_base = &iu->msg, + .iov_len = sizeof(iu->msg) + }; + + list_for_each_entry_safe(sess_iu, tmp_sess_iu, &(iu->sess_list), entry) { + struct rmr_clt_sess *clt_sess; + enum rmr_clt_pool_sess_state state; + + pool_sess = sess_iu->pool_sess; + clt_sess = pool_sess->clt_sess; + + INIT_WORK(&sess_iu->work, send_map_update_done); + + req_ops = (struct rtrs_clt_req_ops) { + .priv = sess_iu, + .conf_fn = msg_conf, + }; + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_FAILED || + state == RMR_CLT_POOL_SESS_REMOVING) { + /* + * Sessions in failed state is probably the reason why we sending + * map add in the first place. + * We can skip those sessions, since map update will take care of this. + */ + pr_debug("%s: skipped sess %s\n", __func__, sess_iu->pool_sess->sessname); + sess_iu->errno = -EINVAL; + schedule_work(&sess_iu->work); + continue; + } + + pr_debug("Sending request flags %u to pool %s session %s " + "chunk [%llu, %llu] offset %u length %u)\n", + iu->msg.flags, pool->poolname, pool_sess->sessname, + iu->msg.id_a, iu->msg.id_b, + iu->msg.offset, iu->msg.length); + + trace_send_map_update(WRITE, sess_iu); + + err = rtrs_clt_request(WRITE, &req_ops, clt_sess->rtrs, + sess_iu->permit, &vec, 1, 0, NULL, 0); + + /* we can ignore errno since we called rmr_clt_send_map_update with NO_WAIT */ + if (err) { + sess_iu->errno = err; + + pr_err("%s: Failed with err %d, schedule work\n", + __func__, err); + schedule_work(&sess_iu->work); + } + } + rmr_put_iu(iu); + + /* + * We are handling err through iu->conf + */ + return 0; +} +EXPORT_SYMBOL(rmr_clt_send_map_update); + +int rmr_clt_map_add_id(struct rmr_pool *pool, int stg_id, rmr_id_t id) +{ + struct rmr_dirty_id_map *map; + + map = rmr_pool_find_map(pool, stg_id); + if (!map) { + pr_err("in pool %s cannot find map for member_id %u\n", + pool->poolname, stg_id); + return -EINVAL; + } + + map->ts = jiffies; + rmr_map_set_dirty(map, id, 0); + + pr_debug("pool %s id (%llu, %llu) inserted to the dirty map\n", + pool->poolname, id.a, id.b); + + return 0; +} + +void sched_map_add(struct work_struct *work) +{ + struct rmr_iu *iu = container_of(work, struct rmr_iu, work); + struct rmr_pool *pool = iu->pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess_iu *sess_iu; + rmr_conf_fn *clt_conf = iu->conf; + void *clt_priv = iu->priv; + int failed_cnt = 0, err = 0; + rmr_id_t id; + + pr_debug("scheduled work process for rmr iu %p send map add id (%llu, %llu), poolname %s\n", + iu, iu->msg.id_a, iu->msg.id_b, pool->poolname); + + /* + * For MAP_ADD, we need failed_id, failed_cnt, and rmr_id_t for chunk number. + * + * We reuse the iu which was used for this IO. + * It already has the chunk number, the clt_conf function to be called, + * and other important things. + */ + iu->msg.hdr.type = cpu_to_le16(RMR_MSG_MAP_ADD); + + id.a = le64_to_cpu(iu->msg.id_a); + id.b = le64_to_cpu(iu->msg.id_b); + list_for_each_entry(sess_iu, &(iu->sess_list), entry) { + pool_sess = sess_iu->pool_sess; + + if (sess_iu->errno) { + iu->msg.map_ver = cpu_to_le64(pool->map_ver); + iu->msg.failed_id[failed_cnt] = pool_sess->member_id; + failed_cnt++; + + rmr_clt_map_add_id(pool, pool_sess->member_id, id); + } + } + iu->msg.failed_cnt = failed_cnt; + + err = rmr_clt_send_map_update(pool, iu); + if (err) { + pr_err("error sending map add for id (%llu, %llu), err=%d\n", + iu->msg.id_a, iu->msg.id_b, err); + (*clt_conf)(clt_priv, err); + } +} + +/** + * rmr_clt_send_map() - Send dirty map entries + * + * @map_src_pool: Pool whose map is to be sent + * @clt_pool: Client pool through which the dest session is selected + * @map_send_cmd: Command structure containing the member_id of the target session + * where the map is to be sent. If NULL then send to all of the session + * + * Return: + * 0 on success, err code otherwise. + * + * Description: + * Sends all the dirty entries from the map in "map_src_pool" to the session with + * member_id equal to member_id mentioned in the map_send_cmd. + * The session where to send the map is picked from the clt_pool. If + * map_send_cmd is NULL then send cmd to all of the sessions in clt_pool. + * + * Context: + * This function blocks while sending the map. + */ +int rmr_clt_send_map(struct rmr_pool *map_src_pool, struct rmr_pool *clt_pool, + const struct rmr_msg_map_send_cmd *map_send_cmd, rmr_map_filter filter) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + bool sess_found = false; + void *bitmap_buf; + int err = 0, idx; + + if (!clt_pool) { + pr_err("Cannot send map, when clt_pool is NULL\n"); + return -EINVAL; + } + + bitmap_buf = kzalloc(RTRS_IO_LIMIT, GFP_KERNEL); + if (!bitmap_buf) { + pr_err("%s: pool %s error allocating buffer to send map\n", + __func__, map_src_pool->poolname); + return -ENOMEM; + } + + idx = srcu_read_lock(&clt_pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &clt_pool->sess_list, entry, + (srcu_read_lock_held(&clt_pool->sess_list_srcu))) { + int bytes = 0; + u8 map_idx = 0; + u64 slp_idx = 0; + + /* if we have a command then skip all the sessions that are not in command */ + if (map_send_cmd && pool_sess->member_id != map_send_cmd->receiver_member_id) + continue; + + sess_found = true; + pr_info("Start sending dirty map for pool %s; to session %s with member_id %d\n", + map_src_pool->poolname, pool_sess->sessname, pool_sess->member_id); + + while ((bytes = rmr_pool_maps_to_buf(map_src_pool, &map_idx, &slp_idx, + bitmap_buf, RTRS_IO_LIMIT, filter)) > 0) { + pr_debug("mapped %d bytes to bitmap_buf\n", bytes); + + err = rmr_clt_pool_map_xfer(clt_pool, pool_sess, RMR_CMD_SEND_MAP_BUF, + bitmap_buf, bytes, 0, 0); + if (err) { + pr_err("%s: Failed to send bitmap_buf, from %s to %s err %d\n", + __func__, map_src_pool->poolname, clt_pool->poolname, err); + goto err_free; + } + } + + rmr_clt_init_cmd(map_src_pool, &msg); + msg.cmd_type = RMR_CMD_MAP_BUF_DONE; + msg.map_buf_done_cmd.map_version = map_src_pool->map_ver; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s: For pool %s, %s failed\n", + __func__, map_src_pool->poolname, rmr_get_cmd_name(msg.cmd_type)); + goto err_free; + } + } + + if (map_send_cmd && !sess_found) { + pr_err("pool %s failed to find sess with member_id %u to send map\n", + clt_pool->poolname, map_send_cmd->receiver_member_id); + err = -EINVAL; + goto err_free; + } + + pr_info("%s: Sending map done\n", __func__); + +err_free: + kfree(bitmap_buf); + srcu_read_unlock(&clt_pool->sess_list_srcu, idx); + + return err; +} +EXPORT_SYMBOL(rmr_clt_send_map); + +int rmr_clt_test_map(struct rmr_pool *src_pool, struct rmr_pool *dst_pool) +{ + struct rmr_clt_pool_sess *pool_sess; + void *bitmap_buf; + int err, idx; + + pr_info("test maps from src_pool=%s to dst_pool=%s...\n", + src_pool->poolname, dst_pool->poolname); + + bitmap_buf = kzalloc(RTRS_IO_LIMIT, GFP_KERNEL); + if (!bitmap_buf) { + pr_err("%s: Error allocating buffer\n", __func__); + err = -ENOMEM; + goto err; + } + + idx = srcu_read_lock(&dst_pool->sess_list_srcu); + list_for_each_entry_srcu(pool_sess, &dst_pool->sess_list, entry, + (srcu_read_lock_held(&dst_pool->sess_list_srcu))) { + enum rmr_clt_pool_sess_state state; + int bytes = 0; + u8 map_idx = 0; + u64 slp_idx = 0; + + state = atomic_read(&pool_sess->state); + if (state == RMR_CLT_POOL_SESS_CREATED || + state == RMR_CLT_POOL_SESS_FAILED) { + pr_warn("sess %s is in created/failed state, skip map test.\n", + pool_sess->sessname); + continue; + } + pr_info("perform map test for sess %s\n", pool_sess->sessname); + while ((bytes = rmr_pool_maps_to_buf(src_pool, &map_idx, &slp_idx, + bitmap_buf, RTRS_IO_LIMIT, + MAP_NO_FILTER)) > 0) { + pr_debug("mapped %d bytes to bitmap_buf\n", bytes); + + err = rmr_clt_pool_map_xfer(dst_pool, pool_sess, RMR_CMD_MAP_TEST, + bitmap_buf, bytes, 0, 0); + if (err) { + pr_err("%s: For sess %s failed test map, src_pool %s dst_pool %s err %d\n", + __func__, pool_sess->sessname, src_pool->poolname, + dst_pool->poolname, err); + srcu_read_unlock(&dst_pool->sess_list_srcu, idx); + goto err_free; + } + } + pr_info("sess %s map test done\n", pool_sess->sessname); + } + srcu_read_unlock(&dst_pool->sess_list_srcu, idx); + +err_free: + kfree(bitmap_buf); +err: + pr_info("test maps from src_pool=%s to dst_pool=%s done, err %d\n", + src_pool->poolname, dst_pool->poolname, err); + + return err; +} +EXPORT_SYMBOL(rmr_clt_test_map); From b42d15fcffb0e32c36c4935c3bde494ae2e369cc Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:16 +0200 Subject: [PATCH 04/13] RDMA/rmr: client: sysfs interface functions Add the client-side sysfs interface used to administer RMR pools and sessions: creating/removing pools, joining and leaving pool sessions, and exposing per-pool and per-session attributes. The sysfs hierarchy lives under /sys/devices/virtual/rmr-client/ and is the primary administrative interface for the RMR client. This file is not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c | 1496 ++++++++++++++++++++ 1 file changed, 1496 insertions(+) create mode 100644 drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c diff --git a/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c b/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c new file mode 100644 index 000000000000..7e12c526f0c9 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-clt-sysfs.c @@ -0,0 +1,1496 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include /* for NAME_MAX */ +#include +#include +#include +#include +#include +#include + +#include "rmr-clt.h" + +/* + * Wait a bit before trying to reconnect after a failure + * in order to give server time to finish clean up which + * leads to "false positives" failed reconnect attempts + */ +#define RTRS_RECONNECT_BACKOFF 1000 + +#define RMR_DEFAULT_CHUNK_SIZE 131072 /* 128 KB */ + +static struct class *rmr_dev_class; +static struct device *rmr_ctl_dev; +static struct device *rmr_pool_dev; +static struct device *rmr_sess_dev; + +enum { + RMR_OPT_ERR = 0, + RMR_ADD_OPT_PATH = 1 << 0, + RMR_ADD_OPT_SESSNAME = 1 << 1, + RMR_ADD_OPT_MODE = 1 << 2, + RMR_DEL_OPT_MODE = 1 << 3, +}; + +static unsigned int rmr_opt_add_mandatory[] = { + RMR_ADD_OPT_PATH, + RMR_ADD_OPT_SESSNAME, + RMR_ADD_OPT_MODE, +}; + +/* For sync pools mode is not meaningful; only path and sessname are required. */ +static unsigned int rmr_opt_add_sync_mandatory[] = { + RMR_ADD_OPT_PATH, + RMR_ADD_OPT_SESSNAME, +}; + +static const match_table_t rmr_opt_add_tokens = { + { RMR_ADD_OPT_PATH, "path=%s" }, + { RMR_ADD_OPT_SESSNAME, "sessname=%s" }, + { RMR_ADD_OPT_MODE, "mode=%s" }, + { RMR_OPT_ERR, NULL }, +}; + +enum rmr_opt_join { + RMR_JOIN_OPT_POOLNAME, + RMR_JOIN_OPT_Mandatory_count, + RMR_JOIN_OPT_SYNC, + RMR_JOIN_OPT_CHUNK_SIZE, + RMR_JOIN_OPT_ERR, +}; + +static const char * const rmr_srv_opts_mandatory_names[] = { + [RMR_JOIN_OPT_POOLNAME] = "poolname", +}; + +static const match_table_t rmr_opt_join_tokens = { + { RMR_JOIN_OPT_POOLNAME, "poolname=%s" }, + { RMR_JOIN_OPT_SYNC, "sync=%s" }, + { RMR_JOIN_OPT_CHUNK_SIZE, "chunk_size=%s" }, + { RMR_JOIN_OPT_ERR, NULL }, +}; + +static unsigned int rmr_opt_del_mandatory[] = { + RMR_DEL_OPT_MODE, +}; + +static const match_table_t rmr_opt_del_tokens = { + { RMR_DEL_OPT_MODE, "mode=%s" }, + { RMR_OPT_ERR, NULL }, +}; + +enum { + RMR_RECONNECT_OPT_ERR = 0, + RMR_RECONNECT_OPT_PATH = 1 << 0, +}; + +static unsigned int rmr_opt_reconnect_mandatory[] = { + RMR_RECONNECT_OPT_PATH, +}; + +static const match_table_t rmr_opt_reconnect_tokens = { + { RMR_RECONNECT_OPT_PATH, "path=%s" }, + { RMR_RECONNECT_OPT_ERR, NULL }, +}; + +/* remove new line from string */ +static void strip(char *s) +{ + char *p = s; + + while (*s != '\0') { + if (*s != '\n') + *p++ = *s++; + else + ++s; + } + *p = '\0'; +} + +static int rmr_clt_parse_add_sess_opts(const char *buf, char *sessname, int *create, + struct rtrs_addr *paths, size_t *path_cnt, + size_t max_path_cnt, const char *er_msg, + const match_table_t rmr_opt_tokens, + unsigned int *rmr_opt_mandatory, + size_t num_rmr_opt_mandatory) +{ + char *options, *options_orig, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + int p_cnt = 0; + + options_orig = kstrdup(buf, GFP_KERNEL); + if (!options_orig) + return -ENOMEM; + + options = strstrip(options_orig); + strip(options); + sep_opt = options; + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case RMR_ADD_OPT_SESSNAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("%s: sessname too long\n", er_msg); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(sessname, p, NAME_MAX); + kfree(p); + break; + + case RMR_ADD_OPT_PATH: + p = match_strdup(args); + if (!p || p_cnt >= max_path_cnt) { + ret = -ENOMEM; + goto out; + } + + ret = rtrs_addr_to_sockaddr(p, strlen(p), RTRS_PORT, + &paths[p_cnt]); + if (ret) { + pr_err("Can't parse path %s: %d\n", p, ret); + kfree(p); + goto out; + } + + p_cnt++; + + kfree(p); + break; + + case RMR_ADD_OPT_MODE: + if (!create) { + pr_err("%s: mode option not supported here\n", er_msg); + ret = -EINVAL; + goto out; + } + + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "create")) { + *create = true; + } else if (!strcmp(p, "assemble")) { + *create = false; + } else { + pr_err("%s: Unknown mode '%s' (valid: create, assemble)\n", er_msg, p); + ret = -EINVAL; + kfree(p); + goto out; + } + kfree(p); + break; + + default: + pr_err("%s: Unknown parameter or missing value" + " '%s'\n", er_msg, p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < num_rmr_opt_mandatory; i++) { + if ((opt_mask & rmr_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("%s: Parameters missing\n", er_msg); + ret = -EINVAL; + break; + } + } + +out: + if (path_cnt) + *path_cnt = p_cnt; + kfree(options_orig); + return ret; +} + +static void rmr_clt_destroy_session_sysfs_files(struct rmr_clt_pool_sess *pool_sess, + const struct attribute *sysfs_self) +{ + if (pool_sess->kobj.state_in_sysfs) { + sysfs_remove_link(&pool_sess->kobj, "clt_sess"); + + if (sysfs_self) + sysfs_remove_file_self(&pool_sess->kobj, sysfs_self); + kobject_del(&pool_sess->kobj); + kobject_put(&pool_sess->kobj); + } +} + +static int rmr_clt_parse_del_sess_opts(const char *buf, bool *delete) +{ + char *options, *options_orig, *sep_opt, *p; + substring_t args[MAX_OPT_ARGS]; + int i, token, opt_mask = 0, ret = -EINVAL; + + options_orig = kstrdup(buf, GFP_KERNEL); + if (!options_orig) + return -ENOMEM; + + options = strstrip(options_orig); + strip(options); + sep_opt = options; + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_opt_del_tokens, args); + opt_mask |= token; + + switch (token) { + case RMR_DEL_OPT_MODE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "delete")) { + *delete = true; + } else if (!strcmp(p, "disassemble")) { + *delete = false; + } else { + pr_err("%s: Unknown mode '%s' (valid: delete, disassemble)\n", "del_sess", p); + ret = -EINVAL; + kfree(p); + goto out; + } + kfree(p); + break; + + default: + pr_err("%s: Unknown parameter or missing value '%s'\n", "del_sess", p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(rmr_opt_del_mandatory); i++) { + if ((opt_mask & rmr_opt_del_mandatory[i])) { + ret = 0; + } else { + pr_err("%s: Parameters missing\n", "del_sess"); + ret = -EINVAL; + break; + } + } + +out: + kfree(options_orig); + return ret; +} + +static ssize_t rmr_clt_del_sess_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_del_sess_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_clt_sess *clt_sess; + int err, i, idx; + bool delete = false; + u8 srv_sess_member_id; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + clt_sess = pool_sess->clt_sess; + srv_sess_member_id = pool_sess->member_id; + pool = pool_sess->pool; + clt_pool = (struct rmr_clt_pool *)pool->priv; + + err = rmr_clt_parse_del_sess_opts(buf, &delete); + if (err) + return err; + + if (pool_sess_change_state(pool_sess, RMR_CLT_POOL_SESS_REMOVING)) { + /* + * Freeze + */ + rmr_clt_pool_io_freeze(clt_pool); + + /* + * Wait for all completion + */ + rmr_clt_pool_io_wait_complete(clt_pool); + + /* + * Remove the storage node from the pool members list. + */ + xa_erase(&pool->stg_members, srv_sess_member_id); + + /* + * We simply wait for all inflights to get over to make sure + * that they are not affected with the delete session messages + * we are going to send after this. + * Once the inflights are done, we can restart the IOs immediately, + * since the session state has been changed to "removing". + * + * Unfreeze and wake up. + */ + rmr_clt_pool_io_unfreeze(clt_pool); + + send_msg_leave_pool(pool_sess, delete, WAIT); + } + + pr_info("Closing session %s in pool %s\n", + pool_sess->sessname, pool->poolname); + + if (!pool->sync) { + if (delete) { + /* + * Delete map for this session if it exists. + * For disassemble, keep the map so the piggyback loop + * continues to accumulate dirty entries for the member. + */ + rmr_pool_remove_map(pool, srv_sess_member_id); + + /* + * Clear the srv_md entry so the piggyback loop does + * not keep referencing a gone member. + * For disassemble, leave it intact — it is needed to + * identify the member during piggyback until reassembly. + */ + idx = rmr_pool_find_md(&pool->pool_md, srv_sess_member_id, false); + + if (idx >= 0) + memset(&pool->pool_md.srv_md[idx], 0, + sizeof(struct rmr_srv_md)); + /* + * TODO: Push the srv_md change to persistence disk on remaining storages. + */ + } else { + /* + * Disassemble: if this was the last non-sync session, no IOs + * will occur and the dirty maps serve no purpose. Delete them + * all; they will be recreated for all members on the first + * assemble via rmr_clt_process_non_sync_sess. + */ + if (xa_empty(&pool->stg_members)) { + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + u8 mid = pool->pool_md.srv_md[i].member_id; + + if (!mid) + continue; + rmr_pool_remove_map(pool, mid); + } + } + } + + /* + * Send messages to all other sessions, + * Informing them that a particular stor is getting deleted + */ + err = rmr_clt_del_stor_from_pool(pool_sess, delete); + if (err) { + pr_err("pool %s, del_stor failed for sess with member_id %u, err %d\n", + pool->poolname, srv_sess_member_id, err); + return err; + } + } + + /* + * Remove the session from the list. + */ + mutex_lock(&pool->sess_lock); + rmr_clt_del_pool_sess(pool_sess); + mutex_unlock(&pool->sess_lock); + + rmr_clt_destroy_session_sysfs_files(pool_sess, &attr->attr); + + rmr_clt_free_pool_sess(pool_sess); + rmr_clt_sess_put(clt_sess); + + if (list_empty(&pool->sess_list)) + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_JOINED, false); + + return count; +} + +static struct kobj_attribute rmr_clt_del_pool_sess_attr = + __ATTR(del_sess, 0644, rmr_clt_del_sess_show, + rmr_clt_del_sess_store); + +static ssize_t rmr_clt_pool_sess_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_clt_pool_sess *pool_sess; + ssize_t written = 0; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + + written += scnprintf(page, PAGE_SIZE, "%s\n", + rmr_clt_sess_state_str(atomic_read(&pool_sess->state))); + + written += scnprintf(page + written, PAGE_SIZE - written, + "Maintenance mode: %d\n", pool_sess->maintenance_mode); + + return written; +} + +static struct kobj_attribute rmr_clt_pool_sess_state_attr = + __ATTR(state, 0444, rmr_clt_pool_sess_state_show, NULL); + +static ssize_t rmr_clt_sess_member_id_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_clt_pool_sess *pool_sess; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + + return scnprintf(page, PAGE_SIZE, "%u\n", + pool_sess->member_id); +} + +static struct kobj_attribute rmr_clt_pool_sess_member_id_attr = + __ATTR(member_id, 0644, rmr_clt_sess_member_id_show, + NULL); + +static ssize_t rmr_clt_sess_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "echo '1|0' > this_sysfs\n"); +} + +static ssize_t rmr_clt_sess_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool *pool; + int pool_sess_state, err; + bool enable; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + pool = pool_sess->pool; + + if (sysfs_streq(buf, "1")) + enable = true; + else if (sysfs_streq(buf, "0")) + enable = false; + else { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + goto err; + } + + pool_sess_state = atomic_read(&pool_sess->state); + + /* + * Manual disable is interpreted as switching to maintenance mode + * And it is only allowed for sessions NOT in "created" and "removing" state + * And non-sync sessions + */ + if (!enable && ((pool_sess_state == RMR_CLT_POOL_SESS_CREATED) || + (pool_sess_state == RMR_CLT_POOL_SESS_REMOVING) || + (pool_sess->pool->sync))) { + pr_err("Cannot put pool_sess in maintenance mode: state %d, sync %d\n", + pool_sess_state, pool_sess->pool->sync); + goto print_state_err; + } + + if (enable) + err = rmr_clt_enable_sess(pool_sess); + else + err = rmr_clt_set_pool_sess_mm(pool_sess); + if (err) { + pr_err("%s failed with err %d\n", __func__, err); + goto err; + } + + return count; + +print_state_err: + pr_err("Current state: %d\n", atomic_read(&pool_sess->state)); +err: + return -EINVAL; +} + +static struct kobj_attribute rmr_clt_pool_sess_enable_attr = + __ATTR(enable, 0644, rmr_clt_sess_enable_show, + rmr_clt_sess_enable_store); + +static ssize_t rmr_clt_sess_check_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "echo '1' > this_sysfs\n"); +} + +static ssize_t rmr_clt_sess_check_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_clt_pool_sess *pool_sess; + struct rmr_msg_pool_cmd msg = {}; + int err; + + pool_sess = container_of(kobj, struct rmr_clt_pool_sess, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + goto err; + } + + rmr_clt_init_cmd(pool_sess->pool, &msg); + msg.cmd_type = RMR_CMD_MAP_CHECK; + + err = rmr_clt_pool_send_cmd(pool_sess, &msg, WAIT); + if (err) { + pr_err("%s failed with err %d\n", __func__, err); + goto err; + } + return count; + +err: + return -EINVAL; +} + +static struct kobj_attribute rmr_clt_pool_sess_check_map_attr = + __ATTR(check_map, 0644, rmr_clt_sess_check_map_show, + rmr_clt_sess_check_map_store); + +static struct attribute *rmr_clt_pool_sess_attrs[] = { + &rmr_clt_del_pool_sess_attr.attr, + &rmr_clt_pool_sess_state_attr.attr, + &rmr_clt_pool_sess_member_id_attr.attr, + &rmr_clt_pool_sess_enable_attr.attr, + &rmr_clt_pool_sess_check_map_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_clt_pool_sess); + +static struct kobj_type rmr_clt_pool_sess_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_clt_pool_sess_groups, +}; + +static int rmr_clt_create_session_sysfs_files(struct rmr_clt_pool_sess *pool_sess) +{ + int ret; + + ret = kobject_init_and_add(&pool_sess->kobj, &rmr_clt_pool_sess_ktype, + &pool_sess->pool->sessions_kobj, + "%s", pool_sess->sessname); + if (ret) + pr_err("Failed to create sysfs dir for session '%s': %d\n", + pool_sess->sessname, ret); + + return ret; +} + +static ssize_t rmr_clt_pool_add_sess_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo \"" + "sessname=" + " path=<[srcaddr,]dstaddr>" + " [path=<[srcaddr,]dstaddr>]\" > %s\n\n" + "addr ::= [ ip: | ip: | gid: ]\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_add_sess_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rtrs_addr paths[3]; + struct sockaddr_storage saddr[ARRAY_SIZE(paths)]; + struct sockaddr_storage daddr[ARRAY_SIZE(paths)]; + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_clt_sess *clt_sess; + struct rmr_clt_pool_sess *pool_sess; + struct rmr_pool_md *clt_md; + char *sessname; + size_t path_cnt; + int ret, index, create = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + clt_pool = (struct rmr_clt_pool *)pool->priv; + + sessname = kzalloc(NAME_MAX, GFP_KERNEL); + if (unlikely(!sessname)) + return -ENOMEM; + + for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) { + paths[path_cnt].src = &saddr[path_cnt]; + paths[path_cnt].dst = &daddr[path_cnt]; + } + + ret = rmr_clt_parse_add_sess_opts(buf, sessname, + pool->sync ? NULL : &create, + paths, &path_cnt, ARRAY_SIZE(paths), + "add_sess", rmr_opt_add_tokens, + pool->sync ? rmr_opt_add_sync_mandatory + : rmr_opt_add_mandatory, + pool->sync ? ARRAY_SIZE(rmr_opt_add_sync_mandatory) + : ARRAY_SIZE(rmr_opt_add_mandatory)); + if (ret) + goto free_name; + + pr_info("%s: Creating rmr client session %s in pool %s\n", __func__, sessname, + pool->poolname); + + clt_sess = find_and_get_or_create_clt_sess(sessname, paths, path_cnt); + if (IS_ERR(clt_sess)) { + pr_err("failed to find and get or create clt sess %s\n", sessname); + ret = PTR_ERR(clt_sess); + goto free_name; + } + + pool_sess = rmr_clt_add_pool_sess(pool, clt_sess, create); + if (IS_ERR(pool_sess)) { + pr_err("failed to add pool sess %s to the pool %s\n", + sessname, pool->poolname); + ret = PTR_ERR(pool_sess); + goto put_clt_sess; + } + ret = rmr_clt_create_session_sysfs_files(pool_sess); + if (ret) { + pr_err("Creating sysfs files for %s in %s failed: %d\n", + pool_sess->sessname, pool->poolname, ret); + goto destroy_sess; + } + + ret = sysfs_create_link(&pool_sess->kobj, &clt_sess->kobj, "clt_sess"); + if (ret) { + pr_err("Creating symlink for %s failed, err: %d\n", + pool_sess->sessname, ret); + rmr_clt_destroy_session_sysfs_files(pool_sess, NULL); + goto destroy_sess; + } + // ret = sysfs_create_link(&sess->kobj, sess->sess_kobj, + // RTRS_LINK_NAME); + // if (ret) { + // pr_err("Creating rtrs symlink for %s in %s failed: %d\n", + // sess->sessname, pool->poolname, ret); + // rmr_clt_destroy_session_sysfs_files(sess, NULL); + // goto destroy_sess; + // } + rmr_clt_change_pool_state(clt_pool, RMR_CLT_POOL_STATE_JOINED, true); + + clt_md = &pool->pool_md; + index = rmr_pool_find_md(clt_md, pool_sess->member_id, true); + if (index < 0) { + pr_err("No space for member %u in the clt_md\n", pool_sess->member_id); + goto destroy_sess; + } + clt_md->srv_md[index].member_id = pool_sess->member_id; + clt_md->srv_md[index].mapped_size = pool->mapped_size; + + kfree(sessname); + return count; + +destroy_sess: + rmr_clt_destroy_pool_sess(pool_sess, create); +put_clt_sess: + rmr_clt_sess_put(clt_sess); +free_name: + kfree(sessname); + return ret; +} + +static struct kobj_attribute rmr_clt_pool_add_sess_attr = + __ATTR(add_sess, 0644, rmr_clt_pool_add_sess_show, + rmr_clt_pool_add_sess_store); + +static ssize_t rmr_clt_pool_leave_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_leave_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + int ret; + + pool = container_of(kobj, struct rmr_pool, kobj); + clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + if (refcount_read(&clt_pool->refcount) > 1) { + pr_err("%s: Pool %s is in use.\n", __func__, pool->poolname); + return -EINVAL; + } + + pr_info("clt: Deleting pool '%s'\n", pool->poolname); + + ret = rmr_clt_remove_pool_from_sysfs(pool, &attr->attr); + if (unlikely(ret)) + return ret; + + return count; +} + +static struct kobj_attribute rmr_clt_pool_leave_pool_attr = + __ATTR(leave_pool, 0644, rmr_clt_pool_leave_pool_show, + rmr_clt_pool_leave_pool_store); + +static ssize_t rmr_clt_pool_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + clt_pool = (struct rmr_clt_pool *)pool->priv; + + if (pool->chunk_size == UINT_MAX) + return scnprintf(page, PAGE_SIZE, "undefined\n"); + + return scnprintf(page, PAGE_SIZE, "%u\n", pool->chunk_size); +} + +static struct kobj_attribute rmr_clt_pool_chunk_size_attr = + __ATTR(chunk_size, 0644, rmr_clt_pool_chunk_size_show, NULL); + +static ssize_t rmr_clt_pool_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool = container_of(kobj, struct rmr_pool, kobj); + struct rmr_dirty_id_map *map; + int i, lock_idx; + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + rmr_map_dump_bitmap(map); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return 0; +} + +static ssize_t rmr_clt_pool_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + int err; + rmr_id_t id = { 0, 0 }; + int srv_id; + + pool = container_of(kobj, struct rmr_pool, kobj); + if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) { + pr_err("cannot parse id.a %s\n", buf); + return -EINVAL; + } + pr_debug("add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id); + + /* + * If given chunk number exceeds total chunks for us, ignore! + */ + if (id.b > pool->no_of_chunks) + return count; + + err = rmr_clt_map_add_id(pool, srv_id, id); + if (err == -ENOMEM) { + pr_err("failed insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + } else { + pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + } + + return count; +} + +static struct kobj_attribute rmr_clt_pool_map_attr = + __ATTR(map, 0644, rmr_clt_pool_map_show, + rmr_clt_pool_map_store); + +static ssize_t rmr_clt_pool_map_ver_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + ssize_t written; + + pool = container_of(kobj, struct rmr_pool, kobj); + + written = scnprintf(page, PAGE_SIZE, "Map ver: %llu\n", pool->map_ver); + + return written; +} + +static struct kobj_attribute rmr_clt_pool_map_ver_attr = + __ATTR(map_version, 0444, rmr_clt_pool_map_ver_show, NULL); + +static ssize_t rmr_clt_pool_enable_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_enable_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + int ret; + + pool = container_of(kobj, struct rmr_pool, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + return -EINVAL; + } + + ret = rmr_clt_pool_try_enable(pool); + if (ret) { + pr_err("%s: pool %s rmr_clt_pool_try_enable failed with err %d\n", + attr->attr.name, pool->poolname, ret); + return ret; + } + + return count; +} + +static struct kobj_attribute rmr_clt_pool_enable_attr = + __ATTR(pool_enable, 0644, rmr_clt_pool_enable_show, + rmr_clt_pool_enable_store); + +static ssize_t rmr_clt_pool_test_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_clt_pool_test_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + int err; + + pool = container_of(kobj, struct rmr_pool, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + pr_info("pool %s start test map...\n", pool->poolname); + err = rmr_clt_test_map(pool, pool); + if (err) { + pr_err("pool %s, test map failed, err %d\n", + pool->poolname, err); + return err; + } + pr_info("pool %s test map done.", pool->poolname); + + return count; +} + +static struct kobj_attribute rmr_clt_pool_test_map_attr = + __ATTR(test_map, 0644, rmr_clt_pool_test_map_show, + rmr_clt_pool_test_map_store); + +static struct attribute *rmr_clt_pool_attrs[] = { + &rmr_clt_pool_add_sess_attr.attr, + &rmr_clt_pool_leave_pool_attr.attr, + &rmr_clt_pool_chunk_size_attr.attr, + &rmr_clt_pool_map_attr.attr, + &rmr_clt_pool_map_ver_attr.attr, + &rmr_clt_pool_enable_attr.attr, + &rmr_clt_pool_test_map_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_clt_pool); + +static struct kobj_type rmr_clt_pool_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_clt_pool_groups, +}; + +static struct kobj_type ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +static ssize_t rmr_clt_join_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo \"" + "poolname= sync=y|Y|0|1 [chunk_size=]\" " + "> %s\n", + attr->attr.name); +} + +static int rmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *stats_kobj); + +static int rmr_clt_create_pool_sysfs_files(struct rmr_pool *pool) +{ + int ret; + struct rmr_clt_pool *clt_pool; + + ret = kobject_init_and_add(&pool->kobj, &rmr_clt_pool_ktype, + &rmr_pool_dev->kobj, "%s", pool->poolname); + if (ret) { + pr_err("Failed to create sysfs dir for pool '%s': %d\n", + pool->poolname, ret); + return ret; + } + + ret = kobject_init_and_add(&pool->sessions_kobj, &ktype, &pool->kobj, + "sessions"); + if (unlikely(ret)) { + pr_err("Failed to create sessions dir for pool '%s': %d\n", + pool->poolname, ret); + goto put_pool_kobj; + } + clt_pool = (struct rmr_clt_pool *)pool->priv; + ret = rmr_clt_create_stats_files(&pool->kobj, &clt_pool->stats_kobj); + if (unlikely(ret)) { + pr_err("Failed to create sysfs stats files " + "for pool '%s': %d\n", + pool->poolname, ret); + goto put_sessions_kobj; + } + + return 0; + +put_sessions_kobj: + kobject_del(&pool->sessions_kobj); + kobject_put(&pool->sessions_kobj); +put_pool_kobj: + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + + return ret; +} + +void rmr_clt_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self) +{ + struct rmr_clt_pool *clt_pool; + + if (pool->kobj.state_in_sysfs) { + clt_pool = (struct rmr_clt_pool *)pool->priv; + kobject_del(&clt_pool->stats_kobj); + kobject_put(&clt_pool->stats_kobj); + + kobject_del(&pool->sessions_kobj); + kobject_put(&pool->sessions_kobj); + if (sysfs_self) + sysfs_remove_file_self(&pool->kobj, sysfs_self); + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + } +} + +static ssize_t rmr_clt_sess_reconnect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "echo 'path=ip:' > this_sysfs\n"); +} + +static ssize_t rmr_clt_sess_reconnect_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_clt_sess *clt_sess; + struct rmr_clt_pool_sess *pool_sess; + struct rtrs_addr paths[3]; + struct sockaddr_storage saddr[ARRAY_SIZE(paths)]; + struct sockaddr_storage daddr[ARRAY_SIZE(paths)]; + size_t path_cnt; + int err; + + + clt_sess = container_of(kobj, struct rmr_clt_sess, kobj); + + pr_info("%s: Starting manual reconnect for clt_sess %s\n", __func__, clt_sess->sessname); + + /* + * The IP of the server has changed. + * Close the old rtrs connection, parse the path IP, + * and reconnect the session + */ + for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) { + paths[path_cnt].src = &saddr[path_cnt]; + paths[path_cnt].dst = &daddr[path_cnt]; + } + + err = rmr_clt_parse_add_sess_opts(buf, NULL, NULL, paths, &path_cnt, ARRAY_SIZE(paths), + "reconnect_sess", rmr_opt_reconnect_tokens, + rmr_opt_reconnect_mandatory, + ARRAY_SIZE(rmr_opt_reconnect_mandatory)); + if (err) { + pr_err("%s: failed to parse options, err=%d\n", __func__, err); + return err; + } + + if (!IS_ERR_OR_NULL(clt_sess->rtrs)) { + pr_info("close rtrs clt for session %s\n", clt_sess->sessname); + + clt_sess->state = RMR_CLT_SESS_DISCONNECTED; + + /* + * Wait for the state to be seen by rmr client + * + * The ones which are already in the rcu read section (see rmr_get_sess_iu) + * would complete its get_permit for rtrs. + * After that, rtrs_clt_close would wait for all the inflight permits to be + * returned. + */ + mutex_lock(&clt_sess->lock); + list_for_each_entry(pool_sess, &clt_sess->pool_sess_list, clt_sess_entry) + synchronize_srcu(&pool_sess->pool->sess_list_srcu); + mutex_unlock(&clt_sess->lock); + + rtrs_clt_close(clt_sess->rtrs); + clt_sess->rtrs = NULL; + + msleep(RTRS_RECONNECT_BACKOFF); + } + + err = rmr_clt_reconnect_sess(clt_sess, paths, path_cnt); + if (err) { + pr_err("rmr_clt_reconnect_sess Failed\n"); + return err; + } + + pr_info("%s: Manual reconnect for clt_sess %s succeeded\n", __func__, clt_sess->sessname); + return count; +} + +static struct kobj_attribute rmr_clt_sess_reconnect_attr = + __ATTR(reconnect, 0644, rmr_clt_sess_reconnect_show, + rmr_clt_sess_reconnect_store); + +static const char *rmr_clt_sess_state_names[] = { + [0] = "invalid state", + [RMR_CLT_SESS_DISCONNECTED] = "disconnected", + [RMR_CLT_SESS_CONNECTED] = "connected" +}; + +static ssize_t rmr_clt_sess_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_clt_sess *clt_sess; + + clt_sess = container_of(kobj, struct rmr_clt_sess, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", + rmr_clt_sess_state_names[clt_sess->state]); +} + +static struct kobj_attribute rmr_clt_sess_state_attr = + __ATTR(state, 0444, rmr_clt_sess_state_show, NULL); + +static struct attribute *rmr_clt_sess_attrs[] = { + &rmr_clt_sess_reconnect_attr.attr, + &rmr_clt_sess_state_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_clt_sess); + +static struct kobj_type rmr_clt_sess_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_clt_sess_groups, +}; + +int rmr_clt_create_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess) +{ + int ret; + + ret = kobject_init_and_add(&clt_sess->kobj, &rmr_clt_sess_ktype, + &rmr_sess_dev->kobj, "%s", clt_sess->sessname); + if (ret) { + pr_err("Failed to create sysfs dir for sess '%s': %d\n", + clt_sess->sessname, ret); + return ret; + } + + return 0; +} + +void rmr_clt_destroy_clt_sess_sysfs_files(struct rmr_clt_sess *clt_sess) +{ + if (clt_sess->kobj.state_in_sysfs) { + kobject_del(&clt_sess->kobj); + kobject_put(&clt_sess->kobj); + } +} + +static int rmr_clt_parse_join_opts(const char *buf, char *poolname, + bool *sync, u32 *chunk_size) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + options = strstrip(options); + strip(options); + sep_opt = options; + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_opt_join_tokens, args); + opt_mask |= (1 << token); + + switch (token) { + case RMR_JOIN_OPT_POOLNAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("join_pool: poolname too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(poolname, p, NAME_MAX); + kfree(p); + break; + + case RMR_JOIN_OPT_SYNC: + p = match_strdup(args); + + ret = kstrtobool(p, sync); + if (ret) { + pr_err("sync isn't a boolean: %d\n", ret); + kfree(p); + goto out; + } + + kfree(p); + break; + + case RMR_JOIN_OPT_CHUNK_SIZE: + /* + * Min supported chunk_size is PAGE_SIZE. + * The value must be power-of-2 and multiples + * of SECTOR_SIZE. + */ + p = match_strdup(args); + + ret = kstrtou32(p, 0, chunk_size); + if (ret) { + pr_err("chunk_size isn't an integer: %d\n", ret); + kfree(p); + goto out; + } else if (*chunk_size < PAGE_SIZE) { + pr_err("Min supported chunk_size is %lu\n", PAGE_SIZE); + ret = -EINVAL; + kfree(p); + goto out; + } else if (!is_power_of_2(*chunk_size)) { + pr_err("chunk_size must be power of 2\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + + kfree(p); + break; + default: + pr_err("join_pool: Unknown parameter or missing value" + " '%s'\n", p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < RMR_JOIN_OPT_Mandatory_count; i++) { + if ((opt_mask & (1 << rmr_opt_join_tokens[i].token))) { + ret = 0; + } else { + pr_err("join_pool: Mandatory parameter missing: %s\n", + rmr_srv_opts_mandatory_names[i]); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + +static struct rmr_clt_pool *rmr_create_clt_pool(char *poolname, bool sync) +{ + struct rmr_clt_pool *clt_pool; + int ret; + + clt_pool = kzalloc(sizeof(struct rmr_clt_pool), GFP_KERNEL); + if (unlikely(!clt_pool)) + return ERR_PTR(-ENOMEM); + + refcount_set(&clt_pool->refcount, 1); + + init_waitqueue_head(&clt_pool->map_update_wq); + atomic_set(&clt_pool->io_freeze, 0); + mutex_init(&clt_pool->io_freeze_lock); + mutex_init(&clt_pool->clt_pool_lock); + + clt_pool->recover_wq = alloc_workqueue("%s_recover_wq", 0, 0, poolname); + if (!clt_pool->recover_wq) { + ret = -ENOMEM; + goto free_clt_pool; + } + + if (!sync) { + INIT_DELAYED_WORK(&clt_pool->recover_dwork, recover_work); + queue_delayed_work(clt_pool->recover_wq, &clt_pool->recover_dwork, + msecs_to_jiffies(RMR_RECOVER_INTERVAL_MS)); + } + + return clt_pool; + +free_clt_pool: + kfree(clt_pool); + return ERR_PTR(ret); +} + +static ssize_t rmr_clt_join_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_clt_pool *clt_pool; + struct rmr_pool_md *clt_md; + char *poolname; + u32 chunk_size = RMR_DEFAULT_CHUNK_SIZE; + bool sync = false; + int err; + + poolname = kzalloc(NAME_MAX, GFP_KERNEL); + if (unlikely(!poolname)) + return -ENOMEM; + + err = rmr_clt_parse_join_opts(buf, poolname, &sync, &chunk_size); + if (unlikely(err)) + goto out; + + strip(poolname); + + pr_info("%s: Creating client pool with poolname %s, sync %d\n", + __func__, poolname, sync); + + clt_pool = rmr_create_clt_pool(poolname, sync); + if (IS_ERR(clt_pool)) { + pr_err("%s: Clt pool creationg failed\n", __func__); + err = PTR_ERR(clt_pool); + goto out; + } + + pool = rmr_create_pool(poolname, clt_pool); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto put_clt_pool; + } + + pool->is_clt = true; + pool->sync = sync; + clt_pool->pool = pool; + + pr_debug("pool %p, clt_pool %p\n", pool, pool->priv); + + err = rmr_clt_create_pool_sysfs_files(pool); + if (err) + goto put_clt_pool; + + if (!sync) { + clt_md = &clt_pool->pool->pool_md; + strscpy(clt_md->poolname, poolname, NAME_MAX); + clt_md->group_id = pool->group_id; + clt_md->map_ver = 1; + } + + kfree(poolname); + + return count; + +put_clt_pool: + if (!sync) + cancel_delayed_work_sync(&clt_pool->recover_dwork); + + rmr_put_clt_pool(clt_pool); +out: + kfree(poolname); + return err; +} + +static struct kobj_attribute rmr_clt_join_pool_attr = + __ATTR(join_pool, 0644, + rmr_clt_join_pool_show, rmr_clt_join_pool_store); + +static struct attribute *default_attrs[] = { + &rmr_clt_join_pool_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +void rmr_clt_destroy_sysfs_files(void) +{ + sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group); + + device_unregister(rmr_sess_dev); + device_unregister(rmr_pool_dev); + device_unregister(rmr_ctl_dev); + + class_destroy(rmr_dev_class); +} + +int rmr_clt_create_sysfs_files(void) +{ + int err; + dev_t devt = MKDEV(0, 0); + + rmr_dev_class = class_create("rmr-client"); + if (IS_ERR(rmr_dev_class)) + return PTR_ERR(rmr_dev_class); + + rmr_ctl_dev = device_create(rmr_dev_class, NULL, devt, NULL, "ctl"); + if (IS_ERR(rmr_ctl_dev)) { + err = PTR_ERR(rmr_ctl_dev); + goto cls_destroy; + } + + rmr_pool_dev = device_create(rmr_dev_class, NULL, devt, NULL, "pools"); + if (IS_ERR(rmr_pool_dev)) { + err = PTR_ERR(rmr_pool_dev); + goto ctl_destroy; + } + + rmr_sess_dev = device_create(rmr_dev_class, NULL, devt, NULL, "sessions"); + if (IS_ERR(rmr_sess_dev)) { + err = PTR_ERR(rmr_sess_dev); + goto pool_destroy; + } + + err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto sess_destroy; + + return 0; + +sess_destroy: + device_unregister(rmr_sess_dev); +pool_destroy: + device_unregister(rmr_pool_dev); +ctl_destroy: + device_unregister(rmr_ctl_dev); +cls_destroy: + class_destroy(rmr_dev_class); + + return err; +} + +STAT_ATTR(struct rmr_clt_stats, read_retries, + rmr_clt_stats_read_retries_to_str, rmr_clt_reset_read_retries); + +static struct attribute *rmr_clt_stats_attrs[] = { + &read_retries_attr.attr, + NULL, +}; + +static struct attribute_group rmr_clt_stats_attr_group = { + .attrs = rmr_clt_stats_attrs, +}; + +static int rmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *stats_kobj) +{ + int ret; + + ret = kobject_init_and_add(stats_kobj, &ktype, kobj, "stats"); + if (ret) { + pr_err("Failed to init and add stats kobject, err: %d\n", + ret); + return ret; + } + + ret = sysfs_create_group(stats_kobj, &rmr_clt_stats_attr_group); + if (ret) { + pr_err("failed to create stats sysfs group, err: %d\n", + ret); + goto put_stats_obj; + } + + return 0; + +put_stats_obj: + kobject_del(stats_kobj); + kobject_put(stats_kobj); + + return ret; +} From 8787e1fe42195d37eae8b09326551408c58789fd Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:17 +0200 Subject: [PATCH 05/13] RDMA/rmr: server: main functionality Add the RMR server implementation: rmr-srv.c server core: session handling, pool registration via rmr_srv_register(), incoming command and IO message processing, sync thread coordination and the rmr_srv_query()/rmr_srv_unregister() entry points used by upper-layer stores. rmr-srv-md.c server-side metadata persistence: serialising the pool metadata (member ID, map version, mapped size, store state) and the dirty maps to the underlying store, plus the periodic md_sync delayed work. The server interacts with an upper-layer store via the struct rmr_srv_store_ops interface defined in rmr-srv.h, allowing different store implementations (block device, file, ...) to plug in without modifying RMR itself. These files are not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/infiniband/ulp/rmr/rmr-srv-md.c | 764 ++++++ drivers/infiniband/ulp/rmr/rmr-srv.c | 3306 +++++++++++++++++++++++ 2 files changed, 4070 insertions(+) create mode 100644 drivers/infiniband/ulp/rmr/rmr-srv-md.c create mode 100644 drivers/infiniband/ulp/rmr/rmr-srv.c diff --git a/drivers/infiniband/ulp/rmr/rmr-srv-md.c b/drivers/infiniband/ulp/rmr/rmr-srv-md.c new file mode 100644 index 000000000000..9dab71a810b8 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv-md.c @@ -0,0 +1,764 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) — server metadata subsystem + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include + +#include "rmr-srv.h" +#include "rmr-req.h" +#include "rmr-clt.h" + +/** + * process_md_io() - Process medata IO message + * + * @pool: the pool where requests go through + * @rtrs_op: rtrs IO context + * @offset: offset in bytes relative to rmr metadata. + * @len: length of the buffer in bytes + * @flags: indicates metadata IO options + * @buf: pointer to metadata buffer + * + * Return: + * 0 on success + * + * Description: + * All metadata IOs go through this function to submit requests to block device. The offset it + * passes on is relative to bytes shifting on rmr medata which is composed of a header + * structure for pool metadata, bitmap and last_io array. + */ +int process_md_io(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len, + unsigned long flags, void *buf) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_srv_req *req; + int err = 0; + + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!percpu_ref_tryget_live(&pool->ids_inflight_ref)) { + err = -EIO; + goto no_put; + } + + req = rmr_srv_md_req_create(srv_pool, rtrs_op, buf, offset, len, flags, rmr_srv_endreq); + if (IS_ERR(req)) { + pr_err("Failed to create rmr_req %pe\n", req); + err = PTR_ERR(req); + goto put_pool; + } + + rmr_md_req_submit(req); + return 0; + +put_pool: + percpu_ref_put(&pool->ids_inflight_ref); +no_put: + return err; +} + +int rmr_srv_read_md(struct rmr_pool *pool, struct rtrs_srv_op *rtrs_op, u32 offset, u32 len, + struct rmr_pool_md *pool_md_page) +{ + /* pool_md is pre-allocated */ + return process_md_io(pool, rtrs_op, offset, len, RMR_OP_MD_READ, pool_md_page); +} + +static int rmr_srv_load_last_io(struct rmr_srv_pool *srv_pool) +{ + void *buf; + u64 offset, len; + struct rmr_pool *pool = srv_pool->pool; + struct rmr_pool_md *pool_md = &pool->pool_md; + int err = 0; + + if (!pool_md->queue_depth) { + pr_err("%s: pool %s has zero queue_depth\n", + __func__, pool->poolname); + return -EINVAL; + } + offset = RMR_LAST_IO_OFFSET; + len = rmr_last_io_len(pool_md->queue_depth); + + if (!srv_pool->last_io_idx) { + srv_pool->last_io_idx = kcalloc(pool_md->queue_depth, + sizeof(*srv_pool->last_io_idx), GFP_KERNEL); + if (!srv_pool->last_io_idx) + return -ENOMEM; + } + + buf = kzalloc(len, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + return err; + } + + err = rmr_srv_read_md(pool, NULL, offset, len, buf); + if (err) { + pr_err("%s: failed to read last_io buffer of len %lld at offset %lld\n", + __func__, len, offset); + goto free_buf; + } + memcpy(srv_pool->last_io_idx, (rmr_id_t *)buf, len); + +free_buf: + kfree(buf); + return err; +} + +/** + * rmr_srv_md_maps_sync - Sync dirty maps to persistent storage + * + * Description: + * Writes maps in two passes to the map-related regions of the on-disk layout: + * + * Pass 1 — hdr_region (single PAGE_SIZE write at RMR_MD_SIZE + last_io_len): + * Fills one rmr_map_cbuf_hdr slot per map_idx in [0:maps_cnt]. + * The buffer is kzalloc'd, so slots beyond maps_cnt are zero. + * The entire PAGE_SIZE region is issued as a single I/O. + * + * Pass 2 — maps_region (slp pages at computed offsets after hdr_region): + * Each map's data offset = map_region_offset + map_idx * per_map_size. + * pool->maps[0:maps_cnt] is always dense (no NULL gaps). + */ +void rmr_srv_md_maps_sync(struct rmr_pool *pool) +{ + struct rmr_map_cbuf_hdr *map_cbuf_hdr; + struct rmr_dirty_id_map *map = NULL; + u32 hdr_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth); + u32 map_region_offset = hdr_region_offset + RMR_MAP_BUF_HDR_SIZE; + u64 per_map_size = 0; + int err, lock_idx; + void *buf; + u8 map_idx; + + buf = kzalloc(RMR_MAP_BUF_HDR_SIZE, GFP_KERNEL); + if (!buf) + return; + + lock_idx = srcu_read_lock(&pool->map_srcu); + + /* Fill the header region: one slot per active map */ + for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) { + map = rcu_dereference(pool->maps[map_idx]); + if (WARN_ON(!map)) + goto unlock; + + map_cbuf_hdr = buf + map_idx * sizeof(struct rmr_map_cbuf_hdr); + map_cbuf_hdr->version = RMR_MAP_FORMAT_VER; + map_cbuf_hdr->member_id = map->member_id; + map_cbuf_hdr->no_of_chunks = map->no_of_chunks; + map_cbuf_hdr->no_of_flp = map->no_of_flp; + map_cbuf_hdr->no_of_slp_in_last_flp = map->no_of_slp_in_last_flp; + map_cbuf_hdr->no_of_chunk_in_last_slp = map->no_of_chunk_in_last_slp; + map_cbuf_hdr->total_slp = map->total_slp; + per_map_size = map->total_slp * PAGE_SIZE; + } + + /* Write the entire header region as a single PAGE_SIZE I/O */ + err = process_md_io(pool, NULL, hdr_region_offset, + PAGE_SIZE, RMR_OP_MD_WRITE, buf); + if (err) { + pr_warn("%s: failed to write header region at 0x%x: %d\n", + __func__, hdr_region_offset, err); + goto unlock; + } + + if (WARN_ON(!per_map_size)) + goto unlock; + + /* Write each map's slp pages */ + for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) { + u32 map_data_offset; + el_flp *flp_ptr; + u64 no_of_slps; + void *slp; + int i, j; + + map = rcu_dereference(pool->maps[map_idx]); + if (WARN_ON(!map)) + break; + + map_data_offset = map_region_offset + map_idx * per_map_size; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (void *)(*flp_ptr); + + err = process_md_io(pool, NULL, map_data_offset, + PAGE_SIZE, RMR_OP_MD_WRITE, slp); + if (err) + pr_warn("%s: failed to write map slp at 0x%x: %d\n", + __func__, map_data_offset, err); + map_data_offset += PAGE_SIZE; + } + } + } + +unlock: + srcu_read_unlock(&pool->map_srcu, lock_idx); + kfree(buf); +} + +/** + * rmr_srv_refresh_md_maps - Restore maps from map buffers on disk + * + * Description: + * Reads back the maps written by rmr_srv_md_maps_sync(). Reads the hdr_region + * in a single I/O to obtain the per-map headers, then loads each present + * map's slp pages from maps_region: + * data offset = map_region_offset + map_idx * per_map_size + * Header slots 0..N-1 are active; remaining are zero (member_id == 0). + */ +static int rmr_srv_refresh_md_maps(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_map_cbuf_hdr *map_cbuf_hdr; + struct rmr_dirty_id_map *map = NULL; + u32 hdr_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth); + u32 map_region_offset = hdr_region_offset + RMR_MAP_BUF_HDR_SIZE; + int err = 0, lock_idx; + void *buf; + u8 map_idx, valid_nr = 0; + bool unpack; + + buf = kzalloc(RMR_MAP_BUF_HDR_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Read the entire header region in a single PAGE_SIZE I/O */ + err = rmr_srv_read_md(pool, NULL, hdr_region_offset, RMR_MAP_BUF_HDR_SIZE, buf); + if (err) { + pr_err("%s: failed to read header region at offset %u\n", + __func__, hdr_region_offset); + kfree(buf); + return err; + } + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (map_idx = 0; map_idx < RMR_POOL_MAX_SESS; map_idx++) { + u64 per_map_size; + u32 map_data_offset; + el_flp *flp_ptr; + u64 no_of_slps; + void *slp; + int i, j; + + map_cbuf_hdr = buf + map_idx * sizeof(struct rmr_map_cbuf_hdr); + pr_debug("%s: %llu %u %llu %llu %llu %llu %llu\n", __func__, + map_cbuf_hdr->version, + map_cbuf_hdr->member_id, + map_cbuf_hdr->no_of_chunks, + map_cbuf_hdr->no_of_flp, + map_cbuf_hdr->no_of_slp_in_last_flp, + map_cbuf_hdr->no_of_chunk_in_last_slp, + map_cbuf_hdr->total_slp); + + /* Empty slot: no more active maps beyond this point */ + if (!map_cbuf_hdr->member_id) + break; + valid_nr++; + + per_map_size = map_cbuf_hdr->total_slp * PAGE_SIZE; + map_data_offset = map_region_offset + map_idx * per_map_size; + + unpack = false; + /* + * The dirty map should be updated only when the one on disk is more updated. + * Such cases are as follows. + * 1) The dirty map does not exist in the pool. The map will be simply restored to + * the last version we have. + * 2) The dirty map of the pool is just created. If it has been updated, the one on + * disk is outdated. + */ + map = rmr_pool_find_map(pool, map_cbuf_hdr->member_id); + if (!map) { + map = rmr_map_create(pool, map_cbuf_hdr->member_id); + if (IS_ERR(map)) { + err = PTR_ERR(map); + pr_err("%s: pool %s, member_id %d failed to create map\n", + __func__, pool->poolname, map_cbuf_hdr->member_id); + goto unlock; + } + unpack = true; + } else if (rmr_map_empty(map)) { + unpack = true; + } + + if (map->no_of_chunks != map_cbuf_hdr->no_of_chunks || + map->no_of_flp != map_cbuf_hdr->no_of_flp || + map->no_of_slp_in_last_flp != map_cbuf_hdr->no_of_slp_in_last_flp || + map->no_of_chunk_in_last_slp != map_cbuf_hdr->no_of_chunk_in_last_slp || + map->total_slp != map_cbuf_hdr->total_slp) { + pr_err("%s: Sanity check failed\n", __func__); + goto unlock; + } + + xa_store(&pool->stg_members, map_cbuf_hdr->member_id, XA_TRUE, GFP_KERNEL); + + if (!unpack) + continue; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + slp = (void *)(*flp_ptr); + + err = rmr_srv_read_md(pool, NULL, map_data_offset, + PAGE_SIZE, slp); + if (err) { + pr_err("%s: failed to read bitmap at offset %u\n", + __func__, map_data_offset); + goto unlock; + } + map_data_offset += PAGE_SIZE; + } + } + } + +unlock: + if (!valid_nr) + pr_err("%s: no valid map found in metadata\n", __func__); + + /* + * TODO: We need better error handling logic here. + * Lets suppose after successfully reading few pages for a map, we fail to read next page. + * We then error out and fail the register, but leave the partially updated map in the pool. + * Later when another register is called, and we come here to read the maps, we will + * see a non-empty map, and skip reading the map from disk. + */ + srcu_read_unlock(&pool->map_srcu, lock_idx); + kfree(buf); + return err; +} + +/** + * rmr_srv_md_update() - update the metadata of the server pool + * + * Description: + * Read current in-memory pool states that changes to the srv_md of this pool. + */ +static int rmr_srv_md_update(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool; + struct rmr_srv_md *my_srv_md; + int md_i; + + pool = srv_pool->pool; + md_i = rmr_pool_find_md(&pool->pool_md, srv_pool->member_id, true); + if (md_i < 0) { + pr_warn("No space for new member %d.\n", srv_pool->member_id); + return -EINVAL; + } + my_srv_md = &pool->pool_md.srv_md[md_i]; + my_srv_md->member_id = srv_pool->member_id; + my_srv_md->store_state = atomic_read(&srv_pool->store_state); + my_srv_md->map_ver = srv_pool->pool->map_ver; + my_srv_md->srv_pool_state = atomic_read(&srv_pool->state); + pr_debug("Set srv_md[%d] it with the member_id %d.\n", md_i, srv_pool->member_id); + return 0; +} + +/** + * rmr_srv_flush_pool_md() - Write pool_md region to disk immediately + * + * @srv_pool: Server pool whose pool_md is to be flushed + * + * Description: + * Persist pool_md without waiting for the delayed work. + */ +void rmr_srv_flush_pool_md(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + void *buf; + int err; + + if (!atomic_read(&srv_pool->store_state) || !pool->mapped_size) + return; + + err = rmr_srv_md_update(srv_pool); + if (err) { + pr_warn("%s: failed to update pool_md before flush: 0x%x\n", __func__, err); + return; + } + + buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + if (!buf) + return; + + memcpy(buf, &pool->pool_md, sizeof(struct rmr_pool_md)); + err = process_md_io(pool, NULL, 0, RMR_MD_SIZE, RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: failed to flush pool_md: 0x%x at offset 0 len %lu\n", + __func__, err, RMR_MD_SIZE); + kfree(buf); +} + +/** + * rmr_srv_flush_last_io() - Write last_io region to disk + * + * @srv_pool: Server pool whose last_io is to be flushed + */ +static void rmr_srv_flush_last_io(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + u64 last_io_len = rmr_last_io_len(pool->pool_md.queue_depth); + void *buf; + int err; + + if (!last_io_len || !srv_pool->last_io) + return; + + buf = kzalloc(last_io_len, GFP_KERNEL); + if (!buf) + return; + + memcpy(srv_pool->last_io_idx, srv_pool->last_io, last_io_len); + memcpy(buf, srv_pool->last_io_idx, last_io_len); + + err = process_md_io(pool, NULL, RMR_MD_SIZE, last_io_len, + RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: failed to flush last_io: 0x%x at offset %lu len %llu\n", + __func__, err, RMR_MD_SIZE, last_io_len); + kfree(buf); +} + +/** + * rmr_srv_md_load_buf() - Load the server metadata from buffer to the server pool. + * + * Description: + * This function loads the server-side metadata from buffer to the pool. The buffer must be + * in the format of rmr pool metadata structure, which may contain updated srv_md of + * multiple servers. + */ +static int rmr_srv_md_load_buf(struct rmr_pool *pool, void *buf) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_md *srv_md_buf; + u8 member_id = 0; + int err = 0, index, i; + bool ret = false; + + buf += (RMR_CLT_MD_SIZE - sizeof(struct rmr_srv_md)); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + buf += sizeof(struct rmr_srv_md); + srv_md_buf = (struct rmr_srv_md *)buf; + member_id = srv_md_buf->member_id; + /* skip updating the srv_md of this server pool */ + if (!member_id || member_id == srv_pool->member_id) + continue; + + index = rmr_pool_find_md(&pool->pool_md, member_id, true); + if (index < 0) { + pr_debug("%s: No space in the pool_md for new member %d\n", + __func__, member_id); + err = -EINVAL; + continue; + } + + pr_debug("Load srv_md[%d] with member_id %d\n", index, member_id); + memcpy(&pool->pool_md.srv_md[index], srv_md_buf, sizeof(struct rmr_srv_md)); + ret = true; + } + + if (!ret) { + pr_debug("No server metadata found in the buffer\n"); + err = -EINVAL; + } + + return err; +} + +/** + * rmr_srv_md_process_buf() - Load the metadata from buffer to the server pool. + * + * Description: + * This node loads the metadata from buffer to the server pool. + */ +int rmr_srv_md_process_buf(struct rmr_pool *pool, void *buf, bool sync) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_pool_md *buf_pool_md, *dest_md = &pool->pool_md; + int err = 0; + + srv_pool = (struct rmr_srv_pool *)pool->priv; + buf_pool_md = (struct rmr_pool_md *)buf; + if (!sync) { + /* Copy only the client-side header. */ + memcpy(dest_md, buf_pool_md, RMR_CLT_MD_SIZE); + } else { + err = rmr_srv_md_load_buf(pool, buf); + if (err) + pr_err("Failed to load md buf to pool %s\n", pool->poolname); + } + + return err; +} + +int rmr_srv_send_md_update(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_pool *sync_pool = srv_pool->clt; + struct rmr_msg_pool_cmd msg = {}; + int err = 0, buflen; + void *buf; + + /* Only normal-state server pools should send metadata updates. */ + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) + return -EINVAL; + + /* For a stg node A, is A->B alive? */ + if (!sync_pool) { + pr_debug("pool %s has no sync pool assigned. Cannot send md update commands.\n", + pool->poolname); + return -ENXIO; + } + + buf = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + buflen = RMR_MD_SIZE; + + rmr_clt_init_cmd(sync_pool, &msg); + msg.cmd_type = RMR_CMD_MD_SEND; + /* This node sends messages to start md_update. */ + msg.md_send_cmd.leader_id = srv_pool->member_id; + msg.md_send_cmd.src_mapped_size = pool->mapped_size; + + err = rmr_clt_send_cmd_with_data_all(sync_pool, &msg, buf, buflen); + if (err < 0) { + pr_debug("pool %s sends all sess RMR_CMD_MD_SEND failed\n", pool->poolname); + goto free_buf; + } + + /* + * keep the original slice of buffer if the corresponding send req failed. + * + * TODO: + * We need to use the err received from rmr_clt_send_cmd_with_data_all in this function, + * and match the sessions we are skipping. + * + * In general, the sessions_skipped == (RMR_POOL_MAX_SESS - (number_of_legs - 1 - err). + * If the above number does not match, then we abandon the buffers, and try again. + */ + err = rmr_srv_md_load_buf(pool, buf); + if (err) { + pr_debug("Failed to load md buf to pool %s\n", pool->poolname); + goto free_buf; + } + +free_buf: + kfree(buf); + return err; +} + +/** + * rmr_srv_refresh_md() - Refresh the metadata of the rmr pool. + * + * @srv_pool: Server pool whose metadata to be find + * + * Description: + * Read the metadata of the rmr pool from the backing store. + * + * Return: + * True when reading the metadata succeeds in two cases. The first case is a successful read + * but no metadata found. The second case is it found metadata which contains the srv_md. + * False otherwise. + */ +int rmr_srv_refresh_md(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool_md *pool_md_page; + struct rmr_pool *pool = srv_pool->pool; + int index, ret; + u64 md_ver; + + pool_md_page = kzalloc(RMR_MD_SIZE, GFP_KERNEL); + if (!pool_md_page) + return -ENOMEM; + + if (rmr_srv_read_md(pool, NULL, 0, RMR_MD_SIZE, pool_md_page)) { + pr_err("%s: failed reading md of rmr\n", __func__); + goto free_md; + } + + pr_info("%s: Read md of pool %s from store with magic 0x%llx\n", + __func__, pool_md_page->poolname, pool_md_page->magic); + + if (pool_md_page->magic != RMR_POOL_MD_MAGIC) { + pr_info("%s: No valid md found on the store for pool %s\n", + __func__, pool->poolname); + ret = -EINVAL; + goto free_md; + } + + /* + * TODO: Should we sanity check other params also? + */ + if (pool_md_page->chunk_size != pool->chunk_size) { + pr_err("%s: chunk size mismatched. pool chunk size %u, md chunk size %u\n", + __func__, pool->chunk_size, pool_md_page->chunk_size); + goto free_md; + } + + /* Import the metadata to the states of the pool. */ + index = rmr_pool_find_md(pool_md_page, srv_pool->member_id, false); + if (index < 0) { + pr_info("%s: No md found for member_id %d\n", __func__, srv_pool->member_id); + ret = index; + goto free_md; + } + + if (pool_md_page->srv_md[index].mapped_size != pool->mapped_size) { + pr_err("%s: Mapped size mismatched. The srv pool %llu, md %llu\n", + __func__, pool->mapped_size, pool_md_page->mapped_size); + ret = -EINVAL; + goto free_md; + } + + md_ver = pool_md_page->srv_md[index].map_ver; + if (md_ver < pool->map_ver) + pr_err("The current map ver is %lld but the map ver on md is %lld.\n", + pool->map_ver, md_ver); + else + pool->map_ver = md_ver; + + pool->pool_md = *pool_md_page; + + ret = rmr_srv_load_last_io(srv_pool); + if (ret) { + pr_err("%s: failed to load last_io array to memory with err 0x%x\n", + __func__, ret); + goto zero_md; + } + + pr_info("%s: no_of_chunks %lld\n", __func__, pool->no_of_chunks); + ret = rmr_srv_refresh_md_maps(srv_pool); + if (ret) { + pr_err("%s: failed to load dirty bitmap to memory with err %pe\n", + __func__, ERR_PTR(ret)); + goto free_last_io; + } + goto free_md; + +free_last_io: + kfree(srv_pool->last_io_idx); + srv_pool->last_io_idx = NULL; +zero_md: + memset(&pool->pool_md, 0, sizeof(pool->pool_md)); +free_md: + kfree(pool_md_page); + return ret; +} + +/** + * rmr_srv_mark_maps_dirty() - Set MD_DIRTY_MAPS and schedule delayed sync + * + * @srv_pool: Server pool with changed maps + */ +void rmr_srv_mark_maps_dirty(struct rmr_srv_pool *srv_pool) +{ + set_bit(MD_DIRTY_MAPS, &srv_pool->md_dirty); + mod_delayed_work(srv_pool->md_sync_wq, &srv_pool->md_sync_dwork, + msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS)); +} + +/** + * rmr_srv_md_sync - sync dirty metadata regions of pool + * + * Description: + * Dirty-driven consumer: only flushes regions whose dirty bit is set. + * Producers set bits and schedule this work via mod_delayed_work(). + * Does NOT re-queue itself — the next dirty event will schedule it. + */ +void rmr_srv_md_sync(struct work_struct *work) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_pool *pool; + bool ret, did_work = false; + + srv_pool = container_of(to_delayed_work(work), struct rmr_srv_pool, md_sync_dwork); + if (!srv_pool->pool) + return; + + /* + * It could happen that access the pool while the pool is not there. Use reference counting + * for server pool to avoid the issue. + */ + ret = rmr_get_srv_pool(srv_pool); + if (!ret) { + pr_err("%s: pool is not there\n", __func__); + return; + } + + pool = srv_pool->pool; + + /* + * Update srv_md snapshot and notify peers whenever any region is dirty. + */ + if (!rmr_srv_md_update(srv_pool) && rmr_srv_send_md_update(pool)) + pr_debug("failed to send md update\n"); + + /* + * The io store is ready after the store is registered and the pool metadata is + * updated, if any. + */ + if (!atomic_read(&srv_pool->store_state) || !pool->mapped_size) + goto put_pool; + + /* + * On-disk layout of rmr pool metadata: + * + * 0 RMR_MD_SIZE +last_io_len +PAGE_SIZE + * +-----------+-------------+---------------+--------------------+ + * | pool_md | last_io | hdr_region | maps_region ... | + * +-----------+-------------+---------------+--------------------+ + * <-RMR_MD_SIZE><-last_io_len><--PAGE_SIZE--><-per_map slp pages-> + * + * pool->maps[0:maps_cnt] is always dense (no NULL gaps). + * + * This I/O covers pool_md + last_io. hdr_region and maps_region are + * written separately by rmr_srv_md_maps_sync(). + */ + if (test_and_clear_bit(MD_DIRTY_POOL, &srv_pool->md_dirty)) { + rmr_srv_flush_pool_md(srv_pool); + did_work = true; + } + + if (test_and_clear_bit(MD_DIRTY_LAST_IO, &srv_pool->md_dirty)) { + rmr_srv_flush_last_io(srv_pool); + did_work = true; + } + + if (test_and_clear_bit(MD_DIRTY_MAPS, &srv_pool->md_dirty)) { + rmr_srv_md_maps_sync(pool); + did_work = true; + } + + if (did_work) + pr_debug("%s: flushed dirty regions for server pool %u of %s\n", + __func__, srv_pool->member_id, pool->poolname); + +put_pool: + rmr_put_srv_pool(srv_pool); + /* Do NOT re-queue. Producers schedule us via mod_delayed_work. */ +} diff --git a/drivers/infiniband/ulp/rmr/rmr-srv.c b/drivers/infiniband/ulp/rmr/rmr-srv.c new file mode 100644 index 000000000000..66af29b90c53 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv.c @@ -0,0 +1,3306 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include + +#include "rmr-srv.h" +#include "rmr-req.h" +#include "rmr-clt.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(RMR_VER_STRING); +MODULE_DESCRIPTION("RMR Server"); +MODULE_LICENSE("GPL"); + +static struct rtrs_srv_ctx *rtrs_ctx; +struct kmem_cache *rmr_req_cachep; + +static LIST_HEAD(g_sess_list); +static DEFINE_MUTEX(g_sess_lock); + +#define MIN_CHUNK_SIZE (128 << 10) +#define MAX_CHUNK_SIZE (1024 << 10) +#define DEFAULT_CHUNK_SIZE MIN_CHUNK_SIZE + +static int __read_mostly chunk_size = DEFAULT_CHUNK_SIZE; + +module_param_named(chunk_size, chunk_size, uint, 0444); +MODULE_PARM_DESC(chunk_size, + "Unit size which is tracked for being dirty. (default: " + /* cppcheck-suppress unknownMacro */ + __stringify(DEFAULT_CHUNK_SIZE) "KB)"); + +static int __read_mostly sync_queue_depth = DEFAULT_SYNC_QUEUE_DEPTH; + +module_param_named(sync_queue_depth, sync_queue_depth, uint, 0644); +MODULE_PARM_DESC(sync_queue_depth, + "Max in-flight sync requests per pool (default: " + __stringify(DEFAULT_SYNC_QUEUE_DEPTH) ")"); + +bool rmr_get_srv_pool(struct rmr_srv_pool *srv_pool) +{ + pr_debug("pool %s, before inc refcount %d\n", + srv_pool->pool->poolname, refcount_read(&srv_pool->refcount)); + return refcount_inc_not_zero(&srv_pool->refcount); +} + +static struct rmr_srv_pool *rmr_find_and_get_srv_pool(u32 group_id) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + mutex_lock(&pool_mutex); + pool = rmr_find_pool_by_group_id(group_id); + if (!pool) { + mutex_unlock(&pool_mutex); + return ERR_PTR(-ENOENT); + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + if (!rmr_get_srv_pool(srv_pool)) { + mutex_unlock(&pool_mutex); + return ERR_PTR(-EINVAL); + } + mutex_unlock(&pool_mutex); + + return srv_pool; +} + +void rmr_put_srv_pool(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *pool = srv_pool->pool; + + might_sleep(); + + pr_debug("pool %s, before dec refcnt %d\n", + (pool ? pool->poolname : "(empty)"), refcount_read(&srv_pool->refcount)); + if (refcount_dec_and_test(&srv_pool->refcount)) { + mutex_destroy(&srv_pool->srv_pool_lock); + + if (srv_pool->clt) + rmr_clt_close(srv_pool->clt); + + kfree(srv_pool->last_io); + srv_pool->last_io = NULL; + kfree(srv_pool->last_io_idx); + srv_pool->last_io_idx = NULL; + + if (pool) { + pr_info("srv: destroy pool %s\n", pool->poolname); + free_pool(pool); + } + + cancel_delayed_work_sync(&srv_pool->md_sync_dwork); + destroy_workqueue(srv_pool->md_sync_wq); + + cancel_delayed_work_sync(&srv_pool->clean_dwork); + destroy_workqueue(srv_pool->clean_wq); + + kfree(srv_pool); + } +} + +static const char *rmr_get_srv_pool_state_name(enum rmr_srv_pool_state state) +{ + switch (state) { + case RMR_SRV_POOL_STATE_EMPTY: return "RMR_SRV_POOL_STATE_EMPTY"; + case RMR_SRV_POOL_STATE_REGISTERED: return "RMR_SRV_POOL_STATE_REGISTERED"; + case RMR_SRV_POOL_STATE_CREATED: return "RMR_SRV_POOL_STATE_CREATED"; + case RMR_SRV_POOL_STATE_NORMAL: return "RMR_SRV_POOL_STATE_NORMAL"; + case RMR_SRV_POOL_STATE_NO_IO: return "RMR_SRV_POOL_STATE_NO_IO"; + + default: return "Unknown state"; + } +} + +/** + * rmr_srv_change_pool_state() - Change srv pool state + * + * @srv_pool: Server pool whose state is to be changed + * @new_state: State to which the transition is to be made + * + * Return: + * old state on succes + * negative error code on failure + * + * Description: + * This function controls the state transitions for rmr-srv pool state. + * Every state transition is controlled by this except to NORMAL. + * Function rmr_srv_set_pool_state_normal handles transition to state NORMAL. + * "always-invalid" state transitions are checked and prevented here + * Case dependent valid/invalid state transition, should be handled by caller + */ +static inline int rmr_srv_change_pool_state(struct rmr_srv_pool *srv_pool, + enum rmr_srv_pool_state new_state) +{ + enum rmr_srv_pool_state old_state = atomic_read(&srv_pool->state); + int cmp_state; + + WARN_ON(new_state == RMR_SRV_POOL_STATE_NORMAL); + + if (old_state == new_state) + return old_state; + + pr_info("%s: Old state %s, Requested state %s\n", + __func__, rmr_get_srv_pool_state_name(old_state), + rmr_get_srv_pool_state_name(new_state)); + + switch (new_state) { + case RMR_SRV_POOL_STATE_NO_IO: + /* + * NO_IO can be reached from REGISTERED, CREATED, or NORMAL. + * EMPTY -> NO_IO is illegal: a pool with no store cannot have + * active sessions that fail. + */ + if (WARN_ON(old_state == RMR_SRV_POOL_STATE_EMPTY)) + goto err; + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_NO_IO); + break; + case RMR_SRV_POOL_STATE_EMPTY: + /* + * EMPTY is reached from REGISTERED (store unregistered, no + * sessions) or from NO_IO (last session left, no store). A + * direct jump from CREATED or NORMAL is illegal — those states + * must pass through NO_IO first. + */ + if (WARN_ON(old_state == RMR_SRV_POOL_STATE_CREATED || + old_state == RMR_SRV_POOL_STATE_NORMAL)) + goto err; + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_EMPTY); + break; + case RMR_SRV_POOL_STATE_REGISTERED: + /* + * REGISTERED is entered from EMPTY (store just registered, no + * sessions) or from NO_IO (last session left, store still + * present). A direct jump from CREATED or NORMAL is illegal — + * those states must pass through NO_IO first. + */ + if (WARN_ON(old_state == RMR_SRV_POOL_STATE_CREATED || + old_state == RMR_SRV_POOL_STATE_NORMAL)) + goto err; + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_REGISTERED); + + break; + case RMR_SRV_POOL_STATE_CREATED: + /* + * CREATED is entered only from REGISTERED, when the first + * non-sync create-mode join arrives. Any other predecessor + * is illegal. + */ + cmp_state = RMR_SRV_POOL_STATE_REGISTERED; + if (atomic_try_cmpxchg(&srv_pool->state, &cmp_state, RMR_SRV_POOL_STATE_CREATED)) + goto out; + WARN_ON(1); + goto err; + default: + pr_err("%s: Unknown state %d\n", __func__, new_state); + goto err; + } + +out: + rmr_srv_mark_pool_md_dirty(srv_pool); + return old_state; + +err: + pr_err("%s: Failed. Old state %s, Requested state %s\n", + __func__, rmr_get_srv_pool_state_name(old_state), + rmr_get_srv_pool_state_name(new_state)); + return -EINVAL; +} + +/** + * rmr_srv_set_pool_state_normal() - Change srv pool state to NORMAL + * + * @srv_pool: Server pool whose state is to be changed to NORMAL + * + * Return: + * old state on succes + * negative error code on failure + * + * Description: + * This function controls the state transitions for rmr-srv pool state to NORMAL + * "always-invalid" state transitions are checked and prevented here + * Case dependent valid/invalid state transition, should be handled by caller + */ +static int rmr_srv_set_pool_state_normal(struct rmr_srv_pool *srv_pool) +{ + int old_state; + + mutex_lock(&srv_pool->srv_pool_lock); + old_state = atomic_read(&srv_pool->state); + + pr_info("%s: Old state %s\n", __func__, + rmr_get_srv_pool_state_name(old_state)); + + if (old_state == RMR_SRV_POOL_STATE_NORMAL) + goto out; + + /* + * CREATED -> NORMAL: normal enable on a newly created pool. + * NO_IO -> NORMAL: map update completed, pool can serve IOs again. + * Any other predecessor is illegal. + */ + if (WARN_ON(old_state != RMR_SRV_POOL_STATE_CREATED && + old_state != RMR_SRV_POOL_STATE_NO_IO)) { + old_state = -EINVAL; + goto out; + } + + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_NORMAL); + rmr_srv_mark_pool_md_dirty(srv_pool); + pr_info("%s: Server pool state changed to NORMAL\n", __func__); + +out: + mutex_unlock(&srv_pool->srv_pool_lock); + + return old_state; +} + +/** + * rmr_srv_clear_map() - clear the dirty map if other pool member completely synced it + * + * @pool: rmr pool that holds the maps to clean + * @member_id: pool member id for which map is reported as clean + * + * Description: + * If other pool member responded that he finished syncing his data, then we can + * clear his map replicated to this nodes, in case of some clear commands were + * lost or failed. + * + * Return: + * no + * + * Context: + * This function can wait on spin_lock if the deleted entry should be inserted back + * + * Locks: + * no + */ +static void rmr_srv_clear_map(struct rmr_pool *pool, u8 member_id) +{ + // TODO: this looks like rmr_pool_map_remove_entries, can we do something about this? + // I was not able to merge them, but it would be nice. + struct rmr_dirty_id_map *map = NULL; + rmr_id_t id; + int i, lock_idx; + + pr_debug("pool %s clear map entries for member_id=%u\n", + pool->poolname, member_id); + + lock_idx = srcu_read_lock(&pool->map_srcu); + map = rmr_pool_find_map(pool, member_id); + if (!map) { + pr_err("for pool %s cannot find map for member id %u\n", pool->poolname, member_id); + goto unlock; + } + + /* if the map state changed since we send our CHECK_MAP command, it means that + * some entries were added and the map is not clean and we should not wipe it. + * rsp of CHECK_MAP cmd can be outdated a little so we do not trust it then. + */ + if (atomic_read(&map->check_state) != RMR_MAP_STATE_CHECKING) + pr_debug("map for member_id=%u cannot be cleared now, state changed\n", + map->member_id); + + for (i = 0; i < map->no_of_chunks; i++) { + id.a = 1; + id.b = i; + + rmr_map_unset_dirty(map, id, MAP_NO_FILTER); + + /* If the state changed since the last check then it is possible that after + * clear_bit of RMR_MAP_STATE_CHECK_CLEAR in the rmr_req_check_map we called + * rmr_map_insert. There we check that entry is already in the map and leave + * the function. But the following erease here would delete it. So we return + * erased entry back to the table if the state of checking changed. + */ + if (atomic_read(&map->check_state) != RMR_MAP_STATE_CHECKING) { + pr_debug("map for member_id=%u cannot be cleared now, state changed\n", + map->member_id); + + rmr_map_set_dirty(map, id, 0); + goto unlock; + } + } + pr_debug("clear map entries for member_id=%u is done\n", member_id); +unlock: + srcu_read_unlock(&pool->map_srcu, lock_idx); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); +} + +/** + * rmr_srv_check_map_clear() - periodic work that checks if the other node finished sync + * + * @work: delayed work structure to start and repeat the work + * + * Description: + * Check the dirty maps of all of the other pool members. If any of the maps is dirty + * then send check command and if the pool member responds that it has cleared his map, + * then we should clear it locally. When checking is done reschedule itself again. + * + * Return: + * no + * + * Context: + * runs in the process context. + * + * Locks: + * no + */ +static void rmr_srv_check_map_clear(struct work_struct *work) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_pool *pool; + int i, lock_idx; + + srv_pool = container_of(to_delayed_work(work), struct rmr_srv_pool, clean_dwork); + + if (!srv_pool->pool) { + pr_debug("no rmr pool assigend to srv_pool yet.\n"); + goto out; + } + + pool = srv_pool->pool; + pr_debug("check map for srv pool %s started...\n", pool->poolname); + + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_debug("srv pool %s is not in normal state, skip map clear check", + pool->poolname); + goto out; + } + + if (!srv_pool->clt) { + pr_debug("srv pool %s does not have sync pool assigned, skip map clear check\n", + pool->poolname); + goto out; + } + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; i < pool->maps_cnt; i++) { + struct rmr_dirty_id_map *map; + u8 member_id; + int ret; + + map = rcu_dereference(pool->maps[i]); + if (WARN_ON(!map)) + break; + + member_id = map->member_id; + if (member_id == srv_pool->member_id) { + pr_debug("srv pool %s skip checking map with id %u, since it is me.\n", + pool->poolname, member_id); + continue; + } + + if (rmr_map_empty(map)) { + pr_debug("srv pool %s map for member_id=%u is empty, no need to check\n", + pool->poolname, map->member_id); + continue; + } + + atomic_set(&map->check_state, RMR_MAP_STATE_CHECKING); + + ret = rmr_clt_pool_member_synced(srv_pool->clt, member_id); + if (ret < 0) { + pr_debug("pool %s failed to check if member_id=%u synced, ret %d\n", + pool->poolname, member_id, ret); + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + continue; + } + + pr_debug("pool %s check if pool member %u synced, reported %u\n\n", + pool->poolname, member_id, ret); + if (ret) + rmr_srv_clear_map(pool, member_id); + + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + pr_debug("check map for pool %s done. schedule next one.\n", pool->poolname); + +out: + queue_delayed_work(srv_pool->clean_wq, &srv_pool->clean_dwork, + msecs_to_jiffies(RMR_SRV_CHECK_MAPS_INTERVAL_MS)); +} + +struct rmr_srv_pool *rmr_create_srv_pool(char *poolname, u32 member_id) +{ + struct rmr_srv_pool *srv_pool; + srv_pool = kzalloc(sizeof(struct rmr_srv_pool), GFP_KERNEL); + if (unlikely(!srv_pool)) + return ERR_PTR(-ENOMEM); + + atomic_set(&srv_pool->state, RMR_SRV_POOL_STATE_EMPTY); + srv_pool->maintenance_mode = false; + refcount_set(&srv_pool->refcount, 1); + mutex_init(&srv_pool->srv_pool_lock); + + atomic_set(&srv_pool->store_state, false); + + srv_pool->member_id = member_id; + srv_pool->max_sync_io_size = U32_MAX; + + /* Sync thread */ + srv_pool->th_tsk = NULL; + atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED); + atomic_set(&srv_pool->in_flight_sync_reqs, 0); + + /* clean outdated entries from the map work */ + srv_pool->clean_wq = alloc_workqueue("%s_clean_wq", 0, 0, poolname); + if (!srv_pool->clean_wq) { + kfree(srv_pool); + pr_err("failed to create wq pool %s\n", poolname); + return ERR_PTR(-ENOMEM); + } + INIT_DELAYED_WORK(&srv_pool->clean_dwork, rmr_srv_check_map_clear); + queue_delayed_work(srv_pool->clean_wq, &srv_pool->clean_dwork, + msecs_to_jiffies(RMR_SRV_CHECK_MAPS_INTERVAL_MS)); + + /* sync metadata of the rmr pool */ + srv_pool->md_sync_wq = alloc_workqueue("%s_md_sync_wq", 0, 0, poolname); + if (!srv_pool->md_sync_wq) { + kfree(srv_pool); + pr_err("failed to create md_sync_wq pool %s\n", poolname); + return ERR_PTR(-ENOMEM); + } + + INIT_DELAYED_WORK(&srv_pool->md_sync_dwork, rmr_srv_md_sync); + /* No initial queue — first dirty event will schedule the work. */ + return srv_pool; +} + +void rmr_srv_pool_update_params(struct rmr_pool *pool) +{ + pr_info("%s: Setting chunk_size for pool %s to %d", + __func__, pool->poolname, chunk_size); + pool->chunk_size = chunk_size; + pool->chunk_size_shift = ilog2(chunk_size); +} + +static struct rmr_pool *rmr_srv_sess_get_pool(struct rmr_srv_sess *srv_sess, u32 group_id) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + bool ret; + + rcu_read_lock(); + pool = xa_load(&srv_sess->pools, group_id); + if (!pool) { + pool = ERR_PTR(-ENXIO); + goto out; + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + ret = rmr_get_srv_pool(srv_pool); + if (!ret) + pool = ERR_PTR(-ENXIO); + +out: + rcu_read_unlock(); + return pool; +} + +static void rmr_srv_sess_put_pool(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + rmr_put_srv_pool(srv_pool); +} + +/** + * rmr_srv_endreq() - Function called when an rmr server request finishes processing + * + * @req: Pointer to the request ending + * @err: Error value. Would be 0 for a successful request + */ +void rmr_srv_endreq(struct rmr_srv_req *req, int err) +{ + struct rmr_srv_pool *srv_pool = req->srv_pool; + struct rmr_pool *pool = srv_pool->pool; + struct rtrs_srv_op *rtrs_op = req->rtrs_op; + struct rmr_dirty_id_map *map; + int i; + + if (req->flags == RMR_OP_MD_WRITE || req->flags == RMR_OP_MD_READ) { + if (unlikely(err)) + pr_err("Failed to complete the md req %x\n", req->flags); + goto put_ref; + } else if (unlikely(err) && !req->sync) { + struct rmr_srv_pool *srv_pool = req->srv_pool; + + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + } else if (rmr_op(req->flags) == RMR_OP_WRITE) { + srv_pool->last_io[req->mem_id].a = req->id.a; + srv_pool->last_io[req->mem_id].b = req->id.b; + + if (!test_and_set_bit(MD_DIRTY_LAST_IO, &srv_pool->md_dirty)) { + mod_delayed_work(srv_pool->md_sync_wq, + &srv_pool->md_sync_dwork, + msecs_to_jiffies(RMR_SRV_MD_SYNC_INTERVAL_MS)); + } + + for (i = 0; i < req->failed_cnt; i++) { + int err; + + map = rmr_pool_find_map(srv_pool->pool, req->failed_srv_id[i]); + if (!map) { + pr_err("Cannot find map for srv_id %u\n", req->failed_srv_id[i]); + err = -EINVAL; + goto out; + } + + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + rmr_map_set_dirty(map, req->id, 0); + + if (req->map_ver > srv_pool->pool->map_ver) + srv_pool->pool->map_ver = req->map_ver; + } + if (req->failed_cnt) { + rmr_srv_mark_pool_md_dirty(srv_pool); + rmr_srv_mark_maps_dirty(srv_pool); + } + } + +out: + /* The requests created by rmr-srv don't use rtrs_op. */ + rtrs_srv_resp_rdma(rtrs_op, err); + rmr_srv_sess_put_pool(req->srv_pool->pool); +put_ref: + percpu_ref_put(&pool->ids_inflight_ref); +} + +static void rmr_srv_stop_sync_and_unset_store(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + atomic_set(&srv_pool->store_state, false); + + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP); + wake_up_process(srv_pool->th_tsk); + + while (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(1000)); + } + } +} + +static void rmr_srv_delete_store_member(struct rmr_pool *pool, unsigned long id) +{ + rmr_pool_remove_map(pool, id); + xa_erase(&pool->stg_members, id); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); +} + +/** + * rmr_srv_add_store_member() - Register a storage member and create its dirty map + * + * @pool: The pool to which the member belongs. + * @id: Member ID of the storage node to add. + * + * Records @id in pool->stg_members and allocates a dirty map for it. + * On failure the stg_members entry is removed before returning. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_add_store_member(struct rmr_pool *pool, unsigned long id) +{ + struct rmr_dirty_id_map *map; + int ret; + + map = rmr_pool_find_map(pool, id); + if (map) { + pr_err("%s: pool %s, member_id %lu map already exists\n", + __func__, pool->poolname, id); + return -EEXIST; + } + + ret = xa_err(xa_store(&pool->stg_members, id, XA_TRUE, GFP_KERNEL)); + if (ret) { + pr_err("%s: Failed to add storage member %lu: %d\n", + __func__, id, ret); + return ret; + } + + /* + * Create the map of the newly added member. + */ + map = rmr_map_create(pool, id); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + pr_err("%s: pool %s, member_id %lu failed to create map on err %d: %pe\n", + __func__, pool->poolname, id, ret, map); + goto rem_store; + } + return 0; + +rem_store: + xa_erase(&pool->stg_members, id); + return ret; +} + +/** + * rmr_srv_handle_other_member_add() - Handle a POOL_INFO ADD message for a different member + * + * @srv_pool: The server pool receiving the notification. + * @pool_info_cmd: The received POOL_INFO command carrying member_id, mode, and dirty. + * + * For %RMR_POOL_INFO_MODE_ASSEMBLE, verifies that the member and its dirty map + * already exist (the node is rejoining a pool it was previously part of). + * For %RMR_POOL_INFO_MODE_CREATE, adds the member via rmr_srv_add_store_member() + * and optionally marks its map fully dirty if the client reported outstanding data. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_handle_other_member_add(struct rmr_srv_pool *srv_pool, + const struct rmr_msg_pool_info_cmd *pool_info_cmd) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_dirty_id_map *map; + int ret; + + if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_ASSEMBLE) { + pr_info("%s: Member %u got add of member %u with mode assemble\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + + /* + * For assemble, member info should already exist. + */ + if (xa_load(&pool->stg_members, pool_info_cmd->member_id) != XA_TRUE) { + pr_err("%s: pool %s, member_id %u not present\n", + __func__, pool->poolname, pool_info_cmd->member_id); + return -ENOENT; + } + + map = rmr_pool_find_map(pool, pool_info_cmd->member_id); + if (!map) { + pr_err("%s: pool %s, member_id %u, map not present\n", + __func__, pool->poolname, pool_info_cmd->member_id); + return -ENOENT; + } + } else if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_CREATE && + pool_info_cmd->member_id != srv_pool->member_id) { + pr_info("%s: Member %u got add of member %u with mode create\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + + ret = rmr_srv_add_store_member(pool, pool_info_cmd->member_id); + if (ret) { + pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret); + return ret; + } + + if (pool_info_cmd->dirty) { + map = rmr_pool_find_map(pool, pool_info_cmd->member_id); + if (WARN_ON(!map)) { + xa_erase(&pool->stg_members, pool_info_cmd->member_id); + return -EINVAL; + } + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + } + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + } else { + pr_err("%s: pool %s, member_id %u, unexpected mode %u for ADD operation\n", + __func__, pool->poolname, pool_info_cmd->member_id, + pool_info_cmd->mode); + return -EINVAL; + } + + return 0; +} + +int rmr_srv_query(struct rmr_pool *pool, u64 mapped_size, struct rmr_attrs *attr) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_dirty_id_map *map; + size_t queue_depth; + + if (pool) { + srv_pool = (struct rmr_srv_pool *)pool->priv; + queue_depth = srv_pool->queue_depth; + } else { + /* + * If pool is NULL, we are being called to estimate the md size + * before the pool is created. Use max queue depth in that case. + */ + queue_depth = RMR_SRV_MAX_QDEPTH; + } + + /* + * Dummy map structure, so that we can reuse the update map param function. + */ + map = (struct rmr_dirty_id_map *)get_zeroed_page(GFP_KERNEL); + if (!map) { + pr_err("%s: Cannot allocate map\n", __func__); + return -ENOMEM; + } + + map->no_of_chunks = (mapped_size >> (ilog2(chunk_size) - 9)); + rmr_map_update_page_params(map); + + attr->rmr_md_size = (map->total_slp * PAGE_SIZE * RMR_POOL_MAX_SESS) + RMR_MD_SIZE; + attr->rmr_md_size += (queue_depth * sizeof(*srv_pool->last_io_idx)); + + attr->rmr_md_size = attr->rmr_md_size / SECTOR_SIZE; + + free_page((unsigned long)map); + return 0; +} +EXPORT_SYMBOL(rmr_srv_query); + +/** + * rmr_srv_set_map() - Create the dirty map for this server's member in the pool + * + * @pool: The pool for which the map is to be created. + * @mode: Registration mode; if %RMR_SRV_DISK_REPLACE, any existing map for + * this member is removed before creating the new one. + * + * Description: + * Invoked after the mapped size of the pool has been validated. Updates + * pool metadata with the mapped size, recalculates the chunk count, and + * calls rmr_srv_add_store_member() to register this node's map. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_set_map(struct rmr_pool *pool, enum rmr_srv_register_disk_mode mode) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + int ret, md_i; + + pr_info("%s: Mapped size of the pool %s is set to %lld\n", + __func__, pool->poolname, pool->mapped_size); + + /* Update mapped_size in the pool metadata. */ + md_i = rmr_pool_find_md(&pool->pool_md, srv_pool->member_id, true); + if (md_i < 0) { + pr_err("No space for new member %d.\n", srv_pool->member_id); + return -ENOMEM; + } + pool->pool_md.srv_md[md_i].mapped_size = pool->mapped_size; + + /* + * The existing map is irrelevant if user asked for store REPLACE. + */ + if (mode == RMR_SRV_DISK_REPLACE) + rmr_pool_remove_map(pool, srv_pool->member_id); + + ret = rmr_srv_add_store_member(pool, srv_pool->member_id); + if (ret) { + pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret); + goto err_out; + } + + return ret; + +err_out: + pool->pool_md.srv_md[md_i].mapped_size = 0; + return ret; +} + +/** + * rmr_srv_register() - Register a backend store with an RMR server pool + * + * @poolname: Name of the pool to which the store is to be registered. + * @ops: Store operations pointer. + * @priv: Private data for the store. + * @mapped_size: Size of the storage device in sectors. + * @mode: Registration mode: %RMR_SRV_DISK_CREATE for a new store, + * %RMR_SRV_DISK_REPLACE to replace an existing one, or + * %RMR_SRV_DISK_ADD to rejoin an existing pool. + * + * Description: + * An RMR server pool requires a backend store to service I/Os. + * This function registers that store, sets up the pool's dirty map for + * this member, and records the marked_create flag for validation when + * the first client joins. + * + * Return: + * Pointer to the rmr_pool on success, NULL on error. + */ +static bool rmr_srv_pool_has_non_sync_sess(struct rmr_pool *pool) +{ + struct rmr_srv_pool_sess *pool_sess; + + list_for_each_entry(pool_sess, &pool->sess_list, pool_entry) { + if (!pool_sess->sync) + return true; + } + return false; +} + +struct rmr_pool *rmr_srv_register(char *poolname, struct rmr_srv_store_ops *ops, void *priv, + u64 mapped_size, enum rmr_srv_register_disk_mode mode) +{ + struct rmr_pool *pool; + struct rmr_srv_io_store *io_store; + struct rmr_srv_pool *srv_pool; + u32 group_id = rmr_pool_hash(poolname); + enum rmr_srv_pool_state state; + int ret; + + srv_pool = rmr_find_and_get_srv_pool(group_id); + if (IS_ERR(srv_pool)) { + pr_err("pool %s does not exists: %pe\n", poolname, srv_pool); + return NULL; + } + pool = srv_pool->pool; + + mutex_lock(&srv_pool->srv_pool_lock); + if (mode == RMR_SRV_DISK_CREATE && + (rmr_srv_pool_has_non_sync_sess(pool) || + rmr_pool_find_map(pool, srv_pool->member_id))) { + pr_err("%s: Cannot register (create) new backend for %s; Sessions/Map exists\n", + __func__, poolname); + ret = -EEXIST; + goto put_err; + } + + if (mode == RMR_SRV_DISK_REPLACE && + (!rmr_srv_pool_has_non_sync_sess(pool))) { + pr_err("%s: Cannot register (replace) new backend for %s; No non-sync session\n", + __func__, poolname); + ret = -EINVAL; + goto put_err; + } + + if (srv_pool->io_store) { + pr_err("Srv pool %s already has store registered\n", poolname); + goto put_err; + } + + if (pool->mapped_size && pool->mapped_size != mapped_size) { + pr_err("Pool %s already has mapped size %lld, cannot register store with %lld\n", + poolname, pool->mapped_size, mapped_size); + ret = -EINVAL; + goto put_err; + } + + io_store = kzalloc(sizeof(*io_store), GFP_KERNEL); + if (!io_store) { + pr_err("Failed to allocate io_store for %s\n", poolname); + goto put_err; + } + + pool->mapped_size = mapped_size; + io_store->ops = ops; + io_store->priv = priv; + srv_pool->io_store = io_store; + + /* The pool updates its number of tracking chunks with the mapped size just provided. */ + rmr_pool_update_no_of_chunk(pool); + + if (mode == RMR_SRV_DISK_CREATE || mode == RMR_SRV_DISK_REPLACE) { + ret = rmr_srv_set_map(pool, mode); + if (ret) { + pr_err("%s: failed to set maps in rmr pool %s, err %d\n", + __func__, poolname, ret); + goto free_io_store; + } + } else if (mode == RMR_SRV_DISK_ADD) { + /* + * Read the pool metadata stored on this device before md_sync writes + * new metadata to the store. + */ + ret = rmr_srv_refresh_md(srv_pool); + if (ret) { + pr_err("%s: cannot refresh md of the pool\n", __func__); + goto free_io_store; + } + } else { + pr_err("%s: Wrong register disk mode %d\n", __func__, mode); + ret = -EINVAL; + goto free_io_store; + } + + srv_pool->marked_create = (mode == RMR_SRV_DISK_CREATE); + atomic_set(&srv_pool->store_state, true); + rmr_srv_mark_pool_md_dirty(srv_pool); + state = atomic_read(&srv_pool->state); + if (state != RMR_SRV_POOL_STATE_NORMAL && + state != RMR_SRV_POOL_STATE_NO_IO) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_REGISTERED); + mutex_unlock(&srv_pool->srv_pool_lock); + + __module_get(THIS_MODULE); + pr_info("Registered store with pool %s\n", poolname); + + return srv_pool->pool; + +free_io_store: + kfree(io_store); + srv_pool->io_store = NULL; +put_err: + mutex_unlock(&srv_pool->srv_pool_lock); + rmr_put_srv_pool(srv_pool); + return NULL; +} +EXPORT_SYMBOL(rmr_srv_register); + +static void rmr_srv_delete_md(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map = NULL; + int err, lock_idx; + u32 map_region_offset = rmr_bitmap_offset(pool->pool_md.queue_depth) + RMR_MAP_BUF_HDR_SIZE; + u64 per_map_size = 0; + u64 len; + u8 map_idx; + void *buf; + + /* + * It could happen to access the pool while the pool is not there. Use reference counting + * for server pool to avoid the issue. + */ + err = rmr_get_srv_pool(srv_pool); + if (!err) { + pr_err("%s: pool is not there\n", __func__); + return; + } + + len = rmr_bitmap_offset(pool->pool_md.queue_depth) + PAGE_SIZE; + buf = kzalloc(len, GFP_KERNEL); + if (!buf) + goto put_pool; + + /* + * On-disk layout of rmr pool metadata: + * + * 0 RMR_MD_SIZE +last_io_len +PAGE_SIZE + * +-----------+-------------+---------------+--------------------+ + * | pool_md | last_io | hdr_region | maps_region ... | + * +-----------+-------------+---------------+--------------------+ + * <-RMR_MD_SIZE><-last_io_len><--PAGE_SIZE--> maps_cnt * per_map + */ + err = process_md_io(pool, NULL, 0, len, RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: failed to process md write io with err 0x%x.\n", __func__, err); + + /* + * Zero the bitmap on disk using O(1) offset formula. + */ + lock_idx = srcu_read_lock(&pool->map_srcu); + for (map_idx = 0; map_idx < pool->maps_cnt; map_idx++) { + u32 map_data_offset; + el_flp *flp_ptr; + u64 no_of_slps; + int i, j; + + map = rcu_dereference(pool->maps[map_idx]); + if (WARN_ON(!map)) + break; + + per_map_size = map->total_slp * PAGE_SIZE; + map_data_offset = map_region_offset + map_idx * per_map_size; + + for (i = 0; i < map->no_of_flp; i++) { + flp_ptr = (el_flp *)map->dirty_bitmap[i]; + + if (i == (map->no_of_flp - 1)) + no_of_slps = map->no_of_slp_in_last_flp; + else + no_of_slps = NO_OF_SLP_PER_FLP; + + for (j = 0; j < no_of_slps; j++, flp_ptr++) { + err = process_md_io(pool, NULL, map_data_offset, + PAGE_SIZE, RMR_OP_MD_WRITE, buf); + if (err) + pr_warn("%s: bitmap write failed at 0x%x, err 0x%x.\n", + __func__, map_data_offset, err); + map_data_offset += PAGE_SIZE; + } + } + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + rmr_srv_delete_store_member(pool, srv_pool->member_id); + + free_page((unsigned long)buf); +put_pool: + rmr_put_srv_pool(srv_pool); +} + +/** + * rmr_srv_unregister() - Unregister the backend store from rmr server pool + * + * @poolname: Name of the pool from which the store is to be unregistered + * @delete: If true, delete all the metadata associated with this pool + * + * Description: + * rmr server pool needs a backend store which serves the IOs + * This function is used to unregister a backend store from rmr server pool. + * + * Return: + * None + */ +void rmr_srv_unregister(char *poolname, bool delete) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + struct rmr_srv_io_store *io_store; + + mutex_lock(&pool_mutex); + pool = rmr_find_pool(poolname); + mutex_unlock(&pool_mutex); + + if (!pool) { + pr_err("%s, Pool %s does not exists\n", __func__, poolname); + return; + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + mutex_lock(&srv_pool->srv_pool_lock); + + if (!srv_pool->io_store) { + pr_err("Srv pool %s not registered\n", poolname); + mutex_unlock(&srv_pool->srv_pool_lock); + return; + } + + if (srv_pool->marked_delete) { + if (!delete) { + pr_err("%s: Storage server marked for delete, but delete mode not set\n", + __func__); + pr_err("%s: Continuing with only removal", __func__); + } + } else if (!srv_pool->marked_create && delete) { + pr_err("%s: Storage server not marked for delete, abandoning delete.\n", __func__); + delete = false; + } + + io_store = srv_pool->io_store; + + rmr_srv_stop_sync_and_unset_store(pool); + + percpu_ref_kill_and_confirm(&pool->ids_inflight_ref, rmr_pool_confirm_inflight_ref); + wait_for_completion(&pool->complete_done); + wait_for_completion(&pool->confirm_done); + + /* + * Re-init so metadata IO can go in if needed + */ + reinit_completion(&pool->complete_done); + reinit_completion(&pool->confirm_done); + percpu_ref_reinit(&pool->ids_inflight_ref); + + if (delete) + rmr_srv_delete_md(pool); + + kfree(srv_pool->io_store); + srv_pool->io_store = NULL; + + mutex_lock(&pool->sess_lock); + if (!rmr_srv_pool_has_non_sync_sess(pool)) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_EMPTY); + mutex_unlock(&pool->sess_lock); + + srv_pool->marked_delete = false; + mutex_unlock(&srv_pool->srv_pool_lock); + + pool->mapped_size = 0; + + rmr_put_srv_pool(srv_pool); + + pr_info("Unregistered store with pool %s\n", poolname); + + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(rmr_srv_unregister); + +/** + * rmr_srv_pool_cmd_with_rsp() - Sends a user command to all sessions of the internal (sync) clt + * + * @pool: rmr pool to which the command is for + * @conf: confirmation function to be called after completion + * @priv: pointer to priv data, to be returned to user while calling conf function + * @usr_vec: kvec containing user data (mostly command messages?) + * @nr: number of kvecs + * @buf: buf where the response from the user server is to be directed + * @buf_len: length of the buffer + * @size: size of the buf to be sent to a single session + * + * Description: + * This function provides an interface for the user to send commands to storage nodes connected + * through the internal network of this rmr pool. + * It redirects the command through the rmr-client pool in this storage node, which then sends + * the command to all the storage nodes it is connected to. + * The command is sent as a read, so that the response from the user srv side can be received + * The buffer sent by the user is meant to receive the response from the user server side. + * The size of the buffer is set during rmr_clt_open. + * + * Return: + * 0 on success + * negative errno in case of error + * + * Context: + * Inflight commands will block map update, until the inflights are completed. + */ +int rmr_srv_pool_cmd_with_rsp(struct rmr_pool *pool, rmr_conf_fn *conf, void *priv, + const struct kvec *usr_vec, size_t nr, void *buf, int buf_len, + size_t size) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!srv_pool->clt) { + pr_warn("srv pool %s does not have sync pool assigned.\n", + pool->poolname); + return -EAGAIN; + } + + return rmr_clt_cmd_with_rsp(srv_pool->clt, conf, priv, usr_vec, nr, buf, buf_len, size); +} +EXPORT_SYMBOL(rmr_srv_pool_cmd_with_rsp); + +static int rmr_srv_send_discard_all(struct rmr_pool *pool, u8 member_id) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_pool *sync_pool = srv_pool->clt; + struct rmr_msg_pool_cmd msg = {}; + int err; + + /* + * If the member_id is not this server's member_id, it means this server is the receiving + * node of the discard request. + */ + if (srv_pool->member_id != member_id) + return 0; + + pr_info("%s: Send discards across storage nodes for pool %s\n", + __func__, pool->poolname); + + rmr_clt_init_cmd(sync_pool, &msg); + msg.cmd_type = RMR_CMD_SEND_DISCARD; + msg.send_discard_cmd.member_id = member_id; + + err = rmr_clt_pool_send_all(sync_pool, &msg); + if (err) { + pr_err("Failed to send discard cmd for pool %s: %d\n", + pool->poolname, err); + } + return err; +} + +/** + * rmr_srv_discard_id() - discard the data chunks of length from offset on disk + * + * @pool: source pool. + * @offset offset in bytes. + * @length: length in bytes + * @member_id: member id of the storage node to discard the data from. If 0, then the node is + * this server pool. + * @sync: indicates whether to send sync requests to other connected nodes. + * + * Return: + * 0 on success, err code otherwise + * + * Description: + * This function discards the data chunks on the server with member_id. It will mark the + * data chunks as dirty and set the discard_entries flag of the corresponding srv_md true. + * Then it notifies all the connected nodes it has discarded data. + */ +int rmr_srv_discard_id(struct rmr_pool *pool, u64 offset, u64 length, u8 member_id, bool sync) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + rmr_id_t id; + int md_i, err; + + if (!member_id) + member_id = srv_pool->member_id; + + map = rmr_pool_find_map(pool, member_id); + if (!map) { + pr_err("for srv pool %s cannot find map for member_id %u\n", + pool->poolname, member_id); + return -EINVAL; + } + + md_i = rmr_pool_find_md(&pool->pool_md, member_id, false); + if (md_i < 0) { + pr_err("%s: for srv pool %s cannot find md for member_id %u\n", + __func__, pool->poolname, member_id); + return -EINVAL; + } + + /* + * If this node has received a response of the discard request from a normal server, + * the node will continue to mark all the data chunks as dirty. + */ + if (member_id == srv_pool->member_id && sync) { + if (!srv_pool->clt) { + pr_err("pool %s has no sync pool assigned. Cannot send discards.\n", + pool->poolname); + return -ENXIO; + } + + /* + * This node tries to send discards to all its connected nodes. The other node + * that has received the discards will start a new round. In the end, all normal + * nodes that are connected to this node should receive the discards. + */ + err = rmr_srv_send_discard_all(pool, member_id); + if (err) { + pr_err("%s: no server receives discards for pool %s: %d\n", + __func__, pool->poolname, err); + return err; + } + } + + /* + * Set the discard_entries flag of the corresponding srv_md true. Be careful that setting + * the wrong srv_md will lead to loops of discards. + */ + pool->pool_md.srv_md[md_i].discard_entries = true; + rmr_srv_mark_pool_md_dirty(srv_pool); + + if (length) { + rmr_map_calc_chunk(pool, offset, length, &id); + rmr_map_set_dirty(map, id, MAP_ENTRY_UNSYNCED); + } else { + /* discard all data chunks */ + rmr_map_set_dirty_all(map, MAP_ENTRY_UNSYNCED); + pr_info("%s: Discard all data chunks for member_id %u in srv_pool %s: %u\n", + __func__, member_id, pool->poolname, srv_pool->member_id); + } + + rmr_map_clear_filter_all(map, MAP_ENTRY_UNSYNCED); + rmr_srv_mark_maps_dirty(srv_pool); + + return 0; +} +EXPORT_SYMBOL(rmr_srv_discard_id); + +void rmr_srv_replace_store(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + RMR_STORE_SET_REPLACE(pool->map_ver); + rmr_srv_flush_pool_md(srv_pool); +} +EXPORT_SYMBOL(rmr_srv_replace_store); + +/** + * rmr_srv_pool_check_store() - Check whether IO is allowed for a pool or not + * + * @pool: pool to check + * + * Return: + * 1 if IO is allowed, 0 therwise + * + * Description: + * For a rmr-srv pool, the store registered provides a way to check whether it can process + * IOs or not. + */ +static int rmr_srv_pool_check_store(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_io_store *store = srv_pool->io_store; + void *store_priv; + + if (!store) { + pr_debug("for srv pool %s no store assigned\n", pool->poolname); + return false; + } + + if (!store->ops) { + pr_err("for pool %s store has no ops assigned\n", pool->poolname); + return false; + } + store_priv = store->priv; + + return store->ops->io_allowed(store_priv); +} + +/** + * process_msg_io() - Process IO message + * + * @srv_sess: rmr srv session over which the message was received + * @rtrs_op: rtrs IO context + * @data: pointer to data buf + * @datalen: len of data buf + * @usr: pointer to user buf + * @usrlen: len of user buf + * + * Return: + * 0 on success + * negative error code otherwise + * + * Description: + * Perform some basic checks. + * Create an IO request and start its state machine. + */ +static int process_msg_io(struct rmr_srv_sess *srv_sess, + struct rtrs_srv_op *rtrs_op, void *data, + u32 datalen, const void *usr, size_t usrlen) +{ + const struct rmr_msg_io *msg = usr; + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + struct rmr_srv_req *req; + int err = 0; + u32 group_id = le32_to_cpu(msg->hdr.group_id); + + pool = rmr_srv_sess_get_pool(srv_sess, group_id); + if (IS_ERR(pool)) { + pr_err_ratelimited("Got I/O request on session %s for unknown pool group id %d: %pe\n", + srv_sess->sessname, group_id, pool); + return PTR_ERR(pool); + } + + srv_pool = (struct rmr_srv_pool *)pool->priv; + + /* + * No new references will come in after we have killed the percpu_ref. + * Percpu_ref_tryget_live() returns false when @confirm_kill in + * percpu_ref_kill_and_confirm() is done. + */ + if (!percpu_ref_tryget_live(&pool->ids_inflight_ref)) { + err = -EIO; + goto no_put; + } + + if (!atomic_read(&srv_pool->store_state) || + atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_err_ratelimited("server pool %s is not up for IO (state = %s)\n", + pool->poolname, + rmr_get_srv_pool_state_name(atomic_read(&srv_pool->state))); + err = -EIO; + goto put_pool; + } + + /* + * The IOs coming from internal sync sessions are always READ. + */ + if (msg->sync && rmr_op(le32_to_cpu(msg->flags)) != RMR_OP_READ) { + pr_err_ratelimited("process_msg_io: pool %s write IO from internal connection.\n", + pool->poolname); + err = -EIO; + goto put_pool; + } + + /* + * For non internal IOs, make sure the underlying store is ready for IO + */ + if (!msg->sync && !rmr_srv_pool_check_store(pool)) { + pr_err("process_msg_io: pool %s IO not allowed\n", pool->poolname); + err = -EIO; + goto put_pool; + } + + req = rmr_srv_req_create(msg, srv_pool, rtrs_op, data, datalen, rmr_srv_endreq); + if (IS_ERR(req)) { + pr_err("Failed to create rmr_req %pe\n", req); + + //TODO: do we have to rtrs_srv_resp_rdma here ? + err = PTR_ERR(req); + goto put_pool; + } + + rmr_req_submit(req); + return 0; + +put_pool: + percpu_ref_put(&pool->ids_inflight_ref); + +no_put: + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + rmr_srv_sess_put_pool(pool); + return err; +} + +int rmr_srv_get_sync_permit(struct rmr_srv_pool *srv_pool) +{ + atomic_inc(&srv_pool->in_flight_sync_reqs); + + while (atomic_read(&srv_pool->in_flight_sync_reqs) >= sync_queue_depth) { + /* Permit overslow; sleep */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_RUNNING) { + atomic_dec(&srv_pool->in_flight_sync_reqs); + + return -EINTR; + } + } + + return 0; +} + +void rmr_srv_put_sync_permit(struct rmr_srv_pool *srv_pool) +{ + atomic_dec(&srv_pool->in_flight_sync_reqs); + + wake_up_process(srv_pool->th_tsk); +} + +static int rmr_srv_sync_map(void *arg) +{ + struct rmr_srv_pool *srv_pool = arg; + struct rmr_pool *pool = srv_pool->pool; + struct rmr_dirty_id_map *map; + rmr_id_t rmr_id; + struct rmr_map_entry *entry; + int err = 0; + u64 i; + + pr_info("Sync thread starting!\n"); + + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + /* + * We do not need to error out here. + * Since no session has ever been added to this pool, + * it technically means this pool is in sync state. + */ + pr_info("No map found for pool %s\n", pool->poolname); + goto out; + } + + rmr_id.a = 1; + for (i = 0; i < map->no_of_chunks; i++) { + if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_REQ_STOP) { + pr_info("Request to stop sync thread\n"); + err = -EINTR; + goto err; + } + + if (!atomic_read(&srv_pool->store_state) || + atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_WAIT); + pr_err("Pool not in desired state\n"); + /* Unsure what error to return here */ + err = -EINVAL; + goto err; + } + + rmr_id.b = i; + entry = rmr_map_get_dirty_entry(map, rmr_id); + if (entry) { + if (atomic_cmpxchg(&entry->sync_cnt, -1, 0) != -1) { + /* someone has already started sync for this id */ + continue; + } + + err = rmr_srv_sync_chunk_id(srv_pool, entry, rmr_id, true); + if (err) { + /* this is to undo the previous cmpxchg if the error in + * rmr_srv_sync_chunk_id happened before any requests were created + */ + atomic_cmpxchg(&entry->sync_cnt, 0, -1); + pr_err("Failed to sync chunk (%llu, %llu)\n", rmr_id.a, rmr_id.b); + goto err; + } + } + } + + /* + * Finished syncing chunks, + * Now change the thread state to wait, + * to wait for the in flight syncs + */ + atomic_set(&srv_pool->thread_state, SYNC_THREAD_WAIT); + +err: + while (atomic_read(&srv_pool->in_flight_sync_reqs) != 0) { + /* + * Wait for all permits to get freed. + * Since the completion path needs this thread to + * be up and running + */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + //TODO: should it be timeout? + } + +out: + atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED); + + pr_info("Sync thread exiting with err %d\n", err); + return err; +} + +int rmr_srv_sync_thread_start(struct rmr_srv_pool *srv_pool) +{ + atomic_set(&srv_pool->in_flight_sync_reqs, 0); + srv_pool->th_tsk = kthread_run(rmr_srv_sync_map, srv_pool, + "rmr_srv_sync_thread"); + if (IS_ERR(srv_pool->th_tsk)) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_STOPPED); + return -ENOMEM; + } + + atomic_set(&srv_pool->thread_state, SYNC_THREAD_RUNNING); + return 0; +} + +int rmr_srv_sync_thread_stop(struct rmr_srv_pool *srv_pool) +{ + if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_RUNNING) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP); + wake_up_process(srv_pool->th_tsk); + } + + return 0; +} + +void rmr_srv_sync_req_failed(struct rmr_srv_pool *srv_pool) +{ + /* + * TODO: Investigate the necessity to change server state + * to RMR_SRV_POOL_STATE_NO_IO for sync_req failure. + */ + // rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + rmr_srv_sync_thread_stop(srv_pool); +} + +static void rmr_srv_read_map_buf(struct rmr_pool *pool, void *buf, size_t buflen, + const struct rmr_msg_map_buf_cmd *map_buf_cmd) +{ + int size; + u8 map_idx = map_buf_cmd->map_idx; + u64 slp_idx = map_buf_cmd->slp_idx; + + size = rmr_pool_maps_to_buf(pool, &map_idx, &slp_idx, buf, buflen, MAP_NO_FILTER); + if (size == 0) { + // No more dirty map to write + struct rmr_map_buf_hdr *map_buf_hdr = (struct rmr_map_buf_hdr *)buf; + + map_buf_hdr->version = RMR_MAP_FORMAT_VER; + map_buf_hdr->member_id = 0; + } +} + +static void rmr_srv_update_md_buf(struct rmr_srv_pool *srv_pool, void *buf, size_t buflen) +{ + struct rmr_pool *pool = srv_pool->pool; + struct rmr_pool_md *pool_md = &pool->pool_md; + struct rmr_pool_md *buf_md = (struct rmr_pool_md *)buf; + u8 member_id = srv_pool->member_id; + int idx, buf_idx; + + /* Zero out the buffer in case data is corrupted somehow. */ + memset(buf, 0, buflen); + idx = rmr_pool_find_md(pool_md, member_id, false); + if (idx < 0) { + pr_err("The server pool hasn't updated srv_md yet %d\n", member_id); + return; + } + + buf_idx = rmr_pool_find_md(buf_md, member_id, true); + if (buf_idx < 0) { + pr_err("The buffer has no space for the member_id %d\n", member_id); + return; + } + + memcpy(&buf_md->srv_md[buf_idx], &pool_md->srv_md[idx], sizeof(struct rmr_srv_md)); +} + +static int rmr_srv_save_last_io_to_map(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + int i, j, lock_idx; + + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("no map found for member_id %u\n", srv_pool->member_id); + return -EINVAL; + } + + for (i = 0; i < srv_pool->queue_depth; i++) { + rmr_id_t *id; + struct rmr_dirty_id_map *mp; + + id = &srv_pool->last_io[i]; + + if (id->a == U64_MAX && id->b == U64_MAX) + continue; + + if (rmr_map_check_dirty(map, *id)) { + /* + * We already have this id added to our map, and which says + * that its dirty for us. This means that last_io info about + * this id is outdated. + * We honor the info in the map, and skip this entry + */ + continue; + } + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (j = 0; j < pool->maps_cnt; j++) { + mp = rcu_dereference(pool->maps[j]); + if (WARN_ON(!mp) || mp->member_id == srv_pool->member_id) + continue; + + rmr_map_set_dirty(mp, *id, 0); + + // Clean the entry since it has been used up + id->a = U64_MAX; + id->b = U64_MAX; + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + } + + rmr_srv_mark_maps_dirty(srv_pool); + return 0; +} + +/** + * process_msg_user_cmd() - Process user command + * + * @pool: rmr pool + * @cmd_msg: pointer to command message. The user data is right after this struct. + * @data: data buffer to be passed down the user + * @datalen: length of the user buffer + * + * Description: + * Pass down the user command to the user server side. + * The user command data is kept right after the pool command (see arranging of kvec) + * + * Return: + * 0 in case of success + * negative is case of failure + * + * Context: + * The call goes to the user server side. Care must be taken not to block. + */ +static int process_msg_user_cmd(struct rmr_srv_pool *srv_pool, + const struct rmr_msg_pool_cmd *cmd_msg, void *data, int datalen) +{ + struct rmr_srv_io_store *store = srv_pool->io_store; + size_t usr_len = cmd_msg->user_cmd.usr_len; + int ret; + + pr_debug("%s: cmd_len=%zu usr_len=%zu\n", __func__, sizeof(*cmd_msg), usr_len); + + if (!store) { + pr_err("%s: No store registered\n", __func__); + return -EAGAIN; + } + + ret = store->ops->submit_cmd(store->priv, cmd_msg + 1, usr_len, data, datalen); + + return ret; +} + +static void do_sess_leave_srv_sess(struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_sess *srv_sess = pool_sess->srv_sess; + + mutex_lock(&srv_sess->lock); + list_del(&pool_sess->srv_sess_entry); + mutex_unlock(&srv_sess->lock); +} + +static void sess_leave_pool(struct rmr_pool *pool, + struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_sess *srv_sess = pool_sess->srv_sess; + + pr_info("pool sesss %s leaves pool %s\n", + pool_sess->sessname, pool->poolname); + + mutex_lock(&pool->sess_lock); + list_del(&pool_sess->pool_entry); + xa_erase(&srv_sess->pools, pool->group_id); + mutex_unlock(&pool->sess_lock); + + rmr_srv_sysfs_del_sess(pool_sess); + + pool_sess->srv_pool = NULL; +} + +static void rmr_srv_free_pool_sess(struct rmr_srv_pool_sess *pool_sess) +{ + kfree(pool_sess); +} + +static void destroy_sess(struct rmr_srv_sess *srv_sess) +{ + struct rmr_srv_pool *srv_pool; + struct rmr_srv_pool_sess *pool_sess, *tmp; + + // why do they do this in rnbd srv ? + // if (list_empty(&srv_sess->pool_sess_list)) + // goto out; + + mutex_lock(&srv_sess->lock); + list_for_each_entry_safe (pool_sess, tmp, &srv_sess->pool_sess_list, srv_sess_entry) { + list_del(&pool_sess->srv_sess_entry); + srv_pool = pool_sess->srv_pool; + + // A network disconnect event + if (!pool_sess->sync) + rmr_srv_change_pool_state(pool_sess->srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + sess_leave_pool(srv_pool->pool, pool_sess); + rmr_put_srv_pool(srv_pool); + rmr_srv_free_pool_sess(pool_sess); + } + mutex_unlock(&srv_sess->lock); + + xa_destroy(&srv_sess->pools); + might_sleep(); + + mutex_lock(&g_sess_lock); + list_del(&srv_sess->g_list_entry); + mutex_unlock(&g_sess_lock); + + mutex_destroy(&srv_sess->lock); + kfree(srv_sess); +} + +void rmr_srv_destroy_pool(struct rmr_pool *pool) +{ + struct rmr_srv_pool_sess *pool_sess, *tmp; + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!pool) { + pr_err("%s: pool is empty\n", __func__); + return; + } + + list_for_each_entry_safe (pool_sess, tmp, &pool->sess_list, pool_entry) { + WARN_ON(!pool_sess->srv_pool); + + do_sess_leave_srv_sess(pool_sess); + sess_leave_pool(srv_pool->pool, pool_sess); + rmr_put_srv_pool(srv_pool); + rmr_srv_free_pool_sess(pool_sess); + } +} + +int rmr_srv_remove_clt_pool(struct rmr_srv_pool *srv_pool) +{ + struct rmr_pool *clt; + + clt = srv_pool->clt; + if (!clt) { + pr_info("Srv pool %s has no internal clt pool assigned\n", + srv_pool->pool->poolname); + return -EINVAL; + } + + pr_info("from pool %s remove sync (internal) pool %s\n", + srv_pool->pool->poolname, clt->poolname); + srv_pool->clt = NULL; + + rmr_clt_close(clt); + + pr_info("pool %s removed\n", clt->poolname); + + return 0; +} + +static int create_srv_sess(struct rtrs_srv_sess *rtrs) +{ + struct rmr_srv_sess *srv_sess; + char sessname[NAME_MAX]; + int err; + + err = rtrs_srv_get_path_name(rtrs, sessname, sizeof(sessname)); + if (unlikely(err)) { + pr_err("rtrs_srv_get_sess_name(%s): %d\n", sessname, err); + return err; + } + srv_sess = kzalloc(sizeof(*srv_sess), GFP_KERNEL); + if (!srv_sess) + return -ENOMEM; + + mutex_init(&srv_sess->lock); + srv_sess->rtrs = rtrs; + strscpy(srv_sess->sessname, sessname, NAME_MAX); + xa_init_flags(&srv_sess->pools, XA_FLAGS_ALLOC); + INIT_LIST_HEAD(&srv_sess->pool_sess_list); + mutex_init(&srv_sess->lock); + + mutex_lock(&g_sess_lock); + list_add(&srv_sess->g_list_entry, &g_sess_list); + mutex_unlock(&g_sess_lock); + + rtrs_srv_set_sess_priv(rtrs, srv_sess); + + return 0; +} + +static int rmr_srv_link_ev(struct rtrs_srv_sess *rtrs, + enum rtrs_srv_link_ev ev, void *priv) +{ + struct rmr_srv_sess *srv_sess = priv; + + switch (ev) { + case RTRS_SRV_LINK_EV_CONNECTED: + return create_srv_sess(rtrs); + + case RTRS_SRV_LINK_EV_DISCONNECTED: + if (WARN_ON(!srv_sess)) + return -EINVAL; + + destroy_sess(srv_sess); + return 0; + + default: + pr_warn("Received unknown rtrs session event %d from session %s\n", + ev, srv_sess->sessname); + return -EINVAL; + } +} + +static struct rmr_srv_pool_sess *__find_sess_in_pool(struct rmr_pool *pool, + const char *sessname) +{ + struct rmr_srv_pool_sess *pool_sess; + + list_for_each_entry (pool_sess, &pool->sess_list, pool_entry) { + if (!strcmp(pool_sess->sessname, sessname)) { + return pool_sess; + } + } + + return NULL; +} + +static int sess_join_pool(struct rmr_pool *pool, struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_pool_sess *find; + struct rmr_srv_sess *srv_sess = pool_sess->srv_sess; + int ret = 0; + + mutex_lock(&pool->sess_lock); + find = __find_sess_in_pool(pool, pool_sess->sessname); + if (find) { + ret = -EEXIST; + goto unlock; + } + + ret = xa_err(xa_store(&srv_sess->pools, pool->group_id, pool, GFP_KERNEL)); + if (ret) { + pr_err("can not add pool %s err %d\n", pool->poolname, ret); + goto unlock; + } + pr_info("%s: Added pool %s to rmr_srv_sess %s\n", + __func__, pool->poolname, srv_sess->sessname); + + ret = rmr_srv_sysfs_add_sess(pool, pool_sess); + if (ret) { + pr_err("failed to create sysfs for pool sess %s in pool %s\n", + pool_sess->sessname, pool->poolname); + + xa_erase(&srv_sess->pools, pool->group_id); + goto unlock; + } + list_add(&pool_sess->pool_entry, &pool->sess_list); + +unlock: + mutex_unlock(&pool->sess_lock); + + return ret; +} + +static void do_sess_leave_pool(struct rmr_pool *pool, struct rmr_srv_pool_sess *pool_sess) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + do_sess_leave_srv_sess(pool_sess); + sess_leave_pool(pool, pool_sess); + rmr_put_srv_pool(srv_pool); + rmr_srv_free_pool_sess(pool_sess); +} + +/** + * process_msg_pool_info() - Process a POOL_INFO membership change notification + * + * @pool: Pool which received the command. + * @pool_info_cmd: The received POOL_INFO command carrying member_id, + * operation, mode, and dirty flag. + * + * Dispatches on (operation, mode) pairs notified by the client: + * - ADD + CREATE: a new storage node is joining; add it via + * rmr_srv_handle_other_member_add(). + * - ADD + ASSEMBLE: an existing node is reassembling; verify its map and + * stg_members entry already exist. + * - REMOVE + DELETE: a storage node is permanently leaving; remove its map + * and stg_members entry via rmr_srv_delete_store_member(). + * - REMOVE + DISASSEMBLE: temporary leave; no map changes needed (TODO). + * + * Return: + * 0 on success, negative error code on failure. + */ +static int process_msg_pool_info(struct rmr_pool *pool, + const struct rmr_msg_pool_info_cmd *pool_info_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + int ret = 0; + + pr_info("%s: Server pool %s with member_id %u, received pool_info message\n", + __func__, pool->poolname, srv_pool->member_id); + + if (pool_info_cmd->operation == RMR_POOL_INFO_OP_ADD) { + ret = rmr_srv_handle_other_member_add(srv_pool, pool_info_cmd); + if (ret) { + pr_err("%s: Failed to create maps for other pools: %d\n", + __func__, ret); + return ret; + } + } else if (pool_info_cmd->operation == RMR_POOL_INFO_OP_REMOVE) { + if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_DELETE) { + pr_info("%s: Member %u got remove of member %u with mode delete\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + rmr_srv_delete_store_member(pool, pool_info_cmd->member_id); + } else if (pool_info_cmd->mode == RMR_POOL_INFO_MODE_DISASSEMBLE) { + pr_info("%s: Member %u got remove of member %u with mode disassemble, " + "preserving dirty map\n", + __func__, srv_pool->member_id, pool_info_cmd->member_id); + /* + * Do NOT remove the dirty map or stg_members entry for the + * disassembled member. IOs arriving after this point will + * continue to accumulate dirty entries for that member via + * the piggyback mechanism, so it can resync on reassembly. + */ + } + } + rmr_srv_flush_pool_md(srv_pool); + + return ret; +} + +static struct rmr_srv_pool_sess *alloc_pool_sess(struct rmr_srv_pool *srv_pool, + struct rmr_srv_sess *srv_sess) +{ + struct rmr_srv_pool_sess *pool_sess; + + pool_sess = kzalloc_node(sizeof(*pool_sess), GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!pool_sess)) { + pr_err("Failed to allocate session for srv pool %s\n", srv_pool->pool->poolname); + return ERR_PTR(-ENOMEM); + } + + strscpy(pool_sess->sessname, srv_sess->sessname, NAME_MAX); + INIT_LIST_HEAD(&pool_sess->pool_entry); + INIT_LIST_HEAD(&pool_sess->srv_sess_entry); + pool_sess->srv_sess = srv_sess; + pool_sess->srv_pool = srv_pool; + + return pool_sess; +} + +/** + * rmr_srv_process_join_create() - Handle the CREATE case of a join_pool message + * + * @pool: The pool being created. + * @join_pool_cmd: The received join_pool command carrying dirty flag and + * per-member info for any pre-existing pool members. + * + * If the client reports that this server's existing data is dirty, marks own + * map fully dirty. Then iterates the per-member list in the message and adds + * each member via rmr_srv_add_store_member(), marking its map dirty if the + * client flagged it. On failure, all members added so far are cleaned up. + * + * Return: + * 0 on success, negative error code on failure. + */ +static int rmr_srv_process_join_create(struct rmr_pool *pool, + const struct rmr_msg_join_pool_cmd *join_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + int i, ret; + u8 member_id; + + /* + * Mark our maps dirty if client asked us to. + */ + if (join_pool_cmd->dirty) { + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("%s: No map found for %u\n", + __func__, srv_pool->member_id); + return -EINVAL; + } + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + } + + /* + * Add other storage members in case its a create message. + */ + for (i = 0; i < join_pool_cmd->mem_info.no_of_stor; i++) { + member_id = join_pool_cmd->mem_info.p_mem_info[i].member_id; + + ret = rmr_srv_add_store_member(pool, member_id); + if (ret) { + pr_err("%s: rmr_srv_add_store_member failed %d\n", __func__, ret); + goto cleanup; + } + + if (join_pool_cmd->mem_info.p_mem_info[i].c_dirty) { + map = rmr_pool_find_map(pool, member_id); + if (WARN_ON(!map)) { + xa_erase(&pool->stg_members, member_id); + ret = -EINVAL; + goto cleanup; + } + rmr_map_set_dirty_all(map, MAP_NO_FILTER); + } + } + + return 0; + +cleanup: + while (i--) + rmr_srv_delete_store_member(pool, + join_pool_cmd->mem_info.p_mem_info[i].member_id); + return ret; +} + +static void rmr_srv_process_leave_delete(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + void *entry; + unsigned long id; + + /* + * When we are leaving a pool (not disassembly), we have to, + * 1) Delete dirty entries from all the maps of other storage nodes, since we do not + * need them anymore + * 2) Delete all the maps of other storage nodes. + * + * Map for this storage node is created/deleted during register/unregister. + */ + xa_for_each(&pool->stg_members, id, entry) { + if (id == srv_pool->member_id) + continue; + + rmr_srv_delete_store_member(pool, id); + } +} + +static int process_msg_join_pool(struct rmr_pool *pool, struct rmr_srv_sess *srv_sess, + struct rtrs_srv_sess *rtrs, bool sync, + const struct rmr_msg_join_pool_cmd *join_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_pool_sess *pool_sess; + int ret = 0, i; + bool alloced_last_io = false; + + pr_info("Client %s requests to join pool %s (state=%d)\n", + srv_sess->sessname, pool->poolname, atomic_read(&srv_pool->state)); + + mutex_lock(&srv_sess->lock); + + /* + * Here we only do chunk size check, + * to make sure different storage nodes do not use different chunk sizes. + */ + if (join_pool_cmd->chunk_size && pool->chunk_size != join_pool_cmd->chunk_size) { + pr_err("pool %s has chunksize %u != msg chunksize %u\n", + pool->poolname, pool->chunk_size, join_pool_cmd->chunk_size); + ret = -EINVAL; + goto unlock; + } + + mutex_lock(&srv_pool->srv_pool_lock); + if (atomic_read(&srv_pool->state) == RMR_SRV_POOL_STATE_EMPTY) { + pr_err("%s: pool %s has no store registered; join rejected\n", + __func__, pool->poolname); + ret = -EINVAL; + goto unlock_srv_pool_lock; + } + + if (!sync) { + if (join_pool_cmd->create) { + if (srv_pool->last_io || srv_pool->last_io_idx) { + pr_err("%s: pool %s already has last_io buffer allocated\n", + __func__, pool->poolname); + ret = -EEXIST; + goto unlock_srv_pool_lock; + } + + if (!srv_pool->marked_create) { + pr_err("%s: pool %s not in create state\n", + __func__, pool->poolname); + ret = -EINVAL; + goto unlock_srv_pool_lock; + } + } else if (srv_pool->marked_create) { + pr_err("%s: pool %s should not be in create state\n", + __func__, pool->poolname); + ret = -EINVAL; + goto unlock_srv_pool_lock; + } + } + + pool_sess = alloc_pool_sess(srv_pool, srv_sess); + if (IS_ERR(pool_sess)) { + pr_err("failed to allc pool_sees for pool %s sev_sess %s: %pe\n", + pool->poolname, srv_sess->sessname, pool_sess); + ret = PTR_ERR(pool_sess); + goto unlock_srv_pool_lock; + } + srv_pool->queue_depth = join_pool_cmd->queue_depth; + + ret = sess_join_pool(pool, pool_sess); + if (ret) { + pr_err("Failed to join pool\n"); + goto free_sess; + } + pool_sess->sync = sync; + + if (!pool_sess->sync && !srv_pool->last_io) { + /* Joining for the first time */ + srv_pool->last_io = kcalloc(srv_pool->queue_depth, sizeof(*srv_pool->last_io), + GFP_KERNEL); + if (!srv_pool->last_io) { + pr_err("Memory allocation failed for srv_pool->last_io\n"); + ret = -ENOMEM; + goto sess_leave; + } + alloced_last_io = true; + + /* The previous last_io buffer exists. */ + if (srv_pool->last_io_idx) { + memcpy(srv_pool->last_io, srv_pool->last_io_idx, + rmr_last_io_len(srv_pool->queue_depth)); + } else { + for (i = 0; i < srv_pool->queue_depth; i++) { + srv_pool->last_io[i].a = U64_MAX; + srv_pool->last_io[i].b = U64_MAX; + } + + srv_pool->last_io_idx = kcalloc(srv_pool->queue_depth, + sizeof(*srv_pool->last_io_idx), GFP_KERNEL); + if (!srv_pool->last_io_idx) { + ret = -ENOMEM; + goto free_last_io; + } + } + pr_info("Allocated %ld B last_io buffer for pool %s\n", + srv_pool->queue_depth * sizeof(*srv_pool->last_io), pool->poolname); + } + + /* + * Join/Rejoin messages from sync sessions do not affect our state. + * + * For non-sync sessions, if our state is NO_IO, pserver can either send a, + * - rejoin message in case our state NO_IO due to network/IO issue + * - join message in case pserver crashed + * hence, no state transition is needed. + */ + if (!pool_sess->sync) { + if (join_pool_cmd->create) { + /* + * First-time pool creation: set up member info and maps, + * then move to CREATED awaiting enable_pool(1). + */ + ret = rmr_srv_process_join_create(pool, join_pool_cmd); + if (ret) { + pr_err("%s: rmr_srv_process_join_create failed %d\n", + __func__, ret); + goto free_last_io; + } + + /* + * In the CREATE path pool_md has only magic set; all other + * header fields are normally populated later by + * RMR_CMD_SEND_MD_BUF. Initialise them now so that + * queue_depth (and the bitmap/last_io offsets derived from + * it) are correct before the first on-demand map flush fires. + */ + pool->pool_md.queue_depth = join_pool_cmd->queue_depth; + pool->pool_md.chunk_size = pool->chunk_size; + pool->pool_md.mapped_size = pool->mapped_size; + pool->pool_md.group_id = pool->group_id; + strscpy(pool->pool_md.poolname, pool->poolname, + sizeof(pool->pool_md.poolname)); + rmr_srv_mark_pool_md_dirty(srv_pool); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + + ret = rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_CREATED); + if (ret < 0) + goto leave_delete; + + srv_pool->marked_create = false; + } else if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NO_IO) { + /* + * Assemble or rejoin: a map update is needed before IOs + * can resume, so move to NO_IO. If we are already in + * NO_IO (e.g. pserver reconnecting after a network event + * that already drove us there), no transition is needed. + */ + ret = rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + if (ret < 0) + goto leave_delete; + } + } + + mutex_unlock(&srv_pool->srv_pool_lock); + + rmr_get_srv_pool(srv_pool); + list_add_tail(&pool_sess->srv_sess_entry, &srv_sess->pool_sess_list); + + mutex_unlock(&srv_sess->lock); + + return 0; + +leave_delete: + if (!pool_sess->sync && join_pool_cmd->create) + rmr_srv_process_leave_delete(pool); +free_last_io: + if (alloced_last_io) { + kfree(srv_pool->last_io); + srv_pool->last_io = NULL; + + kfree(srv_pool->last_io_idx); + srv_pool->last_io_idx = NULL; + } +sess_leave: + sess_leave_pool(pool, pool_sess); +free_sess: + rmr_srv_free_pool_sess(pool_sess); +unlock_srv_pool_lock: + mutex_unlock(&srv_pool->srv_pool_lock); +unlock: + mutex_unlock(&srv_sess->lock); + return ret; +} + +void rmr_srv_stop_sync_and_go_offline(struct rmr_pool *pool) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + atomic_set(&srv_pool->thread_state, SYNC_THREAD_REQ_STOP); + wake_up_process(srv_pool->th_tsk); + + while (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(1000)); + } + } +} + +static int process_msg_leave_pool(struct rmr_pool *pool, struct rmr_srv_sess *sess, bool sync, + const struct rmr_msg_leave_pool_cmd *leave_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_pool_sess *pool_sess; + u64 last_io_len; + int ret = 0; + void *buf; + + pr_info("Session %s requests to leave pool %d\n", sess->sessname, + leave_pool_cmd->member_id); + + if (srv_pool->member_id != leave_pool_cmd->member_id) { + pr_err("%s: For sess %s, Srv pool member_id %d, Message member_id %d\n", + __func__, sess->sessname, srv_pool->member_id, leave_pool_cmd->member_id); + return -ENOENT; + } + + mutex_lock(&pool->sess_lock); + pool_sess = __find_sess_in_pool(pool, sess->sessname); + if (!pool_sess) { + mutex_unlock(&pool->sess_lock); + pr_err("Session %s is not in pool %s\n", sess->sessname, + pool->poolname); + return -ENOENT; + } + mutex_unlock(&pool->sess_lock); + + do_sess_leave_pool(pool, pool_sess); + + mutex_lock(&srv_pool->srv_pool_lock); + srv_pool->marked_delete = leave_pool_cmd->delete; + mutex_unlock(&srv_pool->srv_pool_lock); + + if (!sync) { + /* + * Stop the sync thread if its running, and go offline. + */ + rmr_srv_stop_sync_and_go_offline(pool); + + if (leave_pool_cmd->delete) { + rmr_srv_process_leave_delete(pool); + } else { + /* + * Disassemble: flush the dirty map to disk first so that + * the on-disk map reflects all dirty entries accumulated + * up to this point. On reassembly the map is read back + * and used to drive resync of any members that missed IOs. + */ + rmr_srv_md_maps_sync(pool); + + /* + * Clear last_io and persist it to disk so that it is not + * used after reassembly. Note: maps are always flushed + * above regardless of whether last_io is valid; the two + * operations are independent. + */ + last_io_len = rmr_last_io_len(pool->pool_md.queue_depth); + + if (!srv_pool->last_io || !last_io_len) + goto change_state; + + memset(srv_pool->last_io, 0, last_io_len); + if (srv_pool->last_io_idx) + memset(srv_pool->last_io_idx, 0, last_io_len); + + buf = kzalloc(last_io_len, GFP_KERNEL); + if (!buf) + goto change_state; + + ret = process_md_io(pool, NULL, + RMR_LAST_IO_OFFSET, + last_io_len, + RMR_OP_MD_WRITE, buf); + if (ret) { + pr_err("%s: For pool %s process_md_io failed\n", + __func__, pool->poolname); + } + kfree(buf); + } + +change_state: + /* + * All sessions have left. Transition back to REGISTERED if the + * backend store is still present, or to EMPTY if it is not. + */ + mutex_lock(&srv_pool->srv_pool_lock); + if (srv_pool->io_store) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_REGISTERED); + else + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_EMPTY); + mutex_unlock(&srv_pool->srv_pool_lock); + } + + return 0; +} + +static int process_msg_map_clear(struct rmr_srv_sess *srv_sess, + const void *usr) +{ + const struct rmr_msg_io *msg = usr; + struct rmr_pool *pool; + rmr_id_t id; + unsigned long key; + struct rmr_map_entry *entry; + struct rmr_dirty_id_map *map; + u8 member_id; + int err = 0; + u32 group_id = le32_to_cpu(msg->hdr.group_id); + + id.a = le64_to_cpu(msg->id_a); + id.b = le64_to_cpu(msg->id_b); + key = rmr_id_to_key(id); + member_id = msg->member_id; + + pr_debug("received map clear msg, id (%llu, %llu), member_id %u\n", + id.a, id.b, member_id); + + pool = rmr_srv_sess_get_pool(srv_sess, group_id); + if (IS_ERR(pool)) { + pr_err_ratelimited("Got I/O request on session %s for unknown pool: %pe\n", + srv_sess->sessname, pool); + return PTR_ERR(pool); + } + + map = rmr_pool_find_map(pool, member_id); + if (!map) { + pr_err("no map found for member_id %u\n", member_id); + err = -EINVAL; + goto put_pool; + //TODO: handle this , probably initialize map, or just throw err? + } + + entry = rmr_map_unset_dirty(map, id, MAP_NO_FILTER); + if (entry) { + /* We do not need any rcu protection here since it is deleted by the other + * rmr server. And sync can only be done for entries that are + * dirty for this particaular server. + */ + kmem_cache_free(rmr_map_entry_cachep, entry); + } + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + +put_pool: + rmr_srv_sess_put_pool(pool); + return err; +} + +static int process_msg_map_add(struct rmr_srv_sess *srv_sess, + const void *usr) +{ + const struct rmr_msg_io *msg = usr; + struct rmr_pool *pool; + int i, ret = 0; + struct rmr_dirty_id_map *map; + u32 group_id = le32_to_cpu(msg->hdr.group_id); + + pr_debug("received map add member_id %u, id (%llu %llu)\n", + msg->member_id, msg->id_a, msg->id_b); + + pool = rmr_srv_sess_get_pool(srv_sess, group_id); + if (IS_ERR(pool)) { + pr_err_ratelimited("Got I/O request on session %s for unknown pool: %pe\n", + srv_sess->sessname, pool); + return PTR_ERR(pool); + } + + for (i = 0; i < msg->failed_cnt; i++) { + u64 msg_map_ver = le64_to_cpu(msg->map_ver); + rmr_id_t id; + + map = rmr_pool_find_map(pool, msg->failed_id[i]); + if (!map) { + pr_err("no map found for member_id %u\n", msg->failed_id[i]); + ret = -EINVAL; + goto put_pool; + } + + atomic_set(&map->check_state, RMR_MAP_STATE_NO_CHECK); + id.a = le64_to_cpu(msg->id_a); + id.b = le64_to_cpu(msg->id_b); + rmr_map_set_dirty(map, id, 0); + + if (msg_map_ver > pool->map_ver) + pool->map_ver = msg_map_ver; + } + if (msg->failed_cnt) { + rmr_srv_mark_pool_md_dirty((struct rmr_srv_pool *)pool->priv); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + } + +put_pool: + rmr_srv_sess_put_pool(pool); + + return ret; +} + +/** + * rmr_srv_set_pool_mm() - Set the rmr srv pool to maintenance mode + * + * @srv_pool: The rmr srv pool to set in maintenance mode + * + * Description: + * While in maintenance mode, we do not serve IOs either, so we set state to NO_IO + * + * Return: + * 0 on success + * Error value on failure + */ +static int rmr_srv_set_pool_mm(struct rmr_srv_pool *srv_pool) +{ + srv_pool->maintenance_mode = true; + + return rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); +} + +/** + * rmr_srv_unset_pool_mm() - Clear the rmr srv pool maintenance mode + * + * @srv_pool: The rmr srv pool to clear maintenance mode of + * + * Description: + * While in maintenance mode, we do not serve IOs either, so we set state to NO_IO + * + * Return: + * 0 on success + * Error value on failure + */ +static int rmr_srv_unset_pool_mm(struct rmr_srv_pool *srv_pool) +{ + srv_pool->maintenance_mode = false; + rmr_srv_flush_pool_md(srv_pool); + + return 0; +} + +static int process_msg_enable_pool(struct rmr_pool *pool, struct rmr_srv_sess *sess, bool sync, + const struct rmr_msg_enable_pool_cmd *enable_pool_cmd) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + enum rmr_srv_pool_state old_state = atomic_read(&srv_pool->state); + int ret = 0; + + /* + * Enable/Disable messages from sync sessions do not affect us. + */ + if (sync) { + pr_info("%s: From sync sess %s, for pool %s\n", __func__, sess->sessname, + pool->poolname); + return 0; + } + + pr_info("Client %s requests to set enable=%d pool %s current state %s\n", + sess->sessname, enable_pool_cmd->enable, pool->poolname, + rmr_get_srv_pool_state_name(old_state)); + + /* + * Enable when not in maintenance mode, can be handled simply + */ + if (enable_pool_cmd->enable && !srv_pool->maintenance_mode) { + /* + * CREATED -> NORMAL: initial enable after create-mode join. + * NO_IO -> NORMAL: was_last_authoritative recovery (pserver + * enables this node directly without a map update because its + * dirty map is already authoritative). + */ + if (old_state != RMR_SRV_POOL_STATE_CREATED && + old_state != RMR_SRV_POOL_STATE_NO_IO) { + pr_err("%s: pool %s cannot be enabled in state %s\n", + __func__, pool->poolname, + rmr_get_srv_pool_state_name(old_state)); + return -EINVAL; + } + + ret = rmr_srv_set_pool_state_normal(srv_pool); + if (ret < 0) + goto out_err; + + return 0; + } + + /* + * Any other case involves considering maintenance mode settings + */ + if (!enable_pool_cmd->enable) { + if (old_state != RMR_SRV_POOL_STATE_NORMAL && + old_state != RMR_SRV_POOL_STATE_NO_IO) { + pr_err("%s: pool %s can only disable from NORMAL or NO_IO state (current: %s)\n", + __func__, pool->poolname, + rmr_get_srv_pool_state_name(old_state)); + return -EINVAL; + } + ret = rmr_srv_set_pool_mm(srv_pool); + } else { + ret = rmr_srv_unset_pool_mm(srv_pool); + } + + if (ret < 0) + goto out_err; + + return 0; + +out_err: + /* + * Put srv pool state to old one + */ + atomic_set(&srv_pool->state, old_state); + return ret; +} + +/** + * process_msg_map_ready() - Process RMR_CMD_MAP_READY command + * + * @pool: Pool which received the command + * @sync: Whether the command was sent from an internal (sync) rmr-client or not + * + * Return: + * 0 on success + * Negative errno on failure + * + * Description: + * A RMR_CMD_MAP_READY command is the first command that is sent to a storage node which will + * receive a map from another storage node as part of a map update. + * + * It checks whether this storage node is ready and in an expected state to receive a map. + */ +static int process_msg_map_ready(struct rmr_pool *pool, bool sync) +{ + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_dirty_id_map *map; + int i, err = 0, pool_state; + + mutex_lock(&srv_pool->srv_pool_lock); + pool_state = atomic_read(&srv_pool->state); + + /* A map update from another storage node is not allowed. */ + if (sync) { + pr_err("%s: (sync) Cannot receive map from other storage nodes\n", __func__); + err = -EINVAL; + goto out; + } + + /* + * A map update from pserver should start only when in, + * NO_IO - after a network/IO error + * CREATED - For extend (This is not nice. + * Extend should inform the storage node that it is being + * used for an extend leg for an already existing node, and + * the state should be set accordingly. So that we can allow + * this only when in NO_IO state.) + */ + if (pool_state != RMR_SRV_POOL_STATE_NO_IO && pool_state != RMR_SRV_POOL_STATE_CREATED) { + pr_err("(non-sync) pool state not correct %d", pool_state); + err = -EINVAL; + goto out; + } + + /* + * We seem to be in process of another map update. + */ + if (srv_pool->map_update_state != MAP_UPDATE_STATE_DISABLED) { + pr_err("rmr_srv_send_map Map update already in progress\n"); + err = -EINVAL; + goto out; + } + + /* + * If pserver is instructing us to receive a map, then the map we + * hold is meaningless. + */ + mutex_lock(&pool->maps_lock); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference_protected(pool->maps[i], + lockdep_is_held(&pool->maps_lock)); + if (!map) + continue; + + rmr_map_unset_dirty_all(map); + } + mutex_unlock(&pool->maps_lock); + rmr_srv_mark_maps_dirty(srv_pool); + + srv_pool->map_update_state = MAP_UPDATE_STATE_READY; + + pr_info("%s: process_msg_cmd: moved to MAP_UPDATE_STATE_READY\n", __func__); + +out: + mutex_unlock(&srv_pool->srv_pool_lock); + return err; +} + +/** + * process_msg_cmd_handler() - Processes rmr command message + * + * @work: scheduled work structure + * + * Description: + * The command messages being processed here, can be broadly divided into 2 categories. + * Ones which are able to use the rsp buffer to send back status. + * Ones which cannot use the rsp buffer to send back status. These ones use the rsp buffer + * for other purposes; like sending map data, or read user rsp buffer. + * + * Context: + * Execution time depends on the command. It may take a long time for commands which sends + * data (map). + */ +static void process_msg_cmd_handler(struct work_struct *work) +{ + struct rmr_cmd_work_info *work_info = container_of(work, struct rmr_cmd_work_info, cmd_work); + struct rmr_pool *pool = work_info->pool; + struct rmr_srv_pool *srv_pool = (struct rmr_srv_pool *)pool->priv; + struct rmr_srv_sess *sess = work_info->sess; + struct rtrs_srv_sess *rtrs = work_info->rtrs; + const struct rmr_msg_pool_cmd *cmd_msg = work_info->cmd_msg; + struct rmr_dirty_id_map *map; + u8 sync, flags; + u64 src_mapped_size; + int md_i, err = 0; + + /* + * The switch cases below are used by either map sending node, + * or the node which is to receive the map, but not both. + */ + switch (cmd_msg->cmd_type) { + case RMR_CMD_REJOIN_POOL: + /* + * For now, we do not have any difference between joinand + * rejoin on the storage server side + */ + case RMR_CMD_JOIN_POOL: + /* + * Server node, received a request for a new session + */ + err = process_msg_join_pool(pool, sess, rtrs, cmd_msg->sync, + &cmd_msg->join_pool_cmd); + if (err) { + pr_err("process_msg_join_pool failed with err %d\n", err); + goto out; + } + work_info->rsp->join_pool_cmd_rsp.chunk_size = pool->chunk_size; + + if (pool->mapped_size) { + work_info->rsp->join_pool_cmd_rsp.mapped_size = pool->mapped_size; + pr_info("srv pool %s sets mapped size %llu\n", + pool->poolname, pool->mapped_size); + } else + work_info->rsp->join_pool_cmd_rsp.mapped_size = 0; + + break; + case RMR_CMD_POOL_INFO: + /* + * Server node, received pool info command + */ + err = process_msg_pool_info(pool, &cmd_msg->pool_info_cmd); + if (err) { + pr_err("process_msg_pool_info failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_LEAVE_POOL: + err = process_msg_leave_pool(pool, sess, cmd_msg->sync, &cmd_msg->leave_pool_cmd); + if (err) { + pr_err("process_msg_leave_pool failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_ENABLE_POOL: + err = process_msg_enable_pool(pool, sess, cmd_msg->sync, &cmd_msg->enable_pool_cmd); + if (err) { + pr_err("process_msg_enable_pool failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_MAP_READY: + /* + * Map receiving node. + * Getting ready to receive dirty map + */ + pr_info("%s: RMR_CMD_MAP_READY\n", __func__); + + err = process_msg_map_ready(pool, cmd_msg->sync); + if (err) { + pr_err("process_msg_map_ready failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_MAP_SEND: + /* + * Map sending node. + * Send map to the node with member_id == map_send_cmd->receiver_member_id + */ + pr_info("%s: RMR_CMD_MAP_SEND\n", __func__); + + err = rmr_clt_send_map(pool, srv_pool->clt, &cmd_msg->map_send_cmd, MAP_NO_FILTER); + if (err) { + pr_err("rmr_clt_send_map failed with err %d\n", err); + goto out; + } + + break; + case RMR_CMD_SEND_MAP_BUF: + /* + * Map receiving node. + * Received the map from another node. Save it. + */ + pr_info("%s: RMR_CMD_SEND_MAP_BUF\n", __func__); + + if (srv_pool->map_update_state != MAP_UPDATE_STATE_READY) { + pr_err("rmr_srv_send_map Node not ready to receive map\n"); + err = -EINVAL; + goto out; + } + + err = rmr_pool_save_map(pool, work_info->data, work_info->datalen, + false); + if (err) { + if (!cmd_msg->sync) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + pr_err("rmr_pool_save_map failed\n"); + goto out; + } + break; + case RMR_CMD_MAP_BUF_DONE: + /* + * Map receiving node. + * A confirmation that all map updates have been sent. + */ + pr_info("%s: RMR_CMD_MAP_BUF_DONE\n", __func__); + + if (srv_pool->map_update_state != MAP_UPDATE_STATE_READY) { + pr_err("rmr_srv_send_map Node state not correct\n"); + err = -EINVAL; + goto out; + } + + if (cmd_msg->map_buf_done_cmd.map_version < pool->map_ver) { + pr_err("Map version received (%llu) is older than ours (%llu)\n", + cmd_msg->map_buf_done_cmd.map_version, pool->map_ver); + err = -EINVAL; + goto out; + } + + pool->map_ver = cmd_msg->map_buf_done_cmd.map_version; + rmr_srv_mark_pool_md_dirty(srv_pool); + + srv_pool->map_update_state = MAP_UPDATE_STATE_DONE; + + break; + case RMR_CMD_MAP_DONE: + /* + * Map receiving node. + * A confirmation from the client, that map update was done successfully or not. + */ + pr_info("%s: RMR_CMD_MAP_DONE\n", __func__); + + if (srv_pool->map_update_state != MAP_UPDATE_STATE_DONE) { + pr_err("rmr_srv_send_map Map not updated succesfully\n"); + err = -EINVAL; + } + + /* + * On a successful map update, we go to NORMAL state. + * + * map_done_cmd.enable says whether this map update should make us go to + * NORMAL state or not. This is controlled by the pserver. + */ + if (cmd_msg->map_done_cmd.enable) { + if (rmr_srv_set_pool_state_normal(srv_pool) < 0) + err = -EINVAL; + } + + srv_pool->map_update_state = MAP_UPDATE_STATE_DISABLED; + break; + case RMR_CMD_MAP_DISABLE: + /* + * Something went wrong on the client side; we need to reset everything. + */ + pr_info("%s: RMR_CMD_MAP_DISABLE\n", __func__); + + if (!cmd_msg->sync) + rmr_srv_change_pool_state(srv_pool, RMR_SRV_POOL_STATE_NO_IO); + + srv_pool->map_update_state = MAP_UPDATE_STATE_DISABLED; + break; + case RMR_CMD_READ_MAP_BUF: + /* + * Pserver wants to read our dirty map. So send it. + */ + pr_info("%s: RMR_CMD_READ_MAP_BUF\n", __func__); + + rmr_srv_read_map_buf(pool, work_info->data, work_info->datalen, + &cmd_msg->map_buf_cmd); + + goto out_no_rsp; + case RMR_CMD_MAP_CHECK: + pr_debug("%s: RMR_CMD_MAP_CHECK\n", __func__); + + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_debug("srv pool %s is not in normal state, cannot do map check\n", + pool->poolname); + work_info->rsp->value = false; + break; + } + map = rmr_pool_find_map(pool, srv_pool->member_id); + if (!map) { + pr_err("pool %s no map found for member_id %u\n", + pool->poolname, srv_pool->member_id); + err = -EINVAL; + goto out; + } + work_info->rsp->value = rmr_map_empty(map); + pr_debug("pool %s member_id %d rsp with map_empty=%llu\n", + pool->poolname, srv_pool->member_id, + work_info->rsp->value); + + break; + + case RMR_CMD_LAST_IO_TO_MAP: + /* + * Use the last_io list, and add those IOs as dirty IDs to the map + * for every other storage server other than this one. + */ + pr_info("%s: RMR_CMD_LAST_IO_TO_MAP\n", __func__); + err = rmr_srv_save_last_io_to_map(pool); + if (err) { + pr_err("rmr_srv_save_last_io_to_map failed\n"); + goto out; + } + + break; + + case RMR_CMD_MAP_TEST: + /* + * Received the map test from another node. + * Check that we have everything that other node has. + */ + pr_info("%s: RMR_CMD_MAP_TEST\n", __func__); + + err = rmr_pool_save_map(pool, work_info->data, work_info->datalen, true); + if (err) { + pr_err("rmr_srv_save_map failed, test_only, err %d\n", err); + } + goto out_no_rsp; + case RMR_CMD_MD_SEND: + /* + * Received the message to copy metadata of server pool to the sender. + */ + src_mapped_size = cmd_msg->md_send_cmd.src_mapped_size; + pr_debug("stg %u: receives md_update message from pool %u\n", + srv_pool->member_id, cmd_msg->md_send_cmd.leader_id); + + /* Check the pool mapped_sizes are consistent or not */ + if (pool->mapped_size && src_mapped_size && pool->mapped_size != src_mapped_size) { + pr_err_ratelimited("This %s mapped_size %llu != src %d mapped_size %llu\n", + pool->poolname, pool->mapped_size, cmd_msg->md_send_cmd.leader_id, + src_mapped_size); + goto out; + } + + if (cmd_msg->md_send_cmd.read_full_md) { + if (work_info->datalen < sizeof(struct rmr_pool_md)) { + pr_err("%s: buffer too small for full pool_md (%zu < %zu)\n", + __func__, work_info->datalen, + sizeof(struct rmr_pool_md)); + err = -EINVAL; + goto out; + } + memcpy(work_info->data, &pool->pool_md, sizeof(struct rmr_pool_md)); + } else { + /* If updating buf incurs error, it simply waits for next md_update. */ + rmr_srv_update_md_buf(srv_pool, work_info->data, work_info->datalen); + } + + break; + case RMR_CMD_SEND_MD_BUF: + /* + * Received the client pool metadata. Save it. + */ + sync = cmd_msg->send_md_buf_cmd.sync; + flags = cmd_msg->send_md_buf_cmd.flags; + if (flags == RMR_OP_MD_WRITE) { + err = rmr_srv_md_process_buf(pool, work_info->data, sync); + if (err) { + pr_err("rmr_srv_write_md failed\n"); + goto out; + } + + if (atomic_read(&srv_pool->store_state)) { + /* write back to disk */ + err = process_md_io(pool, NULL, 0, work_info->datalen, flags, + &pool->pool_md); + if (err) { + pr_err("Failed to process md io\n"); + goto out; + } + } + } + + if (!sync && flags == RMR_OP_MD_READ) + memcpy(work_info->data, &pool->pool_md, sizeof(struct rmr_pool_md)); + + break; + case RMR_CMD_SEND_DISCARD: + /* Received the message to handle discards. */ + pr_info("%s: RMR_CMD_SEND_DISCARD for srv %u\n", + __func__, cmd_msg->send_discard_cmd.member_id); + if (!cmd_msg->sync) { + err = rmr_pool_md_check_discard(pool, cmd_msg->send_discard_cmd.member_id); + if (err > 0) { + /* This node has received discards. */ + err = 0; + pr_info("pool %s member_id %d has received discards\n", + pool->poolname, srv_pool->member_id); + goto out; + } + } + + /* + * For sync requests, even if the server that is not in normal state has received + * the discard request, its dirty map is still outdated. However, non-sync + * requests can overlook this check and proceed discarding directly. + */ + if (cmd_msg->sync && atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL){ + pr_err("srv pool %s not in normal state for sync discard request\n", + pool->poolname); + err = -EINVAL; + goto out; + } + + err = rmr_srv_discard_id(pool, 0, 0, cmd_msg->send_discard_cmd.member_id, + cmd_msg->sync); + if (err) + pr_err("Failed to discard id\n"); + + break; + case RMR_CMD_STORE_CHECK: + pr_debug("%s: RMR_CMD_STORE_CHECK\n", __func__); + + work_info->rsp->value = rmr_srv_pool_check_store(pool); + pr_debug("pool %s member_id %d rsp with value=%llu\n", + pool->poolname, srv_pool->member_id, + work_info->rsp->value); + + break; + case RMR_CMD_MAP_GET_VER: + pr_debug("%s: RMR_CMD_MAP_GET_VER\n", __func__); + + work_info->rsp->value = pool->map_ver; + pr_debug("pool %s member_id %d rsp with value=%llu\n", + pool->poolname, srv_pool->member_id, + work_info->rsp->value); + + break; + case RMR_CMD_MAP_SET_VER: + pr_debug("%s: RMR_CMD_MAP_SET_VER\n", __func__); + + pool->map_ver = work_info->cmd_msg->set_map_ver_cmd.map_ver; + rmr_srv_mark_pool_md_dirty(srv_pool); + break; + case RMR_CMD_DISCARD_CLEAR_FLAG: + pr_info("%s: RMR_CMD_DISCARD_CLEAR_FLAG\n", __func__); + + md_i = rmr_pool_find_md(&pool->pool_md, cmd_msg->send_discard_cmd.member_id, false); + if (md_i < 0) { + pr_info("Didn't find md for member_id %u\n", + cmd_msg->send_discard_cmd.member_id); + goto out; + } + + pool->pool_md.srv_md[md_i].discard_entries = false; + rmr_srv_flush_pool_md(srv_pool); + break; + case RMR_CMD_USER: + pr_debug("%s: RMR_CMD_USER\n", __func__); + + err = process_msg_user_cmd(srv_pool, cmd_msg, work_info->data, work_info->datalen); + if (err) { + pr_err("process_msg_user_cmd failed with err %d\n", err); + goto out_no_rsp; + } + + goto out_no_rsp; + default: + pr_warn("%s: switch default type: %d\n", __func__, cmd_msg->cmd_type); + + err = -EINVAL; + } + +out: + work_info->rsp->err = err; + work_info->rsp->member_id = srv_pool->member_id; + work_info->rsp->cmd_type = cmd_msg->cmd_type; + +out_no_rsp: + // Should we return err in rdma_resp ? + pr_debug("send rtrs completion from msg_cmd_handler, err:%d\n", err); + rtrs_srv_resp_rdma(work_info->rtrs_op, err); + + rmr_put_srv_pool(srv_pool); + kfree(work_info); +} + +static int schedule_process_msg_cmd(struct rmr_srv_sess *srv_sess, + struct rtrs_srv_op *rtrs_op, + void *data, size_t datalen, + const void *msg, size_t len) +{ + struct rmr_srv_pool *srv_pool; + const struct rmr_msg_pool_cmd *cmd_msg = msg; + const char *poolname = cmd_msg->pool_name; + struct rmr_cmd_work_info *work_info; + u32 group_id = le32_to_cpu(cmd_msg->hdr.group_id); + + pr_debug("pool %s received cmd %d\n", + poolname, cmd_msg->cmd_type); + + srv_pool = rmr_find_and_get_srv_pool(group_id); + if (IS_ERR(srv_pool)) { + pr_err("Cmd %s: pool %s does not exists: %pe\n", + rmr_get_cmd_name(cmd_msg->cmd_type), poolname, srv_pool); + return PTR_ERR(srv_pool); + } + + pr_debug("process_msg_cmd: pool %s found\n", poolname); + + work_info = kzalloc(sizeof(struct rmr_cmd_work_info), GFP_KERNEL); + if (!work_info) { + pr_err("failed to allocate work info to send map\n"); + rmr_put_srv_pool(srv_pool); + return -ENOMEM; + } + work_info->pool = srv_pool->pool; + work_info->sess = srv_sess; + work_info->rtrs = srv_sess->rtrs; + work_info->rtrs_op = rtrs_op; + work_info->cmd_msg = cmd_msg; + work_info->rsp = data; + work_info->data = data; + work_info->datalen = datalen; + + INIT_WORK(&work_info->cmd_work, process_msg_cmd_handler); + schedule_work(&work_info->cmd_work); + + return 0; +} + +static int rmr_srv_rdma_ev(void *priv, struct rtrs_srv_op *id, + void *data, size_t datalen, + const void *usr, size_t usrlen) +{ + struct rmr_srv_sess *srv_sess = priv; + const struct rmr_msg_hdr *hdr = usr; + int ret = 0; + u16 type; + + if (unlikely(WARN_ON(!srv_sess))) + return -ENODEV; + + type = le16_to_cpu(hdr->type); + + switch (type) { + case RMR_MSG_IO: + return process_msg_io(srv_sess, id, data, datalen, + usr, usrlen); + case RMR_MSG_MAP_CLEAR: + ret = process_msg_map_clear(srv_sess, usr); + break; + case RMR_MSG_MAP_ADD: + ret = process_msg_map_add(srv_sess, usr); + break; + case RMR_MSG_CMD: + return schedule_process_msg_cmd(srv_sess, id, data, datalen, + usr, usrlen); + default: + pr_warn("Received unexpected message type %d from session %s\n", + type, srv_sess->sessname); + return -EINVAL; + } + + rtrs_srv_resp_rdma(id, ret); + + return 0; +} + +/** + * rmr_srv_check_params() - Check the parameters of the storage node + * + * @srv_pool: The rmr srv pool to check parameters for + * + * Description: + * Checks the device params with other connected server nodes. + * + * Return: + * 0 on success. + * -Negative error code on failure. + */ +int rmr_srv_check_params(struct rmr_srv_pool *srv_pool) +{ + void *dev; + int err; + + /* If the store has not been added to this server pool, ignore device param checks. */ + if (!srv_pool->io_store) + return 0; + + dev = srv_pool->io_store->priv; + err = srv_pool->io_store->ops->get_params(dev); + if (err) { + pr_err("%s: store get_params failed for pool %s, err %d\n", + __func__, srv_pool->pool->poolname, err); + return err; + } + return 0; +} +EXPORT_SYMBOL(rmr_srv_check_params); + +static struct rtrs_srv_ops rtrs_ops; +static int __init rmr_srv_init_module(void) +{ + int err; + + if (!is_power_of_2(chunk_size) || + chunk_size < MIN_CHUNK_SIZE || chunk_size > MAX_CHUNK_SIZE) { + pr_err("Loading module %s failed. Invalid chunk_size %u\n", + KBUILD_MODNAME, chunk_size); + pr_err("Chunk size should be a power of 2, and between (min %u - max %u)\n", + MIN_CHUNK_SIZE, MAX_CHUNK_SIZE); + return -EINVAL; + } + + pr_info("Loading module %s, version %s, proto %s, chunk_size %u\n", + KBUILD_MODNAME, RMR_VER_STRING, RMR_PROTO_VER_STRING, chunk_size); + + rtrs_ops = (struct rtrs_srv_ops){ + .rdma_ev = rmr_srv_rdma_ev, + .link_ev = rmr_srv_link_ev, + }; + + rmr_req_cachep = kmem_cache_create("rmr_req_cachep", sizeof(struct rmr_srv_req), + 0, 0, NULL); + if (!rmr_req_cachep) { + pr_err("can not allocagte cachep for rmr_req\n"); + err = -ENOMEM; + goto out; + } + rmr_map_entry_cachep = kmem_cache_create("rmr_map_entry_cachep", + sizeof(struct rmr_map_entry), + 0, 0, NULL); + if (!rmr_map_entry_cachep) { + pr_err("can not allocagte cachep for rmr_map_entry\n"); + err = -ENOMEM; + goto req_destroy; + } + + BUILD_BUG_ON(PAGE_SIZE / sizeof(struct rmr_map_cbuf_hdr) < RMR_POOL_MAX_SESS); + + rtrs_ctx = rtrs_srv_open(&rtrs_ops, RTRS_PORT); + if (IS_ERR(rtrs_ctx)) { + err = PTR_ERR(rtrs_ctx); + pr_err("rtrs_srv_open(), err: %pe\n", rtrs_ctx); + goto map_destroy; + } + + err = rmr_srv_create_sysfs_files(); + if (err) { + pr_err("rmr_srv_create_sysfs_files(), err: %d\n", err); + goto srv_close; + } + + return 0; + +srv_close: + rtrs_srv_close(rtrs_ctx); +map_destroy: + kmem_cache_destroy(rmr_map_entry_cachep); +req_destroy: + kmem_cache_destroy(rmr_req_cachep); +out: + return err; +} + +static void __exit rmr_srv_cleanup_module(void) +{ + struct rmr_pool *pool, *tmp; + struct rmr_srv_pool *srv_pool; + + pr_info("Unloading module\n"); + kmem_cache_destroy(rmr_req_cachep); + + rtrs_srv_close(rtrs_ctx); + + list_for_each_entry_safe (pool, tmp, &pool_list, entry) { + srv_pool = (struct rmr_srv_pool *)pool->priv; + + WARN_ON(!list_empty(&pool->sess_list)); + rmr_srv_destroy_pool(pool); + rmr_srv_destroy_pool_sysfs_files(pool, NULL); + rmr_put_srv_pool(srv_pool); + } + + rmr_srv_destroy_sysfs_files(); + pr_info("Module unloaded\n"); +} + +module_init(rmr_srv_init_module); +module_exit(rmr_srv_cleanup_module); From 1a42846c394bfa3b76c696816672a612432fb5a4 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:18 +0200 Subject: [PATCH 06/13] RDMA/rmr: server: sysfs interface functions Add the server-side sysfs interface used to administer RMR server pools and sessions, mirroring the client sysfs layout. Exposes attributes for member ID, store state, map state and sync status, and accepts administrative commands such as joining and leaving pool members. The sysfs hierarchy lives under /sys/devices/virtual/rmr-server/. This file is not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c | 1047 ++++++++++++++++++++ 1 file changed, 1047 insertions(+) create mode 100644 drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c diff --git a/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c b/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c new file mode 100644 index 000000000000..2aa1e07235b8 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/rmr-srv-sysfs.c @@ -0,0 +1,1047 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Reliable multicast over RTRS (RMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include + +#include "rmr-srv.h" +#include "rmr-map.h" +#include "rmr-clt.h" + +#define MAX_POOL_ID 255 + +static struct class *rmr_dev_class; +static struct device *rmr_ctl_dev; +static struct device *rmr_pool_dev; + +static struct kobj_type rmr_srv_sess_ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +int rmr_srv_sysfs_add_sess(struct rmr_pool *pool, + struct rmr_srv_pool_sess *pool_sess) +{ + int ret; + + ret = kobject_init_and_add(&pool_sess->kobj, &rmr_srv_sess_ktype, + &pool->sessions_kobj, "%s", + pool_sess->sessname); + if (ret) + pr_err("Failed to add session %s into sysfs\n", + pool_sess->sessname); + + return ret; +} + +void rmr_srv_sysfs_del_sess(struct rmr_srv_pool_sess *pool_sess) +{ + kobject_del(&pool_sess->kobj); + kobject_put(&pool_sess->kobj); +} + +static ssize_t rmr_srv_member_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + return sprintf(page, "%d\n", srv_pool->member_id); +} + +static struct kobj_attribute rmr_srv_member_id_attr = + __ATTR(member_id, 0444, rmr_srv_member_id_show, NULL); + +static ssize_t rmr_srv_pool_blksize_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + /* TODO: introduce blksize for pool */ + return sprintf(page, "128k\n"); +} + +static struct kobj_attribute rmr_srv_pool_blksize_attr = + __ATTR(blksize, 0444, rmr_srv_pool_blksize_show, NULL); + +static ssize_t rmr_srv_leave_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +void rmr_srv_destroy_pool_sysfs_files(struct rmr_pool *pool, + const struct attribute *sysfs_self) +{ + if (pool->kobj.state_in_sysfs) { + WARN_ON(!list_empty(&pool->sess_list)); + kobject_del(&pool->sessions_kobj); + kobject_put(&pool->sessions_kobj); + if (sysfs_self) + sysfs_remove_file_self(&pool->kobj, sysfs_self); + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + } +} + +static ssize_t rmr_srv_leave_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (READ_ONCE(srv_pool->io_store)) { + pr_err("pool %s has a store registered\n", pool->poolname); + return -EINVAL; + } + + if (atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_EMPTY) { + pr_err("pool %s cannot leave: not in EMPTY state (state=%d)\n", + pool->poolname, atomic_read(&srv_pool->state)); + return -EINVAL; + } + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + if (srv_pool->clt) { + int err; + + err = rmr_srv_remove_clt_pool(srv_pool); + if (err) { + pr_err("pool %s failed to remove clt_pool\n", pool->poolname); + return -EINVAL; + } + } + pr_info("srv: Deleting pool '%s'\n", pool->poolname); + + rmr_srv_destroy_pool(pool); + rmr_srv_destroy_pool_sysfs_files(pool, &attr->attr); + rmr_put_srv_pool(srv_pool); + + return count; +} + +static struct kobj_attribute rmr_srv_leave_pool_attr = + __ATTR(leave_pool, 0644, rmr_srv_leave_pool_show, + rmr_srv_leave_pool_store); + +static ssize_t rmr_srv_pool_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool = container_of(kobj, struct rmr_pool, kobj); + struct rmr_dirty_id_map *map; + int i, lock_idx; + + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + rmr_map_dump_bitmap(map); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return 0; +} + +static ssize_t rmr_srv_pool_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + rmr_id_t id = { 0, 0 }; + int srv_id; + struct rmr_dirty_id_map *map; + + pool = container_of(kobj, struct rmr_pool, kobj); + if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) { + pr_err("cannot parse id.a %s\n", buf); + return -EINVAL; + } + pr_debug("Add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id); + + /* + * If given chunk number exceeds total chunks for us, ignore! + */ + if (id.b > pool->no_of_chunks) + return count; + + map = rmr_pool_find_map(pool, srv_id); + if (!map) { + pr_err("in pool %s cannot find map for srv_id %u\n", + pool->poolname, srv_id); + return -EINVAL; + } + + rmr_map_set_dirty(map, id, 0); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + + return count; +} + +static struct kobj_attribute rmr_srv_pool_map_attr = + __ATTR(map, 0644, rmr_srv_pool_map_show, + rmr_srv_pool_map_store); + +static ssize_t rmr_srv_pool_map_ver_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + ssize_t written; + + pool = container_of(kobj, struct rmr_pool, kobj); + + written = scnprintf(page, PAGE_SIZE, "Map ver: %llu\n", pool->map_ver); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_map_ver_attr = + __ATTR(map_version, 0444, rmr_srv_pool_map_ver_show, NULL); + +static ssize_t rmr_srv_pool_last_io_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + ssize_t written = 0; + int i; + rmr_id_t *id; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + for (i = 0; i < srv_pool->queue_depth; i++) { + id = &srv_pool->last_io[i]; + + if (id->a == U64_MAX && id->b == U64_MAX) + continue; + + written += scnprintf(page + written, PAGE_SIZE - written, + "[%d]=(%llu,%llu) ", i, id->a, id->b); + } + if (written == 0) + written += scnprintf(page + written, PAGE_SIZE - written, + "(empty)"); + written += scnprintf(page + written, PAGE_SIZE - written, "\n"); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_last_io_attr = + __ATTR(last_io, 0644, rmr_srv_pool_last_io_show, NULL); + +static ssize_t rmr_srv_add_clt_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo poolname > %s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_add_clt_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + struct rmr_pool *clt = NULL; + char name[NAME_MAX]; + int err; + struct rmr_attrs attrs; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (sscanf(buf, "%s", name) != 1) { + pr_err("cannot parse %s\n", buf); + return -EINVAL; + } + + clt = rmr_clt_open(NULL, NULL, name); + if (IS_ERR_OR_NULL(clt)) { + pr_err("cannot open pool %s err %ld\n", name, PTR_ERR(clt)); + return -EEXIST; + } + + pr_info("%s: Adding client pool %s, to server pool %s\n", + __func__, pool->poolname, clt->poolname); + + err = rmr_clt_query(clt, &attrs); + if (unlikely(err)) + goto close_rmr; + + if (!attrs.sync) { + pr_err("%s: Add clt called for non-sync rmr client pool %s\n", __func__, name); + err = -EINVAL; + goto close_rmr; + } + + srv_pool->max_sync_io_size = attrs.max_io_size; + + /* The sync client holds a pointer to its parent server pool. */ + srv_pool->clt = clt; + + /* Re-trigger md sync now that the sync path is available. */ + rmr_srv_mark_pool_md_dirty(srv_pool); + + /* + * Check if the device paramters of connected servers share the same values. + */ + err = rmr_srv_check_params(srv_pool); + if (err) + goto close_clt; + + return count; + +close_clt: + srv_pool->clt = NULL; + srv_pool->max_sync_io_size = 0; +close_rmr: + pr_err("%s: Adding client pool failed\n", __func__); + rmr_clt_close(clt); + return err; +} + +static struct kobj_attribute rmr_srv_add_clt_pool_attr = + __ATTR(add_clt, 0644, rmr_srv_add_clt_pool_show, + rmr_srv_add_clt_pool_store); + +static ssize_t rmr_srv_pool_sync_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + return scnprintf(page, PAGE_SIZE, "Usage: echo \"start|stop\" > /%s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_pool_sync_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int err = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!strncasecmp(buf, "start", 5)) { + /* + * Start + */ + if (atomic_read(&srv_pool->thread_state) != SYNC_THREAD_STOPPED) { + pr_info("For pool %s, sync thread already running\n", pool->poolname); + goto out; + } + + mutex_lock(&srv_pool->srv_pool_lock); + + if (!atomic_read(&srv_pool->store_state) && + atomic_read(&srv_pool->state) != RMR_SRV_POOL_STATE_NORMAL) { + pr_err("Pool %s not in working state. Sync thread start failed\n", + pool->poolname); + err = -EINVAL; + goto unlock_mutex; + } + + err = rmr_srv_sync_thread_start(srv_pool); + if (err) { + pr_err("For pool %s, rmr_srv_sync_thread_start Error %d\n", + pool->poolname, err); + goto unlock_mutex; + } + + mutex_unlock(&srv_pool->srv_pool_lock); + + } else if (!strncasecmp(buf, "stop", 4)) { + /* + * Stop + */ + if (atomic_read(&srv_pool->thread_state) == SYNC_THREAD_STOPPED) { + pr_info("For pool %s, sync thread already stopped\n", pool->poolname); + goto out; + } + + err = rmr_srv_sync_thread_stop(srv_pool); + if (err) { + pr_err("For pool %s, rmr_srv_sync_thread_stop Error %d\n", + pool->poolname, err); + goto err; + } + } else { + pr_err("Unknown value\n"); + err = -EINVAL; + goto err; + } + +out: + return count; + +unlock_mutex: + mutex_unlock(&srv_pool->srv_pool_lock); +err: + return err; +} + +static struct kobj_attribute rmr_srv_pool_sync_attr = + __ATTR(sync, 0644, rmr_srv_pool_sync_show, + rmr_srv_pool_sync_store); + +static ssize_t sync_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int state; + ssize_t written = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + state = atomic_read(&srv_pool->thread_state); + switch (state) { + case SYNC_THREAD_RUNNING: + written = sysfs_emit(page, "Running\n"); + break; + case SYNC_THREAD_STOPPED: + written = sysfs_emit(page, "Stopped\n"); + break; + case SYNC_THREAD_REQ_STOP: + written = sysfs_emit(page, "Request_to_stop\n"); + break; + case SYNC_THREAD_WAIT: + written = sysfs_emit(page, "Wait\n"); + break; + default: + written = sysfs_emit(page, "Unknown value %d\n", state); + break; + } + + return written; +} + +static struct kobj_attribute rmr_srv_pool_sync_state_attr = + __ATTR_RO(sync_state); + +static ssize_t rmr_srv_pool_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int state; + ssize_t written = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + state = atomic_read(&srv_pool->state); + + switch (state) { + case RMR_SRV_POOL_STATE_EMPTY: + written = sysfs_emit(page, "empty\n"); + + break; + case RMR_SRV_POOL_STATE_REGISTERED: + written = sysfs_emit(page, "registered\n"); + + break; + case RMR_SRV_POOL_STATE_CREATED: + written = sysfs_emit(page, "created\n"); + + break; + case RMR_SRV_POOL_STATE_NORMAL: + written = sysfs_emit(page, "normal\n"); + + break; + case RMR_SRV_POOL_STATE_NO_IO: + written = sysfs_emit(page, "no_io\n"); + + break; + default: + written = sysfs_emit(page, "Unknown value %d\n", state); + + break; + } + + written += sysfs_emit_at(page, written, "Maintenance mode: %d\n", + srv_pool->maintenance_mode); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_state_attr = + __ATTR(state, 0644, rmr_srv_pool_state_show, NULL); + +static ssize_t rmr_srv_remove_clt_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_remove_clt_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int err; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + err = rmr_srv_remove_clt_pool(srv_pool); + if (err) { + pr_err("pool %s failed to remove clt_pool\n", pool->poolname); + return -EINVAL; + } + + return count; +} + +static struct kobj_attribute rmr_srv_remove_clt_pool_attr = + __ATTR(remove_clt, 0644, rmr_srv_remove_clt_pool_show, + rmr_srv_remove_clt_pool_store); + +static ssize_t rmr_srv_pool_test_map_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rmr_srv_pool_test_map_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + int err; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + pool->poolname, attr->attr.name, buf); + return -EINVAL; + } + + if (!srv_pool->clt) { + pr_err("pool %s no clt pool assigned to this rmr pool. cannot do map test.\n", + pool->poolname); + return -EINVAL; + } + + pr_info("pool %s start test map...\n", pool->poolname); + err = rmr_clt_test_map(pool, srv_pool->clt); + if (err) { + pr_err("pool %s, test map failed, err %d\n", + pool->poolname, err); + return err; + } + pr_info("pool %s test map done.", pool->poolname); + + return count; +} + +static struct kobj_attribute rmr_srv_pool_test_map_attr = + __ATTR(test_map, 0644, rmr_srv_pool_test_map_show, + rmr_srv_pool_test_map_store); + +static ssize_t rmr_srv_pool_metadata_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_pool_md *pool_md; + struct rmr_srv_md *srv_md; + int i; + ssize_t written = 0; + + pool = container_of(kobj, struct rmr_pool, kobj); + pool_md = &pool->pool_md; + + written += sysfs_emit_at(page, written, + "The metadata of %s is: group_id %u, chunk_size %u, " + "mapped_size %llu, queue_depth %u, " + "bitmap_offset %llu, bitmap_len %llu, " + "last_io_offset %llu, last_io_len %llu\n\n", + pool_md->poolname, pool_md->group_id, pool_md->chunk_size, + pool_md->mapped_size, pool_md->queue_depth, + rmr_bitmap_offset(pool_md->queue_depth), + rmr_bitmap_len(pool->no_of_chunks), + (u64)RMR_LAST_IO_OFFSET, + rmr_last_io_len(pool_md->queue_depth)); + written += sysfs_emit_at(page, written, + "The client pool: map_ver %llu\n\n", pool_md->map_ver); + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + srv_md = &pool_md->srv_md[i]; + if (!srv_md->member_id) + continue; + + written += sysfs_emit_at(page, written, "The server pool with member_id %u: " + "mapped_size %llu, store_state %u, " + "pool_state %u, map_update_state %u, " + "map_ver %llu, discard_entries %x.\n\n", + srv_md->member_id, srv_md->mapped_size, + srv_md->store_state, + srv_md->srv_pool_state, + srv_md->map_update_state, srv_md->map_ver, + srv_md->discard_entries); + } + + return written; +} + +static struct kobj_attribute rmr_srv_pool_metadata_attr = + __ATTR(metadata, 0444, rmr_srv_pool_metadata_show, NULL); + +static const char *map_update_state_str(enum srv_map_update_state state) +{ + switch (state) { + case MAP_UPDATE_STATE_DISABLED: + return "disabled"; + case MAP_UPDATE_STATE_READY: + return "ready"; + case MAP_UPDATE_STATE_DONE: + return "done"; + } + return "unknown"; +} + +static ssize_t rmr_srv_pool_map_update_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + + pool = container_of(kobj, struct rmr_pool, kobj); + srv_pool = (struct rmr_srv_pool *)pool->priv; + + return sysfs_emit(page, "%s\n", map_update_state_str(srv_pool->map_update_state)); +} + +static struct kobj_attribute rmr_srv_pool_map_update_state_attr = + __ATTR(map_update_state, 0644, rmr_srv_pool_map_update_state_show, NULL); + +static ssize_t rmr_srv_pool_map_unsynced_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + ssize_t written = 0; + struct rmr_pool *pool; + struct rmr_dirty_id_map *map; + rmr_id_t id; + int i, j, lock_idx; + + pool = container_of(kobj, struct rmr_pool, kobj); + + id.a = 1; + lock_idx = srcu_read_lock(&pool->map_srcu); + for (i = 0; (i < RMR_POOL_MAX_SESS && written < PAGE_SIZE); i++) { + map = rcu_dereference(pool->maps[i]); + if (!map) + continue; + + written += sysfs_emit_at(page, written, "member_id : %d\n", map->member_id); + for (j = 0; j < map->no_of_chunks; j++) { + size_t len; + + id.b = j; + if (rmr_map_check_dirty(map, id) && + (map->bitmap_filter[id.b] & MAP_ENTRY_UNSYNCED)) { + len = sysfs_emit_at(page, written, "(%llu, %llu) ", + id.a, id.b); + if (!len) // break early if map is too big + break; + written += len; + } + } + written += sysfs_emit_at(page, written, "\n"); + } + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return written; +} + +static ssize_t rmr_srv_pool_map_unsynced_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + rmr_id_t id = { 0, 0 }; + int srv_id; + struct rmr_dirty_id_map *map; + + pool = container_of(kobj, struct rmr_pool, kobj); + if (sscanf(buf, "%llu %llu %d\n", &id.a, &id.b, &srv_id) != 3) { + pr_err("cannot parse id.a %s\n", buf); + return -EINVAL; + } + pr_debug("add id (%llu, %llu), srv_id %d\n", id.a, id.b, srv_id); + + map = rmr_pool_find_map(pool, srv_id); + if (!map) { + pr_err("in pool %s cannot find map for srv_id %u\n", + pool->poolname, srv_id); + return -EINVAL; + } + + rmr_map_set_dirty(map, id, MAP_ENTRY_UNSYNCED); + rmr_srv_mark_maps_dirty((struct rmr_srv_pool *)pool->priv); + pr_debug("insert id (%llu, %llu) srv_id %d\n", id.a, id.b, srv_id); + + return count; +} +static struct kobj_attribute rmr_srv_pool_map_unsynced_attr = + __ATTR(map_unsynced, 0644, rmr_srv_pool_map_unsynced_show, + rmr_srv_pool_map_unsynced_store); + +static ssize_t map_summary_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rmr_pool *pool; + int lock_idx; + int written; + + pool = container_of(kobj, struct rmr_pool, kobj); + + lock_idx = srcu_read_lock(&pool->map_srcu); + written = rmr_map_summary_format(pool, page, PAGE_SIZE); + srcu_read_unlock(&pool->map_srcu, lock_idx); + + return written; +} + +static struct kobj_attribute rmr_srv_pool_map_summary_attr = + __ATTR_RO(map_summary); + +static struct attribute *rmr_srv_pool_attrs[] = { + &rmr_srv_leave_pool_attr.attr, + &rmr_srv_member_id_attr.attr, + &rmr_srv_pool_blksize_attr.attr, + &rmr_srv_pool_map_attr.attr, + &rmr_srv_pool_map_ver_attr.attr, + &rmr_srv_pool_last_io_attr.attr, + &rmr_srv_add_clt_pool_attr.attr, + &rmr_srv_pool_sync_attr.attr, + &rmr_srv_pool_sync_state_attr.attr, + &rmr_srv_pool_state_attr.attr, + &rmr_srv_remove_clt_pool_attr.attr, + &rmr_srv_pool_test_map_attr.attr, + &rmr_srv_pool_metadata_attr.attr, + &rmr_srv_pool_map_update_state_attr.attr, + &rmr_srv_pool_map_unsynced_attr.attr, + &rmr_srv_pool_map_summary_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rmr_srv_pool); + +static struct kobj_type rmr_srv_pool_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = rmr_srv_pool_groups, +}; + +static struct kobj_type ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +static int rmr_srv_create_pool_sysfs_files(struct rmr_pool *pool) +{ + int ret; + + ret = kobject_init_and_add(&pool->kobj, &rmr_srv_pool_ktype, + &rmr_pool_dev->kobj, "%s", pool->poolname); + if (ret) { + pr_err("Failed to create sysfs dir for pool '%s': %d\n", + pool->poolname, ret); + return ret; + } + + ret = kobject_init_and_add(&pool->sessions_kobj, &ktype, &pool->kobj, + "sessions"); + if (unlikely(ret)) { + pr_err("Failed to create sessions dir for pool '%s': %d\n", + pool->poolname, ret); + kobject_del(&pool->kobj); + kobject_put(&pool->kobj); + } + + return ret; +} + +/* remove new line from string */ +static void strip(char *s) +{ + char *p = s; + + while (*s != '\0') { + if (*s != '\n') + *p++ = *s++; + else + ++s; + } + *p = '\0'; +} + +enum rmr_srv_opts { + RMR_SRV_OPT_POOL_NAME, + RMR_SRV_OPT_MEMBER_ID, + RMR_JOIN_OPT_Mandatory_count, + RMR_SRV_OPT_ERR, +}; + +static const char * const rmr_srv_opts_mandatory_names[] = { + [RMR_SRV_OPT_POOL_NAME] = "poolname", + [RMR_SRV_OPT_MEMBER_ID] = "member_id", +}; + +static const match_table_t rmr_srv_opt_tokens = { + { RMR_SRV_OPT_POOL_NAME, "poolname=%s" }, + { RMR_SRV_OPT_MEMBER_ID, "member_id=%s" }, + { RMR_SRV_OPT_ERR, NULL }, +}; + +static int rmr_srv_parse_options(const char *buf, char *poolname, + u32 *member_id) +{ + char *options, *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token, ret = 0, i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + options = strstrip(options); + strip(options); + while ((p = strsep(&options, " ")) != NULL) { + if (!*p) + continue; + token = match_token(p, rmr_srv_opt_tokens, args); + opt_mask |= (1 << token); + + switch (token) { + case RMR_SRV_OPT_POOL_NAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("join_pool: name too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(poolname, p, NAME_MAX); + kfree(p); + break; + + case RMR_SRV_OPT_MEMBER_ID: + p = match_strdup(args); + + ret = kstrtou32(p, 0, member_id); + if (ret) { + pr_err("member_id isn't an integer: %d\n", ret); + kfree(p); + goto out; + } + + kfree(p); + break; + + default: + pr_err("join_pool: Unknown parameter or missing value" + " '%s'\n", p); + ret = -EINVAL; + goto out; + } + }; + + for (i = 0; i < RMR_JOIN_OPT_Mandatory_count; i++) { + if ((opt_mask & (1 << rmr_srv_opt_tokens[i].token))) { + ret = 0; + } else { + pr_err("join_pool: Mandatory parameter missing: %s\n", + rmr_srv_opts_mandatory_names[i]); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + + +static ssize_t rmr_srv_join_pool_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rmr_pool *pool; + struct rmr_srv_pool *srv_pool; + char poolname[NAME_MAX]; + u32 member_id = UINT_MAX; + int err; + + err = rmr_srv_parse_options(buf, poolname, &member_id); + if (unlikely(err)) + return err; + + if (member_id > MAX_POOL_ID) { + pr_err("%s: member_id gt max allowed pools (%u > %u)\n", + __func__, member_id, MAX_POOL_ID); + return -EINVAL; + } + + if (member_id == 0) { + pr_err("%s: member_id is not allowed to be zero\n", __func__); + return -EINVAL; + } + + strip(poolname); + + pr_info("%s: Creating server pool with poolname %s, member_id %u\n", + __func__, poolname, member_id); + + srv_pool = rmr_create_srv_pool(poolname, member_id); + if (IS_ERR(srv_pool)) { + pr_err("failed to create srv pool %s\n", poolname); + return PTR_ERR(srv_pool); + } + + pool = rmr_create_pool(poolname, srv_pool); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto destroy_pool; + } + + srv_pool->pool = pool; + pool->is_clt = false; + rmr_srv_pool_update_params(pool); + + err = rmr_srv_create_pool_sysfs_files(pool); + if (err) { + pr_err("%s: pool %s failed to create sysfs files\n", __func__, pool->poolname); + goto destroy_pool; + } + + return count; + +destroy_pool: + rmr_put_srv_pool(srv_pool); + + return err; +} + +static ssize_t rmr_srv_join_pool_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"poolname= member_id= > %s\n", + attr->attr.name); +} + +static struct kobj_attribute rmr_srv_join_pool_attr = + __ATTR(join_pool, 0644, rmr_srv_join_pool_show, + rmr_srv_join_pool_store); + +static struct attribute *default_attrs[] = { + &rmr_srv_join_pool_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +int rmr_srv_create_sysfs_files(void) +{ + int err; + dev_t devt = MKDEV(0, 0); + + rmr_dev_class = class_create("rmr-server"); + if (IS_ERR(rmr_dev_class)) + return PTR_ERR(rmr_dev_class); + + rmr_ctl_dev = device_create(rmr_dev_class, NULL, devt, NULL, "ctl"); + if (IS_ERR(rmr_ctl_dev)) { + err = PTR_ERR(rmr_ctl_dev); + goto cls_destroy; + } + + rmr_pool_dev = device_create(rmr_dev_class, NULL, devt, NULL, "pools"); + if (IS_ERR(rmr_pool_dev)) { + err = PTR_ERR(rmr_pool_dev); + goto ctl_destroy; + } + + err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto pool_destroy; + + return 0; + +pool_destroy: + device_unregister(rmr_pool_dev); +ctl_destroy: + device_unregister(rmr_ctl_dev); +cls_destroy: + class_destroy(rmr_dev_class); + + return err; +} + +void rmr_srv_destroy_sysfs_files(void) +{ + sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group); + device_unregister(rmr_pool_dev); + device_unregister(rmr_ctl_dev); + class_destroy(rmr_dev_class); +} From a3527883b7152a6168d9d72b93d222085037510f Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:19 +0200 Subject: [PATCH 07/13] RDMA/rmr: include client and server modules into kernel compilation Add the per-directory Kconfig and Makefile, and wire them into the parent drivers/infiniband Kconfig and drivers/infiniband/ulp Makefile so RMR can be enabled in a kernel build. Three Kconfig symbols are introduced: CONFIG_INFINIBAND_RMR (silent, selected by either side) CONFIG_INFINIBAND_RMR_CLIENT (depends on INFINIBAND_RTRS_CLIENT) CONFIG_INFINIBAND_RMR_SERVER (depends on INFINIBAND_RTRS_SERVER) The Makefile builds two modules: rmr-client.ko and rmr-server.ko, sharing the pool, map, request and library code added earlier in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/ulp/Makefile | 1 + drivers/infiniband/ulp/rmr/Kconfig | 35 +++++++++++++++++++++++++++++ drivers/infiniband/ulp/rmr/Makefile | 23 +++++++++++++++++++ 4 files changed, 60 insertions(+) create mode 100644 drivers/infiniband/ulp/rmr/Kconfig create mode 100644 drivers/infiniband/ulp/rmr/Makefile diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a7e3f29dc037..4b2470b5a592 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -110,5 +110,6 @@ source "drivers/infiniband/ulp/srpt/Kconfig" source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/ulp/rtrs/Kconfig" +source "drivers/infiniband/ulp/rmr/Kconfig" endif # INFINIBAND diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile index 51b0d41699b8..24c8e4b00065 100644 --- a/drivers/infiniband/ulp/Makefile +++ b/drivers/infiniband/ulp/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ obj-$(CONFIG_INFINIBAND_ISER) += iser/ obj-$(CONFIG_INFINIBAND_ISERT) += isert/ obj-$(CONFIG_INFINIBAND_RTRS) += rtrs/ +obj-$(CONFIG_INFINIBAND_RMR) += rmr/ diff --git a/drivers/infiniband/ulp/rmr/Kconfig b/drivers/infiniband/ulp/rmr/Kconfig new file mode 100644 index 000000000000..1d62322a02be --- /dev/null +++ b/drivers/infiniband/ulp/rmr/Kconfig @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config INFINIBAND_RMR + tristate + depends on INFINIBAND_ADDR_TRANS + +config INFINIBAND_RMR_CLIENT + tristate "RMR client module" + depends on INFINIBAND_ADDR_TRANS + depends on INFINIBAND_RTRS_CLIENT + select INFINIBAND_RMR + help + Reliable Multicast over RTRS (RMR) client module. + + RMR is an RDMA ULP that provides active-active block-level + replication on top of the RTRS transport. It guarantees + delivery of an I/O to a group of storage nodes and handles + resynchronization of data between storage nodes without + involving the compute client. This option builds the client + side, intended to be used by an upper-layer initiator such + as BRMR. + + If unsure, say N. + +config INFINIBAND_RMR_SERVER + tristate "RMR server module" + depends on INFINIBAND_ADDR_TRANS + depends on INFINIBAND_RTRS_SERVER + select INFINIBAND_RMR + help + RMR server module processing connection, IO and replication + requests from RMR clients on top of RTRS. It will pass IO + requests to its consumer, e.g. BRMR_server. + + If unsure, say N. diff --git a/drivers/infiniband/ulp/rmr/Makefile b/drivers/infiniband/ulp/rmr/Makefile new file mode 100644 index 000000000000..c173092f4cf2 --- /dev/null +++ b/drivers/infiniband/ulp/rmr/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs + +CFLAGS_rmr-clt-trace.o = -I$(src) + +rmr-client-y := rmr-pool.o \ + rmr-clt.o \ + rmr-map-mgmt.o \ + rmr-clt-stats.o \ + rmr-clt-sysfs.o \ + rmr-map.o \ + rmr-clt-trace.o + +rmr-server-y := rmr-pool.o \ + rmr-srv.o \ + rmr-srv-md.o \ + rmr-srv-sysfs.o \ + rmr-req.o \ + rmr-map.o + +obj-$(CONFIG_INFINIBAND_RMR_CLIENT) += rmr-client.o +obj-$(CONFIG_INFINIBAND_RMR_SERVER) += rmr-server.o From 257ece095c5b67daf90380bda41d403b2d41df3a Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:20 +0200 Subject: [PATCH 08/13] block/brmr: add private headers with brmr protocol structs and helpers Block device over RMR (BRMR) is an upper-layer block driver that sits on top of the RMR ULP and exposes a standard Linux block device (/dev/brmrX) backed by an RMR pool. Add the BRMR private headers: brmr-proto.h wire-protocol structs exchanged between client and server outside of the rmr-clt/rmr-srv command channel. brmr-clt.h client-side data structures: per-pool tag set, per-CPU requeue queues, per-device statistics and gendisk state. brmr-srv.h server-side data structures: brmr_srv_blk_dev backing store description, on-disk metadata header layout and state-bit helpers. These files are not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/block/brmr/brmr-clt.h | 299 ++++++++++++++++++++++++++++++++ drivers/block/brmr/brmr-proto.h | 121 +++++++++++++ drivers/block/brmr/brmr-srv.h | 133 ++++++++++++++ 3 files changed, 553 insertions(+) create mode 100644 drivers/block/brmr/brmr-clt.h create mode 100644 drivers/block/brmr/brmr-proto.h create mode 100644 drivers/block/brmr/brmr-srv.h diff --git a/drivers/block/brmr/brmr-clt.h b/drivers/block/brmr/brmr-clt.h new file mode 100644 index 000000000000..1482c7517ee8 --- /dev/null +++ b/drivers/block/brmr/brmr-clt.h @@ -0,0 +1,299 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef BRMR_PRI_H +#define BRMR_PRI_H + +#include +#include +#include "rmr-pool.h" + +#include "brmr-proto.h" + +#define BRMR_VER_MAJOR 0 +#define BRMR_VER_MINOR 1 + +#ifndef BRMR_VER_STRING +#define BRMR_VER_STRING __stringify(BRMR_VER_MAJOR) "." \ + __stringify(BRMR_VER_MINOR) +#endif + +#define BRMR_LINK_NAME "block" + +#ifdef CONFIG_ARCH_NO_SG_CHAIN +#define BRMR_INLINE_SG_CNT 0 +#else +#define BRMR_INLINE_SG_CNT 2 +#endif +#define BRMR_RDMA_SGL_SIZE (sizeof(struct scatterlist) * BRMR_INLINE_SG_CNT) + +enum brmr_dev_state { + DEV_STATE_INIT, + DEV_STATE_READY, + DEV_STATE_DISCONNECTED, + DEV_STATE_CLOSING, +}; + +struct brmr_clt_iu { + struct request *rq; + struct rmr_iu *rmr_iu; + struct brmr_clt_dev *dev; + blk_status_t status; + struct sg_table sgt; + struct scatterlist sgl[]; +}; + +struct brmr_queue { + struct list_head requeue_list; + unsigned long in_list; + struct blk_mq_hw_ctx *hctx; +}; + +struct brmr_cpu_qlist { + struct list_head requeue_list; + spinlock_t requeue_lock; + unsigned int cpu; +}; + +struct brmr_clt_pool { + struct list_head list; + struct rmr_pool *rmr; + wait_queue_head_t rmr_waitq; + bool rmr_ready; + int queue_depth; + u32 max_io_size; + u32 chunk_size; + u32 max_segments; + struct brmr_cpu_qlist __percpu + *cpu_queues; + DECLARE_BITMAP(cpu_queues_bm, NR_CPUS); + int __percpu *cpu_rr; /* per-cpu var for CPU round-robin */ + atomic_t busy; + struct blk_mq_tag_set tag_set; + struct mutex lock; /* protects state and devs_list */ + struct list_head devs_list; /* list of struct brmr_clt_dev */ + refcount_t refcount; + char poolname[NAME_MAX]; +}; + +/** + * Statistic of requests submitted to the rmr-clt layer. + * This means total number of requests received from blk + * is cnt_whole+(cnt_split/2) + * while total number submitted to rmr-clt is cnt_whole+cnt_split + */ +struct brmr_stats_rq { + struct { + u64 cnt_whole; + u64 cnt_split; + u64 total_sectors; + } dir[2]; +}; + +#define STATS_SIZES_NUM 16 + +struct brmr_stats_sizes { + struct { + u64 cnt_whole[STATS_SIZES_NUM]; + u64 cnt_left[STATS_SIZES_NUM]; + u64 cnt_right[STATS_SIZES_NUM]; + } dir[2]; +}; + +struct brmr_stats_sts_resource { + u64 get_iu; + u64 get_iu2; + u64 clt_request1; + u64 clt_request; +}; + +struct brmr_stats_pcpu { + + struct brmr_stats_rq submitted_requests; + struct brmr_stats_sizes request_sizes; + struct brmr_stats_sts_resource sts_resource; +}; + +struct brmr_clt_stats { + struct brmr_stats_pcpu __percpu *pcpu_stats; +}; + +struct brmr_clt_dev { + struct brmr_clt_pool *pool; + struct request_queue *queue; + struct brmr_queue *hw_queues; + u32 idx; + enum brmr_dev_state dev_state; + bool read_only; + bool map_incomplete; + u64 size_sect; /* device size in sectors */ + struct list_head list; + struct brmr_clt_stats stats; + struct gendisk *gd; + struct kobject kobj; + struct kobject kobj_stats; + char blk_symlink_name[NAME_MAX]; + refcount_t refcount; + struct work_struct unmap_on_rmmod_work; + bool wc; + bool fua; + + /* + * Params holding block device related info + */ + u32 max_hw_sectors; + u32 max_write_zeroes_sectors; + u32 max_discard_sectors; + u32 discard_granularity; + u32 discard_alignment; + u16 physical_block_size; + u16 logical_block_size; + u16 max_segments; + u16 secure_discard; + u8 cache_policy; +}; + +#define BRMR_HEADER_MAGIC_TOKEN 0x312631494f4e4f53 + +#define BRMR_HEADER_VERSION_INITIAL 1 +#define BRMR_CURRENT_HEADER_VERSION BRMR_HEADER_VERSION_INITIAL + +static inline enum rmr_io_flags rq_to_rmr_flags(struct request *rq) +{ + enum rmr_io_flags rmr_flag; + + switch (req_op(rq)) { + case REQ_OP_READ: + rmr_flag = RMR_OP_READ; + break; + case REQ_OP_WRITE: + rmr_flag = RMR_OP_WRITE; + break; + case REQ_OP_DISCARD: + rmr_flag = RMR_OP_DISCARD; + break; + case REQ_OP_WRITE_ZEROES: + rmr_flag = RMR_OP_WRITE_ZEROES; + break; + case REQ_OP_FLUSH: + rmr_flag = RMR_OP_FLUSH; + break; +/* TODO + case REQ_OP_SECURE_ERASE: + rmr_flag = IBNBD_OP_SECURE_ERASE; + break; +*/ + default: + WARN(1, "Unknown request type %d (flags %u)\n", + req_op(rq), rq->cmd_flags); + rmr_flag = 0; + } + + /* Set sync flag for write request. */ + if (op_is_sync(rq->cmd_flags)) + rmr_flag |= RMR_F_SYNC; + + if (op_is_flush(rq->cmd_flags)) + rmr_flag |= RMR_F_FUA; + + return rmr_flag; +} + +static inline u32 brmr_pool_chunk_size(struct brmr_clt_pool *pool) +{ + return pool->chunk_size; +} + +struct brmr_clt_dev *brmr_clt_map_device(const char *pool, u64 size); +int brmr_clt_close_device(struct brmr_clt_dev *dev, const struct attribute *sysfs_self); + +void brmr_clt_put_dev(struct brmr_clt_dev *dev); + +struct brmr_clt_dev *find_and_get_device(const char *name); + +/* brmr-sysfs.c */ + +int brmr_clt_create_sysfs_files(void); +void brmr_clt_destroy_sysfs_files(void); + +void brmr_clt_destroy_dev_sysfs_files(struct brmr_clt_dev *dev, + const struct attribute *sysfs_self); + +/* brmr-reque.c */ + +bool brmr_add_to_requeue(struct brmr_clt_pool *pool, struct brmr_queue *q); +void brmr_requeue_requests(struct brmr_clt_pool *pool); +void brmr_init_cpu_qlists(struct brmr_cpu_qlist __percpu *cpu_queues); + +/* brmr-stats.c */ + +int brmr_clt_init_stats(struct brmr_clt_stats *stats); +void brmr_clt_free_stats(struct brmr_clt_stats *stats); + +int brmr_clt_reset_submitted_req(struct brmr_clt_stats *stats, bool enable); +int brmr_clt_reset_req_sizes(struct brmr_clt_stats *stats, bool enable); +int brmr_clt_reset_sts_resource(struct brmr_clt_stats *stats, bool enable); + +/** + * size: size of the request submitted in bytes + * split: 0 when request from blk is submitted to rmr-clt as 1 + * 1 if it is one part of the split from a blk request + */ +void brmr_update_stats(struct brmr_clt_stats *stats, size_t size, int split, int d); + +/** + * which: at which place is BLK_STS_RESOURCE returned? + */ +void brmr_clt_update_sts_resource(struct brmr_clt_stats *stats, int which); + +ssize_t brmr_clt_stats_sizes_to_str(struct brmr_clt_stats *stats, char *page, size_t len); + +ssize_t brmr_clt_stats_rq_to_str(struct brmr_clt_stats *stats, char *page, size_t len); + +ssize_t brmr_stats_sts_resource_to_str( + struct brmr_clt_stats *stats, char *page, size_t len); + +ssize_t brmr_stats_sts_resource_per_cpu_to_str( + struct brmr_clt_stats *stats, char *page, size_t len); + +#define STAT_STORE_FUNC(type, store, reset) \ +static ssize_t store##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + int ret = -EINVAL; \ + type *dev = container_of(kobj, type, kobj_stats); \ + \ + if (sysfs_streq(buf, "1")) \ + ret = reset(&dev->stats, true); \ + else if (sysfs_streq(buf, "0")) \ + ret = reset(&dev->stats, false); \ + if (ret) \ + return ret; \ + \ + return count; \ +} + +#define STAT_SHOW_FUNC(type, show, print) \ +static ssize_t show##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + type *dev = container_of(kobj, type, kobj_stats); \ + \ + return print(&dev->stats, page, PAGE_SIZE); \ +} + +#define STAT_ATTR(type, stat, print, reset) \ +STAT_STORE_FUNC(type, stat, reset) \ +STAT_SHOW_FUNC(type, stat, print) \ +static struct kobj_attribute stat##_attr = \ + __ATTR(stat, 0644, \ + stat##_show, \ + stat##_store) + +#endif /* BRMR_PRI_H */ diff --git a/drivers/block/brmr/brmr-proto.h b/drivers/block/brmr/brmr-proto.h new file mode 100644 index 000000000000..c5f0f25a5eb7 --- /dev/null +++ b/drivers/block/brmr/brmr-proto.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#define BRMR_PROTO_VER_MAJOR 0 +#define BRMR_PROTO_VER_MINOR 1 + +#define BRMR_CMD_RSP_MAGIC 0xDEADF00D + +struct brmr_blk_dev_params { + /* + * Params holding block device related info + */ + __le32 max_hw_sectors; + __le32 max_write_zeroes_sectors; + __le32 max_discard_sectors; + __le32 discard_granularity; + __le32 discard_alignment; + __le16 physical_block_size; + __le16 logical_block_size; + __le16 max_segments; + __le16 secure_discard; + u8 cache_policy; +}; + +enum brmr_msg_type { + BRMR_MSG_IO, + BRMR_MSG_CMD, +}; + +struct brmr_msg_hdr { + __le16 type; + __le16 __padding; +}; + +enum brmr_msg_cmd_type { + BRMR_CMD_MAP, // 0 + BRMR_CMD_REMAP, + + BRMR_CMD_UNMAP, + BRMR_CMD_GET_PARAMS, + + /* + * Add new command types above this. + */ + BRMR_CMD_RSP, +}; + +struct brmr_msg_map_new_cmd { + struct brmr_blk_dev_params dev_params; + + u32 version; /* version of the header itself */ + u64 mapped_size; /* size in 512 byte blocks of this device */ +}; + +struct brmr_msg_cmd { + struct brmr_msg_hdr hdr; + u8 ver; + u8 cmd_type; + u8 rsvd[2]; + union { + struct brmr_msg_map_new_cmd map_new_cmd; + /* May be other command(s) later */ + }; +}; + +/** + * struct brmr_cmd_get_params_rsp - response message to BRMR_CMD_GET_PARAMS + * @hdr: message header + * @nsectors: number of sectors in the usual 512b unit + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * @max_write_zeroes_sectors: max sectors for WRITE ZEROES in the 512b unit + * @max_discard_sectors: max. sectors that can be discarded at once in 512b + * unit. + * @discard_granularity: size of the internal discard allocation unit in bytes + * @discard_alignment: offset from internal allocation assignment in bytes + * @physical_block_size: physical block size device supports in bytes + * @logical_block_size: logical block size device supports in bytes + * @max_segments: max segments hardware support in one transfer + * @secure_discard: supports secure discard + * @cache_policy: support write-back caching or FUA? + */ +struct brmr_cmd_get_params_rsp { + struct brmr_blk_dev_params dev_params; + + /* + * Params holding brmr device related info + */ + u8 mapped; + __le64 mapped_size; +}; + +struct brmr_msg_cmd_rsp { + struct brmr_msg_hdr hdr; + u64 magic; + u8 ver; + u8 cmd_type; + u8 status; + u8 rsvd[1]; + union { + struct brmr_cmd_get_params_rsp get_params_rsp; + //any other command responces. + }; +}; + +struct brmr_cmd_priv { + void *dev; + u8 cmd_type; + void *rsp_buf; + size_t rsp_buf_len; + int errno; + struct completion complete_done; +}; + +enum brmr_cache_policy { + BRMR_FUA = 1 << 0, + BRMR_WRITEBACK = 1 << 1, +}; diff --git a/drivers/block/brmr/brmr-srv.h b/drivers/block/brmr/brmr-srv.h new file mode 100644 index 000000000000..4180ee600e65 --- /dev/null +++ b/drivers/block/brmr/brmr-srv.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#ifndef BRMR_SRV_H +#define BRMR_SRV_H + +#include +#include +#include +#include + +#include "brmr-proto.h" +#include "rmr-req.h" + +#define BRMR_SERVER_VER_MAJOR 0 +#define BRMR_SERVER_VER_MINOR 1 + +#ifndef BRMR_SERVER_VER_STRING +#define BRMR_SERVER_VER_STRING __stringify(BRMR_SERVER_VER_MAJOR) "." \ + __stringify(BRMR_SERVER_VER_MINOR) +#endif + +#define DEFAULT_BLK_OPEN_FLAGS (BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_EXCL) + +#define BRMR_BLK_STORE_MAGIC 0xC0FFEE +#define BLK_STR_MD_SIZE PAGE_SIZE +#define BLK_STR_MD_SIZE_SECTORS (PAGE_SIZE / SECTOR_SIZE) +#define BLK_STR_MIN_MAPPED_SIZE (PAGE_SIZE + BLK_STR_MD_SIZE) + +extern struct list_head store_list; +extern struct mutex store_mutex; + +extern struct rmr_srv_store_ops pstore_blk_ops; +extern struct kobject *rmr_strs_kobj; + +/* brmr server */ + +enum brmr_srv_store_state { + BRMR_SRV_STORE_OPEN, + BRMR_SRV_STORE_MAPPED, + BRMR_SRV_STORE_NEED_SYNC, +}; + +struct brmr_srv_io_priv { + struct brmr_srv_blk_dev *dev; + void *priv; +}; + +struct rmr_blk_dev_params { + u32 max_hw_sectors; + u32 max_write_zeroes_sectors; + u32 max_discard_sectors; + u32 discard_granularity; + u32 discard_alignment; + u16 physical_block_size; + u16 logical_block_size; + u16 max_segments; + u16 secure_discard; + u8 cache_policy; +}; + +struct brmr_srv_blk_dev { + char poolname[NAME_MAX]; + struct block_device *bdev; + struct file *bdev_file; + struct list_head entry; + char name[BDEVNAME_SIZE]; + struct rmr_pool *pool; + u64 mapped_size; /* in sectors */ + u64 dev_size; /* in sectors */ + struct rmr_blk_dev_params dev_params; + struct kmem_cache *io_priv_cache; + struct kobject kobj; + unsigned long state; + struct completion comp; + struct percpu_ref kref; +}; + +struct brmr_srv_blk_dev_meta { + char poolname[NAME_MAX]; + struct rmr_blk_dev_params dev_params; + u64 magic; /* magic token to identify a header */ + u32 version; /* version of the header itself */ + u64 dev_size; + u64 mapped_size; + u64 state; + u64 offset; + u64 ts; +} __packed; + +int brmr_srv_blk_validate_md(struct brmr_srv_blk_dev *dev, struct brmr_srv_blk_dev_meta *meta); +struct brmr_srv_blk_dev *brmr_srv_blk_create(const char *path, char *name); +void brmr_srv_blk_destroy(struct brmr_srv_blk_dev *dev); +int brmr_srv_blk_open(struct brmr_srv_blk_dev *dev, const char *path, bool create, bool replace); +void brmr_srv_blk_close(struct brmr_srv_blk_dev *dev, bool delete); + +int brmr_srv_read_and_check_md(struct brmr_srv_blk_dev *dev, void *md_page); + +static inline void brmr_srv_blk_set_state(struct brmr_srv_blk_dev *dev, + enum brmr_srv_store_state state) +{ + set_bit(state, &dev->state); +} + +static inline void brmr_srv_blk_clear_state(struct brmr_srv_blk_dev *dev, + enum brmr_srv_store_state state) +{ + clear_bit(state, &dev->state); +} + +static inline int brmr_srv_blk_get_ref(struct brmr_srv_blk_dev *dev) +{ + return percpu_ref_tryget(&dev->kref); +} + +static inline void brmr_srv_blk_put_ref(struct brmr_srv_blk_dev *dev) +{ + percpu_ref_put(&dev->kref); +} + + +/* brmr-server-sysfs.c */ + +int brmr_srv_create_sysfs_files(void); +void brmr_srv_destroy_sysfs_files(void); +void blk_str_destroy_sysfs_files(struct brmr_srv_blk_dev *dev, + const struct attribute *sysfs_self); + +#endif /* BRMR_SRV_H */ From 3927c7ea37c7fcb4b50ef8de98f45a957cbccb92 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:21 +0200 Subject: [PATCH 09/13] block/brmr: client: main functionality Add the BRMR client implementation: brmr-clt.c client core: gendisk and tag-set creation per pool, blk-mq queue_rq() submitting block IOs to the underlying RMR pool, queue limit setup (chunk size, write-zeroes, discard, write-cache and FUA features) and device tear-down. brmr-clt-reque.c per-CPU requeue queues used to retry IOs temporarily blocked on RMR resource exhaustion. brmr-clt-stats.c per-pool statistics counters (request size distribution, BLK_STS_RESOURCE returns). These files are not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/block/brmr/brmr-clt-reque.c | 228 +++++ drivers/block/brmr/brmr-clt-stats.c | 332 ++++++++ drivers/block/brmr/brmr-clt.c | 1222 +++++++++++++++++++++++++++ 3 files changed, 1782 insertions(+) create mode 100644 drivers/block/brmr/brmr-clt-reque.c create mode 100644 drivers/block/brmr/brmr-clt-stats.c create mode 100644 drivers/block/brmr/brmr-clt.c diff --git a/drivers/block/brmr/brmr-clt-reque.c b/drivers/block/brmr/brmr-clt-reque.c new file mode 100644 index 000000000000..252661486a0a --- /dev/null +++ b/drivers/block/brmr/brmr-clt-reque.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "brmr-clt.h" +#include "rmr.h" +#include "rmr-pool.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(BRMR_VER_STRING); +MODULE_DESCRIPTION("BRMR Block Device using RMR cluster"); +MODULE_LICENSE("GPL"); + +static inline void brmr_requeue(struct brmr_queue *q) +{ + if (WARN_ON(!q->hctx)) + return; + + /* We can come here from interrupt, thus async=true */ + blk_mq_run_hw_queue(q->hctx, true); +} + +/** + * requeue implementation as used by ibnbd + */ + +void brmr_init_cpu_qlists(struct brmr_cpu_qlist __percpu *cpu_queues) +{ + unsigned int cpu; + struct brmr_cpu_qlist *cpu_q; + + for_each_possible_cpu(cpu) { + cpu_q = per_cpu_ptr(cpu_queues, cpu); + + cpu_q->cpu = cpu; + INIT_LIST_HEAD(&cpu_q->requeue_list); + spin_lock_init(&cpu_q->requeue_lock); + } +} + +/** + * brmr_get_cpu_qlist() - finds a list with HW queues to be requeued + * + * Description: + * Each CPU has a list of HW queues, which needs to be requeed. If a list + * is not empty - it is marked with a bit. This function finds first + * set bit in a bitmap and returns corresponding CPU list. + */ +static struct brmr_cpu_qlist * +brmr_get_cpu_qlist(struct brmr_clt_pool *pool, int cpu) +{ + int bit; + + /* First half */ + bit = find_next_bit(pool->cpu_queues_bm, nr_cpu_ids, cpu); + if (bit < nr_cpu_ids) { + return per_cpu_ptr(pool->cpu_queues, bit); + } else if (cpu != 0) { + /* Second half */ + bit = find_next_bit(pool->cpu_queues_bm, cpu, 0); + if (bit < cpu) + return per_cpu_ptr(pool->cpu_queues, bit); + } + + return NULL; +} + +static inline int nxt_cpu(int cpu) +{ + return (cpu + 1) % nr_cpu_ids; +} + +/** + * brmr_requeue_if_needed() - requeue if CPU queue is marked as non empty + * + * Description: + * Each CPU has it's own list of HW queues, which should be requeued. + * Function finds such list with HW queues, takes a list lock, picks up + * the first HW queue out of the list and requeues it. + * + * Return: + * True if the queue was requeued, false otherwise. + * + * Context: + * Does not matter. + */ +static inline bool brmr_requeue_if_needed(struct brmr_clt_pool *pool) +{ + struct brmr_queue *q = NULL; + struct brmr_cpu_qlist *cpu_q; + unsigned long flags; + int *cpup; + + /* + * To keep fairness and not to let other queues starve we always + * try to wake up someone else in round-robin manner. That of course + * increases latency but queues always have a chance to be executed. + */ + cpup = get_cpu_ptr(pool->cpu_rr); + for (cpu_q = brmr_get_cpu_qlist(pool, nxt_cpu(*cpup)); cpu_q; + cpu_q = brmr_get_cpu_qlist(pool, nxt_cpu(cpu_q->cpu))) { + if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags)) + continue; + if (likely(test_bit(cpu_q->cpu, pool->cpu_queues_bm))) { + q = list_first_entry_or_null(&cpu_q->requeue_list, + typeof(*q), requeue_list); + if (WARN_ON(!q)) + goto clear_bit; + list_del_init(&q->requeue_list); + clear_bit_unlock(0, &q->in_list); + + if (list_empty(&cpu_q->requeue_list)) { + /* Clear bit if nothing is left */ +clear_bit: + clear_bit(cpu_q->cpu, pool->cpu_queues_bm); + } + } + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + + if (q) + break; + } + + /** + * Saves the CPU that is going to be requeued on the per-cpu var. Just + * incrementing it doesn't work because brmr_get_cpu_qlist() will + * always return the first CPU with something on the queue list when the + * value stored on the var is greater than the last CPU with something + * on the list. + */ + if (cpu_q) + *cpup = cpu_q->cpu; + put_cpu_var(pool->cpu_rr); + + if (q) + brmr_requeue(q); + + return !!q; +} + +/** + * brmr_requeue_requests() - requeue all queues left in the list if + * brmr_clt_pool is idling (there are no requests in-flight). + * + * Description: + * This function tries to rerun all stopped queues if there are no + * requests in-flight anymore. This function tries to solve an obvious + * problem, when number of tags < than number of queues (hctx), which + * are stopped and put to sleep. If last tag, which has been just put, + * does not wake up all left queues (hctxs), IO requests hang forever. + * + * That can happen when all number of tags, say N, have been exhausted + * from one CPU, and we have many block devices per session, say M. + * Each block device has it's own queue (hctx) for each CPU, so eventually + * we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids. + * If number of tags N < M x nr_cpu_ids finally we will get an IO hang. + * + * To avoid this hang last caller of brmr_put_iu() (last caller is the + * one who observes pool->busy == 0) must wake up all remaining queues. + * + * Context: + * Called from msg_io_conf which in turn is a completion handler + * that is called from interupt. + */ +void brmr_requeue_requests(struct brmr_clt_pool *pool) +{ + bool requeued; + + do { + requeued = brmr_requeue_if_needed(pool); + } while (atomic_read(&pool->busy) == 0 && requeued); +} + +bool brmr_add_to_requeue(struct brmr_clt_pool *pool, struct brmr_queue *q) +{ + struct brmr_cpu_qlist *cpu_q; + unsigned long flags; + bool added = true; + bool need_set; + + cpu_q = get_cpu_ptr(pool->cpu_queues); + spin_lock_irqsave(&cpu_q->requeue_lock, flags); + + if (likely(!test_and_set_bit_lock(0, &q->in_list))) { + if (WARN_ON(!list_empty(&q->requeue_list))) + goto unlock; + + need_set = !test_bit(cpu_q->cpu, pool->cpu_queues_bm); + if (need_set) { + set_bit(cpu_q->cpu, pool->cpu_queues_bm); + /* Paired with brmr_put_iu(). Set a bit first + * and then observe the busy counter. + */ + smp_mb__before_atomic(); + } + if (likely(atomic_read(&pool->busy))) { + list_add_tail(&q->requeue_list, &cpu_q->requeue_list); + } else { + /* Very unlikely, but possible: busy counter was + * observed as zero. Drop all bits and return + * false to restart the queue by ourselves. + */ + if (need_set) + clear_bit(cpu_q->cpu, pool->cpu_queues_bm); + clear_bit_unlock(0, &q->in_list); + added = false; + } + } +unlock: + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + put_cpu_ptr(pool->cpu_queues); + + return added; +} + diff --git a/drivers/block/brmr/brmr-clt-stats.c b/drivers/block/brmr/brmr-clt-stats.c new file mode 100644 index 000000000000..de080fde779c --- /dev/null +++ b/drivers/block/brmr/brmr-clt-stats.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +//#include +//#include +//#include + +#include "brmr-clt.h" +#include "rmr.h" +#include "rmr-pool.h" + + +int brmr_clt_init_stats(struct brmr_clt_stats *stats) +{ + stats->pcpu_stats = alloc_percpu(typeof(*stats->pcpu_stats)); + if (unlikely(!stats->pcpu_stats)) + return -ENOMEM; + + return 0; +} + +void brmr_clt_free_stats(struct brmr_clt_stats *stats) +{ + free_percpu(stats->pcpu_stats); +} + +int brmr_clt_reset_submitted_req(struct brmr_clt_stats *stats, bool enable) +{ + struct brmr_stats_pcpu *s; + int cpu; + + if (unlikely(!enable)) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->submitted_requests, 0, + sizeof(s->submitted_requests)); + } + + return 0; +} + +int brmr_clt_reset_req_sizes(struct brmr_clt_stats *stats, bool enable) +{ + struct brmr_stats_pcpu *s; + int cpu; + + if (unlikely(!enable)) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->request_sizes, 0, + sizeof(s->request_sizes)); + } + + return 0; +} + +static void brmr_update_submitted_requests(struct brmr_stats_pcpu *s, + size_t size, int split, int d) +{ + s->submitted_requests.dir[d].total_sectors += (size >> SECTOR_SHIFT); + if (split) + s->submitted_requests.dir[d].cnt_split++; + else + s->submitted_requests.dir[d].cnt_whole++; +} + +#define MAX_LEN (128*1024) +#define NUM_CLASSES 16 +#define CLASSIFY_SHIFT (ilog2(MAX_LEN)-ilog2(NUM_CLASSES)) + +/** + classifies length linearly in 16 classes: + + input length in bytes + + < 0x2000 (8K) + >= 0x2000 (8K) + >= 0x4000 (16K) + >= 0x6000 (24K) + >= 0x8000 (32K) + >= 0xa000 (40K) + >= 0xc000 (48K) + >= 0xe000 (56K) + >= 0x10000 (64K) + >= 0x12000 (72K) + >= 0x14000 (80K) + >= 0x16000 (88K) + >= 0x18000 (96K) + >= 0x1a000 (104K) + >= 0x1c000 (112K) + >= 0x1e000 (120K) + + Maximum value is 128K-1. + However everything larger is classified as class 15 as well. +*/ +static inline int classify(long length) +{ + return length < MAX_LEN ? (length >> CLASSIFY_SHIFT) : NUM_CLASSES-1; +} + +static void brmr_update_request_sizes(struct brmr_stats_pcpu *s, + size_t size, int split, int d) +{ + int size_class = classify(size); + switch (split) { + case 0: + s->request_sizes.dir[d].cnt_whole[size_class]++; + break; + case 1: + s->request_sizes.dir[d].cnt_left[size_class]++; + break; + case 2: + s->request_sizes.dir[d].cnt_right[size_class]++; + break; + default: + WARN_ONCE(true,"unexpected value for split"); + } +} + +void brmr_update_stats(struct brmr_clt_stats *stats, size_t size, int split, int d) +{ + struct brmr_stats_pcpu *s; + + s = this_cpu_ptr(stats->pcpu_stats); + + brmr_update_submitted_requests(s, size, split, d); + brmr_update_request_sizes(s, size, split, d); +} + +ssize_t brmr_clt_stats_rq_to_str(struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_rq sum; + struct brmr_stats_rq *r; + int cpu; int d; + + memset(&sum, 0, sizeof(sum)); + + for_each_possible_cpu(cpu) { + r = &per_cpu_ptr(stats->pcpu_stats, cpu)->submitted_requests; + + for (d=READ; d<=WRITE; d++) { + sum.dir[d].cnt_whole += r->dir[d].cnt_whole; + sum.dir[d].cnt_split += r->dir[d].cnt_split; + sum.dir[d].total_sectors += r->dir[d].total_sectors; + } + } + + return scnprintf(page, len, "%llu %llu %llu %llu %llu %llu\n", + sum.dir[READ].cnt_whole, sum.dir[READ].cnt_split, + sum.dir[READ].total_sectors, + sum.dir[WRITE].cnt_whole, sum.dir[WRITE].cnt_split, + sum.dir[WRITE].total_sectors); +} + +ssize_t brmr_clt_stats_sizes_to_str(struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_sizes *sum; + struct brmr_stats_sizes *per_cpu; + int cpu; int d; int i; int cnt = 0; + + sum = kzalloc(sizeof(*sum), GFP_KERNEL); + if (unlikely(!sum)) + return -ENOMEM; + + for (i = 0; i < STATS_SIZES_NUM; i++) { + for_each_possible_cpu(cpu) { + per_cpu = &per_cpu_ptr(stats->pcpu_stats, cpu) + ->request_sizes; + + for (d=READ; d<=WRITE; d++) { + sum->dir[d].cnt_whole[i] + += per_cpu->dir[d].cnt_whole[i]; + sum->dir[d].cnt_left[i] + += per_cpu->dir[d].cnt_left[i]; + sum->dir[d].cnt_right[i] + += per_cpu->dir[d].cnt_right[i]; + } + } + } + + cnt += scnprintf(page + cnt, len - cnt, + " READ " + " whole left right " + "\n"); + if (len - cnt <= 0) + goto free_return; + + cnt += scnprintf(page + cnt, len - cnt, + "<= 8 Kbytes: %19llu %19llu %19llu\n", + sum->dir[READ].cnt_whole[0], + sum->dir[READ].cnt_left[0], + sum->dir[READ].cnt_right[0]); + + for (i = 1; i < STATS_SIZES_NUM; i++) { + + cnt += scnprintf(page + cnt, len - cnt, + "> %3d Kbytes: %19llu %19llu %19llu\n", + (i)<<3, + sum->dir[READ].cnt_whole[i], + sum->dir[READ].cnt_left[i], + sum->dir[READ].cnt_right[i]); + + if (len - cnt <= 0) + goto free_return; + } + + cnt += scnprintf(page + cnt, len - cnt, + "\n WRITE " + " whole left right " + "\n"); + if (len - cnt <= 0) + goto free_return; + + cnt += scnprintf(page + cnt, len - cnt, + "<= 8 Kbytes: %19llu %19llu %19llu\n", + sum->dir[WRITE].cnt_whole[0], + sum->dir[WRITE].cnt_left[0], + sum->dir[WRITE].cnt_right[0]); + + for (i = 1; i < STATS_SIZES_NUM; i++) { + + cnt += scnprintf(page + cnt, len - cnt, + "> %3d Kbytes: %19llu %19llu %19llu\n", + (i)<<3, + sum->dir[WRITE].cnt_whole[i], + sum->dir[WRITE].cnt_left[i], + sum->dir[WRITE].cnt_right[i]); + + if (len - cnt <= 0) + goto free_return; + } + +free_return: + kfree(sum); + + return cnt; +} + +int brmr_clt_reset_sts_resource(struct brmr_clt_stats *stats, bool enable) +{ + struct brmr_stats_pcpu *s; + int cpu; + + if (unlikely(!enable)) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->sts_resource, 0, + sizeof(s->sts_resource)); + } + + return 0; +} + +void brmr_clt_update_sts_resource(struct brmr_clt_stats *stats, int which) +{ + struct brmr_stats_pcpu *s; + + s = this_cpu_ptr(stats->pcpu_stats); + switch (which) { + case 0: + s->sts_resource.get_iu++; + break; + case 1: + s->sts_resource.get_iu2++; + break; + case 2: + s->sts_resource.clt_request1++; + break; + case 3: + s->sts_resource.clt_request++; + break; + default: + WARN_ONCE(true,"unexpected value for which"); + } +} + +ssize_t brmr_stats_sts_resource_to_str( + struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_sts_resource sum; + struct brmr_stats_sts_resource *r; + int cpu; + + memset(&sum, 0, sizeof(sum)); + + for_each_possible_cpu(cpu) { + r = &per_cpu_ptr(stats->pcpu_stats, cpu)->sts_resource; + + sum.get_iu += r->get_iu; + sum.get_iu2 += r->get_iu2; + sum.clt_request1 += r->clt_request1; + sum.clt_request += r->clt_request; + } + + return scnprintf(page, len, "%llu %llu %llu %llu\n", + sum.get_iu, sum.get_iu2, + sum.clt_request1, sum.clt_request); +} + +ssize_t brmr_stats_sts_resource_per_cpu_to_str( + struct brmr_clt_stats *stats, char *page, size_t len) +{ + struct brmr_stats_sts_resource *r; + int cpu; int cnt = 0; + + for_each_possible_cpu(cpu) { + r = &per_cpu_ptr(stats->pcpu_stats, cpu)->sts_resource; + + cnt += scnprintf(page+cnt, len, "%d %llu %llu %llu %llu\n", + cpu, r->get_iu, r->get_iu2, + r->clt_request1, r->clt_request); + if (len - cnt <= 0) + goto return_cnt; + } + +return_cnt: + return cnt; +} + diff --git a/drivers/block/brmr/brmr-clt.c b/drivers/block/brmr/brmr-clt.c new file mode 100644 index 000000000000..6f3d2dd2a9d9 --- /dev/null +++ b/drivers/block/brmr/brmr-clt.c @@ -0,0 +1,1222 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include + +#include "brmr-clt.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(BRMR_VER_STRING); +MODULE_DESCRIPTION("BRMR Block Device using RMR cluster"); +MODULE_LICENSE("GPL"); + +/* + * Maximum number of partitions an instance can have. + * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself) + */ +#define BRMR_PART_BITS 6 + +static DEFINE_IDA(index_ida); +static DEFINE_MUTEX(ida_lock); +static DEFINE_MUTEX(brmr_device_lock); +static LIST_HEAD(brmr_device_list); +static int brmr_major; + +static int BRMR_DELAY_10ms = 10; + +static int index_to_minor(int index) +{ + return index << BRMR_PART_BITS; +} + +static int minor_to_index(int minor) +{ + return minor >> BRMR_PART_BITS; +} + +static inline const char *rq_op_to_str(struct request *rq) +{ + switch (req_op(rq)) { + case REQ_OP_READ: + return "READ"; + case REQ_OP_WRITE: + return "WRITE"; + case REQ_OP_DISCARD: + return "DISCARD"; + case REQ_OP_WRITE_ZEROES: + return "WRITE_ZEROES"; + case REQ_OP_FLUSH: + return "FLUSH"; + default: + return "UNKNOWN"; + } + return ""; +} + + +/* copy from blk.h */ +static inline bool biovec_phys_mergeable(struct request_queue *q, + struct bio_vec *vec1, struct bio_vec *vec2) +{ + unsigned long mask = queue_segment_boundary(q); + phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; + phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + + if (addr1 + vec1->bv_len != addr2) + return false; + // Comment out xen related code + /* + if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) + return false; + */ + if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) + return false; + return true; +} + +/* copy from blk_merge.c */ +static inline unsigned get_max_segment_size(const struct request_queue *q, + struct page *start_page, + unsigned long offset) +{ + unsigned long mask = queue_segment_boundary(q); + + offset = mask & (page_to_phys(start_page) + offset); + + /* + * overflow may be triggered in case of zero page physical address + * on 32bit arch, use queue's max segment size when that happens. + */ + return min_not_zero(mask - offset + 1, + (unsigned long)queue_max_segment_size(q)); +} + +static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, + struct scatterlist *sglist) +{ + if (!*sg) + return sglist; + + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); +} + +/* only try to merge bvecs into one sg if they are from two bios */ +static inline bool +__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec, + struct bio_vec *bvprv, struct scatterlist **sg) +{ + + int nbytes = bvec->bv_len; + + if (!*sg) + return false; + + if ((*sg)->length + nbytes > queue_max_segment_size(q)) + return false; + + if (!biovec_phys_mergeable(q, bvprv, bvec)) + return false; + + (*sg)->length += nbytes; + + return true; +} + +/* + * brmr_clt_get_iu() - Get an RMR I/O unit (iu) + * + * Description: + * It gets an RMR I/O unit using rmr_clt_get_iu() and increments + * the pool busy counter. It invokes rmr_clt_get_iu() with NO_WAIT + * as brmr can requeue an I/O request. + * + * Ref. brmr_add_to_requeue() + */ +static inline struct rmr_iu *brmr_clt_get_iu(struct brmr_clt_pool *pool, enum rmr_io_flags flag) +{ + struct rmr_iu *iu = rmr_clt_get_iu(pool->rmr, flag, NO_WAIT); + if (IS_ERR_OR_NULL(iu)) + return iu; + + atomic_inc(&pool->busy); + + return iu; +} + +/* + * brmr_clt_put_iu() - Put the RMR I/O unit (iu) + * + * Description: + * It puts the RMR I/O unit using rmr_clt_put_iu() and decrements + * the pool busy counter. It uses memory barrier to reflect the + * busy counter. + * + * Ref. brmr_add_to_requeue() and brmr_requeue_requests() + */ +static inline void brmr_clt_put_iu(struct brmr_clt_pool *pool, struct rmr_iu *iu) +{ + rmr_clt_put_iu(pool->rmr, iu); + + atomic_dec(&pool->busy); + /* + * Paired with brmr_add_to_requeue(). Decrement first + * and then check queue bits. + */ + smp_mb__after_atomic(); + brmr_requeue_requests(pool); +} + +static void brmr_softirq_done_fn(struct request *rq) +{ + struct brmr_clt_iu *iu = blk_mq_rq_to_pdu(rq); + struct brmr_clt_dev *dev = iu->dev; + + if (blk_rq_nr_phys_segments(rq)) + sg_free_table_chained(&iu->sgt, BRMR_INLINE_SG_CNT); + + brmr_clt_put_iu(dev->pool, iu->rmr_iu); + blk_mq_end_request(rq, iu->status); +} + +static void brmr_request_conf(void *priv, int errno) +{ + struct brmr_clt_iu *iu = (struct brmr_clt_iu *)priv; + struct brmr_clt_dev *dev = iu->dev; + struct request *rq = iu->rq; + + iu->status = (errno && errno != -ENOENT) ? BLK_STS_IOERR : BLK_STS_OK; + + blk_mq_complete_request(rq); + + if (errno == -ENOENT) + pr_debug("%s request for %s IGNORED err: %d\n", + rq_op_to_str(rq), dev->gd->disk_name, errno); + else if (errno) + pr_err_ratelimited("%s request for %s failed with err: %d\n", + rq_op_to_str(rq), dev->gd->disk_name, errno); +} + +static blk_status_t brmr_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct brmr_clt_dev *dev = bd->rq->q->disk->private_data; + struct brmr_clt_pool *pool = dev->pool; + struct brmr_clt_iu *iu = blk_mq_rq_to_pdu(bd->rq); + struct request *rq = bd->rq; + struct rmr_iu *rmr_iu; + unsigned int sg_cnt = 0; + size_t offset; size_t length; + enum rmr_io_flags flag; + unsigned short prio, seg; + int data_dir, err; + blk_status_t ret = BLK_STS_IOERR; + + if (unlikely(dev->dev_state != DEV_STATE_READY)) + return ret; + + iu->rq = rq; + iu->dev = dev; + + offset = blk_rq_pos(rq) << SECTOR_SHIFT; + length = blk_rq_bytes(rq); + flag = rq_to_rmr_flags(rq); + prio = req_get_ioprio(rq); + data_dir = rq_data_dir(rq); + + rmr_iu = brmr_clt_get_iu(pool, flag); + if (unlikely(rmr_iu == NULL)) { + pr_debug("Got no tag to send a request to rmr_clt\n"); + + /* Increment statistic counter for it */ + brmr_clt_update_sts_resource(&dev->stats, 0); + + if (!brmr_add_to_requeue(pool, hctx->driver_data)) + /* + * TODO unlikely + * Restarting queue with some delay is a stupid way + * of handling resource contentions + */ + blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms); + + return BLK_STS_RESOURCE; + } + if (IS_ERR(rmr_iu)) { + pr_err("Error %pe when reserving resources for io in pool %s\n", + rmr_iu, pool->rmr->poolname); + return BLK_STS_IOERR; + } + iu->rmr_iu = rmr_iu; + + iu->sgt.sgl = iu->sgl; + seg = blk_rq_nr_phys_segments(rq); + if (seg) { + err = sg_alloc_table_chained(&iu->sgt, seg, iu->sgt.sgl, BRMR_INLINE_SG_CNT); + if (err) { + pr_err("sg_alloc_table_chained failed, ret=%x\n", err); + blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms); + brmr_clt_put_iu(pool, rmr_iu); + return BLK_STS_RESOURCE; + } + } + + /* We only support discards with single segment and write_zeroes request with no segment. */ + /* See queue limits. */ + if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES)) + sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl); + + blk_mq_start_request(rq); + brmr_update_stats(&dev->stats, length, 0, data_dir); + + pr_debug("brmr %s request with flag %x offset %lu length %lu sg_cnt: %d\n", + rq_op_to_str(rq), flag, offset, length, sg_cnt); + + err = rmr_clt_request(pool->rmr, rmr_iu, offset, length, flag, prio, + iu, brmr_request_conf, iu->sgt.sgl, sg_cnt); + if (likely(err == 0)) + return BLK_STS_OK; + + pr_err_ratelimited("sending %s request for %s failed with err: %d\n", + rq_op_to_str(rq), dev->gd->disk_name, err); + + if (unlikely(err == -EAGAIN || err == -ENOMEM)) { + pr_debug("Got resource error %d when sending a request to rmr_clt\n", err); + + brmr_clt_update_sts_resource(&dev->stats, 3); + blk_mq_delay_run_hw_queue(hctx, BRMR_DELAY_10ms); + + ret = BLK_STS_RESOURCE; + } else { + ret = BLK_STS_IOERR; + } + + if (seg) + sg_free_table_chained(&iu->sgt, BRMR_INLINE_SG_CNT); + + brmr_clt_put_iu(pool, rmr_iu); + return ret; +} + +static struct blk_mq_ops brmr_mq_ops = { + .queue_rq = brmr_queue_rq, + .complete = brmr_softirq_done_fn, +}; + +static struct brmr_clt_pool *brmr_clt_create_pool(const char *poolname) +{ + struct brmr_clt_pool *pool; + int err; + struct rmr_attrs attrs; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->rmr = rmr_clt_open(pool, NULL, poolname); + if (IS_ERR_OR_NULL(pool->rmr)) { + err = PTR_ERR(pool->rmr); + goto free_pool; + } + err = rmr_clt_query(pool->rmr, &attrs); + if (unlikely(err)) + goto close_rmr; + + pool->queue_depth = attrs.queue_depth; + pool->max_io_size = attrs.max_io_size; + pool->chunk_size = attrs.chunk_size; + pool->max_segments = attrs.max_segments; + + snprintf(pool->poolname, sizeof(pool->poolname), "%s", poolname); + + /* + * When opening a new pool, allocate mq tags for that pool - they are + * going to be shared among all devices opened in that pool + */ + pool->tag_set.ops = &brmr_mq_ops; + pool->tag_set.queue_depth = pool->queue_depth; + pool->tag_set.numa_node = NUMA_NO_NODE; + pool->tag_set.flags = BLK_MQ_F_TAG_QUEUE_SHARED; + pool->tag_set.cmd_size = sizeof(struct brmr_clt_iu) + BRMR_RDMA_SGL_SIZE; + pool->tag_set.nr_hw_queues = num_online_cpus(); + + err = blk_mq_alloc_tag_set(&pool->tag_set); + if (unlikely(err)) + goto close_rmr; + + refcount_set(&pool->refcount, 1); + + atomic_set(&pool->busy, 0); + bitmap_zero(pool->cpu_queues_bm, NR_CPUS); + pool->cpu_rr = alloc_percpu(int); + if (unlikely(!pool->cpu_rr)) { + pr_err("Failed to alloc percpu var (cpu_rr)\n"); + err = -ENOMEM; + goto free_tag_set; + } + pool->cpu_queues = alloc_percpu(struct brmr_cpu_qlist); + if (unlikely(!pool->cpu_queues)) { + pr_err("Failed to alloc percpu var (cpu_queues)\n"); + err = -ENOMEM; + goto free_cpu_rr; + } + brmr_init_cpu_qlists(pool->cpu_queues); + return pool; +free_cpu_rr: + free_percpu(pool->cpu_rr); +free_tag_set: + blk_mq_free_tag_set(&pool->tag_set); +close_rmr: + rmr_clt_close(pool->rmr); +free_pool: + kfree(pool); + + return ERR_PTR(err); +} + +static void brmr_clt_free_pool(struct brmr_clt_pool *pool) +{ + free_percpu(pool->cpu_queues); + pool->cpu_queues = NULL; + free_percpu(pool->cpu_rr); + pool->cpu_rr = NULL; + blk_mq_free_tag_set(&pool->tag_set); + rmr_clt_close(pool->rmr); + kfree(pool); +} + +static void brmr_clt_put_pool(struct brmr_clt_pool *pool) +{ + if (refcount_dec_and_test(&pool->refcount)) + brmr_clt_free_pool(pool); + else + rmr_clt_put_pool(pool->rmr); +} + +static inline bool brmr_clt_get_dev(struct brmr_clt_dev *dev) +{ + return refcount_inc_not_zero(&dev->refcount); +} + +void brmr_clt_put_dev(struct brmr_clt_dev *dev) +{ + might_sleep(); + + if (refcount_dec_and_test(&dev->refcount)) { + + mutex_lock(&ida_lock); + ida_free(&index_ida, dev->idx); + mutex_unlock(&ida_lock); + + kfree(dev->hw_queues); + + brmr_clt_put_pool(dev->pool); + + if (!list_empty(&dev->list)) { + mutex_lock(&brmr_device_lock); + list_del(&dev->list); + mutex_unlock(&brmr_device_lock); + } + kfree(dev); + } +} + +static int brmr_open(struct gendisk *disk, blk_mode_t mode) +{ + struct brmr_clt_dev *dev = disk->private_data; + + if (READ_ONCE(dev->dev_state) != DEV_STATE_READY) + return -EIO; + + if (!brmr_clt_get_dev(dev)) + return -EIO; + + return 0; +} + +static void brmr_release(struct gendisk *gen) +{ + struct brmr_clt_dev *dev = gen->private_data; + + brmr_clt_put_dev(dev); +} + +#if 0 +static int brmr_getgeo(struct block_device *block_device, + struct hd_geometry *geo) +{ + struct brmr_clt_dev *dev = block_device->bd_disk->private_data; + + geo->cylinders = (dev->size_sect & ~0x3f) >> 6; /* size/64 */ + geo->heads = 4; + geo->sectors = 16; + geo->start = 0; + + return 0; +} +#endif + +static const struct block_device_operations brmr_ops = { + .owner = THIS_MODULE, + .open = brmr_open, + .release = brmr_release, + /*.getgeo = brmr_getgeo,*/ +}; + +/** + * brmr_clt_init_cmd() - Initialize message command + * + * @msg: command message where to init + */ +static void brmr_clt_init_cmd(struct brmr_msg_cmd *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->ver = BRMR_PROTO_VER_MAJOR; +} + +/** + * brmr_cmd_conf() - Confirmation function for brmr command message + * + * @priv: priv pointer to brmr command private data + * @errno: error number passed from RMR. + * See description of errno in RMR function. + * + * Description: + * Command response for a map new command can fail on multiple levels. + * If RMR fails to send the message to any or one of the nodes, that would reflect on the + * errno. If the command fails on BRMR level, that would reflect on the rsp struct. + * The error number will be used differently by different commands accordingly. + */ +static void brmr_clt_cmd_conf(void *priv, int errno) +{ + struct brmr_cmd_priv *cmd_priv = (struct brmr_cmd_priv *)priv; + + switch (cmd_priv->cmd_type) { + case BRMR_CMD_MAP: + pr_info("%s: BRMR_CMD_MAP err=%d\n", __func__, errno); + cmd_priv->errno = errno; + break; + case BRMR_CMD_REMAP: + pr_info("%s: BRMR_CMD_REMAP err=%d\n", __func__, errno); + break; + case BRMR_CMD_UNMAP: + pr_info("%s: BRMR_CMD_UNMAP err=%d\n", __func__, errno); + /* + * No processing needed here. + */ + break; + case BRMR_CMD_GET_PARAMS: + pr_info("%s: BRMR_CMD_GET_PARAMS err=%d\n", __func__, errno); + if (errno) + cmd_priv->errno = errno; + break; + + default: + pr_err("%s: Unknown command type %d err=%d\n", __func__, cmd_priv->cmd_type, errno); + } + + complete(&cmd_priv->complete_done); +} + +/** + * brmr_clt_send_msg_cmd() - Sends command message to rmr pool + * + * @dev: pointer to brmr device + * @msg: msg struct to be sent + * @rsp_buf: response buffer where the response of the storage side is stored + * @rsp_buf_len: length of the response buffer + * + * Return: + * Negative if failed to sent command + * As handled by each command in brmr_clt_cmd_conf, if succeeded to send command + * + * Context: + * Would block until response is received + */ +static int brmr_clt_send_msg_cmd(struct brmr_clt_dev *dev, struct brmr_msg_cmd *msg, void *rsp_buf, + size_t rsp_buf_len) +{ + struct brmr_cmd_priv cmd_priv; + struct kvec vec; + int ret; + + vec = (struct kvec) { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + + cmd_priv.dev = dev; + cmd_priv.cmd_type = msg->cmd_type; + cmd_priv.rsp_buf = rsp_buf; + cmd_priv.rsp_buf_len = rsp_buf_len; + cmd_priv.errno = 0; + init_completion(&cmd_priv.complete_done); + + ret = rmr_clt_cmd_with_rsp(dev->pool->rmr, brmr_clt_cmd_conf, &cmd_priv, &vec, 1, rsp_buf, + rsp_buf_len, sizeof(struct brmr_msg_cmd_rsp)); + + if (!ret) { + wait_for_completion(&cmd_priv.complete_done); + ret = cmd_priv.errno; + } + + return ret; +} + +static struct brmr_clt_dev *brmr_alloc_and_init_dev(struct brmr_clt_pool *pool, + u64 size) +{ + struct brmr_clt_dev *dev; + struct brmr_queue *q; + struct blk_mq_hw_ctx *hctx; + int ret; + unsigned long i; + + /* + * alloc device structure + */ + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&dev->list); + dev->size_sect = size; + dev->pool = pool; + dev->dev_state = DEV_STATE_INIT; + dev->map_incomplete = false; + refcount_set(&dev->refcount, 1); + + /* + * Alloc a "queue" per cpu + */ + dev->hw_queues = kcalloc(nr_cpu_ids, + sizeof(*dev->hw_queues), GFP_KERNEL); + if (unlikely(!dev->hw_queues)) { + ret = -ENOMEM; + goto free_dev; + } + + /* + * Get an id to be used in /dev/brmr + */ + mutex_lock(&ida_lock); + ret = ida_alloc_range(&index_ida, 0, minor_to_index(1 << MINORBITS) - 1, + GFP_KERNEL); + mutex_unlock(&ida_lock); + if (ret < 0) { + pr_err("%s: ida_alloc_range() failed for pool %s, err: %d\n", + __func__, pool->poolname, ret); + goto free_queues; + } + dev->idx = ret; + + /* + * Init mq queue + */ + dev->gd = blk_mq_alloc_disk(&pool->tag_set, NULL, dev); + if (IS_ERR(dev->gd)) { + ret = PTR_ERR(dev->gd); + pr_err("Failed to initialize mq: %pe\n", dev->queue); + goto remove_ida; + } + dev->queue = dev->gd->queue; + + /* + * Assign hardware contexts to our queues + */ + queue_for_each_hw_ctx(dev->queue, hctx, i) { + q = &dev->hw_queues[i]; + INIT_LIST_HEAD(&q->requeue_list); + q->hctx = hctx; + hctx->driver_data = q; + } + + return dev; + +remove_ida: + mutex_lock(&ida_lock); + ida_free(&index_ida, dev->idx); + mutex_unlock(&ida_lock); +free_queues: + kfree(dev->hw_queues); +free_dev: + kfree(dev); +out: + return ERR_PTR(ret); +} + +static int brmr_set_dev_params(struct brmr_clt_dev *dev) +{ + struct brmr_clt_pool *pool = dev->pool; + u32 chunk_size = brmr_pool_chunk_size(pool); + struct queue_limits lim; + int ret; + + /* Aligns requests with the chunks in rmr client */ + if (!is_power_of_2(chunk_size >> SECTOR_SHIFT)) { + pr_err("%u not a power of 2!\n", chunk_size); + return -EINVAL; + } + + /* + * Set request queue parameters via queue_limits API + */ + lim = queue_limits_start_update(dev->queue); + lim.logical_block_size = dev->logical_block_size; + lim.physical_block_size = dev->physical_block_size; + lim.max_segments = dev->max_segments; + lim.max_hw_sectors = dev->max_hw_sectors; + lim.max_write_zeroes_sectors = dev->max_write_zeroes_sectors; + lim.io_opt = brmr_pool_chunk_size(pool); + lim.chunk_sectors = chunk_size >> SECTOR_SHIFT; + + /* however we don't support discards to */ + /* discontiguous segments in one request */ + lim.max_discard_segments = 1; + lim.max_hw_discard_sectors = dev->max_discard_sectors; + if (dev->secure_discard) + lim.max_secure_erase_sectors = dev->max_discard_sectors; + + lim.discard_granularity = dev->discard_granularity; + lim.discard_alignment = dev->discard_alignment; + + /* needed for ibtrs_map_sg_fr to work */ + lim.virt_boundary_mask = SZ_4K - 1; + + /* non-rotational device */ + lim.features &= ~BLK_FEAT_ROTATIONAL; + + if (dev->wc) + lim.features |= BLK_FEAT_WRITE_CACHE; + if (dev->fua) + lim.features |= BLK_FEAT_FUA; + + ret = queue_limits_commit_update(dev->queue, &lim); + if (ret) + goto err; + + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); + + ret = brmr_clt_init_stats(&dev->stats); + if (unlikely(ret)) + goto err; + + dev->gd->major = brmr_major; + dev->gd->minors = 1 << BRMR_PART_BITS; + dev->gd->first_minor = index_to_minor(dev->idx); + dev->gd->fops = &brmr_ops; + dev->gd->queue = dev->queue; + dev->gd->private_data = dev; + snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), + "brmr%d", dev->idx); + set_capacity(dev->gd, dev->size_sect); + + return 0; + +err: + return ret; +} + +/** + * brmr_get_remote_dev_params() - Gets device params from storage nodes + * + * @dev: pointer to brmr device + * + * Description: + * Does the following (sanity) checks + * 1) For an unmapped device, param get should succeed on all legs + * 2) There should not be a mixture of mapped and unmapped devices + * + * In addition to above, it also does the following work + * 1) For a mapped device, read from a single leg is enough for success + * 2) For an unmapped device, it does validation checks for params for every leg + * + * Return: + * Negative in case of failure + * 0 for success, and a non-mapped device is found + * 1 for success, and a mapped device is found + * + * Context: + * Would block until response is received + */ +static int brmr_get_remote_dev_params(struct brmr_clt_dev *dev) +{ + struct brmr_clt_pool *pool = dev->pool; + struct brmr_msg_cmd msg; + struct brmr_msg_cmd_rsp *brmr_cmd_rsp; + void *rsp_buf; + size_t rsp_buf_len; + int err = 0, i; + bool partial_fail = false, mapped = false; + + brmr_clt_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_GET_PARAMS; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) + return -ENOMEM; + + err = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (err < 0) { + pr_err("%s: brmr_clt_send_msg_cmd failed with errno %d\n", __func__, err); + goto free_data; + } else if (err) { + /* + * We cannot directly fail here, since we do not know if this is a map for a + * newly created device, or for one which has gone through mapping before. + * + * For the former, any failure should end in the whole map process failing. + * For the latter, a single read from a device with mapped state set should + * be enough for us to go ahead and map. + */ + partial_fail = true; + } + + /* + * Lets do the sanity check first, because combining it with param checks makes the + * entire loop harder to read + */ + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + struct brmr_cmd_get_params_rsp *get_params_rsp; + + brmr_cmd_rsp = ((struct brmr_msg_cmd_rsp *)rsp_buf) + i; + + /* + * We do not need to worry about not seeing MAGIC. + * This would happen for a non-working sessions, OR + * for extra sessions in the end for which there are no legs in RMR (Don't care) + * + * For non-working sessions, we will be notified by RMR through the return value + */ + if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC) + continue; + + /* + * This is error returned by rmr-store. + */ + if (brmr_cmd_rsp->status) + partial_fail = true; + + get_params_rsp = &brmr_cmd_rsp->get_params_rsp; + + /* + * If we find a mapped device, we save that info. + */ + if (get_params_rsp->mapped) + mapped = true; + } + + /* + * If there is no device mapped, it means that this is the first map after device creation + * In such a case, we need all sessions to be up and running. + */ + if (mapped == false && partial_fail) { + pr_err("%s: Mapping first time, but got failure for some sessions\n", __func__); + err = -EINVAL; + goto free_data; + } + + for (i = 0; i < RMR_POOL_MAX_SESS; i++) { + struct brmr_cmd_get_params_rsp *get_params_rsp; + struct brmr_blk_dev_params *rsp_dev_params; + + brmr_cmd_rsp = ((struct brmr_msg_cmd_rsp *)rsp_buf) + i; + + /* + * We are tracking partial failures through the above loop, so + * ignore it here. + */ + if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC || + brmr_cmd_rsp->status) + continue; + + get_params_rsp = &brmr_cmd_rsp->get_params_rsp; + + /* + * We cheat a little, and do this sanity check here. + * + * If even a single device was mapped, and we have sessions with non-mapped + * devices, it will be wrong to go forward with brmr map. + */ + if (mapped && !get_params_rsp->mapped) { + /* + * This can only happen if a node went down and up. + * And instead of re-adding a MAPPED device, a create was called + * We cannot allow map this way, since this means discard could + * have been skipped. + */ + pr_err("%s: Mixed combination of mapped+unmapped metadata found\n", + __func__); + err = -EINVAL; + goto free_data; + } + + /* + * The device size_sect, which is the size provided by the user in the map + * command, should be same as the mapped_size of every storage node's backend + * device, which was provided during create_store. + */ + if (dev->size_sect != le64_to_cpu(get_params_rsp->mapped_size)) { + pr_err("%s: Mismatched mapped_size: (Provide) %llu != %llu (Remote)\n", + __func__, dev->size_sect, le64_to_cpu(get_params_rsp->mapped_size)); + err = -EINVAL; + goto free_data; + } + + rsp_dev_params = &get_params_rsp->dev_params; + + dev->max_write_zeroes_sectors = min_not_zero( + dev->max_write_zeroes_sectors, + le32_to_cpu( + rsp_dev_params->max_write_zeroes_sectors)); + dev->max_discard_sectors = min_not_zero(brmr_pool_chunk_size(pool) >> SECTOR_SHIFT, + le32_to_cpu(rsp_dev_params->max_discard_sectors)); + dev->physical_block_size = max_t(u16, dev->physical_block_size, + le16_to_cpu(rsp_dev_params->physical_block_size)); + dev->logical_block_size = max_t(u16, dev->logical_block_size, + le16_to_cpu(rsp_dev_params->logical_block_size)); + + dev->discard_granularity = dev->logical_block_size; + dev->discard_alignment = dev->logical_block_size; + + /* secure_discard is actually true or false, but since we used + * __le16 to transfer this value in msg, min_t should work fine here + */ + dev->secure_discard = min_t(u16, dev->secure_discard, + le16_to_cpu(rsp_dev_params->secure_discard)); + + dev->cache_policy = rsp_dev_params->cache_policy; + dev->wc = !!(rsp_dev_params->cache_policy & BRMR_WRITEBACK); + dev->fua = !!(rsp_dev_params->cache_policy & BRMR_FUA); + } + + /* max segments and max_hw_sectors we get from rtrs sessions values + * stored in pool like in RNBD, not from bdev of the store side. + */ + dev->max_segments = pool->max_segments; + dev->max_hw_sectors = pool->max_io_size / SECTOR_SIZE; + + /* + * Return whether its a new map or an old one + */ + err = mapped; + +free_data: + kfree(rsp_buf); + + return err; +} + +/** + * brmr_clt_send_map_cmd() - Sends map command for a brmr device + * + * @dev: pointer to brmr device + * + * Return: + * Negative error value in case of failure + * 0 on success + * + * Context: + * Would block until response is received + */ +static int brmr_clt_send_map_cmd(struct brmr_clt_dev *dev) +{ + struct brmr_clt_pool *pool = dev->pool; + struct brmr_msg_cmd msg; + struct brmr_blk_dev_params *dev_params = &(msg.map_new_cmd.dev_params); + void *rsp_buf; + size_t rsp_buf_len; + int err = 0; + + brmr_clt_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_MAP; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) + return -ENOMEM; + + msg.map_new_cmd.version = BRMR_CURRENT_HEADER_VERSION; + msg.map_new_cmd.mapped_size = dev->size_sect; + + dev_params->max_hw_sectors = cpu_to_le32(dev->max_hw_sectors); + dev_params->max_write_zeroes_sectors = cpu_to_le32(dev->max_write_zeroes_sectors); + dev_params->max_discard_sectors = cpu_to_le32(dev->max_discard_sectors); + dev_params->discard_granularity = cpu_to_le32(dev->discard_granularity); + dev_params->discard_alignment = cpu_to_le32(dev->discard_alignment); + dev_params->physical_block_size = cpu_to_le16(dev->physical_block_size); + dev_params->logical_block_size = cpu_to_le16(dev->logical_block_size); + dev_params->max_segments = cpu_to_le16(dev->max_segments); + dev_params->secure_discard = cpu_to_le16(dev->secure_discard); + dev_params->cache_policy = dev->cache_policy; + + err = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (err) + pr_err("Failed to send cmd msg BRMR_CMD_MAP in pool %s, err=%d\n", + pool->poolname, err); + + kfree(rsp_buf); + return err; +} + +/* + * brmr_clt_send_unmap_cmd() - Send an unmap command to the server pool + * + * Sending may fail (e.g. no sessions connected). The failure is logged but + * not propagated — callers always continue with local cleanup regardless. + */ +static void brmr_clt_send_unmap_cmd(struct brmr_clt_dev *dev) +{ + struct brmr_msg_cmd msg; + void *rsp_buf; + size_t rsp_buf_len; + int ret; + + brmr_clt_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_UNMAP; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) { + pr_err("Failed to alloc rsp_buf for unmap in pool %s\n", + dev->pool->poolname); + return; + } + + /* + * Sending messages could fail. For example, there are no client pool sessions + * connected to this pool. Unmap_dev still progresses and cleans up the device + * states on the client side. + */ + ret = brmr_clt_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (ret) + pr_err("Error %d when unmap device in pool %s\n", + ret, dev->pool->poolname); + + kfree(rsp_buf); +} + +/** + * brmr_clt_map_device() - Maps brmr device through an rmr pool + * + * @id: Id for the device + * @poolname: rmr poolname which is to be used for mapping + * @size: Size of the disk + * + * Description: + * Opens rmr pool with pool name "poolname" + * Allocated brmr device and initializes it + * Maps brmr device using the rmr pool only if its not already mapped + * + * Return: + * Pointer to allocated and mapped brmr device on success + * Error pointer on failure + */ +struct brmr_clt_dev *brmr_clt_map_device(const char *poolname, u64 size) +{ + struct brmr_clt_pool *pool = NULL; + struct brmr_clt_dev *dev; + int ret, mapped; + + /* Create brmr pool */ + pool = brmr_clt_create_pool(poolname); + if (IS_ERR(pool)) { + ret = PTR_ERR(pool); + goto err_out; + } + + /* Alloc device */ + dev = brmr_alloc_and_init_dev(pool, size); + if (IS_ERR(dev)) { + pr_err("Error %pe allocating brmr device in pool %s\n", + dev, pool->poolname); + brmr_clt_put_pool(pool); + ret = PTR_ERR(dev); + goto err_out; + } + + mapped = brmr_get_remote_dev_params(dev); + if (mapped < 0) { + pr_err("Failed to get remote devs block params in pool %s, err=%d\n", + pool->poolname, mapped); + ret = mapped; + goto dest_dev; + } + + /* Set device params */ + ret = brmr_set_dev_params(dev); + if (unlikely(ret)) { + pr_err("Error %d brmr_set_dev_params in pool %s\n", + ret, pool->poolname); + goto dest_dev; + } + + /* + * We send map command only if its a new map. + * This must happen before add_disk() so the server is ready to serve + * I/O by the time the kernel probes the partition table. + */ + if (!mapped) { + pr_info("%s: Sending map command through pool %s\n", __func__, pool->poolname); + ret = brmr_clt_send_map_cmd(dev); + if (ret) { + pr_err("Failed to send map cmd to pool %s, err=%d\n", + pool->poolname, ret); + goto put_disk; + } + } + + dev->dev_state = DEV_STATE_READY; + + /* + * Add gendisk + */ + ret = add_disk(dev->gd); + if (ret) { + pr_err("%s: add_disk failed with err %d\n", __func__, ret); + goto unmap_dev; + } + + mutex_lock(&brmr_device_lock); + list_add(&dev->list, &brmr_device_list); + mutex_unlock(&brmr_device_lock); + + return dev; + +unmap_dev: + dev->dev_state = DEV_STATE_INIT; + if (!mapped) + brmr_clt_send_unmap_cmd(dev); +put_disk: + put_disk(dev->gd); + brmr_clt_free_stats(&dev->stats); +dest_dev: + brmr_clt_put_dev(dev); +err_out: + return ERR_PTR(ret); +} + +static void destroy_gen_disk(struct brmr_clt_dev *dev) +{ + unsigned int memflags; + + del_gendisk(dev->gd); + /* + * Before marking queue as dying (blk_cleanup_queue() does that) + * we have to be sure that everything in-flight has gone. + * Blink with freeze/unfreeze. + */ + memflags = blk_mq_freeze_queue(dev->queue); + blk_mq_unfreeze_queue(dev->queue, memflags); + put_disk(dev->gd); +} + +/** + * brmr_clt_close_device() - Closes a brmr device + * + * @dev: pointer to brmr device to close + * @sysfs_self: pointer to sysfs attribute + * + * Return: + * 0 in case of success + * negative in case of failure + */ +int brmr_clt_close_device(struct brmr_clt_dev *dev, + const struct attribute *sysfs_self) +{ + dev->dev_state = DEV_STATE_CLOSING; + destroy_gen_disk(dev); + brmr_clt_send_unmap_cmd(dev); + sysfs_remove_link(&dev->kobj, BRMR_LINK_NAME); + + if (sysfs_self) + brmr_clt_destroy_dev_sysfs_files(dev, sysfs_self); + + brmr_clt_free_stats(&dev->stats); + brmr_clt_put_dev(dev); + + return 0; +} + +struct brmr_clt_dev *find_and_get_device(const char *name) +{ + struct brmr_clt_dev *dev; + + mutex_lock(&brmr_device_lock); + list_for_each_entry(dev, &brmr_device_list, list) { + if (strncasecmp(dev->pool->poolname, name, NAME_MAX)) + continue; + + if (brmr_clt_get_dev(dev)) { + mutex_unlock(&brmr_device_lock); + return dev; + } + } + mutex_unlock(&brmr_device_lock); + + return NULL; +} + +static int __init brmr_client_init(void) +{ + int err; + + pr_info("Loading module %s, version %s\n", + KBUILD_MODNAME, BRMR_VER_STRING); + + brmr_major = register_blkdev(brmr_major, "brmr"); + if (brmr_major <= 0) { + pr_err("Failed to load module," + " block device registration failed\n"); + err = -EBUSY; + goto out; + } + + err = brmr_clt_create_sysfs_files(); +out: + return err; +} + +static void __exit brmr_client_exit(void) +{ + struct brmr_clt_dev *dev, *tmp; + + pr_info("Unloading module\n"); + + brmr_clt_destroy_sysfs_files(); + unregister_blkdev(brmr_major, "brmr"); + + list_for_each_entry_safe(dev, tmp, &brmr_device_list, list) { + brmr_clt_close_device(dev, NULL); + } + + ida_destroy(&index_ida); + + pr_info("Module %s unloaded\n", KBUILD_MODNAME); +} + +module_init(brmr_client_init); +module_exit(brmr_client_exit); From 860e4a764e631095ef8bfa007a0374bd17fa7011 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:22 +0200 Subject: [PATCH 10/13] block/brmr: client: sysfs interface functions Add the BRMR client sysfs interface used to map and unmap remote devices. Writes to /sys/devices/virtual/brmr-client/ctl/map_device trigger creation of a /dev/brmrN gendisk backed by the named RMR pool; per-device attribute groups expose the device state and statistics, and accept unmap requests. This file is not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/block/brmr/brmr-clt-sysfs.c | 463 ++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 drivers/block/brmr/brmr-clt-sysfs.c diff --git a/drivers/block/brmr/brmr-clt-sysfs.c b/drivers/block/brmr/brmr-clt-sysfs.c new file mode 100644 index 000000000000..7d2435acac6a --- /dev/null +++ b/drivers/block/brmr/brmr-clt-sysfs.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "brmr-clt.h" + +static struct device *brmr_dev; +static struct class *brmr_dev_class; +static struct kobject *brmr_devs_kobj; + +enum { + BRMR_OPT_ERR = 0, + BRMR_OPT_POOL = 1 << 1, + BRMR_OPT_SIZE = 1 << 2, +}; + +static int brmr_clt_create_dev_sysfs_files(struct brmr_clt_dev *dev); +static int brmr_add_dev_symlink(struct brmr_clt_dev *dev); + +static unsigned int brmr_opt_mandatory[] = { + BRMR_OPT_POOL, +}; + +static const match_table_t brmr_opt_tokens = { + { BRMR_OPT_POOL, "pool=%s" }, + { BRMR_OPT_SIZE, "size=%s" }, + { BRMR_OPT_ERR, NULL }, +}; + +/* remove new line from string */ +static void strip(char *s) +{ + char *p = s; + + while (*s != '\0') { + if (*s != '\n') + *p++ = *s++; + else + ++s; + } + *p = '\0'; +} + +static int brmr_clt_parse_options(const char *buf, + char *pool, + unsigned long *size) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = strstrip(options); + strip(sep_opt); + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, brmr_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case BRMR_OPT_POOL: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("poolname too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(pool, p, NAME_MAX); + kfree(p); + break; + + case BRMR_OPT_SIZE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + /* + * The conventional semantics are that if the number begins with 0x, it will + * be parsed as hexadecimal; if it begins with 0, it will be parsed as + * octal; otherwise, it will be parsed as decimal. + */ + ret = kstrtoul(p, 0, size); + if (ret) { + pr_err("size '%s' isn't an integer: %d\n", p, ret); + kfree(p); + goto out; + } + kfree(p); + break; + + + default: + pr_err("unknown parameter or missing value" + " '%s'\n", p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(brmr_opt_mandatory); i++) { + if ((opt_mask & brmr_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("parameters missing\n"); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + +static ssize_t brmr_map_device_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo \"" + "pool= " + "size=\" > %s\n", + attr->attr.name); +} + +static ssize_t brmr_map_device_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_clt_dev *dev; + char pool[NAME_MAX]; + unsigned long size = 0; + int ret; + + ret = brmr_clt_parse_options(buf, pool, &size); + if (ret) + goto err; + + dev = find_and_get_device(pool); + if (dev) { + pr_err("Device exists and opened as %s\n", + dev->gd->disk_name); + brmr_clt_put_dev(dev); + ret = -EEXIST; + goto err; + } + + dev = brmr_clt_map_device(pool, size); + if (IS_ERR(dev)) { + pr_err("Error mapping device to pool %s\n", pool); + ret = PTR_ERR(dev); + goto err; + } + ret = brmr_clt_create_dev_sysfs_files(dev); + if (ret) + goto close_device; + + ret = brmr_add_dev_symlink(dev); + if (ret) + goto destroy_sysfs; + + return count; + +destroy_sysfs: + sysfs_remove_link(&dev->kobj, BRMR_LINK_NAME); + brmr_clt_destroy_dev_sysfs_files(dev, NULL); +close_device: + brmr_clt_close_device(dev, NULL); +err: + return ret; +} + +static struct kobj_attribute brmr_map_device_attr = + __ATTR(map_device, 0644, + brmr_map_device_show, brmr_map_device_store); + +static struct attribute *default_attrs[] = { + &brmr_map_device_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +static ssize_t brmr_unmap_device_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo > %s\n", + attr->attr.name); +} + +static ssize_t brmr_unmap_device_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_clt_dev *dev; + int err; + + dev = container_of(kobj, struct brmr_clt_dev, kobj); + + if (!sysfs_streq(buf, "1")) { + pr_err("%s: unknown value: '%s'\n", attr->attr.name, buf); + return -EINVAL; + } + + pr_info("Closing device %s.\n", dev->gd->disk_name); + + /* + * We take explicit module reference only for one reason: do not + * race with lockless ibnbd_destroy_sessions(). + */ + if (!try_module_get(THIS_MODULE)) { + return -ENODEV; + } + err = brmr_clt_close_device(dev, &attr->attr); + if (unlikely(err)) { + if (unlikely(err != -EALREADY)) + pr_err("unmap_device %s: %d\n", + dev->gd->disk_name, err); + goto module_put; + } + + /* + * Here device can be vanished! + */ + err = count; + +module_put: + module_put(THIS_MODULE); + + return err; +} + +static struct kobj_attribute brmr_unmap_device_attr = + __ATTR(unmap_device, 0644, + brmr_unmap_device_show, brmr_unmap_device_store); + +static ssize_t brmr_clt_device_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct brmr_clt_dev *dev; + int cnt; + + dev = container_of(kobj, struct brmr_clt_dev, kobj); + + switch (dev->dev_state) { + case DEV_STATE_INIT: + cnt = sysfs_emit(page, "init\n"); + break; + case DEV_STATE_READY: + cnt = sysfs_emit(page, "ready\n"); + break; + case DEV_STATE_DISCONNECTED: + cnt = sysfs_emit(page, "disconnected\n"); + break; + case DEV_STATE_CLOSING: + cnt = sysfs_emit(page, "closing\n"); + break; + default: + cnt = sysfs_emit(page, "unknown\n"); + break; + } + + if (dev->map_incomplete) + cnt += sysfs_emit_at(page, cnt, "degraded\n"); + + return cnt; +} + +static struct kobj_attribute brmr_clt_device_state = + __ATTR(state, 0444, brmr_clt_device_state_show, NULL); + +static struct attribute *brmr_clt_dev_attrs[] = { + &brmr_unmap_device_attr.attr, + &brmr_clt_device_state.attr, + NULL, +}; +ATTRIBUTE_GROUPS(brmr_clt_dev); + +static struct kobj_type brmr_clt_device_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = brmr_clt_dev_groups, +}; + +static struct kobj_type brmr_clt_stats_ktype = { + .sysfs_ops = &kobj_sysfs_ops, +}; + +static int brmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *kobj_stats); + +static int brmr_clt_create_dev_sysfs_files(struct brmr_clt_dev *dev) +{ + int ret; + + ret = kobject_init_and_add(&dev->kobj, &brmr_clt_device_ktype, + brmr_devs_kobj, + "%s", dev->gd->disk_name); + if (ret) + pr_err("Failed to create sysfs dir for device '%s': %d\n", + dev->gd->disk_name, ret); + + ret = brmr_clt_create_stats_files(&dev->kobj, &dev->kobj_stats); + if (unlikely(ret)) { + pr_err("Failed to create sysfs stats files " + "for device '%s': %d\n", dev->gd->disk_name, ret); + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); + } + return ret; +} + +static int brmr_add_dev_symlink(struct brmr_clt_dev *dev) +{ + struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; + int ret; + + ret = sysfs_create_link(&dev->kobj, gd_kobj, BRMR_LINK_NAME); + if (ret) { + pr_err("Creating symlink for %s failed, err: %d\n", + dev->gd->disk_name, ret); + } + + return ret; +} + +void brmr_clt_destroy_dev_sysfs_files(struct brmr_clt_dev *dev, + const struct attribute *sysfs_self) +{ + if (dev->kobj.state_in_sysfs) { + + kobject_del(&dev->kobj_stats); + kobject_put(&dev->kobj_stats); + if (sysfs_self) + sysfs_remove_file_self(&dev->kobj, sysfs_self); + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); + } +} + +int brmr_clt_create_sysfs_files(void) +{ + int err; + + brmr_dev_class = class_create("brmr-client"); + if (IS_ERR(brmr_dev_class)) + return PTR_ERR(brmr_dev_class); + + brmr_dev = device_create(brmr_dev_class, NULL, + MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(brmr_dev)) { + err = PTR_ERR(brmr_dev); + goto cls_destroy; + } + brmr_devs_kobj = kobject_create_and_add("devices", &brmr_dev->kobj); + if (unlikely(!brmr_devs_kobj)) { + err = -ENOMEM; + goto dev_destroy; + } + err = sysfs_create_group(&brmr_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto put_devs_kobj; + + return 0; + +put_devs_kobj: + kobject_del(brmr_devs_kobj); + kobject_put(brmr_devs_kobj); +dev_destroy: + device_unregister(brmr_dev); +cls_destroy: + class_destroy(brmr_dev_class); + + return err; +} + +void brmr_clt_destroy_sysfs_files(void) +{ + sysfs_remove_group(&brmr_dev->kobj, &default_attr_group); + kobject_del(brmr_devs_kobj); + kobject_put(brmr_devs_kobj); + device_unregister(brmr_dev); + class_destroy(brmr_dev_class); +} + +STAT_ATTR(struct brmr_clt_dev, requests, + brmr_clt_stats_rq_to_str, brmr_clt_reset_submitted_req); +STAT_ATTR(struct brmr_clt_dev, request_sizes, + brmr_clt_stats_sizes_to_str, brmr_clt_reset_req_sizes); +STAT_ATTR(struct brmr_clt_dev, sts_resource, + brmr_stats_sts_resource_to_str, brmr_clt_reset_sts_resource); +STAT_ATTR(struct brmr_clt_dev, sts_resource_per_cpu, + brmr_stats_sts_resource_per_cpu_to_str, brmr_clt_reset_sts_resource); + +static struct attribute *brmr_stats_attrs[] = { + &requests_attr.attr, + &request_sizes_attr.attr, + &sts_resource_attr.attr, + &sts_resource_per_cpu_attr.attr, + NULL, +}; + +static struct attribute_group brmr_stats_attr_group = { + .attrs = brmr_stats_attrs, +}; + +static int brmr_clt_create_stats_files(struct kobject *kobj, + struct kobject *kobj_stats) +{ + int ret; + + ret = kobject_init_and_add(kobj_stats, &brmr_clt_stats_ktype, kobj, "stats"); + if (ret) { + pr_err("Failed to init and add stats kobject, err: %d\n", + ret); + return ret; + } + + ret = sysfs_create_group(kobj_stats, &brmr_stats_attr_group); + if (ret) { + pr_err("failed to create stats sysfs group, err: %d\n", + ret); + goto put_stats_obj; + } + + return 0; + +put_stats_obj: + kobject_del(kobj_stats); + kobject_put(kobj_stats); + + return ret; +} From 382c1366ae860362a86a71684059e988b3bcc667 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:23 +0200 Subject: [PATCH 11/13] block/brmr: server: main functionality Add the BRMR server implementation that exports a local block device as the backing store for an RMR pool. brmr-srv.c implements the struct rmr_srv_store_ops interface provided by RMR (rmr-srv.h) and registers each backing device with rmr_srv_register(). The submit_req and submit_md_req callbacks issue bios to the underlying block_device, propagating the completion back to RMR via rmr_srv_endreq(). The on-disk metadata header at the end of the device is validated on bring-up and used to detect re-mapping into an existing pool. This file is not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/block/brmr/brmr-srv.c | 1402 +++++++++++++++++++++++++++++++++ 1 file changed, 1402 insertions(+) create mode 100644 drivers/block/brmr/brmr-srv.c diff --git a/drivers/block/brmr/brmr-srv.c b/drivers/block/brmr/brmr-srv.c new file mode 100644 index 000000000000..cf85a54e4511 --- /dev/null +++ b/drivers/block/brmr/brmr-srv.c @@ -0,0 +1,1402 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include + +#include "brmr-srv.h" +#include "rmr-srv.h" + +MODULE_AUTHOR("The RMR and BRMR developers"); +MODULE_VERSION(BRMR_SERVER_VER_STRING); +MODULE_DESCRIPTION("BRMR Server"); +MODULE_LICENSE("GPL"); + +LIST_HEAD(store_list); +DEFINE_MUTEX(store_mutex); /* mutex to protect store_list */ + +/** + * brmr_srv_blk_validate_md() - Parse metadata for the given rmr block device and validate it + * + * @dev: RMR block device against which the md is to be validated + * @meta: pointer to metadata to be checked + * + * Return: + * 0: On success + * -Error: On failure + */ +int brmr_srv_blk_validate_md(struct brmr_srv_blk_dev *dev, struct brmr_srv_blk_dev_meta *meta) +{ + if (meta->magic != BRMR_BLK_STORE_MAGIC) { + pr_warn("No md found. store %s md magic=%llX does not match %X\n", + dev->poolname, meta->magic, BRMR_BLK_STORE_MAGIC); + return -EINVAL; + } + + // TODO: check version! + + if (dev->dev_size && dev->dev_size != meta->dev_size) { + pr_err("store %s dev_size %llu does not match md value %llu\n", + dev->poolname, dev->dev_size, meta->dev_size); + return -EINVAL; + } + + if (dev->mapped_size != meta->mapped_size) { + pr_err("store %s mapped_size %llu does not match md value %llu\n", + dev->poolname, dev->mapped_size, meta->mapped_size); + return -EINVAL; + } + + if (strncmp(dev->poolname, meta->poolname, NAME_MAX)) { + pr_err("store %s does not match md value %s\n", + dev->poolname, meta->poolname); + return -EINVAL; + } + + pr_debug("store %s md: mapped_size=%llu\n", + dev->poolname, meta->mapped_size); + pr_debug("md parsing is done for store %s\n", dev->poolname); + + return 0; +} + +/** + * brmr_srv_blk_fill_md() - Fill metadata from brmr srv block device + * + * @dev: BRMR server block device from which data is to be taken + * @data: pointer to metadata + * + * Return: + * 0: On success + * -Error: On failure + */ +static int brmr_srv_blk_fill_md(struct brmr_srv_blk_dev *dev, void *data) +{ + struct brmr_srv_blk_dev_meta *meta = data; + + meta->magic = BRMR_BLK_STORE_MAGIC; + meta->version = 0; + meta->dev_size = dev->dev_size; + meta->offset = BLK_STR_MD_SIZE_SECTORS; + meta->ts = jiffies; // or ktime_get_real_seconds(); + meta->mapped_size = dev->mapped_size; + meta->state = dev->state; + + memcpy(&meta->dev_params, &dev->dev_params, sizeof(struct rmr_blk_dev_params)); + + strscpy(meta->poolname, dev->poolname, NAME_MAX); + + pr_debug("md filling pool %s is done for dev %s\n", meta->poolname, dev->name); + + return 0; +} + +static int brmr_srv_blk_md_io_sync(struct block_device *bdev, int rw, void *md_data) +{ + int err = 0; + struct bio *bio; + blk_opf_t bio_flags = REQ_META; + u32 bytes; + + bio = bio_alloc(bdev, 1, bio_flags, GFP_NOIO); + if (!bio) { + pr_err("Failed to allocate metadata bio\n"); + return -ENOMEM; + } + + bytes = bio_add_page(bio, virt_to_page(md_data), PAGE_SIZE, 0); + if (bytes != PAGE_SIZE) { + pr_err("Failed to add page to bio, bytes returned=%u, expected %lu\n", + bytes, PAGE_SIZE); + err = -EINVAL; + goto bio_put; + } + + if (rw == READ) + bio->bi_opf = REQ_OP_READ; + else + bio->bi_opf = REQ_OP_WRITE | REQ_FUA; + + bio->bi_opf |= bio_flags; + bio->bi_iter.bi_sector = 0; + bio_set_dev(bio, bdev); + + pr_debug("submit_bio_wait dev %s, rw %s\n", + bdev->bd_disk->disk_name, rw == WRITE ? "WRITE" : "READ"); + err = submit_bio_wait(bio); + if (err) { + pr_err("Error reading md from %s, err %d\n", + bdev->bd_disk->disk_name, err); + goto bio_put; + } + pr_info("%s: for dev %s md rw %s is completed with code %d\n", + __func__, bdev->bd_disk->disk_name, rw == WRITE ? "WRITE" : "READ", err); + +bio_put: + bio_put(bio); + + return err; +} + +/** + * brmr_srv_blk_read_md() - read md from given block device + * + * @bdev: block device from which to read md + * @md_page: buffer to fill with md + */ +static int brmr_srv_blk_bdev_read_md(struct block_device *bdev, char *md_page) +{ + int err = 0; + + err = brmr_srv_blk_md_io_sync(bdev, READ, md_page); + if (err) { + pr_err("error reading md from %s, err %d\n", bdev->bd_disk->disk_name, err); + return err; + } + + pr_debug("read md from dev %s is done\n", bdev->bd_disk->disk_name); + + return err; +} + +static int brmr_srv_blk_write_md(struct brmr_srv_blk_dev *dev) +{ + int err = 0; + void *md_page; + + pr_debug("flush md to dev %s\n", dev->name); + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_err("Failed to allocate page to read md\n"); + err = -ENOMEM; + goto out; + } + + err = brmr_srv_blk_fill_md(dev, md_page); + if (err) { + pr_err("error filling md for dev %s, err %d\n", dev->name, err); + goto free_md_page; + } + + err = brmr_srv_blk_md_io_sync(dev->bdev, WRITE, md_page); + if (err) { + pr_err("error writing md to %s, err %d\n", dev->name, err); + goto free_md_page; + } + pr_debug("flush md to dev is done %s\n", dev->name); + +free_md_page: + kfree(md_page); +out: + return err; +} + +static void brmr_srv_blk_zero_md(struct brmr_srv_blk_dev *dev) +{ + int err = 0; + void *md_page; + + pr_debug("zero md on dev %s\n", dev->name); + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_warn("Failed to allocate page to read md\n"); + return; + } + + err = brmr_srv_blk_md_io_sync(dev->bdev, WRITE, md_page); + if (err) + pr_warn("error writing zero md to %s, err %d\n", dev->name, err); + + pr_debug("zero md on dev is done %s\n", dev->name); + kfree(md_page); +} + +static void brmr_srv_ref_kill(struct brmr_srv_blk_dev *dev) +{ + percpu_ref_kill(&dev->kref); + wait_for_completion(&dev->comp); +} + +static void brmr_srv_blk_release(struct percpu_ref *kref) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kref, struct brmr_srv_blk_dev, kref); + complete(&dev->comp); +} + +/** + * brmr_srv_blk_close() - Close a brmr srv block device + * + * @dev: BRMR server block device to be closed + * + * Description: + * Close an opened brmr srv store block device. + * This function is the opposite of brmr_srv_blk_open. + * This function is supposed to be the check and stop for inflight IOs. + * + * Locks: + * store_mutex should be held while calling this. + */ +void brmr_srv_blk_close(struct brmr_srv_blk_dev *dev, bool delete) +{ + pr_info("rmr store name: %s; dev %s is closing\n", dev->poolname, dev->name); + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN); + + list_del(&dev->entry); + + pr_info("brmr server store blk dev %s wait for io to complete.\n", dev->name); + brmr_srv_ref_kill(dev); + + /* + * Reinit the ref counter so that RMR can send metadata requests. + */ + reinit_completion(&dev->comp); + percpu_ref_reinit(&dev->kref); + + rmr_srv_unregister(dev->poolname, delete); + dev->pool = NULL; + brmr_srv_ref_kill(dev); + + if (delete) + brmr_srv_blk_zero_md(dev); +} + +static int brmr_srv_blk_do_discard(struct brmr_srv_blk_dev *dev) +{ + struct rmr_pool *pool = dev->pool; + int err; + + pr_info("store id %s has mapped size of %llu, send discarded chunks to rmr pool %s\n", + dev->poolname, dev->mapped_size, dev->pool->poolname); + + err = rmr_srv_discard_id(pool, 0, 0, 0, true); + if (err) + pr_err("store %s failed to discard all data\n", dev->poolname); + + return err; +} + +/** + * brmr_srv_init_cmd() - Initialize message command + * + * @msg: command message where to init + */ +static void brmr_srv_init_cmd(struct brmr_msg_cmd *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->ver = BRMR_PROTO_VER_MAJOR; +} + +/** + * brmr_srv_cmd_conf() - Confirmation function for brmr srv store internal command message + * + * @priv: priv pointer to brmr command private data + * @errno: error number passed from RMR. + * See description of errno in RMR function. + * + * Description: + * Command response for a map new command can fail on multiple levels. + * If RMR fails to send the message to any or one of the nodes, that would reflect on the + * errno. If the command fails on BRMR level, that would reflect on the rsp struct. + * The error number will be used differently by different commands accordingly. + */ +static void brmr_srv_cmd_conf(void *priv, int errno) +{ + struct brmr_cmd_priv *cmd_priv = (struct brmr_cmd_priv *)priv; + + cmd_priv->errno = errno; + + switch (cmd_priv->cmd_type) { + case BRMR_CMD_GET_PARAMS: + if (cmd_priv->errno) + pr_err("%s: BRMR_CMD_GET_PARAMS failed with err=%pe on sending", + __func__, ERR_PTR(errno)); + + break; + + default: + cmd_priv->errno = -EINVAL; + pr_err("%s: Unknown command type %d err=%d\n", __func__, cmd_priv->cmd_type, errno); + } + + complete(&cmd_priv->complete_done); +} + +/** + * brmr_srv_send_msg_cmd() - Sends command message to internal rmr pool through rmr-srv pool + * + * @dev: pointer to brmr device + * @msg: msg struct to be sent + * @rsp_buf: response buffer where the response of the storage side is stored + * @rsp_buf_len: length of the response buffer + * + * Return: + * Negative if failed to sent command + * As handled by each command in brmr_cmd_conf, if succeeded to send command + * + * Context: + * Would block until response is received + */ +static int brmr_srv_send_msg_cmd(struct brmr_srv_blk_dev *dev, struct brmr_msg_cmd *msg, + void *rsp_buf, size_t rsp_buf_len) +{ + struct brmr_cmd_priv cmd_priv; + struct kvec vec; + int ret; + + vec = (struct kvec) { + .iov_base = msg, + .iov_len = sizeof(*msg) + }; + + cmd_priv.dev = dev; + cmd_priv.cmd_type = msg->cmd_type; + cmd_priv.rsp_buf = rsp_buf; + cmd_priv.rsp_buf_len = rsp_buf_len; + cmd_priv.errno = 0; + init_completion(&cmd_priv.complete_done); + + ret = rmr_srv_pool_cmd_with_rsp(dev->pool, brmr_srv_cmd_conf, &cmd_priv, &vec, 1, rsp_buf, + rsp_buf_len, sizeof(struct brmr_msg_cmd_rsp)); + + if (!ret) { + wait_for_completion(&cmd_priv.complete_done); + ret = cmd_priv.errno; + } + + return ret; +} + +/** + * brmr_srv_blk_get_params() - Get parameters from other servers + * + * @dev: Backend device for which to be checked + * + * Description: + * Check whether parameters from other servers are consistent with this server through + * internal network. + * + * Return: + * 0 on success of checks + * -Negative error value on failure of checks. + * -EAGAIN if no sync sessions are connected to this server. + */ +static int brmr_srv_blk_get_params(void *device) +{ + struct brmr_srv_blk_dev *dev; + struct brmr_msg_cmd msg; + struct brmr_msg_cmd_rsp *brmr_cmd_rsp; + void *rsp_buf; + size_t rsp_buf_len; + int err = 0, i; + bool checked = false; + + dev = (struct brmr_srv_blk_dev *)device; + brmr_srv_init_cmd(&msg); + msg.cmd_type = BRMR_CMD_GET_PARAMS; + + rsp_buf_len = sizeof(struct brmr_msg_cmd_rsp) * RMR_POOL_MAX_SESS; + rsp_buf = kzalloc(rsp_buf_len, GFP_KERNEL); + if (!rsp_buf) + return -ENOMEM; + + err = brmr_srv_send_msg_cmd(dev, &msg, rsp_buf, rsp_buf_len); + if (err < 0) { + pr_warn("%s: brmr_send_msg_cmd failed with errno %d\n", __func__, err); + /* + * Sending could fail for various reasons. The server may be isolated and has + * no connected sync sessions to other nodes. Or the connected server has no + * store attached. + */ + goto free_data; + } + + /* + * We do not care if the command failed for few storage nodes, as long as we get a good + * response from one of them. + * + * The mapped size of all storage nodes which are connected should be the same, whether + * the backend device of those nodes is mapped or not. + * + * TODO: If the responses of other storage nodes are different, then use values from + * nodes which are mapped. If there are no mapped devices in the pool, then the check + * will fail when the mapped sizes are different. + */ + brmr_cmd_rsp = (struct brmr_msg_cmd_rsp *)rsp_buf; + for (i = 0; i < RMR_POOL_MAX_SESS; i++, brmr_cmd_rsp++) { + struct brmr_cmd_get_params_rsp *get_params_rsp = &brmr_cmd_rsp->get_params_rsp; + struct brmr_blk_dev_params *rsp_dev_params; + + /* + * If there is no magic, or the command failed, + * we do not use that nodes info to perform the check. + */ + if (brmr_cmd_rsp->magic != BRMR_CMD_RSP_MAGIC || + brmr_cmd_rsp->status) + continue; + + if (dev->mapped_size != le64_to_cpu(get_params_rsp->mapped_size)) { + pr_err("%s: Mismatch in mapped_size: %llu != %llu\n", __func__, + dev->mapped_size, le64_to_cpu(get_params_rsp->mapped_size)); + err = -EINVAL; + goto free_data; + } + + rsp_dev_params = &get_params_rsp->dev_params; + + dev->dev_params.max_hw_sectors = le32_to_cpu(rsp_dev_params->max_hw_sectors); + dev->dev_params.max_write_zeroes_sectors = + le32_to_cpu(rsp_dev_params->max_write_zeroes_sectors); + dev->dev_params.max_discard_sectors = + le32_to_cpu(rsp_dev_params->max_discard_sectors); + dev->dev_params.discard_granularity = + le32_to_cpu(rsp_dev_params->discard_granularity); + dev->dev_params.discard_alignment = le32_to_cpu(rsp_dev_params->discard_alignment); + dev->dev_params.physical_block_size = + le16_to_cpu(rsp_dev_params->physical_block_size); + dev->dev_params.logical_block_size = + le16_to_cpu(rsp_dev_params->logical_block_size); + dev->dev_params.max_segments = le16_to_cpu(rsp_dev_params->max_segments); + dev->dev_params.secure_discard = le16_to_cpu(rsp_dev_params->secure_discard); + dev->dev_params.cache_policy = rsp_dev_params->cache_policy; + + /* + * At least check passed with one mapped storage node + * + * We still perform the check for other mapped storage nodes just for sanity. + */ + checked = true; + } + + if (checked == false) { + pr_err("%s: Check for mapped_size failed for dev %s.\n", + __func__, dev->poolname); + err = -EINVAL; + } + +free_data: + kfree(rsp_buf); + + return err; +} + +/** + * brmr_srv_blk_add_handle_replace() - Handle check and discard for a store which was replaced + * + * @dev: RMR block device to be closed + * + * Description: + * When an empty disk is added to an already existing brmr server store, it means that the + * empty disk is to replace the disk which was present in the existing brmr srv store. + * Before replacing the disk with the new empty one, there are a number of things to be done. + * This function performs the following task, + * 1) Get some parameters from other storage node through the internal network, and checks + * whether the mapped_size passed for the new empty disk is correct or not. + * 2) If the above check passed, then discard is sent above to rmr-server. + * + * Return: + * 0 on success + * -Error value on error + */ +static int brmr_srv_blk_add_handle_replace(struct brmr_srv_blk_dev *dev) +{ + int err = 0; + + /* + * The check passed. We can now do the discard safely. + */ + err = brmr_srv_blk_do_discard(dev); + if (err) { + pr_err("%s: brmr_srv_blk_do_discard failed for dev %s\n", __func__, dev->poolname); + return err; + } + + /* + * We are done with everything, and we are good. + * We now set the MAPPED state and write metadata again so it is persisted. + * so that IOs can be served. + */ + brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_MAPPED); + err = brmr_srv_blk_write_md(dev); + if (err) { + pr_err("%s: dev %s: write md error %d\n", __func__, dev->name, err); + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED); + return err; + } + + /* + * After the discarded entries are sent to rmr-server, set the map version of + * rmr pool to zero. + */ + rmr_srv_replace_store(dev->pool); + return 0; +} + +/** + * brmr_srv_read_and_check_md() - Read and check metadata if it exists + * + * @dev: BRMR server block device for which the metadata is to be checked + * @md_page: pointer to the buf where to read the metadata + * + * Description: + * Read metadata from the given store device, and check whether metadata exists. + * + * Return: + * 0: read was successful and metadata exists + * -1: read was successful but metadata doesn't exists + * -Errno: read failed + */ +int brmr_srv_read_and_check_md(struct brmr_srv_blk_dev *dev, void *md_page) +{ + struct brmr_srv_blk_dev_meta *meta = md_page; + int err; + + err = brmr_srv_blk_bdev_read_md(dev->bdev, md_page); + if (err) { + pr_err("%s: failed to read md, err=%d\n", __func__, err); + return -EINVAL; + } + + if (meta->magic != BRMR_BLK_STORE_MAGIC) { + pr_info("%s: No MD exists for block device %s, md magic=%llX does not match %X\n", + __func__, dev->name, meta->magic, BRMR_BLK_STORE_MAGIC); + return -1; + } + + pr_info("%s: %s MD exists for block device %s\n", __func__, meta->poolname, dev->name); + + return 0; +} + +/** + * brmr_srv_blk_open() - Open an brmr srv block device + * + * @dev: BRMR server block device structure to be used. + * @path: path to the block device. + * @create: Whether to create a new store or open an existing one. + * @replace: Whether the device is being added to replace an empty disk. + * + * Description: + * Open the block device "path", and populate the brmr srv block device "dev" + * with the details. + * To close the device, call brmr_srv_blk_close() + * + * Return: + * 0 on success + * -Error value on error + * + * Locks: + * store_mutex should be held while calling this. + */ +int brmr_srv_blk_open(struct brmr_srv_blk_dev *dev, const char *path, + bool create, bool replace) +{ + struct rmr_attrs attr; + int err; + + err = rmr_srv_query(NULL, dev->mapped_size, &attr); + if (err) { + pr_err("dev %s: rmr srv query failed %d\n", dev->name, err); + return err; + } + + if ((dev->mapped_size + BLK_STR_MD_SIZE_SECTORS + attr.rmr_md_size) > dev->dev_size) { + pr_err("%s: dev %s: No space for rmr metadata %llu(in sectors)\n", + __func__, dev->name, attr.rmr_md_size); + return -ENOSPC; + } + + /* + * After the device registers to the RMR server pool, there will be metadata requests from + * RMR server transmitted to the device which starts reference counting. The reference + * count of the device must be initialized before any in flight requests are sent to BRMR. + */ + err = percpu_ref_init(&dev->kref, brmr_srv_blk_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (err) { + pr_err("%s: percpu ref init failed.\n", __func__); + return -EINVAL; + } + init_completion(&dev->comp); + + dev->pool = rmr_srv_register(dev->poolname, &pstore_blk_ops, dev, + dev->mapped_size, create ? RMR_SRV_DISK_CREATE : + (replace ? RMR_SRV_DISK_REPLACE : + RMR_SRV_DISK_ADD)); + if (!dev->pool) { + pr_err("Failed registering blk store %s, err\n", dev->poolname); + brmr_srv_ref_kill(dev); + return -EINVAL; + } + + brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_OPEN); + + if (!create) { + err = brmr_srv_blk_get_params(dev); + if (replace) { + /* + * Any failure of getting parameters is not allowed when replacing a store. + * Either it failed to send the command or the parameters are different. + */ + if (err) { + pr_err("%s: replace_store: brmr_srv_blk_get_params failed with err %d\n", + __func__, err); + goto close_dev; + } + } else { + /* + * The store creation will fail if the connected servers to this server + * share different parameter values. If sending the command of getting + * parameters failed due to no sync sessions connected to this server + * where no parameters are received, the store will be created, delaying + * checks when this server is connected to some other servers. + */ + if (err && err != -EAGAIN) { + pr_err("%s: create_store: brmr_srv_blk_get_params failed with err %d\n", + __func__, err); + goto close_dev; + } + } + + /* + * TODO: Would we be creating the maps for replace (empty disk) at the + * same time as we create one for create_disk? + */ + if (replace) { + err = brmr_srv_blk_add_handle_replace(dev); + if (err) { + pr_err("%s: replace_store %s: handling replace failed with err %d", + __func__, dev->poolname, err); + goto close_dev; + } + } + } + + /* we write md in both cases (new or old device) just to check if device is ok + * for writing + */ + err = brmr_srv_blk_write_md(dev); + if (err) { + pr_err("dev %s: write md error %d\n", dev->name, err); + goto close_dev; + } + + list_add(&dev->entry, &store_list); + + pr_info("%s: brmr srv blk str %s, dev %s set state to open\n", __func__, dev->poolname, + dev->name); + + return 0; + +close_dev: + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN); + /* + * TODO: Ideally, the unregister should be called with (create || replace). + * But right now there is no way to RMR to go ahead with the delete, + * even if marked_delete is not set. + */ + rmr_srv_unregister(dev->poolname, create); + dev->pool = NULL; + brmr_srv_ref_kill(dev); + + return err; +} + +/** + * brmr_srv_blk_cleanup() - Cleanup all the opened and active brmr srv block devices + * + * Description: + * This function is called when the module brmr server store is getting removed. + * It closes, destroys and frees all the open and active brmr server block devices. + */ +static void brmr_srv_blk_cleanup(void) +{ + struct brmr_srv_blk_dev *dev, *tmp; + + mutex_lock(&store_mutex); + list_for_each_entry_safe(dev, tmp, &store_list, entry) { + blk_str_destroy_sysfs_files(dev, NULL); + brmr_srv_blk_close(dev, false); + + pr_info("put blkdev %s\n", dev->bdev->bd_disk->disk_name); + bdev_fput(dev->bdev_file); + + brmr_srv_blk_destroy(dev); + } + mutex_unlock(&store_mutex); +} + +/** + * brmr_srv_blk_create() - Create an brmr_srv_blk_dev with the given data + * + * @path: path to the block device. + * @poolname: Name to be given to the created block device + * + * Description: + * To destroy a created brmr server block device, call brmr_srv_blk_destroy() + * + * Return: + * Pointer to the allocated brmr srv block device on success + * Error pointer on error + */ +struct brmr_srv_blk_dev *brmr_srv_blk_create(const char *path, char *poolname) +{ + struct brmr_srv_blk_dev *dev; + int err = 0; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + err = -ENOMEM; + goto err; + } + + strscpy(dev->poolname, poolname, NAME_MAX); + + dev->io_priv_cache = kmem_cache_create("brmr_srv_io_priv_cache", + sizeof(struct brmr_srv_io_priv), 0, 0, NULL); + if (!dev->io_priv_cache) { + pr_err("failed to create cache for device %s\n", poolname); + err = -ENOMEM; + goto free_dev; + } + + pr_debug("brmr srv blk store with name %s created\n", poolname); + + return dev; + +free_dev: + kfree(dev); +err: + return ERR_PTR(err); +} + +/** + * brmr_srv_blk_destroy() - Destroy a given brmr_srv_blk_dev + * + * @dev: brmr server block device to be destroyed + * @sysfs_self: Pointer to self attribute + * + * Description: + * This function is the opposite of brmr_srv_blk_create() + * The pointer to the self attribute is used to denote whether the destroy call + * is a result of a sysfs task for its own device. + */ +void brmr_srv_blk_destroy(struct brmr_srv_blk_dev *dev) +{ + kmem_cache_destroy(dev->io_priv_cache); + kfree(dev); +} + +/** + * brmr_srv_blk_map_dev() - Process a map command from the client side + * + * @dev: brmr server block device to be destroyed + * @map_cmd: Pointer to structure holding map command info + * + * Description: + * We save all the data and param sent in the command in out metadata, + * since these are assured to have been validated across all storage nodes. + * + * For future get params requests, we send back these instead of reading them + * from the underlying block device. + * + * Return: + * 0 on success + * -Error value on error + */ +static int brmr_srv_blk_map_dev(struct brmr_srv_blk_dev *dev, + const struct brmr_msg_map_new_cmd *map_cmd) +{ + const struct brmr_blk_dev_params *cmd_dev_params = &map_cmd->dev_params; + int err; + u64 recvd_mapped_size = map_cmd->mapped_size; + + pr_info("%s: Mapping device %s with mapped_size %llu, recvd size %llu\n", + __func__, dev->name, dev->mapped_size, recvd_mapped_size); + + if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state)) { + pr_err("%s: Received map command for already mapped device %s\n", + __func__, dev->name); + return -EINVAL; + } + + if (recvd_mapped_size > dev->dev_size - BLK_STR_MD_SIZE_SECTORS) { + pr_err("can not map %llu, only %llu available %s\n", + recvd_mapped_size, dev->dev_size - BLK_STR_MD_SIZE_SECTORS, dev->name); + return -ENOSPC; + } + + if (dev->mapped_size && dev->mapped_size != recvd_mapped_size) { + pr_err("dev %s is already mapped with size %llu, does not match %llu", + dev->name, dev->mapped_size, recvd_mapped_size); + return -EINVAL; + } + + dev->mapped_size = recvd_mapped_size; + + dev->dev_params.max_hw_sectors = le32_to_cpu(cmd_dev_params->max_hw_sectors); + dev->dev_params.max_write_zeroes_sectors = + le32_to_cpu(cmd_dev_params->max_write_zeroes_sectors); + dev->dev_params.max_discard_sectors = le32_to_cpu(cmd_dev_params->max_discard_sectors); + dev->dev_params.discard_granularity = le32_to_cpu(cmd_dev_params->discard_granularity); + dev->dev_params.discard_alignment = le32_to_cpu(cmd_dev_params->discard_alignment); + dev->dev_params.physical_block_size = le16_to_cpu(cmd_dev_params->physical_block_size); + dev->dev_params.logical_block_size = le16_to_cpu(cmd_dev_params->logical_block_size); + dev->dev_params.max_segments = le16_to_cpu(cmd_dev_params->max_segments); + dev->dev_params.secure_discard = le16_to_cpu(cmd_dev_params->secure_discard); + dev->dev_params.cache_policy = cmd_dev_params->cache_policy; + + brmr_srv_blk_set_state(dev, BRMR_SRV_STORE_MAPPED); + + err = brmr_srv_blk_write_md(dev); + if (err) { + pr_err("failed to write md for %s, err %d\n", dev->name, err); + dev->mapped_size = 0; + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED); + return -EINVAL; + } + + return 0; +} + +/* Always succeeds. */ +static int brmr_srv_blk_unmap_dev(struct brmr_srv_blk_dev *dev) +{ + pr_info("unmap device: %s\n", dev->name); + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_MAPPED); + + return 0; +} + +static bool brmr_srv_blk_io_allowed(void *store_priv) +{ + struct brmr_srv_blk_dev *dev = store_priv; + + if (!dev) { + pr_err("no store registered\n"); + return false; + } + + return test_bit(BRMR_SRV_STORE_OPEN, &dev->state) && + test_bit(BRMR_SRV_STORE_MAPPED, &dev->state); +} + +#define bio_disk_name(bio) ((bio)->bi_bdev->bd_disk->disk_name) +#define bio_first_sector(bio) ((bio_end_sector(bio) - bio_sectors(bio))) + +static void brmr_srv_bi_end_io(struct bio *bio) +{ + struct brmr_srv_io_priv *io_priv = bio->bi_private; + struct brmr_srv_blk_dev *dev = io_priv->dev; + int err; + + err = blk_status_to_errno(bio->bi_status); + pr_debug("end io called for dev %s, bio=%p, err=%d\n", dev->poolname, bio, err); + + if (err) { + brmr_srv_blk_clear_state(dev, BRMR_SRV_STORE_OPEN); + pr_err("Dev %s, Bio %p type %s, err=%d bdev_name=%s\n", dev->poolname, + bio, bio_data_dir(bio) == WRITE ? "W" : "R", err, bio_disk_name(bio)); + } + + rmr_srv_req_resp(io_priv->priv, err); + + kmem_cache_free(dev->io_priv_cache, io_priv); + brmr_srv_blk_put_ref(dev); + bio_put(bio); +} + +static int brmr_srv_submit_bi(struct brmr_srv_blk_dev *dev, void *data, u64 offset, u32 length, + unsigned long flags, u16 prio, void *priv) +{ + struct bio *bio; + struct brmr_srv_io_priv *io_priv; + blk_opf_t bio_flags; + int ret = 0; + bool is_md_op = false; + + switch (rmr_op(flags)) { + case RMR_OP_READ: + bio_flags = REQ_OP_READ; + break; + case RMR_OP_WRITE: + case RMR_OP_SYNCREQ: + bio_flags = REQ_OP_WRITE; + break; + case RMR_OP_DISCARD: + bio_flags = REQ_OP_DISCARD; + break; + case RMR_OP_WRITE_ZEROES: + bio_flags = REQ_OP_WRITE_ZEROES; + break; + case RMR_OP_FLUSH: + bio_flags = REQ_OP_WRITE | REQ_PREFLUSH; + break; + case RMR_OP_MD_READ: + bio_flags = REQ_OP_READ; + is_md_op = true; + break; + case RMR_OP_MD_WRITE: + bio_flags = REQ_OP_WRITE; + is_md_op = true; + break; + default: + pr_err("Wrong flags=%lu\n", flags); + return -EINVAL; + } + + /* + * Most md IO are created on rmr-srv and does not get priority value passed on from rmr-clt + */ + if (is_md_op) { + bio_flags |= REQ_META; + if (rmr_op(flags) == RMR_OP_MD_WRITE) + bio_flags |= REQ_FUA; + } + + if (flags & RMR_F_SYNC) + bio_flags |= REQ_SYNC; + + if (flags & RMR_F_FUA) + bio_flags |= REQ_FUA; + + bio = bio_alloc(dev->bdev, 1, bio_flags, GFP_KERNEL); + if (bio_add_page(bio, virt_to_page(data), length, + offset_in_page(data)) != length) { + pr_err("Failed to map data to bio\n"); + ret = -EINVAL; + goto put_bio; + } + + io_priv = kmem_cache_zalloc(dev->io_priv_cache, GFP_KERNEL); + if (!io_priv) { + pr_err("Failed to alloc io_priv for op %lx dev %s\n", flags, dev->poolname); + ret = -ENOMEM; + goto put_bio; + } + + io_priv->dev = dev; + io_priv->priv = priv; + + bio->bi_private = io_priv; + bio->bi_end_io = brmr_srv_bi_end_io; + bio->bi_iter.bi_sector = offset; + bio->bi_iter.bi_size = length; + bio_set_dev(bio, dev->bdev); + + pr_debug("Submit %s bio=%p, disk=%s, flag=[%lx], bio_flag=[%x], op=[%x]" + "first_sect=%llu, sectors=%d\n", + is_md_op ? "md req" : "req", bio, bio_disk_name(bio), + flags, bio_flags, rmr_op(flags), + (u64)bio_first_sector(bio), bio_sectors(bio)); + + if (is_md_op) { + ret = submit_bio_wait(bio); + if (ret) { + pr_err("Error waiting md from %s, err %d\n", + dev->bdev->bd_disk->disk_name, ret); + } + goto end_bio; + } else { + /* + * Most md IO are created on rmr-srv and does not get priority value passed on from + * rmr-clt + */ + bio->bi_ioprio = prio; + submit_bio(bio); + } + + return 0; +end_bio: + rmr_srv_req_resp(io_priv->priv, ret); + kmem_cache_free(dev->io_priv_cache, io_priv); +put_bio: + bio_put(bio); + return ret; +} + +/** + * brmr_srv_process_blk_req() - Processes brmr srv store IO messages + * + * @dev: pointer to rmr block device + * @data: pointer to data + * @data_offset: offset on disk (represented in bytes) + * @length: length of data in bytes + * @flags: IO flags + * @prio: prio from block layer + * @priv: pointer to priv data for rmr + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_process_blk_req(void *device, void *data, u32 data_offset, + u32 length, unsigned long flags, u16 prio, void *priv) +{ + struct brmr_srv_blk_dev *dev = (struct brmr_srv_blk_dev *)device; + u64 offset = 0; /* in sectors */ + int ret = 0; + + if (!brmr_srv_blk_get_ref(dev)) { + pr_err("for dev %s, name %s, failed to get_ref\n", + dev->name, dev->poolname); + return -EIO; + } + + if (!brmr_srv_blk_io_allowed(dev)) { + pr_err("Store name %s, offset %u, length %u, io is not allowed!\n", + dev->poolname, data_offset, length); + ret = -EINVAL; + goto err; + } + + offset = BLK_STR_MD_SIZE_SECTORS; + offset += (data_offset) >> SECTOR_SHIFT; //bytes to sectors; + + pr_debug("Submitted req to %s, flag %lu offset %llu length %u\n", + dev->name, flags, offset, length); + ret = brmr_srv_submit_bi(dev, data, offset, length, flags, prio, priv); + if (ret) { + pr_err("%s: bio submission failed for data IO\n", __func__); + goto err; + } + + return 0; + +err: + brmr_srv_blk_put_ref(dev); + return ret; +} + +/** + * brmr_srv_process_blk_md_req() - Process the requests for rmr metadata + * + * Return: + * 0 on success + * + * Description: + * The rmr metadata will be stored at the end of the device. + */ +static int brmr_srv_process_blk_md_req(void *device, void *data, u32 data_offset, + u32 length, unsigned long flags, void *priv) +{ + struct brmr_srv_blk_dev *dev = device; + int err; + u64 offset = 0; /* in sectors */ + + if (!brmr_srv_blk_get_ref(dev)) { + pr_err("for dev %s, name %s, failed to get_ref\n", + dev->name, dev->poolname); + return -EIO; + } + + /* The mapped_size is in sectors. */ + offset = BLK_STR_MD_SIZE_SECTORS + dev->mapped_size; + offset += (data_offset) >> SECTOR_SHIFT; //bytes to sectors; + pr_debug("Submitted md req to %s, flag %lu offset %llu length %u\n", + dev->name, flags, offset, length); + /* + * It's no need to return err to upper layer here. If the submission of md request fails, + * it will go through the endreq path after the server req finishes processing. + */ + err = brmr_srv_submit_bi(dev, data, offset, length, flags, 0, priv); + if (err) + pr_err("%s: bio submission failed for metadata IO\n", __func__); + brmr_srv_blk_put_ref(dev); + return 0; +} + +/** + * brmr_srv_init_cmd_rsp() - Initialize command response + * + * @msg: command response to initialize + */ +static void brmr_srv_init_cmd_rsp(struct brmr_msg_cmd_rsp *msg) +{ + memset(msg, 0, sizeof(*msg)); + + msg->hdr.type = cpu_to_le16(BRMR_MSG_CMD); + msg->hdr.__padding = 0; + msg->magic = BRMR_CMD_RSP_MAGIC; + msg->ver = BRMR_PROTO_VER_MAJOR; + msg->cmd_type = BRMR_CMD_RSP; +} + +/** + * brmr_srv_fill_dev_param_dev() - Fill dev params from the saved params in brmr srv block device + * + * @dev: pointer to brmr server block device + * @rsp: Pointer to command response structure holding params + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_fill_dev_param_dev(struct brmr_srv_blk_dev *dev, + struct brmr_cmd_get_params_rsp *rsp) +{ + struct brmr_srv_blk_dev_meta *md_page; + struct brmr_blk_dev_params *rsp_dev_params = &rsp->dev_params; + int ret; + + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + ret = -ENOMEM; + goto out; + } + + /* + * We have to read the metadata from the device. + */ + ret = brmr_srv_blk_bdev_read_md(dev->bdev, (void *)md_page); + if (ret) { + pr_err("%s: failed to read md, err=%d\n", __func__, ret); + goto out; + } + + if (md_page->magic != BRMR_BLK_STORE_MAGIC) { + pr_warn("%s: No md found. store %s md magic=%llX does not match %X\n", + __func__, dev->poolname, md_page->magic, BRMR_BLK_STORE_MAGIC); + ret = -EINVAL; + goto out; + } + + rsp_dev_params->max_hw_sectors = cpu_to_le32(md_page->dev_params.max_hw_sectors); + rsp_dev_params->max_write_zeroes_sectors = + cpu_to_le32(md_page->dev_params.max_write_zeroes_sectors); + rsp_dev_params->max_discard_sectors = cpu_to_le32(md_page->dev_params.max_discard_sectors); + rsp_dev_params->discard_granularity = cpu_to_le32(md_page->dev_params.discard_granularity); + rsp_dev_params->discard_alignment = cpu_to_le32(md_page->dev_params.discard_alignment); + rsp_dev_params->physical_block_size = cpu_to_le16(md_page->dev_params.physical_block_size); + rsp_dev_params->logical_block_size = cpu_to_le16(md_page->dev_params.logical_block_size); + rsp_dev_params->max_segments = cpu_to_le16(md_page->dev_params.max_segments); + rsp_dev_params->secure_discard = cpu_to_le16(md_page->dev_params.secure_discard); + rsp_dev_params->cache_policy = md_page->dev_params.cache_policy; + +out: + kfree(md_page); + return ret; +} + +/** + * brmr_srv_fill_dev_param_bdev() - Fill dev params from the underlying block device + * + * @dev: pointer to brmr server block device + * @rsp: Pointer to command response structure holding params + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_fill_dev_param_bdev(struct brmr_srv_blk_dev *dev, + struct brmr_cmd_get_params_rsp *rsp) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct brmr_blk_dev_params *rsp_dev_params = &rsp->dev_params; + + if (!q) { + pr_err("%s: no queue for dev %s\n", __func__, dev->name); + return -EINVAL; + } + + rsp_dev_params->logical_block_size = + cpu_to_le16(bdev_logical_block_size(bdev)); + rsp_dev_params->physical_block_size = + cpu_to_le16(bdev_physical_block_size(bdev)); + rsp_dev_params->max_segments = + cpu_to_le16(queue_max_segments(q)); + rsp_dev_params->max_hw_sectors = + cpu_to_le32(queue_max_hw_sectors(q)); + rsp_dev_params->max_write_zeroes_sectors = + cpu_to_le32(bdev_write_zeroes_sectors(bdev)); + rsp_dev_params->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev)); + rsp_dev_params->discard_granularity = + cpu_to_le32(bdev_get_queue(bdev)->limits.discard_granularity); + rsp_dev_params->discard_alignment = + cpu_to_le32(bdev_get_queue(bdev)->limits.discard_alignment); + rsp_dev_params->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev)); + rsp_dev_params->cache_policy = 0; + + if (blk_queue_write_cache(q)) + rsp_dev_params->cache_policy |= BRMR_WRITEBACK; + if (bdev_fua(bdev)) + rsp_dev_params->cache_policy |= BRMR_FUA; + + return 0; +} + +/** + * brmr_srv_fill_get_params_rsp() - Fill dev params into the command response structure + * + * @dev: pointer to brmr server block device + * @brmr_cmd_rsp: Pointer to command response structure + * + * Description: + * For mapped devices, we need to pick up the params from the brmr server block device itself + * These are the same ones which are saved in the metadata of the device. + * + * For unmapped devices, we need to extract this info from the underlying block device + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_fill_get_params_rsp(struct brmr_srv_blk_dev *dev, + struct brmr_msg_cmd_rsp *brmr_cmd_rsp) +{ + struct brmr_cmd_get_params_rsp *rsp; + int ret; + + if (!dev) { + pr_err("%s: no brmr srv blk dev to get params\n", __func__); + return -ENODEV; + } + + if (!dev->bdev) { + pr_err("%s: no bdev opened for dev %s\n", __func__, dev->name); + return -EINVAL; + } + + rsp = &brmr_cmd_rsp->get_params_rsp; + + /* + * For a mapped device, we get the saved params in the device structure (read from md) + * since those are the ones which would have gone through validation, + * when the map happened. + * + * For unmapped device, we get params from the underlying bdev. + */ + if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state)) + ret = brmr_srv_fill_dev_param_dev(dev, rsp); + else + ret = brmr_srv_fill_dev_param_bdev(dev, rsp); + + if (ret) { + pr_err("%s: Fill dev params failed for dev %s\n", __func__, dev->name); + return -EINVAL; + } + + rsp->mapped = test_bit(BRMR_SRV_STORE_MAPPED, &dev->state); + rsp->mapped_size = cpu_to_le64(dev->mapped_size); + pr_info("%s: dev %s, mapped_size %llu\n", __func__, + dev->name, le64_to_cpu(rsp->mapped_size)); + + return 0; +} + +/** + * brmr_srv_blk_cmd() - Processes brmr srv store command messages + * + * @device: brmr server store device + * @usr_buf: user buffer containing the command message struct (ones sent as kvec to rmr) + * @usr_len: length of the usr_buf + * @data: data buffer where the response can be sent back for brmr client to read + * @datalen: length of data buffer + * + * Return: + * 0 in case of success + * negative in case of failure + */ +static int brmr_srv_blk_cmd(void *device, const void *usr_buf, int usr_len, void *data, + int datalen) +{ + struct brmr_srv_blk_dev *dev = device; + const struct brmr_msg_cmd *msg = (const struct brmr_msg_cmd *)usr_buf; + struct brmr_msg_cmd_rsp *brmr_cmd_rsp = (struct brmr_msg_cmd_rsp *)data; + int ret = 0; + + if (datalen < sizeof(*brmr_cmd_rsp)) { + WARN_ON(1); + return -EINVAL; + } + + if (!brmr_srv_blk_get_ref(dev)) { + pr_err("for dev %s, name %s, failed to get_ref to process command %d\n", + dev->name, dev->poolname, msg->cmd_type); + return -EIO; + } + + brmr_srv_init_cmd_rsp(brmr_cmd_rsp); + + switch (msg->cmd_type) { + case BRMR_CMD_MAP: + pr_info("%s: BRMR_CMD_MAP\n", __func__); + + brmr_cmd_rsp->status = brmr_srv_blk_map_dev(dev, &msg->map_new_cmd); + if (brmr_cmd_rsp->status) { + pr_err("Failed to map new dev to %s, err %d\n", + dev->name, brmr_cmd_rsp->status); + } + break; + case BRMR_CMD_REMAP: + pr_info("%s: BRMR_CMD_REMAP\n", __func__); + break; + case BRMR_CMD_UNMAP: + pr_info("%s: BRMR_CMD_UNMAP\n", __func__); + + brmr_cmd_rsp->status = brmr_srv_blk_unmap_dev(dev); + break; + case BRMR_CMD_GET_PARAMS: + pr_info("%s: BRMR_CMD_GET_PARAMS\n", __func__); + + brmr_cmd_rsp->status = brmr_srv_fill_get_params_rsp(dev, brmr_cmd_rsp); + break; + + default: + pr_err("%s: Unknown command type %d\n", __func__, msg->cmd_type); + } + + brmr_srv_blk_put_ref(dev); + + return ret; +} + +struct rmr_srv_store_ops pstore_blk_ops = { + .submit_req = brmr_srv_process_blk_req, + .submit_md_req = brmr_srv_process_blk_md_req, + .submit_cmd = brmr_srv_blk_cmd, + .io_allowed = brmr_srv_blk_io_allowed, + .get_params = brmr_srv_blk_get_params, +}; + +static int __init brmr_srv_init_module(void) +{ + int err = 0; + + pr_info("Loading module %s, version %s\n", + KBUILD_MODNAME, BRMR_SERVER_VER_STRING); + + err = brmr_srv_create_sysfs_files(); + if (err) { + pr_err("rmr_store_create_sysfs_files(), err: %d\n", err); + goto out; + } + + return 0; +out: + return err; +} + +static void __exit brmr_srv_cleanup_module(void) +{ + brmr_srv_blk_cleanup(); + brmr_srv_destroy_sysfs_files(); + + pr_info("Module %s unloaded\n", KBUILD_MODNAME); +} + +module_init(brmr_srv_init_module); +module_exit(brmr_srv_cleanup_module); From 2227c51b677e58796702e7327a6526e0e67bf592 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:24 +0200 Subject: [PATCH 12/13] block/brmr: server: sysfs interface functions Add the BRMR server sysfs interface used to register and unregister backing block devices. Writes to /sys/devices/virtual/brmr-server/ctl/map_device open the named block device, validate or initialise its on-disk metadata, and register it with RMR as the IO store for the named pool. This file is not compiled until the modules are wired into the build in a later patch in this series. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/block/brmr/brmr-srv-sysfs.c | 707 ++++++++++++++++++++++++++++ 1 file changed, 707 insertions(+) create mode 100644 drivers/block/brmr/brmr-srv-sysfs.c diff --git a/drivers/block/brmr/brmr-srv-sysfs.c b/drivers/block/brmr/brmr-srv-sysfs.c new file mode 100644 index 000000000000..7e413eb258bb --- /dev/null +++ b/drivers/block/brmr/brmr-srv-sysfs.c @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Block device over RMR (BRMR) + * + * Copyright (c) 2026 IONOS SE + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "brmr-srv.h" +#include "rmr-srv.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +static struct class *rmr_str_class; +static struct device *rmr_ctl_dev; +static struct device *rmr_strs_dev; + +enum { + BRMR_SRV_STR_OPT_ERR = 0, + BRMR_SRV_STR_OPT_DEVICE = 1 << 0, + BRMR_SRV_STR_OPT_POOL = 1 << 2, + BRMR_SRV_STR_OPT_MAPPED_SIZE = 1 << 3, + BRMR_SRV_STR_OPT_MODE = 1 << 4, +}; + +static const unsigned int rmr_str_opt_mandatory[] = { + BRMR_SRV_STR_OPT_POOL, + BRMR_SRV_STR_OPT_DEVICE, + BRMR_SRV_STR_OPT_MAPPED_SIZE, +}; + +static const match_table_t rmr_str_opt_tokens = { + { BRMR_SRV_STR_OPT_POOL, "pool=%s" }, + { BRMR_SRV_STR_OPT_DEVICE, "device=%s" }, + { BRMR_SRV_STR_OPT_MAPPED_SIZE, "mapped_size=%s" }, + { BRMR_SRV_STR_OPT_MODE, "mode=%s" }, + { BRMR_SRV_STR_OPT_ERR, NULL }, +}; + +struct brmr_srv_str_options { + char *pool; + char *device; + unsigned long mapped_size; +}; + +static void brmr_srv_remove_store(struct brmr_srv_blk_dev *dev, struct kobj_attribute *attr, + bool delete) +{ + mutex_lock(&store_mutex); + + blk_str_destroy_sysfs_files(dev, &attr->attr); + + brmr_srv_blk_close(dev, delete); + + pr_info("put blkdev %s\n", dev->bdev->bd_disk->disk_name); + bdev_fput(dev->bdev_file); + + pr_info("%s store %s, store name %s.\n", (delete ? "Delete" : "Remove"), + dev->name, dev->poolname); + brmr_srv_blk_destroy(dev); + mutex_unlock(&store_mutex); +} + +static int brmr_srv_parse_add_opts(const char *buf, struct brmr_srv_str_options *opt, + unsigned int *replace) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = strstrip(options); + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rmr_str_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case BRMR_SRV_STR_OPT_POOL: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("add_store: pool name is too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(opt->pool, p, NAME_MAX); + kfree(p); + break; + + case BRMR_SRV_STR_OPT_DEVICE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("add_store: device name is too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strscpy(opt->device, p, NAME_MAX); + kfree(p); + break; + + case BRMR_SRV_STR_OPT_MAPPED_SIZE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + ret = kstrtoul(p, 0, &opt->mapped_size); + if (ret) { + pr_err("mapped_size isn't an integer: %d\n", ret); + kfree(p); + goto out; + } + + if (opt->mapped_size == 0) { + pr_err("mapped_size cannot be 0\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + + kfree(p); + break; + + case BRMR_SRV_STR_OPT_MODE: + if (!replace) { + pr_err("%s: mode option not supported here\n", __func__); + ret = -EINVAL; + goto out; + } + + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "replace")) { + *replace = true; + } else { + pr_err("%s: Unknown mode '%s'\n", __func__, p); + ret = -EINVAL; + kfree(p); + goto out; + } + kfree(p); + break; + + default: + pr_err("add_store: Unknown parameter or missing value '%s'\n", + p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(rmr_str_opt_mandatory); i++) { + if ((opt_mask & rmr_str_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("add_store: Parameters missing\n"); + ret = -EINVAL; + break; + } + } + +out: + kfree(options); + return ret; +} + +static ssize_t blk_str_dev_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + return sysfs_emit(page, "%llu\n", dev->dev_size); +} + +static struct kobj_attribute blk_str_dev_size_attr = + __ATTR(dev_size, 0644, blk_str_dev_size_show, NULL); + +static ssize_t blk_str_mapped_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + return sysfs_emit(page, "%llu\n", dev->mapped_size); +} + +static struct kobj_attribute blk_str_mapped_size_attr = + __ATTR(mapped_size, 0644, blk_str_mapped_size_show, NULL); + +static ssize_t blk_str_bdev_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + return sysfs_emit(page, "%s\n", dev->name); +} + +static struct kobj_attribute blk_str_bdev_name_attr = + __ATTR(bdev_name, 0644, blk_str_bdev_name_show, NULL); + +static ssize_t blk_str_remove_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 to remove the store\n"); +} + +static ssize_t blk_str_remove_store_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + dev->name, attr->attr.name, buf); + return -EINVAL; + } + + brmr_srv_remove_store(dev, attr, false); + + return count; +} + +static struct kobj_attribute blk_str_remove_store_attr = + __ATTR(remove_store, 0644, + blk_str_remove_store_show, blk_str_remove_store_store); + +static ssize_t blk_str_delete_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 to delete the store\n"); +} + +static ssize_t blk_str_delete_store_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_blk_dev *dev; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + if (!sysfs_streq(buf, "1")) { + pr_err("%s, %s unknown value: '%s'\n", + dev->name, attr->attr.name, buf); + return -EINVAL; + } + + brmr_srv_remove_store(dev, attr, true); + + return count; +} + +static struct kobj_attribute blk_str_delete_store_attr = + __ATTR(delete_store, 0644, + blk_str_delete_store_show, blk_str_delete_store_store); + +static ssize_t state_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct brmr_srv_blk_dev *dev; + size_t count = 0; + + dev = container_of(kobj, struct brmr_srv_blk_dev, kobj); + + if (test_bit(BRMR_SRV_STORE_OPEN, &dev->state)) + count += sysfs_emit_at(page, count, "open\n"); + else + count += sysfs_emit_at(page, count, "closed\n"); + + if (test_bit(BRMR_SRV_STORE_MAPPED, &dev->state)) + count += sysfs_emit_at(page, count, "mapped\n"); + else + count += sysfs_emit_at(page, count, "unmapped\n"); + + return count; +} + +static struct kobj_attribute blk_str_state_attr = + __ATTR_RO(state); + +static struct attribute *blk_str_map_attrs[] = { + &blk_str_dev_size_attr.attr, + &blk_str_mapped_size_attr.attr, + &blk_str_bdev_name_attr.attr, + &blk_str_remove_store_attr.attr, + &blk_str_delete_store_attr.attr, + &blk_str_state_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(blk_str_map); + +static struct kobj_type blk_str_device_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = blk_str_map_groups, +}; + +static int blk_str_create_sysfs_files(struct brmr_srv_blk_dev *dev) +{ + int ret; + + ret = kobject_init_and_add(&dev->kobj, &blk_str_device_ktype, + &rmr_strs_dev->kobj, + "%s", dev->poolname); + if (ret) + pr_err("Failed to create sysfs dir for store %s, name %s, err=%d\n", + dev->name, dev->poolname, ret); + + return ret; +} + +void blk_str_destroy_sysfs_files(struct brmr_srv_blk_dev *dev, + const struct attribute *sysfs_self) +{ + if (sysfs_self) + sysfs_remove_file_self(&dev->kobj, sysfs_self); + + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); +} + +/** + * brmr_srv_blk_dev_exit() - Destroy and put the blkdev + * + * @dev: RMR block device structure to be used. + * + * Description: + * This function gives up the blkdev reference, and destroys the rmr block device + */ +static void brmr_srv_blk_dev_exit(struct brmr_srv_blk_dev *dev) +{ + pr_info("%s: put blkdev %s\n", __func__, dev->name); + bdev_fput(dev->bdev_file); + + brmr_srv_blk_destroy(dev); +} + +/** + * brmr_srv_blk_dev_init() - Create and initialize a brmr server store block device + * + * @pool_name: Name to be given to the created rmr block device + * @dev_name: path to the block device + * @mapped_size:mapped size of the block device + * + * Description: + * This function checks whether the rmr pool is available to be registered. + * It then creates the block device, and initializes it. + * + * Return: + * Pointer to the created rmr block device on success + * Error pointer on error + */ +static struct brmr_srv_blk_dev *brmr_srv_blk_dev_init(char *pool_name, char *dev_name, + u64 mapped_size) +{ + struct file *bdev_file; + struct brmr_srv_blk_dev *dev; + + dev = brmr_srv_blk_create(dev_name, pool_name); + if (IS_ERR(dev)) { + pr_err("failed to alloc store for device %s: %pe\n", pool_name, dev); + return dev; + } + + bdev_file = bdev_file_open_by_path(dev_name, DEFAULT_BLK_OPEN_FLAGS, + dev, NULL); + if (IS_ERR(bdev_file)) { + pr_err("%s: bdev_file_open_by_path for device %s failed with err (%pe)\n", + __func__, dev_name, bdev_file); + brmr_srv_blk_destroy(dev); + return ERR_CAST(bdev_file); + } + + dev->bdev_file = bdev_file; + dev->bdev = file_bdev(bdev_file); + dev->dev_size = get_capacity(dev->bdev->bd_disk); + strscpy(dev->name, dev->bdev->bd_disk->disk_name, sizeof(dev->name)); + + if (mapped_size < BLK_STR_MIN_MAPPED_SIZE) { + pr_err("%s: Given mapped size %llu less than minimum default(%lu) for dev %s\n", + __func__, mapped_size, BLK_STR_MIN_MAPPED_SIZE, dev->name); + brmr_srv_blk_dev_exit(dev); + return ERR_PTR(-ENOSPC); + } + + if (mapped_size + BLK_STR_MD_SIZE_SECTORS > dev->dev_size) { + pr_err("can not map %llu, only %llu available %s\n", + mapped_size, dev->dev_size - BLK_STR_MD_SIZE_SECTORS, dev->name); + brmr_srv_blk_dev_exit(dev); + return ERR_PTR(-ENOSPC); + } + + dev->mapped_size = mapped_size; + + pr_info("%s: succeeded\n", __func__); + + return dev; +} + +static ssize_t brmr_srv_create_store_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_str_options opt; + char dev_name[NAME_MAX]; + char pool_name[NAME_MAX]; + struct brmr_srv_blk_dev *dev; + struct brmr_srv_blk_dev_meta *md_page; + int md_state, err; + + opt.pool = pool_name; + opt.device = dev_name; + opt.mapped_size = 0; + + if (brmr_srv_parse_add_opts(buf, &opt, NULL)) + goto out; + + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_err("%s: Failed to allocate page to read md\n", __func__); + goto out; + } + + mutex_lock(&store_mutex); + + dev = brmr_srv_blk_dev_init(pool_name, dev_name, opt.mapped_size); + if (IS_ERR(dev)) { + pr_err("%s: brmr_srv_blk_dev_init failed: %pe\n", __func__, dev); + goto mut_unlock; + } + + md_state = brmr_srv_read_and_check_md(dev, md_page); + if (md_state != -1) { + /* + * read and check md failed. It could be read error or that md exists + */ + pr_err("%s: md read and check failed: %d\n", __func__, md_state); + goto dev_exit; + } + + err = brmr_srv_blk_open(dev, dev_name, true, false); + if (err) { + pr_err("failed to open %s, err %d\n", dev_name, err); + goto dev_exit; + } + + err = blk_str_create_sysfs_files(dev); + if (err) { + pr_err("failed to create sysfs files\n"); + goto dev_close; + } + + mutex_unlock(&store_mutex); + pr_info("Created new blk store for %s, with disk %s\n", pool_name, dev_name); + + kfree(md_page); + return count; + +dev_close: + brmr_srv_blk_close(dev, true); +dev_exit: + brmr_srv_blk_dev_exit(dev); +mut_unlock: + mutex_unlock(&store_mutex); + kfree(md_page); +out: + return -EINVAL; +} + +static ssize_t brmr_srv_create_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"pool= device= mapped_size=\" > %s\n\n", + attr->attr.name); +} + +static struct kobj_attribute brmr_srv_create_store_attr = + __ATTR(create_store, 0644, + brmr_srv_create_store_show, brmr_srv_create_store_store); + +static ssize_t brmr_srv_add_store_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct brmr_srv_blk_dev *dev; + char dev_name[NAME_MAX]; + char pool_name[NAME_MAX]; + struct brmr_srv_str_options opt; + struct brmr_srv_blk_dev_meta *md_page; + int md_state, ret; + unsigned int replace = false; + + opt.pool = pool_name; + opt.device = dev_name; + opt.mapped_size = 0; + + if (brmr_srv_parse_add_opts(buf, &opt, &replace)) + goto out; + + /* + * Disable replace mode for now. + * Most of the code for replace mode to work is present, but there are some + * edge cases which needs work, and a info exchange between storage nodes which + * needs to be added. + */ + if (replace) { + pr_err("%s: Replace mode not supported yet\n", __func__); + goto out; + } + + md_page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!md_page) { + pr_err("Failed to allocate page to read md\n"); + goto out; + } + + mutex_lock(&store_mutex); + + dev = brmr_srv_blk_dev_init(pool_name, dev_name, opt.mapped_size); + if (IS_ERR(dev)) { + pr_err("brmr_srv_blk_dev_init failed: %pe\n", dev); + goto mut_unlock; + } + + md_state = brmr_srv_read_and_check_md(dev, md_page); + if (md_state == -1) { + /* + * md doesn't exists. This means the disk is an empty one. + * We have to replace, so check the mode first + */ + if (!replace) { + pr_err("%s: Incorrect mode %d. md doesn't exists\n", __func__, replace); + goto dev_exit; + } + + /* + * we have to do the following, + * + * 1) Check params like mapped size from at least one other storage node + * 2) Do discard + */ + pr_info("%s: No md found. Replacing disk %s for pool %s, size %llu\n", + __func__, dev_name, pool_name, dev->mapped_size); + } else if (md_state == 0) { + /* + * md exists. + * We are restoring an earlier used device. + */ + if (replace) { + pr_err("%s: Incorrect mode %d. md exists\n", __func__, replace); + goto dev_exit; + } + + /* + * Validate the metadata stored with the data provided. + */ + ret = brmr_srv_blk_validate_md(dev, md_page); + if (ret) { + pr_err("Local metadata validation failed\n"); + goto dev_exit; + } + + memcpy(&dev->dev_params, &md_page->dev_params, sizeof(struct rmr_blk_dev_params)); + dev->state = md_page->state; + + pr_info("%s: md found. Re-adding disk %s for pool %s, size %llu\n", + __func__, dev_name, pool_name, dev->mapped_size); + } else { + pr_err("%s: md cannot be read for block device %s, Err = %d\n", + __func__, dev->name, md_state); + goto dev_exit; + } + + if (brmr_srv_blk_open(dev, dev_name, false /* create */, replace)) { + pr_err("failed to open %s\n", dev_name); + goto dev_exit; + } + + ret = blk_str_create_sysfs_files(dev); + if (ret) { + pr_err("failed to create sysfs files\n"); + goto dev_close; + } + + mutex_unlock(&store_mutex); + + kfree(md_page); + return count; + +dev_close: + brmr_srv_blk_close(dev, replace); +dev_exit: + brmr_srv_blk_dev_exit(dev); +mut_unlock: + mutex_unlock(&store_mutex); + kfree(md_page); +out: + return -EINVAL; +} + +static ssize_t brmr_srv_add_store_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"pool= device= mapped_size=\" > %s\n\n", + attr->attr.name); +} + +static struct kobj_attribute brmr_srv_add_store_attr = + __ATTR(add_store, 0644, + brmr_srv_add_store_show, brmr_srv_add_store_store); + +static struct attribute *default_attrs[] = { + &brmr_srv_create_store_attr.attr, + &brmr_srv_add_store_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +int brmr_srv_create_sysfs_files(void) +{ + int err; + dev_t devt = MKDEV(0, 0); + + rmr_str_class = class_create("brmr-server"); + if (IS_ERR(rmr_str_class)) + return PTR_ERR(rmr_str_class); + + rmr_ctl_dev = device_create(rmr_str_class, NULL, devt, NULL, "ctl"); + if (IS_ERR(rmr_ctl_dev)) { + err = PTR_ERR(rmr_ctl_dev); + goto cls_destroy; + } + + rmr_strs_dev = device_create(rmr_str_class, NULL, devt, NULL, "stores"); + if (IS_ERR(rmr_strs_dev)) { + err = PTR_ERR(rmr_strs_dev); + goto ctl_destroy; + } + + err = sysfs_create_group(&rmr_ctl_dev->kobj, &default_attr_group); + if (unlikely(err)) + goto strs_destroy; + + return 0; + +strs_destroy: + device_unregister(rmr_strs_dev); +ctl_destroy: + device_unregister(rmr_ctl_dev); +cls_destroy: + class_destroy(rmr_str_class); + + return err; +} + +void brmr_srv_destroy_sysfs_files(void) +{ + sysfs_remove_group(&rmr_ctl_dev->kobj, &default_attr_group); + device_unregister(rmr_strs_dev); + device_unregister(rmr_ctl_dev); + class_destroy(rmr_str_class); +} From fec55e57d14c1caa98d6c24f3cc4aafcfb2e3ded Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Tue, 5 May 2026 09:46:25 +0200 Subject: [PATCH 13/13] block/brmr: include client and server modules into kernel compilation Add the per-directory Kconfig and Makefile, and wire them into the parent drivers/block Kconfig and Makefile so BRMR can be enabled in a kernel build. Three Kconfig symbols are introduced: CONFIG_BLK_DEV_BRMR (silent, selected by either side) CONFIG_BLK_DEV_BRMR_CLIENT (depends on INFINIBAND_RMR_CLIENT) CONFIG_BLK_DEV_BRMR_SERVER (depends on INFINIBAND_RMR_SERVER) The Makefile builds two modules: brmr-client.ko and brmr-server.ko. The server side acts as a consumer of the RMR server-side IO store interface (struct rmr_srv_store_ops) to back an RMR pool with a local block device. Signed-off-by: Md Haris Iqbal Signed-off-by: Jia Li --- drivers/block/Kconfig | 2 ++ drivers/block/Makefile | 1 + drivers/block/brmr/Kconfig | 28 ++++++++++++++++++++++++++++ drivers/block/brmr/Makefile | 16 ++++++++++++++++ 4 files changed, 47 insertions(+) create mode 100644 drivers/block/brmr/Kconfig create mode 100644 drivers/block/brmr/Makefile diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 858320b6ebb7..65167fcb1357 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -353,6 +353,8 @@ config BLKDEV_UBLK_LEGACY_OPCODES source "drivers/block/rnbd/Kconfig" +source "drivers/block/brmr/Kconfig" + config BLK_DEV_ZONED_LOOP tristate "Zoned loopback device support" depends on BLK_DEV_ZONED diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 2d8096eb8cdf..4793c9b0b383 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ +obj-$(CONFIG_BLK_DEV_BRMR) += brmr/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull/ diff --git a/drivers/block/brmr/Kconfig b/drivers/block/brmr/Kconfig new file mode 100644 index 000000000000..a38d59d2c1d4 --- /dev/null +++ b/drivers/block/brmr/Kconfig @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config BLK_DEV_BRMR + bool + +config BLK_DEV_BRMR_CLIENT + tristate "Block device over RMR (BRMR) client" + depends on INFINIBAND_RMR_CLIENT + select BLK_DEV_BRMR + help + BRMR client is a block device driver that sits on top of the + RMR ULP and exposes a standard Linux block device (/dev/brmrX) + backed by an RMR pool. Together with RMR it provides a + single-hop replication and resynchronization solution for + RDMA-connected storage clusters. + + If unsure, say N. + +config BLK_DEV_BRMR_SERVER + tristate "Block device over RMR (BRMR) server" + depends on INFINIBAND_RMR_SERVER + select BLK_DEV_BRMR + help + BRMR server exports a local block device as the backing store + for an RMR pool, so that BRMR clients can map it remotely + over RDMA. + + If unsure, say N. diff --git a/drivers/block/brmr/Makefile b/drivers/block/brmr/Makefile new file mode 100644 index 000000000000..894ba2720557 --- /dev/null +++ b/drivers/block/brmr/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs \ + -I$(srctree)/drivers/infiniband/ulp/rmr \ + -I$(srctree)/drivers/block/brmr + +brmr-client-y := brmr-clt.o \ + brmr-clt-sysfs.o \ + brmr-clt-reque.o \ + brmr-clt-stats.o + +brmr-server-y := brmr-srv-sysfs.o \ + brmr-srv.o + +obj-$(CONFIG_BLK_DEV_BRMR_CLIENT) += brmr-client.o +obj-$(CONFIG_BLK_DEV_BRMR_SERVER) += brmr-server.o