From 1727a02b1051638b21f5b7379a8252dc3119ebcd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 23 Apr 2026 05:10:15 -0600 Subject: [PATCH 1/6] io_uring/epoll: switch to using do_epoll_ctl_file() interface No functional changes in this patch. Signed-off-by: Jens Axboe --- io_uring/epoll.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 8d4610246ba0a..59cd4f0096489 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -51,10 +51,21 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) { struct io_epoll *ie = io_kiocb_to_cmd(req, struct io_epoll); - int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct epoll_filefd efd; + int ret; + + CLASS(fd, f)(ie->epfd); + if (fd_empty(f)) + return -EBADF; + + CLASS(fd, tf)(ie->fd); + if (fd_empty(tf)) + return -EBADF; - ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + efd.file = fd_file(tf); + efd.fd = ie->fd; + ret = do_epoll_ctl_file(fd_file(f), ie->op, &efd, &ie->event, force_nonblock); if (force_nonblock && ret == -EAGAIN) return -EAGAIN; From e89a50afc53310ddfe6b6370c53686f8957679fa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 23 Apr 2026 05:10:45 -0600 Subject: [PATCH 2/6] io_uring/epoll: disallow adding an epoll file to an epoll context One of the nastier things about epoll is how it allows adding epoll files to epoll contexts. This leads to all sorts of loop detection code, and has been a source of issues in the past. Arguably adding IORING_EPOLL_CTL is a historical mistake on the io_uring side, but we're kind of stuck with it now as it does seem to be in use according to code searches. But we can at least minimize the damage a bit and just disallow this part of epoll, where nesting issues can arise. Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe --- io_uring/epoll.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 59cd4f0096489..42057aab91247 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -62,6 +62,9 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) CLASS(fd, tf)(ie->fd); if (fd_empty(tf)) return -EBADF; + /* disallow adding an epoll context to another epoll context */ + if (ie->op == EPOLL_CTL_ADD && is_file_epoll(fd_file(tf))) + return -EINVAL; efd.file = fd_file(tf); efd.fd = ie->fd; From 381e736515173a1fb78d2a86983d3ebfcf263597 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 May 2026 05:40:16 -0600 Subject: [PATCH 3/6] io_uring/rsrc: bump struct io_mapped_ubuf length field to size_t In preparation for supporting bigger individual buffers, bump the length field to a full 8-bytes with size_t rather than an unsigned int. Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 2 +- io_uring/rsrc.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index c2d3e45544bb4..f0ff4bd01b6d1 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -223,7 +223,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) if (ctx->buf_table.nodes[i]) buf = ctx->buf_table.nodes[i]->buf; if (buf) - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); + seq_printf(m, "%5u: 0x%llx/%zu\n", i, buf->ubuf, buf->len); else seq_printf(m, "%5u: \n", i); } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 44e3386f7c1ca..03521b50926ca 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -34,15 +34,15 @@ enum { struct io_mapped_ubuf { u64 ubuf; - unsigned int len; + size_t len; unsigned int nr_bvecs; unsigned int folio_shift; refcount_t refs; + u8 flags; + u8 dir; unsigned long acct_pages; void (*release)(void *); void *priv; - u8 flags; - u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; From 5af42f737e7f253a026d23e31a153b70045cdbc6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 24 Jan 2026 10:02:41 -0700 Subject: [PATCH 4/6] io_uring/rsrc: add huge page accounting for registered buffers Track huge page references in a per-ring xarray to prevent double accounting when the same huge page is used by multiple registered buffers, either within the same ring or across cloned rings. When registering buffers backed by huge pages, we need to account for RLIMIT_MEMLOCK. But if multiple buffers share the same huge page (common with cloned buffers), we must not account for the same page multiple times. Similarly, we must only unaccount when the last reference to a huge page is released. Maintain a per-ring xarray (hpage_acct) that tracks reference counts for each huge page. When registering a buffer, for each unique huge page, increment its accounting reference count, and only account pages that are newly added. When unregistering a buffer, for each unique huge page, decrement its refcount. Once the refcount hits zero, the page is unaccounted. Note: any account is done against the ctx->user that was assigned when the ring was setup. As before, if root is running the operation, no accounting is done. With these changes, any use of imu->acct_pages is also dead, hence kill it from struct io_mapped_ubuf. This shrinks it from 56b to 48b on a 64-bit arch. Additionally, hpage_already_acct() is gone, which was an O(M*M) scan over current + previous registrations. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 + io_uring/io_uring.c | 3 + io_uring/rsrc.c | 260 +++++++++++++++++++++++++-------- io_uring/rsrc.h | 1 - 4 files changed, 206 insertions(+), 61 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 244392026c6d5..23b8891d57045 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -446,6 +446,9 @@ struct io_ring_ctx { /* Stores zcrx object pointers of type struct io_zcrx_ifq */ struct xarray zcrx_ctxs; + /* Used for accounting references on pages in registered buffers */ + struct xarray hpage_acct; + u32 pers_next; struct xarray personalities; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4ed998d60c09c..fb6ed52bae61e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -233,6 +233,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return NULL; xa_init(&ctx->io_bl_xa); + xa_init(&ctx->hpage_acct); /* * Use 5 bits less than the max cq entries, that should give us around @@ -302,6 +303,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) io_free_alloc_caches(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); + xa_destroy(&ctx->hpage_acct); kfree(ctx); return NULL; } @@ -2198,6 +2200,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_napi_free(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); + xa_destroy(&ctx->hpage_acct); kfree(ctx); } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 650303626be6e..be7c5bf4e161e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -28,7 +28,52 @@ struct io_rsrc_update { }; static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, - struct iovec *iov, struct page **last_hpage); + struct iovec *iov); + +static int hpage_acct_ref(struct io_ring_ctx *ctx, struct page *hpage, + bool *acct_new) +{ + unsigned long key = (unsigned long) hpage; + unsigned long count; + void *entry; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + entry = xa_load(&ctx->hpage_acct, key); + if (entry) { + *acct_new = false; + count = xa_to_value(entry) + 1; + } else { + ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT); + if (ret) + return ret; + *acct_new = true; + count = 1; + } + xa_store(&ctx->hpage_acct, key, xa_mk_value(count), GFP_KERNEL_ACCOUNT); + return 0; +} + +static bool hpage_acct_unref(struct io_ring_ctx *ctx, struct page *hpage) +{ + unsigned long key = (unsigned long) hpage; + unsigned long count; + void *entry; + + lockdep_assert_held(&ctx->uring_lock); + + entry = xa_load(&ctx->hpage_acct, key); + if (WARN_ON_ONCE(!entry)) + return false; + count = xa_to_value(entry); + if (count == 1) { + xa_erase(&ctx->hpage_acct, key); + return true; + } + xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT); + return false; +} /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) @@ -124,15 +169,53 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) kvfree(imu); } +static unsigned long io_buffer_unaccount_pages(struct io_ring_ctx *ctx, + struct io_mapped_ubuf *imu) +{ + struct page *seen = NULL; + unsigned long acct = 0; + int i; + + if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user) + return 0; + + for (i = 0; i < imu->nr_bvecs; i++) { + struct page *page = imu->bvec[i].bv_page; + struct page *hpage; + + if (!PageCompound(page)) { + acct++; + continue; + } + + hpage = compound_head(page); + if (hpage == seen) + continue; + seen = hpage; + + /* Unaccount on last reference */ + if (hpage_acct_unref(ctx, hpage)) + acct += page_size(hpage) >> PAGE_SHIFT; + cond_resched(); + } + + return acct; +} + static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { + unsigned long acct_pages = 0; + + /* Always decrement, so it works for cloned buffers too */ + acct_pages = io_buffer_unaccount_pages(ctx, imu); + if (unlikely(refcount_read(&imu->refs) > 1)) { if (!refcount_dec_and_test(&imu->refs)) return; } - if (imu->acct_pages) - io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); + if (acct_pages) + io_unaccount_mem(ctx->user, ctx->mm_account, acct_pages); imu->release(imu->priv); io_free_imu(ctx, imu); } @@ -282,7 +365,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, { u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec fast_iov, *iov; - struct page *last_hpage = NULL; struct iovec __user *uvec; u64 user_data = up->data; __u32 done; @@ -307,7 +389,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = -EFAULT; break; } - node = io_sqe_buffer_register(ctx, iov, &last_hpage); + node = io_sqe_buffer_register(ctx, iov); if (IS_ERR(node)) { err = PTR_ERR(node); break; @@ -605,76 +687,79 @@ int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) } /* - * Not super efficient, but this is just a registration time. And we do cache - * the last compound head, so generally we'll only do a full search if we don't - * match that one. - * - * We check if the given compound head page has already been accounted, to - * avoid double accounting it. This allows us to account the full size of the - * page, not just the constituent pages of a huge page. + * Undo hpage_acct_ref() calls made during io_buffer_account_pin() on failure. + * This operates on the pages array since imu->bvec isn't populated yet. */ -static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct page *hpage) +static void io_buffer_unaccount_hpages(struct io_ring_ctx *ctx, + struct page **pages, int nr_pages) { - int i, j; + struct page *seen = NULL; + int i; + + if (!ctx->user) + return; - /* check current page array */ for (i = 0; i < nr_pages; i++) { + struct page *hpage; + if (!PageCompound(pages[i])) continue; - if (compound_head(pages[i]) == hpage) - return true; - } - - /* check previously registered pages */ - for (i = 0; i < ctx->buf_table.nr; i++) { - struct io_rsrc_node *node = ctx->buf_table.nodes[i]; - struct io_mapped_ubuf *imu; - if (!node) + hpage = compound_head(pages[i]); + if (hpage == seen) continue; - imu = node->buf; - for (j = 0; j < imu->nr_bvecs; j++) { - if (!PageCompound(imu->bvec[j].bv_page)) - continue; - if (compound_head(imu->bvec[j].bv_page) == hpage) - return true; - } - } + seen = hpage; - return false; + hpage_acct_unref(ctx, hpage); + cond_resched(); + } } static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct io_mapped_ubuf *imu, - struct page **last_hpage) + int nr_pages) { + unsigned long acct_pages = 0; + struct page *seen = NULL; int i, ret; - imu->acct_pages = 0; + if (!ctx->user) + return 0; + for (i = 0; i < nr_pages; i++) { + struct page *hpage; + bool acct_new; + if (!PageCompound(pages[i])) { - imu->acct_pages++; - } else { - struct page *hpage; - - hpage = compound_head(pages[i]); - if (hpage == *last_hpage) - continue; - *last_hpage = hpage; - if (headpage_already_acct(ctx, pages, i, hpage)) - continue; - imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; + acct_pages++; + continue; + } + + hpage = compound_head(pages[i]); + if (hpage == seen) + continue; + seen = hpage; + + ret = hpage_acct_ref(ctx, hpage, &acct_new); + if (ret) { + io_buffer_unaccount_hpages(ctx, pages, i); + return ret; } + if (acct_new) + acct_pages += page_size(hpage) >> PAGE_SHIFT; + cond_resched(); } - if (!imu->acct_pages) - return 0; + /* Try to account the memory */ + if (acct_pages) { + ret = io_account_mem(ctx->user, ctx->mm_account, acct_pages); + if (ret) { + /* Undo the refs we just added */ + io_buffer_unaccount_hpages(ctx, pages, nr_pages); + return ret; + } + } - ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); - if (ret) - imu->acct_pages = 0; - return ret; + return 0; } static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, @@ -763,8 +848,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, } static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, - struct iovec *iov, - struct page **last_hpage) + struct iovec *iov) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; @@ -811,7 +895,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, goto done; imu->nr_bvecs = nr_pages; - ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); + ret = io_buffer_account_pin(ctx, pages, nr_pages); if (ret) goto done; @@ -861,7 +945,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { - struct page *last_hpage = NULL; struct io_rsrc_data data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; @@ -904,7 +987,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } } - node = io_sqe_buffer_register(ctx, iov, &last_hpage); + node = io_sqe_buffer_register(ctx, iov); if (IS_ERR(node)) { ret = PTR_ERR(node); break; @@ -971,7 +1054,6 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, imu->ubuf = 0; imu->len = blk_rq_bytes(rq); - imu->acct_pages = 0; imu->folio_shift = PAGE_SHIFT; refcount_set(&imu->refs, 1); imu->release = release; @@ -1136,6 +1218,56 @@ int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, return io_import_fixed(ddir, iter, node->buf, buf_addr, len); } +static int io_buffer_acct_cloned_hpages(struct io_ring_ctx *ctx, + struct io_mapped_ubuf *imu) +{ + struct page *seen = NULL; + int i, ret = 0; + + if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user) + return 0; + + for (i = 0; i < imu->nr_bvecs; i++) { + struct page *page = imu->bvec[i].bv_page; + struct page *hpage; + bool acct_new; + + if (!PageCompound(page)) + continue; + + hpage = compound_head(page); + if (hpage == seen) + continue; + seen = hpage; + + /* Atomically add reference for cloned buffer */ + ret = hpage_acct_ref(ctx, hpage, &acct_new); + if (ret) + break; + + cond_resched(); + } + + if (!ret) + return 0; + + /* Undo refs we added for bvecs [0..i) */ + seen = NULL; + for (int j = 0; j < i; j++) { + struct page *p = imu->bvec[j].bv_page; + struct page *hp; + + if (!PageCompound(p)) + continue; + hp = compound_head(p); + if (hp == seen) + continue; + seen = hp; + hpage_acct_unref(ctx, hp); + } + return ret; +} + /* Lock two rings at once. The rings must be different! */ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) { @@ -1218,6 +1350,14 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx refcount_inc(&src_node->buf->refs); dst_node->buf = src_node->buf; + /* track compound references to clones */ + ret = io_buffer_acct_cloned_hpages(ctx, src_node->buf); + if (ret) { + refcount_dec(&src_node->buf->refs); + io_cache_free(&ctx->node_cache, dst_node); + io_rsrc_data_free(ctx, &data); + return ret; + } } data.nodes[off++] = dst_node; i++; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 44e3386f7c1ca..c0f8a18ec7674 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -38,7 +38,6 @@ struct io_mapped_ubuf { unsigned int nr_bvecs; unsigned int folio_shift; refcount_t refs; - unsigned long acct_pages; void (*release)(void *); void *priv; u8 flags; From 90f22c16e71724a783606f3d1c7acf308154f2c4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 May 2026 05:42:51 -0600 Subject: [PATCH 5/6] io_uring/rsrc: raise registered buffer 1GB limit There's no real reason to have a limit, as the memory is accounted by the lockmem limits anyway, if any exist. io_pin_pages() will still restrict the maximum allowed limit per buffer, which is INT_MAX number of pages. Cap it a bit lower than that, at 1TB for a 64-bit system. Surely that should be enough for everyone. For now. Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 650303626be6e..082cdd8c91c30 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -88,9 +88,14 @@ int io_validate_user_buf_range(u64 uaddr, u64 ulen) unsigned long tmp, base = (unsigned long)uaddr; unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); - /* arbitrary limit, but we need something */ - if (ulen > SZ_1G || !ulen) + if (!ulen) return -EFAULT; + /* 32-bit sanity checking */ + if (ulen > ULONG_MAX || uaddr > ULONG_MAX) + return -EFAULT; + /* cap to 1TB for 64-bit */ + if (ulen > SZ_1T) + return -EINVAL; if (check_add_overflow(base, acct_len, &tmp)) return -EOVERFLOW; return 0; From eeb4f25e0fdd2dc80d8c87947fc18a760283fdef Mon Sep 17 00:00:00 2001 From: kernel-patches-daemon Date: Tue, 12 May 2026 19:52:34 +0000 Subject: [PATCH 6/6] Dummy commit