From d41ffc331ad4083cdd403a5f61d741cb501b2edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Thu, 17 Apr 2025 15:43:12 +0200 Subject: [PATCH 1/6] net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-72330 cve-pre CVE-2025-38248 commit-author Yong Wang commit 4b30ae9adb047dd0a7982975ec3933c529537026 When a bridge port STP state is changed from BLOCKING/DISABLED to FORWARDING, the port's igmp query timer will NOT re-arm itself if the bridge has been configured as per-VLAN multicast snooping. Solve this by choosing the correct multicast context(s) to enable/disable port multicast based on whether per-VLAN multicast snooping is enabled or not, i.e. using per-{port, VLAN} context in case of per-VLAN multicast snooping by re-implementing br_multicast_enable_port() and br_multicast_disable_port() functions. Before the patch, the IGMP query does not happen in the last step of the following test sequence, i.e. no growth for tx counter: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # bridge link set dev swp1 state 0 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge link set dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 After the patch, the IGMP query happens in the last step of the test: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # bridge link set dev swp1 state 0 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge link set dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 3 Signed-off-by: Yong Wang Reviewed-by: Andy Roulin Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller (cherry picked from commit 4b30ae9adb047dd0a7982975ec3933c529537026) Signed-off-by: Marcin Wcisło --- net/bridge/br_multicast.c | 77 +++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 96d1fc78dd396..54d60989c535e 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2102,12 +2102,17 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) } } -void br_multicast_enable_port(struct net_bridge_port *port) +static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; spin_lock_bh(&br->multicast_lock); - __br_multicast_enable_port_ctx(&port->multicast_ctx); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + __br_multicast_enable_port_ctx(pmctx); spin_unlock_bh(&br->multicast_lock); } @@ -2134,11 +2139,67 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) br_multicast_rport_del_notify(pmctx, del); } +static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) +{ + struct net_bridge *br = pmctx->port->br; + + spin_lock_bh(&br->multicast_lock); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + + __br_multicast_disable_port_ctx(pmctx); + spin_unlock_bh(&br->multicast_lock); +} + +static void br_multicast_toggle_port(struct net_bridge_port *port, bool on) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + rcu_read_lock(); + vg = nbp_vlan_group_rcu(port); + if (!vg) { + rcu_read_unlock(); + return; + } + + /* iterate each vlan, toggle vlan multicast context */ + list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast_port *pmctx = + &vlan->port_mcast_ctx; + u8 state = br_vlan_get_state(vlan); + /* enable vlan multicast context when state is + * LEARNING or FORWARDING + */ + if (on && br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(pmctx); + else + br_multicast_disable_port_ctx(pmctx); + } + rcu_read_unlock(); + return; + } +#endif + /* toggle port multicast context when vlan snooping is disabled */ + if (on) + br_multicast_enable_port_ctx(&port->multicast_ctx); + else + br_multicast_disable_port_ctx(&port->multicast_ctx); +} + +void br_multicast_enable_port(struct net_bridge_port *port) +{ + br_multicast_toggle_port(port, true); +} + void br_multicast_disable_port(struct net_bridge_port *port) { - spin_lock_bh(&port->br->multicast_lock); - __br_multicast_disable_port_ctx(&port->multicast_ctx); - spin_unlock_bh(&port->br->multicast_lock); + br_multicast_toggle_port(port, false); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -4297,9 +4358,9 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) - br_multicast_disable_port(p); + br_multicast_disable_port_ctx(&p->multicast_ctx); else - br_multicast_enable_port(p); + br_multicast_enable_port_ctx(&p->multicast_ctx); } list_for_each_entry(vlan, &vg->vlan_list, vlist) From cb5585144d8a9f21eeb2e240e065454ff4183593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Mon, 27 Apr 2026 16:43:59 +0200 Subject: [PATCH 2/6] net: bridge: mcast: update multicast contex when vlan state is changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-72330 cve-pre CVE-2025-38248 commit-author Yong Wang commit 6c131043eaf1be2a6cc2d228f92ceb626fbcc0f3 When the vlan STP state is changed, which could be manipulated by "bridge vlan" commands, similar to port STP state, this also impacts multicast behaviors such as igmp query. In the scenario of per-VLAN snooping, there's a need to update the corresponding multicast context to re-arm the port query timer when vlan state becomes "forwarding" etc. Update br_vlan_set_state() function to enable vlan multicast context in such scenario. Before the patch, the IGMP query does not happen in the last step of the following test sequence, i.e. no growth for tx counter: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # sleep 1 # bridge vlan set vid 1 dev swp1 state 4 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge vlan set vid 1 dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 After the patch, the IGMP query happens in the last step of the test: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # sleep 1 # bridge vlan set vid 1 dev swp1 state 4 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge vlan set vid 1 dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 3 Signed-off-by: Yong Wang Reviewed-by: Andy Roulin Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller (cherry picked from commit 6c131043eaf1be2a6cc2d228f92ceb626fbcc0f3) Signed-off-by: Marcin Wcisło --- net/bridge/br_mst.c | 4 ++-- net/bridge/br_multicast.c | 26 ++++++++++++++++++++++++++ net/bridge/br_private.h | 11 ++++++++++- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 1820f09ff59ce..3f24b4ee49c27 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -80,10 +80,10 @@ static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, if (br_vlan_get_state(v) == state) return; - br_vlan_set_state(v, state); - if (v->vid == vg->pvid) br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 54d60989c535e..68526c83bf1cf 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4265,6 +4265,32 @@ static void __br_multicast_stop(struct net_bridge_mcast *brmctx) #endif } +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + struct net_bridge *br; + + if (!br_vlan_should_use(v)) + return; + + if (br_vlan_is_master(v)) + return; + + br = v->port->br; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + + if (br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(&v->port_mcast_ctx); + + /* Multicast is not disabled for the vlan when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ +#endif +} + void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge *br; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1c7a03c54d174..2d98feecdc651 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1039,6 +1039,7 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx); void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state); void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); @@ -1475,6 +1476,11 @@ static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pm { } +static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, + u8 state) +{ +} + static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { @@ -1825,7 +1831,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts); -/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access, + * while br_vlan_set_state() may access data protected by multicast_lock. + */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) { return READ_ONCE(v->state); @@ -1834,6 +1842,7 @@ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) { WRITE_ONCE(v->state, state); + br_multicast_update_vlan_mcast_ctx(v, state); } static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) From c17f7bed63ecc5884ec4e4132c09e075a879f896 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Thu, 19 Jun 2025 21:22:28 +0300 Subject: [PATCH 3/6] bridge: mcast: Fix use-after-free during router port configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-72330 cve CVE-2025-38248 commit-author Ido Schimmel commit 7544f3f5b0b58c396f374d060898b5939da31709 upstream-diff Context conflicts due to missing 8fa7292fee5c5240402371ea89ab285ec856c916 ("treewide: Switch/rename to timer_delete[_sync]()"). No real diffs from upstream The bridge maintains a global list of ports behind which a multicast router resides. The list is consulted during forwarding to ensure multicast packets are forwarded to these ports even if the ports are not member in the matching MDB entry. When per-VLAN multicast snooping is enabled, the per-port multicast context is disabled on each port and the port is removed from the global router port list: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 # ip link add name dummy1 up master br1 type dummy # ip link set dev dummy1 type bridge_slave mcast_router 2 $ bridge -d mdb show | grep router router ports on br1: dummy1 # ip link set dev br1 type bridge mcast_vlan_snooping 1 $ bridge -d mdb show | grep router However, the port can be re-added to the global list even when per-VLAN multicast snooping is enabled: # ip link set dev dummy1 type bridge_slave mcast_router 0 # ip link set dev dummy1 type bridge_slave mcast_router 2 $ bridge -d mdb show | grep router router ports on br1: dummy1 Since commit 4b30ae9adb04 ("net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions"), when per-VLAN multicast snooping is enabled, multicast disablement on a port will disable the per-{port, VLAN} multicast contexts and not the per-port one. As a result, a port will remain in the global router port list even after it is deleted. This will lead to a use-after-free [1] when the list is traversed (when adding a new port to the list, for example): # ip link del dev dummy1 # ip link add name dummy2 up master br1 type dummy # ip link set dev dummy2 type bridge_slave mcast_router 2 Similarly, stale entries can also be found in the per-VLAN router port list. When per-VLAN multicast snooping is disabled, the per-{port, VLAN} contexts are disabled on each port and the port is removed from the per-VLAN router port list: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 # ip link add name dummy1 up master br1 type dummy # bridge vlan add vid 2 dev dummy1 # bridge vlan global set vid 2 dev br1 mcast_snooping 1 # bridge vlan set vid 2 dev dummy1 mcast_router 2 $ bridge vlan global show dev br1 vid 2 | grep router router ports: dummy1 # ip link set dev br1 type bridge mcast_vlan_snooping 0 $ bridge vlan global show dev br1 vid 2 | grep router However, the port can be re-added to the per-VLAN list even when per-VLAN multicast snooping is disabled: # bridge vlan set vid 2 dev dummy1 mcast_router 0 # bridge vlan set vid 2 dev dummy1 mcast_router 2 $ bridge vlan global show dev br1 vid 2 | grep router router ports: dummy1 When the VLAN is deleted from the port, the per-{port, VLAN} multicast context will not be disabled since multicast snooping is not enabled on the VLAN. As a result, the port will remain in the per-VLAN router port list even after it is no longer member in the VLAN. This will lead to a use-after-free [2] when the list is traversed (when adding a new port to the list, for example): # ip link add name dummy2 up master br1 type dummy # bridge vlan add vid 2 dev dummy2 # bridge vlan del vid 2 dev dummy1 # bridge vlan set vid 2 dev dummy2 mcast_router 2 Fix these issues by removing the port from the relevant (global or per-VLAN) router port list in br_multicast_port_ctx_deinit(). The function is invoked during port deletion with the per-port multicast context and during VLAN deletion with the per-{port, VLAN} multicast context. Note that deleting the multicast router timer is not enough as it only takes care of the temporary multicast router states (1 or 3) and not the permanent one (2). [1] BUG: KASAN: slab-out-of-bounds in br_multicast_add_router.part.0+0x3f1/0x560 Write of size 8 at addr ffff888004a67328 by task ip/384 [...] Call Trace: dump_stack_lvl+0x6f/0xa0 print_address_description.constprop.0+0x6f/0x350 print_report+0x108/0x205 kasan_report+0xdf/0x110 br_multicast_add_router.part.0+0x3f1/0x560 br_multicast_set_port_router+0x74e/0xac0 br_setport+0xa55/0x1870 br_port_slave_changelink+0x95/0x120 __rtnl_newlink+0x5e8/0xa40 rtnl_newlink+0x627/0xb00 rtnetlink_rcv_msg+0x6fb/0xb70 netlink_rcv_skb+0x11f/0x350 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x124/0x1c0 do_syscall_64+0xbb/0x360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [2] BUG: KASAN: slab-use-after-free in br_multicast_add_router.part.0+0x378/0x560 Read of size 8 at addr ffff888009f00840 by task bridge/391 [...] Call Trace: dump_stack_lvl+0x6f/0xa0 print_address_description.constprop.0+0x6f/0x350 print_report+0x108/0x205 kasan_report+0xdf/0x110 br_multicast_add_router.part.0+0x378/0x560 br_multicast_set_port_router+0x6f9/0xac0 br_vlan_process_options+0x8b6/0x1430 br_vlan_rtm_process_one+0x605/0xa30 br_vlan_rtm_process+0x396/0x4c0 rtnetlink_rcv_msg+0x2f7/0xb70 netlink_rcv_skb+0x11f/0x350 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x124/0x1c0 do_syscall_64+0xbb/0x360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 2796d846d74a ("net: bridge: vlan: convert mcast router global option to per-vlan entry") Fixes: 4b30ae9adb04 ("net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions") Reported-by: syzbot+7bfa4b72c6a5da128d32@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/684c18bd.a00a0220.279073.000b.GAE@google.com/T/ Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250619182228.1656906-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski (cherry picked from commit 7544f3f5b0b58c396f374d060898b5939da31709) Signed-off-by: Marcin Wcisło --- net/bridge/br_multicast.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 68526c83bf1cf..e8591a60fde35 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2009,10 +2009,19 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { + struct net_bridge *br = pmctx->port->br; + bool del = false; + #if IS_ENABLED(CONFIG_IPV6) del_timer_sync(&pmctx->ip6_mc_router_timer); #endif del_timer_sync(&pmctx->ip4_mc_router_timer); + + spin_lock_bh(&br->multicast_lock); + del |= br_ip6_multicast_rport_del(pmctx); + del |= br_ip4_multicast_rport_del(pmctx); + br_multicast_rport_del_notify(pmctx, del); + spin_unlock_bh(&br->multicast_lock); } int br_multicast_add_port(struct net_bridge_port *port) From a5cda260331a5947e6f513c496f5462e839a37fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Mon, 4 Sep 2023 12:34:40 -0400 Subject: [PATCH 4/6] NFS: Use the correct commit info in nfs_join_page_group() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-136536 cve-pre CVE-2025-39697 commit-author Trond Myklebust commit b193a78ddb5ee7dba074d3f28dc050069ba083c0 Ensure that nfs_clear_request_commit() updates the correct counters when it removes them from the commit list. Fixes: ed5d588fe47f ("NFS: Try to join page groups before an O_DIRECT retransmission") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker (cherry picked from commit b193a78ddb5ee7dba074d3f28dc050069ba083c0) Signed-off-by: Marcin Wcisło --- fs/nfs/direct.c | 8 +++++--- fs/nfs/write.c | 23 ++++++++++++----------- include/linux/nfs_page.h | 4 +++- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 47d892a1d363d..72915678cbaa8 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -488,7 +488,9 @@ static void nfs_direct_add_page_head(struct list_head *list, kref_get(&head->wb_kref); } -static void nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *req, *subreq; @@ -510,7 +512,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode) nfs_release_request(subreq); } } while ((subreq = subreq->wb_this_page) != req); - nfs_join_page_group(req, inode); + nfs_join_page_group(req, cinfo, inode); } } @@ -536,7 +538,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); dreq->count = 0; dreq->max_count = 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6ef150eff2b1d..58efc409a7a90 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -58,7 +58,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * @@ -501,8 +502,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -532,7 +533,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -565,8 +566,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; + struct nfs_commit_info cinfo; int ret; + nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -583,7 +586,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) return ERR_PTR(ret); } - nfs_join_page_group(head, inode); + nfs_join_page_group(head, &cinfo, inode); return head; } @@ -942,18 +945,16 @@ static void nfs_folio_clear_commit(struct folio *folio) } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_folio_clear_commit(nfs_page_to_folio(req)); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index aa9f4c6ebe261..1c315f854ea80 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -157,7 +157,9 @@ extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req); extern int nfs_page_group_lock_subrequests(struct nfs_page *head); -extern void nfs_join_page_group(struct nfs_page *head, struct inode *inode); +extern void nfs_join_page_group(struct nfs_page *head, + struct nfs_commit_info *cinfo, + struct inode *inode); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); From d099aac818da2c3e6961b9e346a45b794d772963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Mon, 1 Jul 2024 07:26:52 +0200 Subject: [PATCH 5/6] nfs: fold nfs_page_group_lock_subrequests into nfs_lock_and_join_requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-136536 cve-pre CVE-2025-39697 commit-author Christoph Hellwig commit 25edbcac6e32eab345e470d56ca9974a577b878b upstream-diff Used linux-6.6.y backport 9a1963404cc2eef69d2f8a42861bdf63d087dd5d for the clean cherry pick Fold nfs_page_group_lock_subrequests into nfs_lock_and_join_requests to prepare for future changes to this code, and move the helpers to write.c as well. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker (cherry picked from commit 9a1963404cc2eef69d2f8a42861bdf63d087dd5d) Signed-off-by: Marcin Wcisło --- fs/nfs/pagelist.c | 77 ---------------------------------------- fs/nfs/write.c | 75 ++++++++++++++++++++++++++++++++++---- include/linux/nfs_page.h | 1 - 3 files changed, 69 insertions(+), 84 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 43582dcba66cc..b94375fbd135d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -205,83 +205,6 @@ nfs_page_group_lock_head(struct nfs_page *req) return head; } -/* - * nfs_unroll_locks - unlock all newly locked reqs and wait on @req - * @head: head request of page group, must be holding head lock - * @req: request that couldn't lock and needs to wait on the req bit lock - * - * This is a helper function for nfs_lock_and_join_requests - * returns 0 on success, < 0 on error. - */ -static void -nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) -{ - struct nfs_page *tmp; - - /* relinquish all the locks successfully grabbed this run */ - for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { - if (!kref_read(&tmp->wb_kref)) - continue; - nfs_unlock_and_release_request(tmp); - } -} - -/* - * nfs_page_group_lock_subreq - try to lock a subrequest - * @head: head request of page group - * @subreq: request to lock - * - * This is a helper function for nfs_lock_and_join_requests which - * must be called with the head request and page group both locked. - * On error, it returns with the page group unlocked. - */ -static int -nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) -{ - int ret; - - if (!kref_get_unless_zero(&subreq->wb_kref)) - return 0; - while (!nfs_lock_request(subreq)) { - nfs_page_group_unlock(head); - ret = nfs_wait_on_request(subreq); - if (!ret) - ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unroll_locks(head, subreq); - nfs_release_request(subreq); - return ret; - } - } - return 0; -} - -/* - * nfs_page_group_lock_subrequests - try to lock the subrequests - * @head: head request of page group - * - * This is a helper function for nfs_lock_and_join_requests which - * must be called with the head request locked. - */ -int nfs_page_group_lock_subrequests(struct nfs_page *head) -{ - struct nfs_page *subreq; - int ret; - - ret = nfs_page_group_lock(head); - if (ret < 0) - return ret; - /* lock each request in the page group */ - for (subreq = head->wb_this_page; subreq != head; - subreq = subreq->wb_this_page) { - ret = nfs_page_group_lock_subreq(head, subreq); - if (ret < 0) - return ret; - } - nfs_page_group_unlock(head); - return 0; -} - /* * nfs_page_set_headlock - set the request PG_HEADLOCK * @req: request that is to be locked diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 58efc409a7a90..08e1a687db6da 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -547,6 +547,57 @@ void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, nfs_destroy_unlinked_subrequests(destroy_list, head, inode); } +/* + * nfs_unroll_locks - unlock all newly locked reqs and wait on @req + * @head: head request of page group, must be holding head lock + * @req: request that couldn't lock and needs to wait on the req bit lock + * + * This is a helper function for nfs_lock_and_join_requests + * returns 0 on success, < 0 on error. + */ +static void +nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) +{ + struct nfs_page *tmp; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } +} + +/* + * nfs_page_group_lock_subreq - try to lock a subrequest + * @head: head request of page group + * @subreq: request to lock + * + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request and page group both locked. + * On error, it returns with the page group unlocked. + */ +static int +nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) +{ + int ret; + + if (!kref_get_unless_zero(&subreq->wb_kref)) + return 0; + while (!nfs_lock_request(subreq)) { + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(head, subreq); + nfs_release_request(subreq); + return ret; + } + } + return 0; +} + /* * nfs_lock_and_join_requests - join all subreqs to the head req * @folio: the folio used to lookup the "page group" of nfs_page structures @@ -565,7 +616,7 @@ void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; - struct nfs_page *head; + struct nfs_page *head, *subreq; struct nfs_commit_info cinfo; int ret; @@ -579,16 +630,28 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) if (IS_ERR_OR_NULL(head)) return head; + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; + /* lock each request in the page group */ - ret = nfs_page_group_lock_subrequests(head); - if (ret < 0) { - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + for (subreq = head->wb_this_page; + subreq != head; + subreq = subreq->wb_this_page) { + ret = nfs_page_group_lock_subreq(head, subreq); + if (ret < 0) + goto out_unlock; } - nfs_join_page_group(head, &cinfo, inode); + nfs_page_group_unlock(head); + nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_join_page_group(head, &cinfo, inode); return head; + +out_unlock: + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); } static void nfs_write_error(struct nfs_page *req, int error) diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 1c315f854ea80..3a0f7ebe53883 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -156,7 +156,6 @@ extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req); -extern int nfs_page_group_lock_subrequests(struct nfs_page *head); extern void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, struct inode *inode); From 30b7164aeb098a65b74b2f4fba0825423b68c1c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Sat, 16 Aug 2025 07:25:20 -0700 Subject: [PATCH 6/6] NFS: Fix a race when updating an existing write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-136536 cve CVE-2025-39697 commit-author Trond Myklebust commit 76d2e3890fb169168c73f2e4f8375c7cc24a765e upstream-diff Used linux-6.6.y backport 181feb41f0b268e6288bf9a7b984624d7fe2031d for the clean cherry pick After nfs_lock_and_join_requests() tests for whether the request is still attached to the mapping, nothing prevents a call to nfs_inode_remove_request() from succeeding until we actually lock the page group. The reason is that whoever called nfs_inode_remove_request() doesn't necessarily have a lock on the page group head. So in order to avoid races, let's take the page group lock earlier in nfs_lock_and_join_requests(), and hold it across the removal of the request in nfs_inode_remove_request(). Reported-by: Jeff Layton Tested-by: Joe Quanaim Tested-by: Andrew Steffen Reviewed-by: Jeff Layton Fixes: bd37d6fce184 ("NFSv4: Convert nfs_lock_and_join_requests() to use nfs_page_find_head_request()") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust (cherry picked from commit 181feb41f0b268e6288bf9a7b984624d7fe2031d) Signed-off-by: Marcin Wcisło --- fs/nfs/pagelist.c | 9 ++--- fs/nfs/write.c | 71 ++++++++++++++-------------------------- include/linux/nfs_page.h | 1 + 3 files changed, 31 insertions(+), 50 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b94375fbd135d..8dea28edcd3a8 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -271,13 +271,14 @@ nfs_page_group_unlock(struct nfs_page *req) nfs_page_clear_headlock(req); } -/* - * nfs_page_group_sync_on_bit_locked +/** + * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set + * @req: request in page group + * @bit: PG_* bit that is used to sync page group * * must be called with page group lock held */ -static bool -nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) { struct nfs_page *head = req->wb_head; struct nfs_page *tmp; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 08e1a687db6da..748300df8eb1f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -155,20 +155,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) } } -static int -nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) { - int ret; - - if (!test_bit(PG_REMOVE, &req->wb_flags)) - return 0; - ret = nfs_page_group_lock(req); - if (ret) - return ret; if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) nfs_page_set_inode_ref(req, inode); - nfs_page_group_unlock(req); - return 0; } static struct nfs_page *nfs_folio_private_request(struct folio *folio) @@ -237,36 +227,6 @@ static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) return req; } -static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio) -{ - struct inode *inode = folio_file_mapping(folio)->host; - struct nfs_page *req, *head; - int ret; - - for (;;) { - req = nfs_folio_find_head_request(folio); - if (!req) - return req; - head = nfs_page_group_lock_head(req); - if (head != req) - nfs_release_request(req); - if (IS_ERR(head)) - return head; - ret = nfs_cancel_remove_inode(head, inode); - if (ret < 0) { - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); - } - /* Ensure that nobody removed the request before we locked it */ - if (head == nfs_folio_private_request(folio)) - break; - if (folio_test_swapcache(folio)) - break; - nfs_unlock_and_release_request(head); - } - return head; -} - /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct folio *folio, unsigned int offset, unsigned int count) @@ -620,20 +580,37 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) struct nfs_commit_info cinfo; int ret; - nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed * until the head reference is released. */ - head = nfs_folio_find_and_lock_request(folio); - if (IS_ERR_OR_NULL(head)) - return head; +retry: + head = nfs_folio_find_head_request(folio); + if (!head) + return NULL; + + while (!nfs_lock_request(head)) { + ret = nfs_wait_on_request(head); + if (ret < 0) { + nfs_release_request(head); + return ERR_PTR(ret); + } + } ret = nfs_page_group_lock(head); if (ret < 0) goto out_unlock; + /* Ensure that nobody removed the request before we locked it */ + if (head != folio->private && !folio_test_swapcache(folio)) { + nfs_page_group_unlock(head); + nfs_unlock_and_release_request(head); + goto retry; + } + + nfs_cancel_remove_inode(head, inode); + /* lock each request in the page group */ for (subreq = head->wb_this_page; subreq != head; @@ -838,7 +815,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + nfs_page_group_lock(req); + if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); @@ -850,6 +828,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) } spin_unlock(&mapping->private_lock); } + nfs_page_group_unlock(req); if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { atomic_long_dec(&nfsi->nrequests); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 3a0f7ebe53883..6a46069c5a368 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -162,6 +162,7 @@ extern void nfs_join_page_group(struct nfs_page *head, extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern bool nfs_page_group_sync_on_bit_locked(struct nfs_page *, unsigned int); extern int nfs_page_set_headlock(struct nfs_page *req); extern void nfs_page_clear_headlock(struct nfs_page *req); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);