From a560caebcf633989052d78be37cb28bc651aa982 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:17 +0200 Subject: [PATCH 01/12] swap: remove the maxpages variable in sys_swapon Always use si->max which is updated setup_swap_extents instead of copying into and out of maxpages. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- mm/swapfile.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9174f1eeffb0..f7ebd97e28a3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3350,10 +3350,9 @@ static unsigned long read_swap_header(struct swap_info_struct *si, } static int setup_swap_clusters_info(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned long maxpages) + union swap_header *swap_header) { - unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long nr_clusters = DIV_ROUND_UP(si->max, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; int err = -ENOMEM; unsigned long i; @@ -3395,7 +3394,7 @@ static int setup_swap_clusters_info(struct swap_info_struct *si, if (err) goto err; } - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { + for (i = si->max; i < round_up(si->max, SWAPFILE_CLUSTER); i++) { err = swap_cluster_setup_bad_slot(si, cluster_info, i, true); if (err) goto err; @@ -3425,7 +3424,7 @@ static int setup_swap_clusters_info(struct swap_info_struct *si, si->cluster_info = cluster_info; return 0; err: - free_swap_cluster_info(cluster_info, maxpages); + free_swap_cluster_info(cluster_info, si->max); return err; } @@ -3440,7 +3439,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) union swap_header *swap_header; int nr_extents; sector_t span; - unsigned long maxpages; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3512,14 +3510,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } swap_header = kmap_local_folio(folio, 0); - maxpages = read_swap_header(si, swap_header, inode); - if (unlikely(!maxpages)) { + si->max = read_swap_header(si, swap_header, inode); + if (unlikely(!si->max)) { error = -EINVAL; goto bad_swap_unlock_inode; } - si->max = maxpages; - si->pages = maxpages - 1; + si->pages = si->max - 1; nr_extents = setup_swap_extents(si, swap_file, &span); if (nr_extents < 0) { error = nr_extents; @@ -3531,14 +3528,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } - maxpages = si->max; - /* Set up the swap cluster info */ - error = setup_swap_clusters_info(si, swap_header, maxpages); + error = setup_swap_clusters_info(si, swap_header); if (error) goto bad_swap_unlock_inode; - error = swap_cgroup_swapon(si->type, maxpages); + error = swap_cgroup_swapon(si->type, si->max); if (error) goto bad_swap_unlock_inode; @@ -3546,7 +3541,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ - si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + si->zeromap = kvmalloc_array(BITS_TO_LONGS(si->max), sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!si->zeromap) { error = -ENOMEM; @@ -3597,7 +3592,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = zswap_swapon(si->type, maxpages); + error = zswap_swapon(si->type, si->max); if (error) goto bad_swap_unlock_inode; From ae17f993ad00baff5bc940012120ec0e43f4a711 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:18 +0200 Subject: [PATCH 02/12] swap: move boilerplate code into the core swap code Make the core swap code calculate sis->pages, nr_extents and the span, re-set sis->max based on it and don't require passing the current offset into the swap file to swap_add_extent as all that can trivially be calculated internally. Also truncate the spans based on the available information. All this removes a lot of boilerplate code in the callers. Signed-off-by: Christoph Hellwig Acked-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- Documentation/filesystems/locking.rst | 2 +- Documentation/filesystems/vfs.rst | 2 +- fs/btrfs/inode.c | 58 ++----------- fs/ext4/inode.c | 5 +- fs/f2fs/data.c | 38 ++------- fs/iomap/swapfile.c | 58 +------------ fs/nfs/file.c | 9 +- fs/ntfs/aops.c | 5 +- fs/smb/client/file.c | 5 +- fs/xfs/xfs_aops.c | 6 +- fs/zonefs/file.c | 5 +- include/linux/fs.h | 3 +- include/linux/iomap.h | 5 +- include/linux/swap.h | 11 ++- mm/page_io.c | 39 ++------- mm/swapfile.c | 116 ++++++++++++++++---------- 16 files changed, 121 insertions(+), 246 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8421ea21bd35..f3658204d070 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -264,7 +264,7 @@ prototypes:: int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_folio)(struct address_space *, struct folio *); - int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) + int (*swap_activate)(struct swap_info_struct *sis, struct file *f) int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 7c753148af88..4092b2149a5d 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -774,7 +774,7 @@ cache in your filesystem. The following members are defined: size_t count); void (*is_dirty_writeback)(struct folio *, bool *, bool *); int (*error_remove_folio)(struct mapping *mapping, struct folio *); - int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) + int (*swap_activate)(struct swap_info_struct *sis, struct file *f); int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 906d5c21ebc4..198d87e6f19a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10204,51 +10204,17 @@ struct btrfs_swap_info { u64 start; u64 block_start; u64 block_len; - u64 lowest_ppage; - u64 highest_ppage; - unsigned long nr_pages; - int nr_extents; }; static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { - unsigned long nr_pages; - unsigned long max_pages; - u64 first_ppage, first_ppage_reported, next_ppage; - int ret; - - /* - * Our swapfile may have had its size extended after the swap header was - * written. In that case activating the swapfile should not go beyond - * the max size set in the swap header. - */ - if (bsi->nr_pages >= sis->max) - return 0; + u64 first_ppage, next_ppage; - max_pages = sis->max - bsi->nr_pages; first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; - if (first_ppage >= next_ppage) - return 0; - nr_pages = next_ppage - first_ppage; - nr_pages = min(nr_pages, max_pages); - - first_ppage_reported = first_ppage; - if (bsi->start == 0) - first_ppage_reported++; - if (bsi->lowest_ppage > first_ppage_reported) - bsi->lowest_ppage = first_ppage_reported; - if (bsi->highest_ppage < (next_ppage - 1)) - bsi->highest_ppage = next_ppage - 1; - - ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); - if (ret < 0) - return ret; - bsi->nr_extents += ret; - bsi->nr_pages += nr_pages; - return 0; + return add_swap_extent(sis, next_ppage - first_ppage, first_ppage); } static void btrfs_swap_deactivate(struct file *file) @@ -10259,8 +10225,7 @@ static void btrfs_swap_deactivate(struct file *file) atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); } -static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10269,9 +10234,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct extent_state *cached_state = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; - struct btrfs_swap_info bsi = { - .lowest_ppage = (sector_t)-1ULL, - }; + struct btrfs_swap_info bsi = {}; struct btrfs_backref_share_check_ctx *backref_ctx = NULL; struct btrfs_path *path = NULL; int ret = 0; @@ -10570,23 +10533,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, up_write(&BTRFS_I(inode)->i_mmap_lock); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); - if (ret) - return ret; - - if (device) + if (!ret && device) sis->bdev = device->bdev; - *span = bsi.highest_ppage - bsi.lowest_ppage + 1; - sis->max = bsi.nr_pages; - sis->pages = bsi.nr_pages - 1; - return bsi.nr_extents; + return ret; } #else static void btrfs_swap_deactivate(struct file *file) { } -static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file) { return -EOPNOTSUPP; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c2c2d6ac7f3d..ca7bac4a8b4a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3940,10 +3940,9 @@ static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, - struct file *file, sector_t *span) + struct file *file) { - return iomap_swapfile_activate(sis, file, span, - &ext4_iomap_report_ops); + return iomap_swapfile_activate(sis, file, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8d4f1e75dee3..86fabacc67e6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4249,7 +4249,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, } static int check_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) + struct file *swap_file) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; @@ -4257,9 +4257,6 @@ static int check_swap_activate(struct swap_info_struct *sis, block_t cur_lblock; block_t last_lblock; block_t pblock; - block_t lowest_pblock = -1; - block_t highest_pblock = 0; - int nr_extents = 0; unsigned int nr_pblocks; unsigned int blks_per_sec = BLKS_PER_SEC(sbi); unsigned int not_aligned = 0; @@ -4272,7 +4269,7 @@ static int check_swap_activate(struct swap_info_struct *sis, cur_lblock = 0; last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); - while (cur_lblock < last_lblock && cur_lblock < sis->max) { + while (cur_lblock < last_lblock) { struct f2fs_map_blocks map; bool last_extent = false; retry: @@ -4307,8 +4304,6 @@ static int check_swap_activate(struct swap_info_struct *sis, not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); - if (cur_lblock + nr_pblocks > sis->max) - nr_pblocks -= blks_per_sec; /* this extent is last one */ if (!nr_pblocks) { @@ -4328,31 +4323,14 @@ static int check_swap_activate(struct swap_info_struct *sis, goto retry; } - if (cur_lblock + nr_pblocks >= sis->max) - nr_pblocks = sis->max - cur_lblock; - - if (cur_lblock) { /* exclude the header page */ - if (pblock < lowest_pblock) - lowest_pblock = pblock; - if (pblock + nr_pblocks - 1 > highest_pblock) - highest_pblock = pblock + nr_pblocks - 1; - } - /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ - ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); + ret = add_swap_extent(sis, nr_pblocks, pblock); if (ret < 0) goto out; - nr_extents += ret; cur_lblock += nr_pblocks; } - ret = nr_extents; - *span = 1 + highest_pblock - lowest_pblock; - if (cur_lblock == 0) - cur_lblock = 1; /* force Empty message */ - sis->max = cur_lblock; - sis->pages = cur_lblock - 1; out: if (not_aligned) f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", @@ -4360,8 +4338,7 @@ static int check_swap_activate(struct swap_info_struct *sis, return ret; } -static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file) { struct inode *inode = file_inode(file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -4391,14 +4368,14 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, f2fs_precache_extents(inode); - ret = check_swap_activate(sis, file, span); + ret = check_swap_activate(sis, file); if (ret < 0) return ret; stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(sbi, REQ_TIME); - return ret; + return 0; } static void f2fs_swap_deactivate(struct file *file) @@ -4409,8 +4386,7 @@ static void f2fs_swap_deactivate(struct file *file) clear_inode_flag(inode, FI_PIN_FILE); } #else -static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file) { return -EOPNOTSUPP; } diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 0db77c449467..f778b2c6c922 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -11,10 +11,7 @@ struct iomap_swapfile_info { struct iomap iomap; /* accumulated iomap */ struct swap_info_struct *sis; - uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ - uint64_t highest_ppage; /* highest physical addr seen (pages) */ unsigned long nr_pages; /* number of pages collected */ - int nr_extents; /* extent count */ struct file *file; }; @@ -27,16 +24,8 @@ struct iomap_swapfile_info { static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) { struct iomap *iomap = &isi->iomap; - unsigned long nr_pages; - unsigned long max_pages; uint64_t first_ppage; - uint64_t first_ppage_reported; uint64_t next_ppage; - int error; - - if (unlikely(isi->nr_pages >= isi->sis->max)) - return 0; - max_pages = isi->sis->max - isi->nr_pages; /* * Round the start up and the end down so that the physical @@ -45,33 +34,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> PAGE_SHIFT; - - /* Skip too-short physical extents. */ - if (first_ppage >= next_ppage) - return 0; - nr_pages = next_ppage - first_ppage; - nr_pages = min(nr_pages, max_pages); - - /* - * Calculate how much swap space we're adding; the first page contains - * the swap header and doesn't count. The mm still wants that first - * page fed to add_swap_extent, however. - */ - first_ppage_reported = first_ppage; - if (iomap->offset == 0) - first_ppage_reported++; - if (isi->lowest_ppage > first_ppage_reported) - isi->lowest_ppage = first_ppage_reported; - if (isi->highest_ppage < (next_ppage - 1)) - isi->highest_ppage = next_ppage - 1; - - /* Add extent, set up for the next call. */ - error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); - if (error < 0) - return error; - isi->nr_extents += error; - isi->nr_pages += nr_pages; - return 0; + return add_swap_extent(isi->sis, next_ppage - first_ppage, first_ppage); } static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) @@ -138,8 +101,7 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, * passed to the swapfile subsystem. */ int iomap_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *pagespan, - const struct iomap_ops *ops) + struct file *swap_file, const struct iomap_ops *ops) { struct inode *inode = swap_file->f_mapping->host; struct iomap_iter iter = { @@ -150,7 +112,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, }; struct iomap_swapfile_info isi = { .sis = sis, - .lowest_ppage = (sector_t)-1ULL, .file = swap_file, }; int ret; @@ -174,19 +135,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; } - /* - * If this swapfile doesn't contain even a single page-aligned - * contiguous range of blocks, reject this useless swapfile to - * prevent confusion later on. - */ - if (isi.nr_pages == 0) { - pr_warn("swapon: Cannot find a single usable page in file.\n"); - return -EINVAL; - } - - *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; - sis->max = isi.nr_pages; - sis->pages = isi.nr_pages - 1; - return isi.nr_extents; + return 0; } EXPORT_SYMBOL_GPL(iomap_swapfile_activate); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 25048a3c2364..74b401aa2b3a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,8 +567,7 @@ static int nfs_launder_folio(struct folio *folio) return ret; } -static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file) { unsigned long blocks; long long isize; @@ -589,19 +588,17 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = rpc_clnt_swap_activate(clnt); if (ret) return ret; - ret = add_swap_extent(sis, 0, sis->max, 0); + ret = add_swap_extent(sis, sis->max, 0); if (ret < 0) { rpc_clnt_swap_deactivate(clnt); return ret; } - *span = sis->pages; - if (cl->rpc_ops->enable_swap) cl->rpc_ops->enable_swap(inode); sis->flags |= SWP_FS_OPS; - return ret; + return 0; } static void nfs_swap_deactivate(struct file *file) diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 1fbf832ad165..4b7d019bc6ed 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -271,10 +271,9 @@ static int ntfs_writepages(struct address_space *mapping, } static int ntfs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) + struct file *swap_file) { - return iomap_swapfile_activate(sis, swap_file, span, - &ntfs_read_iomap_ops); + return iomap_swapfile_activate(sis, swap_file, &ntfs_read_iomap_ops); } const struct address_space_operations ntfs_aops = { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 664a2c223089..11d4655ef490 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3287,7 +3287,7 @@ void cifs_oplock_break(struct work_struct *work) } static int cifs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) + struct file *swap_file) { struct cifsFileInfo *cfile = swap_file->private_data; struct inode *inode = swap_file->f_mapping->host; @@ -3308,7 +3308,6 @@ static int cifs_swap_activate(struct swap_info_struct *sis, pr_warn("swap activate: swapfile has holes\n"); return -EINVAL; } - *span = sis->pages; pr_warn_once("Swap support over SMB3 is experimental\n"); @@ -3329,7 +3328,7 @@ static int cifs_swap_activate(struct swap_info_struct *sis, */ sis->flags |= SWP_FS_OPS; - return add_swap_extent(sis, 0, sis->max, 0); + return add_swap_extent(sis, sis->max, 0); } static void cifs_swap_deactivate(struct file *file) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f279055fcea0..1e8662e0e7cd 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -799,8 +799,7 @@ xfs_vm_readahead( static int xfs_vm_swap_activate( struct swap_info_struct *sis, - struct file *swap_file, - sector_t *span) + struct file *swap_file) { struct xfs_inode *ip = XFS_I(file_inode(swap_file)); @@ -838,8 +837,7 @@ xfs_vm_swap_activate( */ sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; - return iomap_swapfile_activate(sis, swap_file, span, - &xfs_read_iomap_ops); + return iomap_swapfile_activate(sis, swap_file, &xfs_read_iomap_ops); } const struct address_space_operations xfs_address_space_operations = { diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 5ada33f70bb4..214e4bf8e30a 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -168,7 +168,7 @@ static int zonefs_writepages(struct address_space *mapping, } static int zonefs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) + struct file *swap_file) { struct inode *inode = file_inode(swap_file); @@ -178,8 +178,7 @@ static int zonefs_swap_activate(struct swap_info_struct *sis, return -EINVAL; } - return iomap_swapfile_activate(sis, swap_file, span, - &zonefs_read_iomap_ops); + return iomap_swapfile_activate(sis, swap_file, &zonefs_read_iomap_ops); } const struct address_space_operations zonefs_file_aops = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 11559c513dfb..b8b6f7a38f4d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -435,8 +435,7 @@ struct address_space_operations { int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ - int (*swap_activate)(struct swap_info_struct *sis, struct file *file, - sector_t *span); + int (*swap_activate)(struct swap_info_struct *sis, struct file *file); void (*swap_deactivate)(struct file *file); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2c5685adf3a9..d82126e3d086 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -604,10 +604,9 @@ struct file; struct swap_info_struct; int iomap_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *pagespan, - const struct iomap_ops *ops); + struct file *swap_file, const struct iomap_ops *ops); #else -# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) +# define iomap_swapfile_activate(sis, swapfile, ops) (-EIO) #endif /* CONFIG_SWAP */ extern struct bio_set iomap_ioend_bioset; diff --git a/include/linux/swap.h b/include/linux/swap.h index 7a09df6977a5..b8dfe2c6bc98 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -403,10 +403,9 @@ extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP -int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, - unsigned long nr_pages, sector_t start_block); -int generic_swapfile_activate(struct swap_info_struct *, struct file *, - sector_t *); +int add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, + sector_t start_block); +int generic_swapfile_activate(struct swap_info_struct *, struct file *); static inline unsigned long total_swapcache_pages(void) { @@ -528,8 +527,8 @@ static inline bool folio_free_swap(struct folio *folio) } static inline int add_swap_extent(struct swap_info_struct *sis, - unsigned long start_page, - unsigned long nr_pages, sector_t start_block) + unsigned long start_page, unsigned long nr_pages, + sector_t start_block) { return -EINVAL; } diff --git a/mm/page_io.c b/mm/page_io.c index 70cea9e24d2f..f30f36ec1ed0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -76,19 +76,14 @@ static void end_swap_bio_read(struct bio *bio) } int generic_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, - sector_t *span) + struct file *swap_file) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; unsigned blocks_per_page; - unsigned long page_no; unsigned blkbits; sector_t probe_block; sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; int ret; blkbits = inode->i_blkbits; @@ -99,10 +94,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis, * to be very smart. */ probe_block = 0; - page_no = 0; last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { + while ((probe_block + blocks_per_page) <= last_block) { unsigned block_in_page; sector_t first_block; @@ -137,38 +130,22 @@ int generic_swapfile_activate(struct swap_info_struct *sis, } } - first_block >>= (PAGE_SHIFT - blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; - } - /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ - ret = add_swap_extent(sis, page_no, 1, first_block); + ret = add_swap_extent(sis, 1, + first_block >> (PAGE_SHIFT - blkbits)); if (ret < 0) - goto out; - nr_extents += ret; - page_no++; + return ret; probe_block += blocks_per_page; reprobe: continue; } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; -out: - return ret; + return 0; + bad_bmap: pr_err("swapon: swapfile has holes\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } static bool is_folio_zero_filled(struct folio *folio) diff --git a/mm/swapfile.c b/mm/swapfile.c index f7ebd97e28a3..158620fd2978 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2704,15 +2704,21 @@ static void destroy_swap_extents(struct swap_info_struct *sis, * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * - * This function rather assumes that it is called in ascending page order. + * Note that start_block is in units of PAGE_SIZE and not actually in block + * layer sectors as the sector_t would suggest. */ int -add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, - unsigned long nr_pages, sector_t start_block) +add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, + sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; - struct swap_extent *new_se; + + if (!nr_pages) + return 0; + if (unlikely(sis->pages >= sis->max)) + return 0; + nr_pages = min(nr_pages, sis->max - sis->pages); /* * place the new node at the right most since the @@ -2725,25 +2731,25 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); - BUG_ON(se->start_page + se->nr_pages != start_page); - if (se->start_block + se->nr_pages == start_block) { - /* Merge it */ - se->nr_pages += nr_pages; - return 0; - } + if (WARN_ON_ONCE(se->start_page + se->nr_pages != sis->pages)) + return -EINVAL; + if (se->start_block + se->nr_pages == start_block) + goto add; } /* No merge, insert a new extent. */ - new_se = kmalloc_obj(*se); - if (new_se == NULL) + se = kzalloc_obj(*se); + if (!se) return -ENOMEM; - new_se->start_page = start_page; - new_se->nr_pages = nr_pages; - new_se->start_block = start_block; - - rb_link_node(&new_se->rb_node, parent, link); - rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); - return 1; + rb_link_node(&se->rb_node, parent, link); + rb_insert_color(&se->rb_node, &sis->swap_extent_root); + + se->start_page = sis->pages; + se->start_block = start_block; +add: + se->nr_pages += nr_pages; + sis->pages += nr_pages; + return 0; } EXPORT_SYMBOL_GPL(add_swap_extent); @@ -2775,20 +2781,17 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) + struct file *swap_file) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; - if (S_ISBLK(inode->i_mode)) { - ret = add_swap_extent(sis, 0, sis->max, 0); - *span = sis->pages; - return ret; - } + if (S_ISBLK(inode->i_mode)) + return add_swap_extent(sis, sis->max, 0); if (mapping->a_ops->swap_activate) { - ret = mapping->a_ops->swap_activate(sis, swap_file, span); + ret = mapping->a_ops->swap_activate(sis, swap_file); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; @@ -2800,7 +2803,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, return ret; } - return generic_swapfile_activate(sis, swap_file, span); + return generic_swapfile_activate(sis, swap_file); } static void _enable_swap_info(struct swap_info_struct *si) @@ -3428,6 +3431,40 @@ static int setup_swap_clusters_info(struct swap_info_struct *si, return err; } +static void swap_print_info(struct swap_info_struct *si, const char *name) +{ + unsigned int nr_extents = 0; + u64 lowest_ppage = (u64)-1; + u64 highest_ppage = 0; + struct swap_extent *se; + + /* + * Calculate how much swap space we're adding; the first page contains + * the swap header and doesn't count. + */ + for (se = first_se(si); se; se = next_se(se)) { + u64 first_ppage = se->start_block; + u64 next_ppage = se->start_block + se->nr_pages; + + if (se->start_page == 0) + first_ppage++; + + if (lowest_ppage > first_ppage) + lowest_ppage = first_ppage; + if (highest_ppage < next_ppage - 1) + highest_ppage = next_ppage - 1; + nr_extents++; + } + + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", + K(si->pages), name, si->prio, nr_extents, + K(highest_ppage - lowest_ppage), + (si->flags & SWP_SOLIDSTATE) ? "SS" : "", + (si->flags & SWP_DISCARDABLE) ? "D" : "", + (si->flags & SWP_AREA_DISCARD) ? "s" : "", + (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *si; @@ -3437,8 +3474,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int prio; int error; union swap_header *swap_header; - int nr_extents; - sector_t span; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3510,24 +3545,25 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } swap_header = kmap_local_folio(folio, 0); + si->pages = 0; si->max = read_swap_header(si, swap_header, inode); if (unlikely(!si->max)) { error = -EINVAL; goto bad_swap_unlock_inode; } - si->pages = si->max - 1; - nr_extents = setup_swap_extents(si, swap_file, &span); - if (nr_extents < 0) { - error = nr_extents; + error = setup_swap_extents(si, swap_file); + if (error < 0) goto bad_swap_unlock_inode; - } - if (si->pages != si->max - 1) { - pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max); + if (si->pages != si->max) { + pr_err("swap:%u != (max:%u)\n", si->pages, si->max); error = -EINVAL; goto bad_swap_unlock_inode; } + /* Remove the first page countaining the swap header. */ + si->pages--; + /* Set up the swap cluster info */ error = setup_swap_clusters_info(si, swap_header); if (error) @@ -3624,13 +3660,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */ enable_swap_info(si); - pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", - K(si->pages), name->name, si->prio, nr_extents, - K((unsigned long long)span), - (si->flags & SWP_SOLIDSTATE) ? "SS" : "", - (si->flags & SWP_DISCARDABLE) ? "D" : "", - (si->flags & SWP_AREA_DISCARD) ? "s" : "", - (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); + swap_print_info(si, name->name); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); From 32ec056f0a2eb200d0d9ef424055bb63ca70da58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:19 +0200 Subject: [PATCH 03/12] swap,fs: move swapfile operations to struct file_operations The swap operations have nothing to do with the address_space, which is used for pagecache operations. Move them to struct file_operations instead. This will allow moving the block device special cases into block/fops.c subsequently. Pass struct file first to ->swap_activate as file operations typically get the file or iocb as first argument and use swap_activate instead of swapfile_activate in all names to be consistent. Note that while the trivial iomap wrappers are moved to a new file when applicable to keep them local to the file operation instances, complex implementation are kept in their existing place. It might be worth to move them in follow-on patches if the maintainers desire so. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- .../filesystems/iomap/operations.rst | 3 +- Documentation/filesystems/locking.rst | 35 +++++++------- Documentation/filesystems/vfs.rst | 40 ++++++++-------- fs/btrfs/btrfs_inode.h | 3 ++ fs/btrfs/file.c | 4 ++ fs/btrfs/inode.c | 15 +----- fs/ext4/file.c | 6 +++ fs/ext4/inode.c | 10 ---- fs/f2fs/data.c | 15 +----- fs/f2fs/f2fs.h | 2 + fs/f2fs/file.c | 4 ++ fs/iomap/swapfile.c | 12 ++--- fs/nfs/direct.c | 1 + fs/nfs/file.c | 12 +++-- fs/nfs/nfs4file.c | 3 ++ fs/ntfs/aops.c | 7 --- fs/ntfs/file.c | 6 +++ fs/smb/client/cifsfs.c | 18 ++++++++ fs/smb/client/cifsfs.h | 3 ++ fs/smb/client/file.c | 12 ++--- fs/xfs/xfs_aops.c | 46 ------------------- fs/xfs/xfs_file.c | 45 ++++++++++++++++++ fs/zonefs/file.c | 29 ++++++------ include/linux/fs.h | 10 ++-- include/linux/iomap.h | 6 +-- include/linux/nfs_fs.h | 3 ++ include/linux/swap.h | 2 +- mm/page_io.c | 9 ++-- mm/swapfile.c | 12 ++--- 29 files changed, 187 insertions(+), 186 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index da982ca7e413..2a78037665b7 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -55,7 +55,6 @@ The following address space operations can be wrapped easily: * ``readahead`` * ``writepages`` * ``bmap`` - * ``swap_activate`` ``struct iomap_write_ops`` -------------------------- @@ -747,7 +746,7 @@ function. Swap File Activation ==================== -The ``iomap_swapfile_activate`` function finds all the base-page aligned +The ``iomap_swap_activate`` function finds all the base-page aligned regions in a file and sets them up as swap space. The file will be ``fsync()``'d before activation. ``IOMAP_REPORT`` will be passed as the ``flags`` argument to diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index f3658204d070..e79d72a12273 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -264,9 +264,6 @@ prototypes:: int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_folio)(struct address_space *, struct folio *); - int (*swap_activate)(struct swap_info_struct *sis, struct file *f) - int (*swap_deactivate)(struct file *); - int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); locking rules: All except dirty_folio and free_folio may block @@ -289,9 +286,6 @@ migrate_folio: yes (both) launder_folio: yes is_partially_uptodate: yes error_remove_folio: yes -swap_activate: no -swap_deactivate: no -swap_rw: yes, unlocks ====================== ======================== ========= =============== ->write_begin(), ->write_end() and ->read_folio() may be called from @@ -350,19 +344,6 @@ cleaned, or an error value if not. Note that in order to prevent the folio getting mapped back in and redirtied, it needs to be kept locked across the entire operation. -->swap_activate() will be called to prepare the given file for swap. It -should perform any validation and preparation necessary to ensure that -writes can be performed with minimal memory allocation. It should call -add_swap_extent(), or the helper iomap_swapfile_activate(), and return -the number of extents added. If IO should be submitted through -->swap_rw(), it should set SWP_FS_OPS, otherwise IO will be submitted -directly to the block device ``sis->bdev``. - -->swap_deactivate() will be called in the sys_swapoff() -path after ->swap_activate() returned success. - -->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate(). - file_lock_operations ==================== @@ -503,6 +484,9 @@ prototypes:: struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); + int (*swap_activate)(struct file *file, struct swap_info_struct *sis); + int (*swap_deactivate)(struct file *); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); locking rules: All may block. @@ -555,6 +539,19 @@ used. To block changes to file contents via a memory mapping during the operation, the filesystem must take mapping->invalidate_lock to coordinate with ->page_mkwrite. +->swap_activate() is called to prepare the given file for swap. It should +perform any validation and preparation necessary to ensure that writes can be +performed with minimal memory allocation. It should call add_swap_extent(), +or the helper iomap_swap_activate(), and return the number of extents added. +If IO should be submitted through ->swap_rw(), the file system must set +SWP_FS_OPS from ->swap_activate(), otherwise IO will be submitted directly to +the block device ``sis->bdev``. + +->swap_deactivate() is called from the swapoff path to disable a swapfile +successfully activated using ->swap_activate(). + +->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate(). + dquot_operations ================ diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 4092b2149a5d..1624c1ee82d6 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -774,9 +774,6 @@ cache in your filesystem. The following members are defined: size_t count); void (*is_dirty_writeback)(struct folio *, bool *, bool *); int (*error_remove_folio)(struct mapping *mapping, struct folio *); - int (*swap_activate)(struct swap_info_struct *sis, struct file *f); - int (*swap_deactivate)(struct file *); - int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; ``read_folio`` @@ -970,23 +967,6 @@ cache in your filesystem. The following members are defined: Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. -``swap_activate`` - - Called to prepare the given file for swap. It should perform - any validation and preparation necessary to ensure that writes - can be performed with minimal memory allocation. It should call - add_swap_extent(), or the helper iomap_swapfile_activate(), and - return the number of extents added. If IO should be submitted - through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will - be submitted directly to the block device ``sis->bdev``. - -``swap_deactivate`` - Called during swapoff on files where swap_activate was - successful. - -``swap_rw`` - Called to read or write swap pages when SWP_FS_OPS is set. - The File Object =============== @@ -1046,6 +1026,9 @@ This describes how the VFS can manipulate an open file. As of kernel int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); int (*mmap_prepare)(struct vm_area_desc *); + int (*swap_activate)(struct file *file, struct swap_info_struct *sis); + int (*swap_deactivate)(struct file *); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; Again, all methods are called without any locks being held, unless @@ -1175,6 +1158,23 @@ otherwise noted. this can be specified by the vm_area_desc->action field and related parameters. +``swap_activate`` + + Called to prepare the given file for swap. It should perform + any validation and preparation necessary to ensure that writes + can be performed with minimal memory allocation. It should call + add_swap_extent(), or the helper iomap_swap_activate(), and + return the number of extents added. If IO should be submitted + through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will + be submitted directly to the block device ``sis->bdev``. + +``swap_deactivate`` + Called during swapoff on files where swap_activate was + successful. + +``swap_rw`` + Called to read or write swap pages when SWP_FS_OPS is set. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 55c272fe5d92..f527126882d6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -670,4 +670,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, const struct btrfs_file_extent *file_extent, int type); +int btrfs_swap_activate(struct file *file, struct swap_info_struct *sis); +void btrfs_swap_deactivate(struct file *file); + #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index cf1cb5c4db75..165b8da1d7db 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3867,6 +3867,10 @@ const struct file_operations btrfs_file_operations = { .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, .setlease = generic_setlease, +#ifdef CONFIG_SWAP + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, +#endif }; int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 198d87e6f19a..ee0a7947706a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10217,7 +10217,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return add_swap_extent(sis, next_ppage - first_ppage, first_ppage); } -static void btrfs_swap_deactivate(struct file *file) +void btrfs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); @@ -10225,7 +10225,7 @@ static void btrfs_swap_deactivate(struct file *file) atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); } -static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file) +int btrfs_swap_activate(struct file *file, struct swap_info_struct *sis) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10537,15 +10537,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file) sis->bdev = device->bdev; return ret; } -#else -static void btrfs_swap_deactivate(struct file *file) -{ -} - -static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file) -{ - return -EOPNOTSUPP; -} #endif /* @@ -10692,8 +10683,6 @@ static const struct address_space_operations btrfs_aops = { .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_folio = generic_error_remove_folio, - .swap_activate = btrfs_swap_activate, - .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index eb1a323962b1..fad3ed05c02a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -971,6 +971,11 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, offset, maxbytes); } +static int ext4_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + return iomap_swap_activate(file, sis, &ext4_iomap_report_ops); +} + const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, @@ -992,6 +997,7 @@ const struct file_operations ext4_file_operations = { FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, .setlease = generic_setlease, + .swap_activate = ext4_swap_activate, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ca7bac4a8b4a..efbb2ddad363 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3939,12 +3939,6 @@ static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) return block_dirty_folio(mapping, folio); } -static int ext4_iomap_swap_activate(struct swap_info_struct *sis, - struct file *file) -{ - return iomap_swapfile_activate(sis, file, &ext4_iomap_report_ops); -} - static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, @@ -3958,7 +3952,6 @@ static const struct address_space_operations ext4_aops = { .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { @@ -3974,7 +3967,6 @@ static const struct address_space_operations ext4_journalled_aops = { .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { @@ -3990,14 +3982,12 @@ static const struct address_space_operations ext4_da_aops = { .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, - .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 86fabacc67e6..8bcf630df557 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4338,7 +4338,7 @@ static int check_swap_activate(struct swap_info_struct *sis, return ret; } -static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file) +int f2fs_swap_activate(struct file *file, struct swap_info_struct *sis) { struct inode *inode = file_inode(file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -4378,22 +4378,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file) return 0; } -static void f2fs_swap_deactivate(struct file *file) +void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } -#else -static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file) -{ - return -EOPNOTSUPP; -} - -static void f2fs_swap_deactivate(struct file *file) -{ -} #endif const struct address_space_operations f2fs_dblock_aops = { @@ -4407,8 +4398,6 @@ const struct address_space_operations f2fs_dblock_aops = { .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .bmap = f2fs_bmap, - .swap_activate = f2fs_swap_activate, - .swap_deactivate = f2fs_swap_deactivate, }; void f2fs_clear_page_cache_dirty_tag(struct folio *folio) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 91f506e7c9cf..93e9709f26fa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4195,6 +4195,8 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +int f2fs_swap_activate(struct file *file, struct swap_info_struct *sis); +void f2fs_swap_deactivate(struct file *file); extern const struct iomap_ops f2fs_iomap_ops; /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fb12c5c9affd..aa91d5fff1cf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -5488,4 +5488,8 @@ const struct file_operations f2fs_file_operations = { .fadvise = f2fs_file_fadvise, .fop_flags = FOP_BUFFER_RASYNC, .setlease = generic_setlease, +#ifdef CONFIG_SWAP + .swap_activate = f2fs_swap_activate, + .swap_deactivate = f2fs_swap_deactivate, +#endif }; diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index f778b2c6c922..cf354fdfb7c3 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -100,10 +100,10 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, * Iterate a swap file's iomaps to construct physical extents that can be * passed to the swapfile subsystem. */ -int iomap_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, const struct iomap_ops *ops) +int iomap_swap_activate(struct file *file, struct swap_info_struct *sis, + const struct iomap_ops *ops) { - struct inode *inode = swap_file->f_mapping->host; + struct inode *inode = file->f_mapping->host; struct iomap_iter iter = { .inode = inode, .pos = 0, @@ -112,7 +112,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, }; struct iomap_swapfile_info isi = { .sis = sis, - .file = swap_file, + .file = file, }; int ret; @@ -120,7 +120,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, * Persist all file mapping metadata so that we won't have any * IOMAP_F_DIRTY iomaps. */ - ret = vfs_fsync(swap_file, 1); + ret = vfs_fsync(file, 1); if (ret) return ret; @@ -137,4 +137,4 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return 0; } -EXPORT_SYMBOL_GPL(iomap_swapfile_activate); +EXPORT_SYMBOL_GPL(iomap_swap_activate); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 48d89716193a..e92a4c8f8f77 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -164,6 +164,7 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) return ret; return 0; } +EXPORT_SYMBOL_GPL(nfs_swap_rw); static void nfs_direct_release_pages(struct page **pages, unsigned int npages) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 74b401aa2b3a..2bc55d9d71e1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,7 +567,7 @@ static int nfs_launder_folio(struct folio *folio) return ret; } -static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file) +int nfs_swap_activate(struct file *file, struct swap_info_struct *sis) { unsigned long blocks; long long isize; @@ -600,8 +600,9 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file) sis->flags |= SWP_FS_OPS; return 0; } +EXPORT_SYMBOL_GPL(nfs_swap_activate); -static void nfs_swap_deactivate(struct file *file) +void nfs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); struct rpc_clnt *clnt = NFS_CLIENT(inode); @@ -611,6 +612,7 @@ static void nfs_swap_deactivate(struct file *file) if (cl->rpc_ops->disable_swap) cl->rpc_ops->disable_swap(file_inode(file)); } +EXPORT_SYMBOL_GPL(nfs_swap_deactivate); const struct address_space_operations nfs_file_aops = { .read_folio = nfs_read_folio, @@ -625,9 +627,6 @@ const struct address_space_operations nfs_file_aops = { .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_folio = generic_error_remove_folio, - .swap_activate = nfs_swap_activate, - .swap_deactivate = nfs_swap_deactivate, - .swap_rw = nfs_swap_rw, }; /* @@ -960,6 +959,9 @@ const struct file_operations nfs_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, .fop_flags = FOP_DONTCACHE, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index be40e126c539..eb1a8dbab55a 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -455,5 +455,8 @@ const struct file_operations nfs4_file_operations = { #else .llseek = nfs_file_llseek, #endif + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, .fop_flags = FOP_DONTCACHE, }; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 4b7d019bc6ed..a94f5f675790 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -270,12 +270,6 @@ static int ntfs_writepages(struct address_space *mapping, return iomap_writepages(&wpc); } -static int ntfs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file) -{ - return iomap_swapfile_activate(sis, swap_file, &ntfs_read_iomap_ops); -} - const struct address_space_operations ntfs_aops = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, @@ -287,7 +281,6 @@ const struct address_space_operations ntfs_aops = { .error_remove_folio = generic_error_remove_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, - .swap_activate = ntfs_swap_activate, }; const struct address_space_operations ntfs_mft_aops = { diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index e8bea22b81a7..0dcf8479362a 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1114,6 +1114,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t offset, loff_t le return err; } +static int ntfs_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + return iomap_swap_activate(file, sis, &ntfs_read_iomap_ops); +} + const struct file_operations ntfs_file_ops = { .llseek = ntfs_file_llseek, .read_iter = ntfs_file_read_iter, @@ -1130,6 +1135,7 @@ const struct file_operations ntfs_file_ops = { #endif .fallocate = ntfs_fallocate, .setlease = generic_setlease, + .swap_activate = ntfs_swap_activate, }; const struct inode_operations ntfs_file_inode_ops = { diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9f76b0347fa9..f0d8a3a46074 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1577,6 +1577,9 @@ const struct file_operations cifs_file_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_strict_ops = { @@ -1597,6 +1600,9 @@ const struct file_operations cifs_file_strict_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_direct_ops = { @@ -1617,6 +1623,9 @@ const struct file_operations cifs_file_direct_ops = { .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_nobrl_ops = { @@ -1635,6 +1644,9 @@ const struct file_operations cifs_file_nobrl_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_strict_nobrl_ops = { @@ -1653,6 +1665,9 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_direct_nobrl_ops = { @@ -1671,6 +1686,9 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_dir_ops = { diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index c455b15f2778..1e5b9fce84f9 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -115,6 +115,9 @@ int cifs_file_mmap_prepare(struct vm_area_desc *desc); int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc); extern const struct file_operations cifs_dir_ops; int cifs_readdir(struct file *file, struct dir_context *ctx); +int cifs_swap_activate(struct file *swap_file, struct swap_info_struct *sis); +void cifs_swap_deactivate(struct file *file); +int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter); /* Functions related to dir entries */ extern const struct dentry_operations cifs_dentry_ops; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 11d4655ef490..84459f87907e 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3286,8 +3286,7 @@ void cifs_oplock_break(struct work_struct *work) cifs_done_oplock_break(cinode); } -static int cifs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file) +int cifs_swap_activate(struct file *swap_file, struct swap_info_struct *sis) { struct cifsFileInfo *cfile = swap_file->private_data; struct inode *inode = swap_file->f_mapping->host; @@ -3296,7 +3295,7 @@ static int cifs_swap_activate(struct swap_info_struct *sis, cifs_dbg(FYI, "swap activate\n"); - if (!swap_file->f_mapping->a_ops->swap_rw) + if (!swap_file->f_op->swap_rw) /* Cannot support swap */ return -EINVAL; @@ -3331,7 +3330,7 @@ static int cifs_swap_activate(struct swap_info_struct *sis, return add_swap_extent(sis, sis->max, 0); } -static void cifs_swap_deactivate(struct file *file) +void cifs_swap_deactivate(struct file *file) { struct cifsFileInfo *cfile = file->private_data; @@ -3352,7 +3351,7 @@ static void cifs_swap_deactivate(struct file *file) * * Perform IO to the swap-file. This is much like direct IO. */ -static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) +int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; @@ -3378,9 +3377,6 @@ const struct address_space_operations cifs_addr_ops = { * TODO: investigate and if useful we could add an is_dirty_writeback * helper if needed */ - .swap_activate = cifs_swap_activate, - .swap_deactivate = cifs_swap_deactivate, - .swap_rw = cifs_swap_rw, }; /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1e8662e0e7cd..7488fc6a7b78 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -796,50 +796,6 @@ xfs_vm_readahead( iomap_readahead(&xfs_read_iomap_ops, &ctx, NULL); } -static int -xfs_vm_swap_activate( - struct swap_info_struct *sis, - struct file *swap_file) -{ - struct xfs_inode *ip = XFS_I(file_inode(swap_file)); - - if (xfs_is_zoned_inode(ip)) - return -EINVAL; - - /* - * Swap file activation can race against concurrent shared extent - * removal in files that have been cloned. If this happens, - * iomap_swapfile_iter() can fail because it encountered a shared - * extent even though an operation is in progress to remove those - * shared extents. - * - * This race becomes problematic when we defer extent removal - * operations beyond the end of a syscall (i.e. use async background - * processing algorithms). Users think the extents are no longer - * shared, but iomap_swapfile_iter() still sees them as shared - * because the refcountbt entries for the extents being removed have - * not yet been updated. Hence the swapon call fails unexpectedly. - * - * The race condition is currently most obvious from the unlink() - * operation as extent removal is deferred until after the last - * reference to the inode goes away. We then process the extent - * removal asynchronously, hence triggers the "syscall completed but - * work not done" condition mentioned above. To close this race - * window, we need to flush any pending inodegc operations to ensure - * they have updated the refcountbt records before we try to map the - * swapfile. - */ - xfs_inodegc_flush(ip->i_mount); - - /* - * Direct the swap code to the correct block device when this file - * sits on the RT device. - */ - sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; - - return iomap_swapfile_activate(sis, swap_file, &xfs_read_iomap_ops); -} - const struct address_space_operations xfs_address_space_operations = { .read_folio = xfs_vm_read_folio, .readahead = xfs_vm_readahead, @@ -851,11 +807,9 @@ const struct address_space_operations xfs_address_space_operations = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = xfs_vm_swap_activate, }; const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .dirty_folio = noop_dirty_folio, - .swap_activate = xfs_vm_swap_activate, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 845a97c9b063..41f7e19bd31f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -2081,6 +2081,50 @@ xfs_file_mmap_prepare( return 0; } +static int +xfs_file_swap_activate( + struct file *file, + struct swap_info_struct *sis) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + + if (xfs_is_zoned_inode(ip)) + return -EINVAL; + + /* + * Swap file activation can race against concurrent shared extent + * removal in files that have been cloned. If this happens, + * iomap_swapfile_iter() can fail because it encountered a shared + * extent even though an operation is in progress to remove those + * shared extents. + * + * This race becomes problematic when we defer extent removal + * operations beyond the end of a syscall (i.e. use async background + * processing algorithms). Users think the extents are no longer + * shared, but iomap_swapfile_iter() still sees them as shared + * because the refcountbt entries for the extents being removed have + * not yet been updated. Hence the swapon call fails unexpectedly. + * + * The race condition is currently most obvious from the unlink() + * operation as extent removal is deferred until after the last + * reference to the inode goes away. We then process the extent + * removal asynchronously, hence triggers the "syscall completed but + * work not done" condition mentioned above. To close this race + * window, we need to flush any pending inodegc operations to ensure + * they have updated the refcountbt records before we try to map the + * swapfile. + */ + xfs_inodegc_flush(ip->i_mount); + + /* + * Direct the swap code to the correct block device when this file + * sits on the RT device. + */ + sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; + + return iomap_swap_activate(file, sis, &xfs_read_iomap_ops); +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, @@ -2104,6 +2148,7 @@ const struct file_operations xfs_file_operations = { FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, .setlease = generic_setlease, + .swap_activate = xfs_file_swap_activate, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 214e4bf8e30a..2c817917a13d 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -167,20 +167,6 @@ static int zonefs_writepages(struct address_space *mapping, return iomap_writepages(&wpc); } -static int zonefs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file) -{ - struct inode *inode = file_inode(swap_file); - - if (zonefs_inode_is_seq(inode)) { - zonefs_err(inode->i_sb, - "swap file: not a conventional zone file\n"); - return -EINVAL; - } - - return iomap_swapfile_activate(sis, swap_file, &zonefs_read_iomap_ops); -} - const struct address_space_operations zonefs_file_aops = { .read_folio = zonefs_read_folio, .readahead = zonefs_readahead, @@ -191,7 +177,6 @@ const struct address_space_operations zonefs_file_aops = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = zonefs_swap_activate, }; int zonefs_file_truncate(struct inode *inode, loff_t isize) @@ -858,6 +843,19 @@ static int zonefs_file_release(struct inode *inode, struct file *file) return 0; } +static int zonefs_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + struct inode *inode = file_inode(file); + + if (zonefs_inode_is_seq(inode)) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; + } + + return iomap_swap_activate(file, sis, &zonefs_read_iomap_ops); +} + const struct file_operations zonefs_file_operations = { .open = zonefs_file_open, .release = zonefs_file_release, @@ -869,4 +867,5 @@ const struct file_operations zonefs_file_operations = { .splice_read = zonefs_file_splice_read, .splice_write = iter_file_splice_write, .iopoll = iocb_bio_iopoll, + .swap_activate = zonefs_swap_activate, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index b8b6f7a38f4d..7564cef5405d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,11 +433,6 @@ struct address_space_operations { size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); - - /* swapfile support */ - int (*swap_activate)(struct swap_info_struct *sis, struct file *file); - void (*swap_deactivate)(struct file *file); - int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; @@ -1966,6 +1961,11 @@ struct file_operations { int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); int (*mmap_prepare)(struct vm_area_desc *); + + /* swapfile support */ + int (*swap_activate)(struct file *file, struct swap_info_struct *sis); + void (*swap_deactivate)(struct file *file); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); } __randomize_layout; /* Supports async buffered reads */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d82126e3d086..3fd582d375b6 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -603,10 +603,10 @@ void iomap_dio_bio_end_io(struct bio *bio); struct file; struct swap_info_struct; -int iomap_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, const struct iomap_ops *ops); +int iomap_swap_activate(struct file *file, struct swap_info_struct *sis, + const struct iomap_ops *ops); #else -# define iomap_swapfile_activate(sis, swapfile, ops) (-EIO) +# define iomap_swap_activate(file, sis, ops) (-EIO) #endif /* CONFIG_SWAP */ extern struct bio_set iomap_ioend_bioset; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4623262da3c0..9746212a085e 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -538,6 +538,9 @@ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ /* * linux/fs/nfs/file.c */ +int nfs_swap_activate(struct file *file, struct swap_info_struct *sis); +void nfs_swap_deactivate(struct file *file); + extern const struct file_operations nfs_file_operations; #if IS_ENABLED(CONFIG_NFS_V4) extern const struct file_operations nfs4_file_operations; diff --git a/include/linux/swap.h b/include/linux/swap.h index b8dfe2c6bc98..657779485ae4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -405,7 +405,7 @@ extern void __meminit kswapd_stop(int nid); int add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, sector_t start_block); -int generic_swapfile_activate(struct swap_info_struct *, struct file *); +int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis); static inline unsigned long total_swapcache_pages(void) { diff --git a/mm/page_io.c b/mm/page_io.c index f30f36ec1ed0..3e1c12649448 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -75,8 +75,7 @@ static void end_swap_bio_read(struct bio *bio) bio_put(bio); } -int generic_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file) +int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; @@ -451,11 +450,10 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) void swap_write_unplug(struct swap_iocb *sio) { struct iov_iter from; - struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); - ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + ret = sio->iocb.ki_filp->f_op->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); } @@ -640,11 +638,10 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) void __swap_read_unplug(struct swap_iocb *sio) { struct iov_iter from; - struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); - ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + ret = sio->iocb.ki_filp->f_op->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 158620fd2978..a183c9c95695 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2692,11 +2692,9 @@ static void destroy_swap_extents(struct swap_info_struct *sis, } if (sis->flags & SWP_ACTIVATED) { - struct address_space *mapping = swap_file->f_mapping; - sis->flags &= ~SWP_ACTIVATED; - if (mapping->a_ops->swap_deactivate) - mapping->a_ops->swap_deactivate(swap_file); + if (swap_file->f_op->swap_deactivate) + swap_file->f_op->swap_deactivate(swap_file); } } @@ -2790,8 +2788,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, if (S_ISBLK(inode->i_mode)) return add_swap_extent(sis, sis->max, 0); - if (mapping->a_ops->swap_activate) { - ret = mapping->a_ops->swap_activate(sis, swap_file); + if (swap_file->f_op->swap_activate) { + ret = swap_file->f_op->swap_activate(swap_file, sis); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; @@ -2803,7 +2801,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, return ret; } - return generic_swapfile_activate(sis, swap_file); + return generic_swap_activate(swap_file, sis); } static void _enable_swap_info(struct swap_info_struct *si) From 9b1a6a55a13b442312c5f62c63b80bb188eff133 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:20 +0200 Subject: [PATCH 04/12] swap: restrict to regular files or block devices Various swap code assumes it runs either on a block device or on a regular file. Make this restriction explicit using checks right after opening the file. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- mm/swapfile.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index a183c9c95695..651c1b59ff9f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3515,6 +3515,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOENT; goto bad_swap_unlock_inode; } + if (!S_ISBLK(inode->i_mode) && !S_ISREG(inode->i_mode)) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; From 31d49a8ada687bd4e902dbfd4d6144f8b6c432da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:21 +0200 Subject: [PATCH 05/12] swap: cleanup setup_swap_extents Reflow setup_swap_extents so that the flag checking is not conditional on a swap_activate method. This is currently a no-op because the swapoff code still checks the presence of a swap_deactivate method, but it simplifies adding a new check, and also makes the SWP_ACTIVATED flag more consistent. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Acked-by: Chris Li --- mm/swapfile.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 651c1b59ff9f..1b7fc03612f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2783,25 +2783,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; - int ret; + int ret, error = 0; if (S_ISBLK(inode->i_mode)) return add_swap_extent(sis, sis->max, 0); - if (swap_file->f_op->swap_activate) { + if (swap_file->f_op->swap_activate) ret = swap_file->f_op->swap_activate(swap_file, sis); - if (ret < 0) - return ret; - sis->flags |= SWP_ACTIVATED; - if ((sis->flags & SWP_FS_OPS) && - sio_pool_init() != 0) { - destroy_swap_extents(sis, swap_file); - return -ENOMEM; - } + else + ret = generic_swap_activate(swap_file, sis); + if (ret < 0) return ret; - } - return generic_swap_activate(swap_file, sis); + sis->flags |= SWP_ACTIVATED; + if (sis->flags & SWP_FS_OPS) + error = sio_pool_init(); + if (error) + destroy_swap_extents(sis, swap_file); + return error; } static void _enable_swap_info(struct swap_info_struct *si) From 39a95ad9cfcdbbaa254b41bd1a261270a959b8e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:22 +0200 Subject: [PATCH 06/12] swap,block: move the block device swapon code into block/fops.c Make use of the abstractions we have. This is a preparation for moving more special casing down into block/. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- block/fops.c | 6 ++++++ mm/swapfile.c | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/block/fops.c b/block/fops.c index bb6642b45937..453141801684 100644 --- a/block/fops.c +++ b/block/fops.c @@ -949,6 +949,11 @@ static int blkdev_mmap_prepare(struct vm_area_desc *desc) return generic_file_mmap_prepare(desc); } +static int blkdev_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + return add_swap_extent(sis, sis->max, 0); +} + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_release, @@ -965,6 +970,7 @@ const struct file_operations def_blk_fops = { .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, + .swap_activate = blkdev_swap_activate, .uring_cmd = blkdev_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC, }; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b7fc03612f4..fbf11c8c5c69 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2781,13 +2781,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent); static int setup_swap_extents(struct swap_info_struct *sis, struct file *swap_file) { - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; int ret, error = 0; - if (S_ISBLK(inode->i_mode)) - return add_swap_extent(sis, sis->max, 0); - if (swap_file->f_op->swap_activate) ret = swap_file->f_op->swap_activate(swap_file, sis); else From 8708fb9e291d325b6d60aef4aeecc17fec071a2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:23 +0200 Subject: [PATCH 07/12] swap,block: limit swap file size to device size Don't blindly pass the value from the swap header to swap_add_extent, but instead the device size rounded down to page granularity. This activated the sanity checking in the core code that catches a too large value in the swap header. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" --- block/fops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/fops.c b/block/fops.c index 453141801684..067e46299666 100644 --- a/block/fops.c +++ b/block/fops.c @@ -951,7 +951,9 @@ static int blkdev_mmap_prepare(struct vm_area_desc *desc) static int blkdev_swap_activate(struct file *file, struct swap_info_struct *sis) { - return add_swap_extent(sis, sis->max, 0); + loff_t isize = i_size_read(bdev_file_inode(file)); + + return add_swap_extent(sis, div_u64(isize, PAGE_SIZE), 0); } const struct file_operations def_blk_fops = { From ca5c4c76b81d46c64f5ca2c460deac5dcc04b437 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:24 +0200 Subject: [PATCH 08/12] swap,iomap: simplify iomap_swapfile_iter add_swap_extent already coalesces multiple extents, no need to duplicate that in the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal --- fs/iomap/swapfile.c | 104 +++++++++++++------------------------------- 1 file changed, 31 insertions(+), 73 deletions(-) diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index cf354fdfb7c3..a4e0ca462cc4 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -6,57 +6,32 @@ #include #include -/* Swapfile activation */ - -struct iomap_swapfile_info { - struct iomap iomap; /* accumulated iomap */ - struct swap_info_struct *sis; - unsigned long nr_pages; /* number of pages collected */ - struct file *file; -}; - -/* - * Collect physical extents for this swap file. Physical extents reported to - * the swap code must be trimmed to align to a page boundary. The logical - * offset within the file is irrelevant since the swapfile code maps logical - * page numbers of the swap device to the physical page-aligned extents. - */ -static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) -{ - struct iomap *iomap = &isi->iomap; - uint64_t first_ppage; - uint64_t next_ppage; - - /* - * Round the start up and the end down so that the physical - * extent aligns to a page boundary. - */ - first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> - PAGE_SHIFT; - return add_swap_extent(isi->sis, next_ppage - first_ppage, first_ppage); -} - -static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) +static int iomap_swapfile_fail(struct file *file, const char *str) { char *buf, *p = ERR_PTR(-ENOMEM); buf = kmalloc(PATH_MAX, GFP_KERNEL); if (buf) - p = file_path(isi->file, buf, PATH_MAX); + p = file_path(file, buf, PATH_MAX); pr_err("swapon: file %s %s\n", IS_ERR(p) ? "" : p, str); kfree(buf); return -EINVAL; } /* - * Accumulate iomaps for this swap file. We have to accumulate iomaps because - * swap only cares about contiguous page-aligned physical extents and makes no - * distinction between written and unwritten extents. + * Report physical extents for this swap file. Physical extents reported to the + * swap code must be trimmed to align to a page boundary. The logical offset + * within the file is irrelevant since the swapfile code maps logical page + * numbers of the swap device to the physical page-aligned extents. */ -static int iomap_swapfile_iter(struct iomap_iter *iter, - struct iomap *iomap, struct iomap_swapfile_info *isi) +static int iomap_swapfile_iter(struct iomap_iter *iter, struct file *file, + struct swap_info_struct *sis) { + struct iomap *iomap = &iter->iomap; + uint64_t first_ppage; + uint64_t next_ppage; + int error; + switch (iomap->type) { case IOMAP_MAPPED: case IOMAP_UNWRITTEN: @@ -64,35 +39,31 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, break; case IOMAP_INLINE: /* No inline data. */ - return iomap_swapfile_fail(isi, "is inline"); + return iomap_swapfile_fail(file, "is inline"); default: - return iomap_swapfile_fail(isi, "has unallocated extents"); + return iomap_swapfile_fail(file, "has unallocated extents"); } /* No uncommitted metadata or shared blocks. */ if (iomap->flags & IOMAP_F_DIRTY) - return iomap_swapfile_fail(isi, "is not committed"); + return iomap_swapfile_fail(file, "is not committed"); if (iomap->flags & IOMAP_F_SHARED) - return iomap_swapfile_fail(isi, "has shared extents"); + return iomap_swapfile_fail(file, "has shared extents"); /* Only one bdev per swap file. */ - if (iomap->bdev != isi->sis->bdev) - return iomap_swapfile_fail(isi, "outside the main device"); - - if (isi->iomap.length == 0) { - /* No accumulated extent, so just store it. */ - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); - } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { - /* Append this to the accumulated extent. */ - isi->iomap.length += iomap->length; - } else { - /* Otherwise, add the retained iomap and store this one. */ - int error = iomap_swapfile_add_extent(isi); - if (error) - return error; - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); - } + if (iomap->bdev != sis->bdev) + return iomap_swapfile_fail(file, "outside the main device"); + /* + * Round the start up and the end down so that the physical extent + * aligns to a page boundary. + */ + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> + PAGE_SHIFT; + error = add_swap_extent(sis, next_ppage - first_ppage, first_ppage); + if (error) + return error; return iomap_iter_advance_full(iter); } @@ -110,10 +81,6 @@ int iomap_swap_activate(struct file *file, struct swap_info_struct *sis, .len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE), .flags = IOMAP_REPORT, }; - struct iomap_swapfile_info isi = { - .sis = sis, - .file = file, - }; int ret; /* @@ -125,16 +92,7 @@ int iomap_swap_activate(struct file *file, struct swap_info_struct *sis, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); - if (ret < 0) - return ret; - - if (isi.iomap.length) { - ret = iomap_swapfile_add_extent(&isi); - if (ret) - return ret; - } - - return 0; + iter.status = iomap_swapfile_iter(&iter, file, sis); + return ret; } EXPORT_SYMBOL_GPL(iomap_swap_activate); From e4852636c2e05ae29c8dc5cc5a65a6115e790745 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:25 +0200 Subject: [PATCH 09/12] swap: push down setting sis->bdev into ->swap_activate Only the file operation method knows what block device we'll swap to. So move down setting sis->bdev and the special blockdev flag into ->swap_activate. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal --- block/fops.c | 9 ++++++++- fs/btrfs/inode.c | 7 ++++--- fs/f2fs/data.c | 3 ++- fs/iomap/swapfile.c | 7 ++----- fs/nfs/file.c | 2 +- fs/smb/client/file.c | 2 +- fs/xfs/xfs_file.c | 6 ------ include/linux/swap.h | 4 ++-- mm/page_io.c | 3 +-- mm/swapfile.c | 38 ++++++++++++-------------------------- 10 files changed, 33 insertions(+), 48 deletions(-) diff --git a/block/fops.c b/block/fops.c index 067e46299666..da09ce3f072f 100644 --- a/block/fops.c +++ b/block/fops.c @@ -951,9 +951,16 @@ static int blkdev_mmap_prepare(struct vm_area_desc *desc) static int blkdev_swap_activate(struct file *file, struct swap_info_struct *sis) { + struct block_device *bdev = I_BDEV(file->f_mapping->host); loff_t isize = i_size_read(bdev_file_inode(file)); - return add_swap_extent(sis, div_u64(isize, PAGE_SIZE), 0); + /* + * The swap code performs arbitrary overwrites, which are not supported + * on zones with sequential write constraints. + */ + if (bdev_is_zoned(bdev)) + return -EINVAL; + return add_swap_extent(sis, div_u64(isize, PAGE_SIZE), bdev, 0); } const struct file_operations def_blk_fops = { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee0a7947706a..84003c520530 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10201,6 +10201,7 @@ static void btrfs_free_swapfile_pins(struct inode *inode) } struct btrfs_swap_info { + struct btrfs_device *device; u64 start; u64 block_start; u64 block_len; @@ -10214,7 +10215,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; - return add_swap_extent(sis, next_ppage - first_ppage, first_ppage); + return add_swap_extent(sis, next_ppage - first_ppage, bsi->device->bdev, + first_ppage); } void btrfs_swap_deactivate(struct file *file) @@ -10503,6 +10505,7 @@ int btrfs_swap_activate(struct file *file, struct swap_info_struct *sis) bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; + bsi.device = device; } if (fatal_signal_pending(current)) { @@ -10533,8 +10536,6 @@ int btrfs_swap_activate(struct file *file, struct swap_info_struct *sis) up_write(&BTRFS_I(inode)->i_mmap_lock); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); - if (!ret && device) - sis->bdev = device->bdev; return ret; } #endif diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8bcf630df557..8d116ff517c9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4326,7 +4326,8 @@ static int check_swap_activate(struct swap_info_struct *sis, /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ - ret = add_swap_extent(sis, nr_pblocks, pblock); + ret = add_swap_extent(sis, nr_pblocks, inode->i_sb->s_bdev, + pblock); if (ret < 0) goto out; cur_lblock += nr_pblocks; diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index a4e0ca462cc4..862b4c02a8bd 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -50,10 +50,6 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, struct file *file, if (iomap->flags & IOMAP_F_SHARED) return iomap_swapfile_fail(file, "has shared extents"); - /* Only one bdev per swap file. */ - if (iomap->bdev != sis->bdev) - return iomap_swapfile_fail(file, "outside the main device"); - /* * Round the start up and the end down so that the physical extent * aligns to a page boundary. @@ -61,7 +57,8 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, struct file *file, first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> PAGE_SHIFT; - error = add_swap_extent(sis, next_ppage - first_ppage, first_ppage); + error = add_swap_extent(sis, next_ppage - first_ppage, iomap->bdev, + first_ppage); if (error) return error; return iomap_iter_advance_full(iter); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2bc55d9d71e1..10ab2a923835 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -588,7 +588,7 @@ int nfs_swap_activate(struct file *file, struct swap_info_struct *sis) ret = rpc_clnt_swap_activate(clnt); if (ret) return ret; - ret = add_swap_extent(sis, sis->max, 0); + ret = add_swap_extent(sis, sis->max, NULL, 0); if (ret < 0) { rpc_clnt_swap_deactivate(clnt); return ret; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 84459f87907e..e1bbc65ce7f3 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3327,7 +3327,7 @@ int cifs_swap_activate(struct file *swap_file, struct swap_info_struct *sis) */ sis->flags |= SWP_FS_OPS; - return add_swap_extent(sis, sis->max, 0); + return add_swap_extent(sis, sis->max, NULL, 0); } void cifs_swap_deactivate(struct file *file) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 41f7e19bd31f..74128ebf7161 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -2116,12 +2116,6 @@ xfs_file_swap_activate( */ xfs_inodegc_flush(ip->i_mount); - /* - * Direct the swap code to the correct block device when this file - * sits on the RT device. - */ - sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; - return iomap_swap_activate(file, sis, &xfs_read_iomap_ops); } diff --git a/include/linux/swap.h b/include/linux/swap.h index 657779485ae4..b1cbb67ddd8e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -404,7 +404,7 @@ extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP int add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, - sector_t start_block); + struct block_device *bdev, sector_t start_block); int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis); static inline unsigned long total_swapcache_pages(void) @@ -528,7 +528,7 @@ static inline bool folio_free_swap(struct folio *folio) static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, - sector_t start_block) + struct block_device *bdev, sector_t start_block) { return -EINVAL; } diff --git a/mm/page_io.c b/mm/page_io.c index 3e1c12649448..2ab8994ed1c2 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -132,7 +132,7 @@ int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis) /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ - ret = add_swap_extent(sis, 1, + ret = add_swap_extent(sis, 1, inode->i_sb->s_bdev, first_block >> (PAGE_SHIFT - blkbits)); if (ret < 0) return ret; @@ -141,7 +141,6 @@ int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis) continue; } return 0; - bad_bmap: pr_err("swapon: swapfile has holes\n"); return -EINVAL; diff --git a/mm/swapfile.c b/mm/swapfile.c index fbf11c8c5c69..2c9d2af736c4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2707,7 +2707,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis, */ int add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, - sector_t start_block) + struct block_device *bdev, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; @@ -2718,6 +2718,12 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, return 0; nr_pages = min(nr_pages, sis->max - sis->pages); + /* Only one bdev per swap file for now. */ + if (!sis->bdev) + sis->bdev = bdev; + else if (bdev != sis->bdev) + return -EINVAL; + /* * place the new node at the right most since the * function is called in ascending page order. @@ -2793,6 +2799,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sis->flags |= SWP_ACTIVATED; if (sis->flags & SWP_FS_OPS) error = sio_pool_init(); + else if (WARN_ON_ONCE(!sis->bdev)) + error = -EINVAL; if (error) destroy_swap_extents(sis, swap_file); return error; @@ -3224,26 +3232,6 @@ static struct swap_info_struct *alloc_swap_info(void) return p; } -static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) -{ - if (S_ISBLK(inode->i_mode)) { - si->bdev = I_BDEV(inode); - /* - * Zoned block devices contain zones that have a sequential - * write only restriction. Hence zoned block devices are not - * suitable for swapping. Disallow them here. - */ - if (bdev_is_zoned(si->bdev)) - return -EINVAL; - si->flags |= SWP_BLKDEV; - } else if (S_ISREG(inode->i_mode)) { - si->bdev = inode->i_sb->s_bdev; - } - - return 0; -} - - /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: @@ -3500,16 +3488,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) dentry = swap_file->f_path.dentry; inode = mapping->host; - error = claim_swapfile(si, inode); - if (unlikely(error)) - goto bad_swap; - inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } - if (!S_ISBLK(inode->i_mode) && !S_ISREG(inode->i_mode)) { + if (S_ISBLK(inode->i_mode)) { + si->flags |= SWP_BLKDEV; + } else if (!S_ISREG(inode->i_mode)) { error = -EINVAL; goto bad_swap_unlock_inode; } From 295326b0ce8b857a6a1fe6c4dda9eea8d4ed8afa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:26 +0200 Subject: [PATCH 10/12] swap: add a swap_activate_fs_ops helper Add a helper abstracting away the low-level details of enabling fs_ops-based swapping. This prepares for taking swap_info_struct private. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- fs/nfs/file.c | 4 +--- fs/smb/client/file.c | 3 +-- include/linux/swap.h | 5 +++++ mm/swapfile.c | 7 +++++++ 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 10ab2a923835..ce4d860c4e7a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -588,7 +588,7 @@ int nfs_swap_activate(struct file *file, struct swap_info_struct *sis) ret = rpc_clnt_swap_activate(clnt); if (ret) return ret; - ret = add_swap_extent(sis, sis->max, NULL, 0); + ret = swap_activate_fs_ops(sis); if (ret < 0) { rpc_clnt_swap_deactivate(clnt); return ret; @@ -596,8 +596,6 @@ int nfs_swap_activate(struct file *file, struct swap_info_struct *sis) if (cl->rpc_ops->enable_swap) cl->rpc_ops->enable_swap(inode); - - sis->flags |= SWP_FS_OPS; return 0; } EXPORT_SYMBOL_GPL(nfs_swap_activate); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index e1bbc65ce7f3..e11065be1e64 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3326,8 +3326,7 @@ int cifs_swap_activate(struct file *swap_file, struct swap_info_struct *sis) * from reading or writing the file */ - sis->flags |= SWP_FS_OPS; - return add_swap_extent(sis, sis->max, NULL, 0); + return swap_activate_fs_ops(sis); } void cifs_swap_deactivate(struct file *file) diff --git a/include/linux/swap.h b/include/linux/swap.h index b1cbb67ddd8e..916889738f08 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -406,6 +406,7 @@ extern void __meminit kswapd_stop(int nid); int add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, struct block_device *bdev, sector_t start_block); int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis); +int swap_activate_fs_ops(struct swap_info_struct *sis); static inline unsigned long total_swapcache_pages(void) { @@ -532,6 +533,10 @@ static inline int add_swap_extent(struct swap_info_struct *sis, { return -EINVAL; } +static inline int swap_activate_fs_ops(struct swap_info_struct *sis) +{ + return -EINVAL; +} #endif /* CONFIG_SWAP */ #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2c9d2af736c4..26852c2ad36e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2757,6 +2757,13 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, } EXPORT_SYMBOL_GPL(add_swap_extent); +int swap_activate_fs_ops(struct swap_info_struct *sis) +{ + sis->flags |= SWP_FS_OPS; + return add_swap_extent(sis, sis->max, NULL, 0); +} +EXPORT_SYMBOL_GPL(swap_activate_fs_ops); + /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is From 8be1bd92107ea4c24cb9bd34f2d01dbcfdcdf9fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:27 +0200 Subject: [PATCH 11/12] swap: move struct swap_extent to swapfile.c struct swap_extent is only used inside of mm/swapfile.c, so move it there. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- include/linux/swap.h | 15 --------------- mm/swapfile.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 916889738f08..95237ee065c2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -178,21 +178,6 @@ struct sysinfo; struct writeback_control; struct zone; -/* - * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of - * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the - * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart - * from setup, they're handled identically. - * - * We always assume that blocks are of size PAGE_SIZE. - */ -struct swap_extent { - struct rb_node rb_node; - pgoff_t start_page; - pgoff_t nr_pages; - sector_t start_block; -}; - /* * Max bad pages in the new format.. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 26852c2ad36e..c0479533f9ef 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -260,6 +260,21 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, return ret; } +/* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of + * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the + * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart + * from setup, they're handled identically. + * + * We always assume that blocks are of size PAGE_SIZE. + */ +struct swap_extent { + struct rb_node rb_node; + pgoff_t start_page; + pgoff_t nr_pages; + sector_t start_block; +}; + static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); From 31e57ca61944ae49d8be255d04d9ebbe00a82de2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 May 2026 07:35:28 +0200 Subject: [PATCH 12/12] swap: move swap_info_struct to mm/swap.h swap_info_struct is now internal to the MM subsystem, so remove it from the public header. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: "Darrick J. Wong" Acked-by: Chris Li --- include/linux/swap.h | 98 +------------------------------------------- mm/swap.h | 92 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 96 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 95237ee065c2..31eef9b74949 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -16,9 +16,9 @@ #include #include -struct notifier_block; - struct bio; +struct notifier_block; +struct swap_info_struct; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff @@ -178,29 +178,6 @@ struct sysinfo; struct writeback_control; struct zone; -/* - * Max bad pages in the new format.. - */ -#define MAX_SWAP_BADPAGES \ - ((offsetof(union swap_header, magic.magic) - \ - offsetof(union swap_header, info.badpages)) / sizeof(int)) - -enum { - SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ - SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ - SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ - SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ - SWP_BLKDEV = (1 << 6), /* its a block device */ - SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ - SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ - SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ - SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ - SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ - SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ - /* add others here before... */ -}; - #define SWAP_CLUSTER_MAX 32UL #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX @@ -219,56 +196,6 @@ enum { #define SWAP_NR_ORDERS 1 #endif -/* - * We keep using same cluster for rotational device so IO will be sequential. - * The purpose is to optimize SWAP throughput on these device. - */ -struct swap_sequential_cluster { - unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ -}; - -/* - * The in-memory structure used to track swap areas. - */ -struct swap_info_struct { - struct percpu_ref users; /* indicate and keep swap device valid. */ - unsigned long flags; /* SWP_USED etc: see above */ - signed short prio; /* swap priority of this type */ - struct plist_node list; /* entry in swap_active_head */ - signed char type; /* strange name for an index */ - unsigned int max; /* size of this swap device */ - unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ - struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct list_head free_clusters; /* free clusters list */ - struct list_head full_clusters; /* full clusters list */ - struct list_head nonfull_clusters[SWAP_NR_ORDERS]; - /* list of cluster that contains at least one free slot */ - struct list_head frag_clusters[SWAP_NR_ORDERS]; - /* list of cluster that are fragmented or contented */ - unsigned int pages; /* total of usable pages of swap */ - atomic_long_t inuse_pages; /* number of those currently in use */ - struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ - spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ - struct rb_root swap_extent_root;/* root of the swap extent rbtree */ - struct block_device *bdev; /* swap device or bdev of swap file */ - struct file *swap_file; /* seldom referenced */ - struct completion comp; /* seldom referenced */ - spinlock_t lock; /* - * protect map scan related fields like - * inuse_pages and all cluster lists. - * Other fields are only changed - * at swapon/swapoff, so are protected - * by swap_lock. changing flags need - * hold this lock and swap_lock. If - * both locks need hold, hold swap_lock - * first. - */ - struct work_struct discard_work; /* discard worker */ - struct work_struct reclaim_work; /* reclaim worker */ - struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_list; /* entry in swap_avail_head */ -}; - static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); @@ -423,10 +350,7 @@ int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct backing_dev_info; -extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); /* @@ -452,20 +376,7 @@ bool folio_free_swap(struct folio *folio); swp_entry_t swap_alloc_hibernation_slot(int type); void swap_free_hibernation_slot(swp_entry_t entry); -static inline void put_swap_device(struct swap_info_struct *si) -{ - percpu_ref_put(&si->users); -} - #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) -{ - return NULL; -} - -static inline void put_swap_device(struct swap_info_struct *si) -{ -} #define get_nr_swap_pages() 0L #define total_swap_pages 0L @@ -497,11 +408,6 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) -{ - return false; -} - static inline int swp_swapcount(swp_entry_t entry) { return 0; diff --git a/mm/swap.h b/mm/swap.h index a77016f2423b..70974495bf15 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -8,6 +8,79 @@ struct swap_iocb; extern int page_cluster; +/* + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. + */ +struct swap_sequential_cluster { + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ +}; + +/* + * The in-memory structure used to track swap areas. + */ +struct swap_info_struct { + struct percpu_ref users; /* indicate and keep swap device valid. */ + unsigned long flags; /* SWP_USED etc: see above */ + signed short prio; /* swap priority of this type */ + struct plist_node list; /* entry in swap_active_head */ + signed char type; /* strange name for an index */ + unsigned int max; /* size of this swap device */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct list_head free_clusters; /* free clusters list */ + struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ + unsigned int pages; /* total of usable pages of swap */ + atomic_long_t inuse_pages; /* number of those currently in use */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ + struct rb_root swap_extent_root;/* root of the swap extent rbtree */ + struct block_device *bdev; /* swap device or bdev of swap file */ + struct file *swap_file; /* seldom referenced */ + struct completion comp; /* seldom referenced */ + spinlock_t lock; /* + * protect map scan related fields like + * inuse_pages and all cluster lists. + * Other fields are only changed + * at swapon/swapoff, so are protected + * by swap_lock. changing flags need + * hold this lock and swap_lock. If + * both locks need hold, hold swap_lock + * first. + */ + struct work_struct discard_work; /* discard worker */ + struct work_struct reclaim_work; /* reclaim worker */ + struct list_head discard_clusters; /* discard clusters list */ + struct plist_node avail_list; /* entry in swap_avail_head */ +}; + +/* + * Max bad pages in the new format.. + */ +#define MAX_SWAP_BADPAGES \ + ((offsetof(union swap_header, magic.magic) - \ + offsetof(union swap_header, info.badpages)) / sizeof(int)) + +enum { + SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ + SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ + SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ + SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ + SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ + SWP_BLKDEV = (1 << 6), /* its a block device */ + SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ + SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ + SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + /* add others here before... */ +}; + #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) @@ -352,6 +425,13 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) return i; } +bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); +struct swap_info_struct *get_swap_device(swp_entry_t entry); +static inline void put_swap_device(struct swap_info_struct *si) +{ + percpu_ref_put(&si->users); +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -498,5 +578,17 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { return 0; } +static inline bool swap_entry_swapped(struct swap_info_struct *si, + swp_entry_t entry) +{ + return false; +} +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + return NULL; +} +static inline void put_swap_device(struct swap_info_struct *si) +{ +} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */