diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index da982ca7e4137..2a78037665b79 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -55,7 +55,6 @@ The following address space operations can be wrapped easily: * ``readahead`` * ``writepages`` * ``bmap`` - * ``swap_activate`` ``struct iomap_write_ops`` -------------------------- @@ -747,7 +746,7 @@ function. Swap File Activation ==================== -The ``iomap_swapfile_activate`` function finds all the base-page aligned +The ``iomap_swap_activate`` function finds all the base-page aligned regions in a file and sets them up as swap space. The file will be ``fsync()``'d before activation. ``IOMAP_REPORT`` will be passed as the ``flags`` argument to diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 8421ea21bd35e..e79d72a12273f 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -264,9 +264,6 @@ prototypes:: int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_folio)(struct address_space *, struct folio *); - int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) - int (*swap_deactivate)(struct file *); - int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); locking rules: All except dirty_folio and free_folio may block @@ -289,9 +286,6 @@ migrate_folio: yes (both) launder_folio: yes is_partially_uptodate: yes error_remove_folio: yes -swap_activate: no -swap_deactivate: no -swap_rw: yes, unlocks ====================== ======================== ========= =============== ->write_begin(), ->write_end() and ->read_folio() may be called from @@ -350,19 +344,6 @@ cleaned, or an error value if not. Note that in order to prevent the folio getting mapped back in and redirtied, it needs to be kept locked across the entire operation. -->swap_activate() will be called to prepare the given file for swap. It -should perform any validation and preparation necessary to ensure that -writes can be performed with minimal memory allocation. It should call -add_swap_extent(), or the helper iomap_swapfile_activate(), and return -the number of extents added. If IO should be submitted through -->swap_rw(), it should set SWP_FS_OPS, otherwise IO will be submitted -directly to the block device ``sis->bdev``. - -->swap_deactivate() will be called in the sys_swapoff() -path after ->swap_activate() returned success. - -->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate(). - file_lock_operations ==================== @@ -503,6 +484,9 @@ prototypes:: struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); + int (*swap_activate)(struct file *file, struct swap_info_struct *sis); + int (*swap_deactivate)(struct file *); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); locking rules: All may block. @@ -555,6 +539,19 @@ used. To block changes to file contents via a memory mapping during the operation, the filesystem must take mapping->invalidate_lock to coordinate with ->page_mkwrite. +->swap_activate() is called to prepare the given file for swap. It should +perform any validation and preparation necessary to ensure that writes can be +performed with minimal memory allocation. It should call add_swap_extent(), +or the helper iomap_swap_activate(), and return the number of extents added. +If IO should be submitted through ->swap_rw(), the file system must set +SWP_FS_OPS from ->swap_activate(), otherwise IO will be submitted directly to +the block device ``sis->bdev``. + +->swap_deactivate() is called from the swapoff path to disable a swapfile +successfully activated using ->swap_activate(). + +->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate(). + dquot_operations ================ diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 7c753148af888..1624c1ee82d67 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -774,9 +774,6 @@ cache in your filesystem. The following members are defined: size_t count); void (*is_dirty_writeback)(struct folio *, bool *, bool *); int (*error_remove_folio)(struct mapping *mapping, struct folio *); - int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) - int (*swap_deactivate)(struct file *); - int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; ``read_folio`` @@ -970,23 +967,6 @@ cache in your filesystem. The following members are defined: Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. -``swap_activate`` - - Called to prepare the given file for swap. It should perform - any validation and preparation necessary to ensure that writes - can be performed with minimal memory allocation. It should call - add_swap_extent(), or the helper iomap_swapfile_activate(), and - return the number of extents added. If IO should be submitted - through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will - be submitted directly to the block device ``sis->bdev``. - -``swap_deactivate`` - Called during swapoff on files where swap_activate was - successful. - -``swap_rw`` - Called to read or write swap pages when SWP_FS_OPS is set. - The File Object =============== @@ -1046,6 +1026,9 @@ This describes how the VFS can manipulate an open file. As of kernel int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); int (*mmap_prepare)(struct vm_area_desc *); + int (*swap_activate)(struct file *file, struct swap_info_struct *sis); + int (*swap_deactivate)(struct file *); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; Again, all methods are called without any locks being held, unless @@ -1175,6 +1158,23 @@ otherwise noted. this can be specified by the vm_area_desc->action field and related parameters. +``swap_activate`` + + Called to prepare the given file for swap. It should perform + any validation and preparation necessary to ensure that writes + can be performed with minimal memory allocation. It should call + add_swap_extent(), or the helper iomap_swap_activate(), and + return the number of extents added. If IO should be submitted + through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will + be submitted directly to the block device ``sis->bdev``. + +``swap_deactivate`` + Called during swapoff on files where swap_activate was + successful. + +``swap_rw`` + Called to read or write swap pages when SWP_FS_OPS is set. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/block/fops.c b/block/fops.c index bb6642b45937c..da09ce3f072f5 100644 --- a/block/fops.c +++ b/block/fops.c @@ -949,6 +949,20 @@ static int blkdev_mmap_prepare(struct vm_area_desc *desc) return generic_file_mmap_prepare(desc); } +static int blkdev_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + struct block_device *bdev = I_BDEV(file->f_mapping->host); + loff_t isize = i_size_read(bdev_file_inode(file)); + + /* + * The swap code performs arbitrary overwrites, which are not supported + * on zones with sequential write constraints. + */ + if (bdev_is_zoned(bdev)) + return -EINVAL; + return add_swap_extent(sis, div_u64(isize, PAGE_SIZE), bdev, 0); +} + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_release, @@ -965,6 +979,7 @@ const struct file_operations def_blk_fops = { .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, + .swap_activate = blkdev_swap_activate, .uring_cmd = blkdev_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC, }; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 55c272fe5d92a..f527126882d64 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -670,4 +670,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, const struct btrfs_file_extent *file_extent, int type); +int btrfs_swap_activate(struct file *file, struct swap_info_struct *sis); +void btrfs_swap_deactivate(struct file *file); + #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index cf1cb5c4db757..165b8da1d7db4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3867,6 +3867,10 @@ const struct file_operations btrfs_file_operations = { .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, .setlease = generic_setlease, +#ifdef CONFIG_SWAP + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, +#endif }; int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 906d5c21ebc47..84003c5205304 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10201,57 +10201,25 @@ static void btrfs_free_swapfile_pins(struct inode *inode) } struct btrfs_swap_info { + struct btrfs_device *device; u64 start; u64 block_start; u64 block_len; - u64 lowest_ppage; - u64 highest_ppage; - unsigned long nr_pages; - int nr_extents; }; static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { - unsigned long nr_pages; - unsigned long max_pages; - u64 first_ppage, first_ppage_reported, next_ppage; - int ret; + u64 first_ppage, next_ppage; - /* - * Our swapfile may have had its size extended after the swap header was - * written. In that case activating the swapfile should not go beyond - * the max size set in the swap header. - */ - if (bsi->nr_pages >= sis->max) - return 0; - - max_pages = sis->max - bsi->nr_pages; first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; - if (first_ppage >= next_ppage) - return 0; - nr_pages = next_ppage - first_ppage; - nr_pages = min(nr_pages, max_pages); - - first_ppage_reported = first_ppage; - if (bsi->start == 0) - first_ppage_reported++; - if (bsi->lowest_ppage > first_ppage_reported) - bsi->lowest_ppage = first_ppage_reported; - if (bsi->highest_ppage < (next_ppage - 1)) - bsi->highest_ppage = next_ppage - 1; - - ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); - if (ret < 0) - return ret; - bsi->nr_extents += ret; - bsi->nr_pages += nr_pages; - return 0; + return add_swap_extent(sis, next_ppage - first_ppage, bsi->device->bdev, + first_ppage); } -static void btrfs_swap_deactivate(struct file *file) +void btrfs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); @@ -10259,8 +10227,7 @@ static void btrfs_swap_deactivate(struct file *file) atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); } -static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +int btrfs_swap_activate(struct file *file, struct swap_info_struct *sis) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10269,9 +10236,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct extent_state *cached_state = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; - struct btrfs_swap_info bsi = { - .lowest_ppage = (sector_t)-1ULL, - }; + struct btrfs_swap_info bsi = {}; struct btrfs_backref_share_check_ctx *backref_ctx = NULL; struct btrfs_path *path = NULL; int ret = 0; @@ -10540,6 +10505,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; + bsi.device = device; } if (fatal_signal_pending(current)) { @@ -10570,25 +10536,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, up_write(&BTRFS_I(inode)->i_mmap_lock); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); - if (ret) - return ret; - - if (device) - sis->bdev = device->bdev; - *span = bsi.highest_ppage - bsi.lowest_ppage + 1; - sis->max = bsi.nr_pages; - sis->pages = bsi.nr_pages - 1; - return bsi.nr_extents; -} -#else -static void btrfs_swap_deactivate(struct file *file) -{ -} - -static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) -{ - return -EOPNOTSUPP; + return ret; } #endif @@ -10736,8 +10684,6 @@ static const struct address_space_operations btrfs_aops = { .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_folio = generic_error_remove_folio, - .swap_activate = btrfs_swap_activate, - .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index eb1a323962b10..fad3ed05c02a9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -971,6 +971,11 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, offset, maxbytes); } +static int ext4_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + return iomap_swap_activate(file, sis, &ext4_iomap_report_ops); +} + const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, @@ -992,6 +997,7 @@ const struct file_operations ext4_file_operations = { FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, .setlease = generic_setlease, + .swap_activate = ext4_swap_activate, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c2c2d6ac7f3d1..efbb2ddad3630 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3939,13 +3939,6 @@ static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) return block_dirty_folio(mapping, folio); } -static int ext4_iomap_swap_activate(struct swap_info_struct *sis, - struct file *file, sector_t *span) -{ - return iomap_swapfile_activate(sis, file, span, - &ext4_iomap_report_ops); -} - static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, @@ -3959,7 +3952,6 @@ static const struct address_space_operations ext4_aops = { .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { @@ -3975,7 +3967,6 @@ static const struct address_space_operations ext4_journalled_aops = { .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { @@ -3991,14 +3982,12 @@ static const struct address_space_operations ext4_da_aops = { .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, - .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8d4f1e75dee3e..8d116ff517c99 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4249,7 +4249,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, } static int check_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) + struct file *swap_file) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; @@ -4257,9 +4257,6 @@ static int check_swap_activate(struct swap_info_struct *sis, block_t cur_lblock; block_t last_lblock; block_t pblock; - block_t lowest_pblock = -1; - block_t highest_pblock = 0; - int nr_extents = 0; unsigned int nr_pblocks; unsigned int blks_per_sec = BLKS_PER_SEC(sbi); unsigned int not_aligned = 0; @@ -4272,7 +4269,7 @@ static int check_swap_activate(struct swap_info_struct *sis, cur_lblock = 0; last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); - while (cur_lblock < last_lblock && cur_lblock < sis->max) { + while (cur_lblock < last_lblock) { struct f2fs_map_blocks map; bool last_extent = false; retry: @@ -4307,8 +4304,6 @@ static int check_swap_activate(struct swap_info_struct *sis, not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); - if (cur_lblock + nr_pblocks > sis->max) - nr_pblocks -= blks_per_sec; /* this extent is last one */ if (!nr_pblocks) { @@ -4328,31 +4323,15 @@ static int check_swap_activate(struct swap_info_struct *sis, goto retry; } - if (cur_lblock + nr_pblocks >= sis->max) - nr_pblocks = sis->max - cur_lblock; - - if (cur_lblock) { /* exclude the header page */ - if (pblock < lowest_pblock) - lowest_pblock = pblock; - if (pblock + nr_pblocks - 1 > highest_pblock) - highest_pblock = pblock + nr_pblocks - 1; - } - /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ - ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); + ret = add_swap_extent(sis, nr_pblocks, inode->i_sb->s_bdev, + pblock); if (ret < 0) goto out; - nr_extents += ret; cur_lblock += nr_pblocks; } - ret = nr_extents; - *span = 1 + highest_pblock - lowest_pblock; - if (cur_lblock == 0) - cur_lblock = 1; /* force Empty message */ - sis->max = cur_lblock; - sis->pages = cur_lblock - 1; out: if (not_aligned) f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", @@ -4360,8 +4339,7 @@ static int check_swap_activate(struct swap_info_struct *sis, return ret; } -static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +int f2fs_swap_activate(struct file *file, struct swap_info_struct *sis) { struct inode *inode = file_inode(file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -4391,33 +4369,23 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, f2fs_precache_extents(inode); - ret = check_swap_activate(sis, file, span); + ret = check_swap_activate(sis, file); if (ret < 0) return ret; stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(sbi, REQ_TIME); - return ret; + return 0; } -static void f2fs_swap_deactivate(struct file *file) +void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } -#else -static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) -{ - return -EOPNOTSUPP; -} - -static void f2fs_swap_deactivate(struct file *file) -{ -} #endif const struct address_space_operations f2fs_dblock_aops = { @@ -4431,8 +4399,6 @@ const struct address_space_operations f2fs_dblock_aops = { .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .bmap = f2fs_bmap, - .swap_activate = f2fs_swap_activate, - .swap_deactivate = f2fs_swap_deactivate, }; void f2fs_clear_page_cache_dirty_tag(struct folio *folio) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 91f506e7c9cfb..93e9709f26fa2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4195,6 +4195,8 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +int f2fs_swap_activate(struct file *file, struct swap_info_struct *sis); +void f2fs_swap_deactivate(struct file *file); extern const struct iomap_ops f2fs_iomap_ops; /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fb12c5c9affda..aa91d5fff1cf7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -5488,4 +5488,8 @@ const struct file_operations f2fs_file_operations = { .fadvise = f2fs_file_fadvise, .fop_flags = FOP_BUFFER_RASYNC, .setlease = generic_setlease, +#ifdef CONFIG_SWAP + .swap_activate = f2fs_swap_activate, + .swap_deactivate = f2fs_swap_deactivate, +#endif }; diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 0db77c449467a..862b4c02a8bd1 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -6,94 +6,32 @@ #include #include -/* Swapfile activation */ - -struct iomap_swapfile_info { - struct iomap iomap; /* accumulated iomap */ - struct swap_info_struct *sis; - uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ - uint64_t highest_ppage; /* highest physical addr seen (pages) */ - unsigned long nr_pages; /* number of pages collected */ - int nr_extents; /* extent count */ - struct file *file; -}; - -/* - * Collect physical extents for this swap file. Physical extents reported to - * the swap code must be trimmed to align to a page boundary. The logical - * offset within the file is irrelevant since the swapfile code maps logical - * page numbers of the swap device to the physical page-aligned extents. - */ -static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) -{ - struct iomap *iomap = &isi->iomap; - unsigned long nr_pages; - unsigned long max_pages; - uint64_t first_ppage; - uint64_t first_ppage_reported; - uint64_t next_ppage; - int error; - - if (unlikely(isi->nr_pages >= isi->sis->max)) - return 0; - max_pages = isi->sis->max - isi->nr_pages; - - /* - * Round the start up and the end down so that the physical - * extent aligns to a page boundary. - */ - first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; - next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> - PAGE_SHIFT; - - /* Skip too-short physical extents. */ - if (first_ppage >= next_ppage) - return 0; - nr_pages = next_ppage - first_ppage; - nr_pages = min(nr_pages, max_pages); - - /* - * Calculate how much swap space we're adding; the first page contains - * the swap header and doesn't count. The mm still wants that first - * page fed to add_swap_extent, however. - */ - first_ppage_reported = first_ppage; - if (iomap->offset == 0) - first_ppage_reported++; - if (isi->lowest_ppage > first_ppage_reported) - isi->lowest_ppage = first_ppage_reported; - if (isi->highest_ppage < (next_ppage - 1)) - isi->highest_ppage = next_ppage - 1; - - /* Add extent, set up for the next call. */ - error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); - if (error < 0) - return error; - isi->nr_extents += error; - isi->nr_pages += nr_pages; - return 0; -} - -static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) +static int iomap_swapfile_fail(struct file *file, const char *str) { char *buf, *p = ERR_PTR(-ENOMEM); buf = kmalloc(PATH_MAX, GFP_KERNEL); if (buf) - p = file_path(isi->file, buf, PATH_MAX); + p = file_path(file, buf, PATH_MAX); pr_err("swapon: file %s %s\n", IS_ERR(p) ? "" : p, str); kfree(buf); return -EINVAL; } /* - * Accumulate iomaps for this swap file. We have to accumulate iomaps because - * swap only cares about contiguous page-aligned physical extents and makes no - * distinction between written and unwritten extents. + * Report physical extents for this swap file. Physical extents reported to the + * swap code must be trimmed to align to a page boundary. The logical offset + * within the file is irrelevant since the swapfile code maps logical page + * numbers of the swap device to the physical page-aligned extents. */ -static int iomap_swapfile_iter(struct iomap_iter *iter, - struct iomap *iomap, struct iomap_swapfile_info *isi) +static int iomap_swapfile_iter(struct iomap_iter *iter, struct file *file, + struct swap_info_struct *sis) { + struct iomap *iomap = &iter->iomap; + uint64_t first_ppage; + uint64_t next_ppage; + int error; + switch (iomap->type) { case IOMAP_MAPPED: case IOMAP_UNWRITTEN: @@ -101,35 +39,28 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, break; case IOMAP_INLINE: /* No inline data. */ - return iomap_swapfile_fail(isi, "is inline"); + return iomap_swapfile_fail(file, "is inline"); default: - return iomap_swapfile_fail(isi, "has unallocated extents"); + return iomap_swapfile_fail(file, "has unallocated extents"); } /* No uncommitted metadata or shared blocks. */ if (iomap->flags & IOMAP_F_DIRTY) - return iomap_swapfile_fail(isi, "is not committed"); + return iomap_swapfile_fail(file, "is not committed"); if (iomap->flags & IOMAP_F_SHARED) - return iomap_swapfile_fail(isi, "has shared extents"); - - /* Only one bdev per swap file. */ - if (iomap->bdev != isi->sis->bdev) - return iomap_swapfile_fail(isi, "outside the main device"); - - if (isi->iomap.length == 0) { - /* No accumulated extent, so just store it. */ - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); - } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { - /* Append this to the accumulated extent. */ - isi->iomap.length += iomap->length; - } else { - /* Otherwise, add the retained iomap and store this one. */ - int error = iomap_swapfile_add_extent(isi); - if (error) - return error; - memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); - } + return iomap_swapfile_fail(file, "has shared extents"); + /* + * Round the start up and the end down so that the physical extent + * aligns to a page boundary. + */ + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> + PAGE_SHIFT; + error = add_swap_extent(sis, next_ppage - first_ppage, iomap->bdev, + first_ppage); + if (error) + return error; return iomap_iter_advance_full(iter); } @@ -137,56 +68,28 @@ static int iomap_swapfile_iter(struct iomap_iter *iter, * Iterate a swap file's iomaps to construct physical extents that can be * passed to the swapfile subsystem. */ -int iomap_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *pagespan, +int iomap_swap_activate(struct file *file, struct swap_info_struct *sis, const struct iomap_ops *ops) { - struct inode *inode = swap_file->f_mapping->host; + struct inode *inode = file->f_mapping->host; struct iomap_iter iter = { .inode = inode, .pos = 0, .len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE), .flags = IOMAP_REPORT, }; - struct iomap_swapfile_info isi = { - .sis = sis, - .lowest_ppage = (sector_t)-1ULL, - .file = swap_file, - }; int ret; /* * Persist all file mapping metadata so that we won't have any * IOMAP_F_DIRTY iomaps. */ - ret = vfs_fsync(swap_file, 1); + ret = vfs_fsync(file, 1); if (ret) return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); - if (ret < 0) - return ret; - - if (isi.iomap.length) { - ret = iomap_swapfile_add_extent(&isi); - if (ret) - return ret; - } - - /* - * If this swapfile doesn't contain even a single page-aligned - * contiguous range of blocks, reject this useless swapfile to - * prevent confusion later on. - */ - if (isi.nr_pages == 0) { - pr_warn("swapon: Cannot find a single usable page in file.\n"); - return -EINVAL; - } - - *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; - sis->max = isi.nr_pages; - sis->pages = isi.nr_pages - 1; - return isi.nr_extents; + iter.status = iomap_swapfile_iter(&iter, file, sis); + return ret; } -EXPORT_SYMBOL_GPL(iomap_swapfile_activate); +EXPORT_SYMBOL_GPL(iomap_swap_activate); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 48d89716193a7..e92a4c8f8f778 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -164,6 +164,7 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) return ret; return 0; } +EXPORT_SYMBOL_GPL(nfs_swap_rw); static void nfs_direct_release_pages(struct page **pages, unsigned int npages) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 25048a3c23643..ce4d860c4e7a8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,8 +567,7 @@ static int nfs_launder_folio(struct folio *folio) return ret; } -static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) +int nfs_swap_activate(struct file *file, struct swap_info_struct *sis) { unsigned long blocks; long long isize; @@ -589,22 +588,19 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = rpc_clnt_swap_activate(clnt); if (ret) return ret; - ret = add_swap_extent(sis, 0, sis->max, 0); + ret = swap_activate_fs_ops(sis); if (ret < 0) { rpc_clnt_swap_deactivate(clnt); return ret; } - *span = sis->pages; - if (cl->rpc_ops->enable_swap) cl->rpc_ops->enable_swap(inode); - - sis->flags |= SWP_FS_OPS; - return ret; + return 0; } +EXPORT_SYMBOL_GPL(nfs_swap_activate); -static void nfs_swap_deactivate(struct file *file) +void nfs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); struct rpc_clnt *clnt = NFS_CLIENT(inode); @@ -614,6 +610,7 @@ static void nfs_swap_deactivate(struct file *file) if (cl->rpc_ops->disable_swap) cl->rpc_ops->disable_swap(file_inode(file)); } +EXPORT_SYMBOL_GPL(nfs_swap_deactivate); const struct address_space_operations nfs_file_aops = { .read_folio = nfs_read_folio, @@ -628,9 +625,6 @@ const struct address_space_operations nfs_file_aops = { .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_folio = generic_error_remove_folio, - .swap_activate = nfs_swap_activate, - .swap_deactivate = nfs_swap_deactivate, - .swap_rw = nfs_swap_rw, }; /* @@ -963,6 +957,9 @@ const struct file_operations nfs_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, .fop_flags = FOP_DONTCACHE, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index be40e126c5394..eb1a8dbab55ae 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -455,5 +455,8 @@ const struct file_operations nfs4_file_operations = { #else .llseek = nfs_file_llseek, #endif + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, .fop_flags = FOP_DONTCACHE, }; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 1fbf832ad1654..a94f5f6757904 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -270,13 +270,6 @@ static int ntfs_writepages(struct address_space *mapping, return iomap_writepages(&wpc); } -static int ntfs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - return iomap_swapfile_activate(sis, swap_file, span, - &ntfs_read_iomap_ops); -} - const struct address_space_operations ntfs_aops = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, @@ -288,7 +281,6 @@ const struct address_space_operations ntfs_aops = { .error_remove_folio = generic_error_remove_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, - .swap_activate = ntfs_swap_activate, }; const struct address_space_operations ntfs_mft_aops = { diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index e8bea22b81a75..0dcf8479362a4 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1114,6 +1114,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t offset, loff_t le return err; } +static int ntfs_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + return iomap_swap_activate(file, sis, &ntfs_read_iomap_ops); +} + const struct file_operations ntfs_file_ops = { .llseek = ntfs_file_llseek, .read_iter = ntfs_file_read_iter, @@ -1130,6 +1135,7 @@ const struct file_operations ntfs_file_ops = { #endif .fallocate = ntfs_fallocate, .setlease = generic_setlease, + .swap_activate = ntfs_swap_activate, }; const struct inode_operations ntfs_file_inode_ops = { diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9f76b0347fa9d..f0d8a3a460745 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1577,6 +1577,9 @@ const struct file_operations cifs_file_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_strict_ops = { @@ -1597,6 +1600,9 @@ const struct file_operations cifs_file_strict_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_direct_ops = { @@ -1617,6 +1623,9 @@ const struct file_operations cifs_file_direct_ops = { .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_nobrl_ops = { @@ -1635,6 +1644,9 @@ const struct file_operations cifs_file_nobrl_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_strict_nobrl_ops = { @@ -1653,6 +1665,9 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_file_direct_nobrl_ops = { @@ -1671,6 +1686,9 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, + .swap_activate = cifs_swap_activate, + .swap_deactivate = cifs_swap_deactivate, + .swap_rw = cifs_swap_rw, }; const struct file_operations cifs_dir_ops = { diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index c455b15f27782..1e5b9fce84f92 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -115,6 +115,9 @@ int cifs_file_mmap_prepare(struct vm_area_desc *desc); int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc); extern const struct file_operations cifs_dir_ops; int cifs_readdir(struct file *file, struct dir_context *ctx); +int cifs_swap_activate(struct file *swap_file, struct swap_info_struct *sis); +void cifs_swap_deactivate(struct file *file); +int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter); /* Functions related to dir entries */ extern const struct dentry_operations cifs_dentry_ops; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 664a2c2230890..e11065be1e64e 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3286,8 +3286,7 @@ void cifs_oplock_break(struct work_struct *work) cifs_done_oplock_break(cinode); } -static int cifs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) +int cifs_swap_activate(struct file *swap_file, struct swap_info_struct *sis) { struct cifsFileInfo *cfile = swap_file->private_data; struct inode *inode = swap_file->f_mapping->host; @@ -3296,7 +3295,7 @@ static int cifs_swap_activate(struct swap_info_struct *sis, cifs_dbg(FYI, "swap activate\n"); - if (!swap_file->f_mapping->a_ops->swap_rw) + if (!swap_file->f_op->swap_rw) /* Cannot support swap */ return -EINVAL; @@ -3308,7 +3307,6 @@ static int cifs_swap_activate(struct swap_info_struct *sis, pr_warn("swap activate: swapfile has holes\n"); return -EINVAL; } - *span = sis->pages; pr_warn_once("Swap support over SMB3 is experimental\n"); @@ -3328,11 +3326,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis, * from reading or writing the file */ - sis->flags |= SWP_FS_OPS; - return add_swap_extent(sis, 0, sis->max, 0); + return swap_activate_fs_ops(sis); } -static void cifs_swap_deactivate(struct file *file) +void cifs_swap_deactivate(struct file *file) { struct cifsFileInfo *cfile = file->private_data; @@ -3353,7 +3350,7 @@ static void cifs_swap_deactivate(struct file *file) * * Perform IO to the swap-file. This is much like direct IO. */ -static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) +int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; @@ -3379,9 +3376,6 @@ const struct address_space_operations cifs_addr_ops = { * TODO: investigate and if useful we could add an is_dirty_writeback * helper if needed */ - .swap_activate = cifs_swap_activate, - .swap_deactivate = cifs_swap_deactivate, - .swap_rw = cifs_swap_rw, }; /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f279055fcea03..7488fc6a7b78a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -796,52 +796,6 @@ xfs_vm_readahead( iomap_readahead(&xfs_read_iomap_ops, &ctx, NULL); } -static int -xfs_vm_swap_activate( - struct swap_info_struct *sis, - struct file *swap_file, - sector_t *span) -{ - struct xfs_inode *ip = XFS_I(file_inode(swap_file)); - - if (xfs_is_zoned_inode(ip)) - return -EINVAL; - - /* - * Swap file activation can race against concurrent shared extent - * removal in files that have been cloned. If this happens, - * iomap_swapfile_iter() can fail because it encountered a shared - * extent even though an operation is in progress to remove those - * shared extents. - * - * This race becomes problematic when we defer extent removal - * operations beyond the end of a syscall (i.e. use async background - * processing algorithms). Users think the extents are no longer - * shared, but iomap_swapfile_iter() still sees them as shared - * because the refcountbt entries for the extents being removed have - * not yet been updated. Hence the swapon call fails unexpectedly. - * - * The race condition is currently most obvious from the unlink() - * operation as extent removal is deferred until after the last - * reference to the inode goes away. We then process the extent - * removal asynchronously, hence triggers the "syscall completed but - * work not done" condition mentioned above. To close this race - * window, we need to flush any pending inodegc operations to ensure - * they have updated the refcountbt records before we try to map the - * swapfile. - */ - xfs_inodegc_flush(ip->i_mount); - - /* - * Direct the swap code to the correct block device when this file - * sits on the RT device. - */ - sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; - - return iomap_swapfile_activate(sis, swap_file, span, - &xfs_read_iomap_ops); -} - const struct address_space_operations xfs_address_space_operations = { .read_folio = xfs_vm_read_folio, .readahead = xfs_vm_readahead, @@ -853,11 +807,9 @@ const struct address_space_operations xfs_address_space_operations = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = xfs_vm_swap_activate, }; const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .dirty_folio = noop_dirty_folio, - .swap_activate = xfs_vm_swap_activate, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 845a97c9b0630..74128ebf71617 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -2081,6 +2081,44 @@ xfs_file_mmap_prepare( return 0; } +static int +xfs_file_swap_activate( + struct file *file, + struct swap_info_struct *sis) +{ + struct xfs_inode *ip = XFS_I(file_inode(file)); + + if (xfs_is_zoned_inode(ip)) + return -EINVAL; + + /* + * Swap file activation can race against concurrent shared extent + * removal in files that have been cloned. If this happens, + * iomap_swapfile_iter() can fail because it encountered a shared + * extent even though an operation is in progress to remove those + * shared extents. + * + * This race becomes problematic when we defer extent removal + * operations beyond the end of a syscall (i.e. use async background + * processing algorithms). Users think the extents are no longer + * shared, but iomap_swapfile_iter() still sees them as shared + * because the refcountbt entries for the extents being removed have + * not yet been updated. Hence the swapon call fails unexpectedly. + * + * The race condition is currently most obvious from the unlink() + * operation as extent removal is deferred until after the last + * reference to the inode goes away. We then process the extent + * removal asynchronously, hence triggers the "syscall completed but + * work not done" condition mentioned above. To close this race + * window, we need to flush any pending inodegc operations to ensure + * they have updated the refcountbt records before we try to map the + * swapfile. + */ + xfs_inodegc_flush(ip->i_mount); + + return iomap_swap_activate(file, sis, &xfs_read_iomap_ops); +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, @@ -2104,6 +2142,7 @@ const struct file_operations xfs_file_operations = { FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, .setlease = generic_setlease, + .swap_activate = xfs_file_swap_activate, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 5ada33f70bb47..2c817917a13d5 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -167,21 +167,6 @@ static int zonefs_writepages(struct address_space *mapping, return iomap_writepages(&wpc); } -static int zonefs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - struct inode *inode = file_inode(swap_file); - - if (zonefs_inode_is_seq(inode)) { - zonefs_err(inode->i_sb, - "swap file: not a conventional zone file\n"); - return -EINVAL; - } - - return iomap_swapfile_activate(sis, swap_file, span, - &zonefs_read_iomap_ops); -} - const struct address_space_operations zonefs_file_aops = { .read_folio = zonefs_read_folio, .readahead = zonefs_readahead, @@ -192,7 +177,6 @@ const struct address_space_operations zonefs_file_aops = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = zonefs_swap_activate, }; int zonefs_file_truncate(struct inode *inode, loff_t isize) @@ -859,6 +843,19 @@ static int zonefs_file_release(struct inode *inode, struct file *file) return 0; } +static int zonefs_swap_activate(struct file *file, struct swap_info_struct *sis) +{ + struct inode *inode = file_inode(file); + + if (zonefs_inode_is_seq(inode)) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; + } + + return iomap_swap_activate(file, sis, &zonefs_read_iomap_ops); +} + const struct file_operations zonefs_file_operations = { .open = zonefs_file_open, .release = zonefs_file_release, @@ -870,4 +867,5 @@ const struct file_operations zonefs_file_operations = { .splice_read = zonefs_file_splice_read, .splice_write = iter_file_splice_write, .iopoll = iocb_bio_iopoll, + .swap_activate = zonefs_swap_activate, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 11559c513dfbb..7564cef5405d4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,12 +433,6 @@ struct address_space_operations { size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); - - /* swapfile support */ - int (*swap_activate)(struct swap_info_struct *sis, struct file *file, - sector_t *span); - void (*swap_deactivate)(struct file *file); - int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; @@ -1967,6 +1961,11 @@ struct file_operations { int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); int (*mmap_prepare)(struct vm_area_desc *); + + /* swapfile support */ + int (*swap_activate)(struct file *file, struct swap_info_struct *sis); + void (*swap_deactivate)(struct file *file); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); } __randomize_layout; /* Supports async buffered reads */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2c5685adf3a97..3fd582d375b60 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -603,11 +603,10 @@ void iomap_dio_bio_end_io(struct bio *bio); struct file; struct swap_info_struct; -int iomap_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *pagespan, +int iomap_swap_activate(struct file *file, struct swap_info_struct *sis, const struct iomap_ops *ops); #else -# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) +# define iomap_swap_activate(file, sis, ops) (-EIO) #endif /* CONFIG_SWAP */ extern struct bio_set iomap_ioend_bioset; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4623262da3c09..9746212a085e7 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -538,6 +538,9 @@ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ /* * linux/fs/nfs/file.c */ +int nfs_swap_activate(struct file *file, struct swap_info_struct *sis); +void nfs_swap_deactivate(struct file *file); + extern const struct file_operations nfs_file_operations; #if IS_ENABLED(CONFIG_NFS_V4) extern const struct file_operations nfs4_file_operations; diff --git a/include/linux/swap.h b/include/linux/swap.h index 7a09df6977a5f..31eef9b749497 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -16,9 +16,9 @@ #include #include -struct notifier_block; - struct bio; +struct notifier_block; +struct swap_info_struct; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff @@ -178,44 +178,6 @@ struct sysinfo; struct writeback_control; struct zone; -/* - * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of - * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the - * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart - * from setup, they're handled identically. - * - * We always assume that blocks are of size PAGE_SIZE. - */ -struct swap_extent { - struct rb_node rb_node; - pgoff_t start_page; - pgoff_t nr_pages; - sector_t start_block; -}; - -/* - * Max bad pages in the new format.. - */ -#define MAX_SWAP_BADPAGES \ - ((offsetof(union swap_header, magic.magic) - \ - offsetof(union swap_header, info.badpages)) / sizeof(int)) - -enum { - SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ - SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ - SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ - SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ - SWP_BLKDEV = (1 << 6), /* its a block device */ - SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ - SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ - SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ - SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ - SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ - SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ - /* add others here before... */ -}; - #define SWAP_CLUSTER_MAX 32UL #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX @@ -234,56 +196,6 @@ enum { #define SWAP_NR_ORDERS 1 #endif -/* - * We keep using same cluster for rotational device so IO will be sequential. - * The purpose is to optimize SWAP throughput on these device. - */ -struct swap_sequential_cluster { - unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ -}; - -/* - * The in-memory structure used to track swap areas. - */ -struct swap_info_struct { - struct percpu_ref users; /* indicate and keep swap device valid. */ - unsigned long flags; /* SWP_USED etc: see above */ - signed short prio; /* swap priority of this type */ - struct plist_node list; /* entry in swap_active_head */ - signed char type; /* strange name for an index */ - unsigned int max; /* size of this swap device */ - unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ - struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct list_head free_clusters; /* free clusters list */ - struct list_head full_clusters; /* full clusters list */ - struct list_head nonfull_clusters[SWAP_NR_ORDERS]; - /* list of cluster that contains at least one free slot */ - struct list_head frag_clusters[SWAP_NR_ORDERS]; - /* list of cluster that are fragmented or contented */ - unsigned int pages; /* total of usable pages of swap */ - atomic_long_t inuse_pages; /* number of those currently in use */ - struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ - spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ - struct rb_root swap_extent_root;/* root of the swap extent rbtree */ - struct block_device *bdev; /* swap device or bdev of swap file */ - struct file *swap_file; /* seldom referenced */ - struct completion comp; /* seldom referenced */ - spinlock_t lock; /* - * protect map scan related fields like - * inuse_pages and all cluster lists. - * Other fields are only changed - * at swapon/swapoff, so are protected - * by swap_lock. changing flags need - * hold this lock and swap_lock. If - * both locks need hold, hold swap_lock - * first. - */ - struct work_struct discard_work; /* discard worker */ - struct work_struct reclaim_work; /* reclaim worker */ - struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_list; /* entry in swap_avail_head */ -}; - static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); @@ -403,10 +315,10 @@ extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP -int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, - unsigned long nr_pages, sector_t start_block); -int generic_swapfile_activate(struct swap_info_struct *, struct file *, - sector_t *); +int add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, + struct block_device *bdev, sector_t start_block); +int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis); +int swap_activate_fs_ops(struct swap_info_struct *sis); static inline unsigned long total_swapcache_pages(void) { @@ -438,10 +350,7 @@ int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct backing_dev_info; -extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); /* @@ -467,20 +376,7 @@ bool folio_free_swap(struct folio *folio); swp_entry_t swap_alloc_hibernation_slot(int type); void swap_free_hibernation_slot(swp_entry_t entry); -static inline void put_swap_device(struct swap_info_struct *si) -{ - percpu_ref_put(&si->users); -} - #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) -{ - return NULL; -} - -static inline void put_swap_device(struct swap_info_struct *si) -{ -} #define get_nr_swap_pages() 0L #define total_swap_pages 0L @@ -512,11 +408,6 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) -{ - return false; -} - static inline int swp_swapcount(swp_entry_t entry) { return 0; @@ -528,8 +419,12 @@ static inline bool folio_free_swap(struct folio *folio) } static inline int add_swap_extent(struct swap_info_struct *sis, - unsigned long start_page, - unsigned long nr_pages, sector_t start_block) + unsigned long start_page, unsigned long nr_pages, + struct block_device *bdev, sector_t start_block) +{ + return -EINVAL; +} +static inline int swap_activate_fs_ops(struct swap_info_struct *sis) { return -EINVAL; } diff --git a/mm/page_io.c b/mm/page_io.c index 70cea9e24d2fd..2ab8994ed1c29 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -75,20 +75,14 @@ static void end_swap_bio_read(struct bio *bio) bio_put(bio); } -int generic_swapfile_activate(struct swap_info_struct *sis, - struct file *swap_file, - sector_t *span) +int generic_swap_activate(struct file *swap_file, struct swap_info_struct *sis) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; unsigned blocks_per_page; - unsigned long page_no; unsigned blkbits; sector_t probe_block; sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; int ret; blkbits = inode->i_blkbits; @@ -99,10 +93,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis, * to be very smart. */ probe_block = 0; - page_no = 0; last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { + while ((probe_block + blocks_per_page) <= last_block) { unsigned block_in_page; sector_t first_block; @@ -137,38 +129,21 @@ int generic_swapfile_activate(struct swap_info_struct *sis, } } - first_block >>= (PAGE_SHIFT - blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; - } - /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ - ret = add_swap_extent(sis, page_no, 1, first_block); + ret = add_swap_extent(sis, 1, inode->i_sb->s_bdev, + first_block >> (PAGE_SHIFT - blkbits)); if (ret < 0) - goto out; - nr_extents += ret; - page_no++; + return ret; probe_block += blocks_per_page; reprobe: continue; } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; -out: - return ret; + return 0; bad_bmap: pr_err("swapon: swapfile has holes\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } static bool is_folio_zero_filled(struct folio *folio) @@ -474,11 +449,10 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) void swap_write_unplug(struct swap_iocb *sio) { struct iov_iter from; - struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); - ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + ret = sio->iocb.ki_filp->f_op->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); } @@ -663,11 +637,10 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) void __swap_read_unplug(struct swap_iocb *sio) { struct iov_iter from; - struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); - ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + ret = sio->iocb.ki_filp->f_op->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); } diff --git a/mm/swap.h b/mm/swap.h index a77016f2423b9..70974495bf15a 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -8,6 +8,79 @@ struct swap_iocb; extern int page_cluster; +/* + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. + */ +struct swap_sequential_cluster { + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ +}; + +/* + * The in-memory structure used to track swap areas. + */ +struct swap_info_struct { + struct percpu_ref users; /* indicate and keep swap device valid. */ + unsigned long flags; /* SWP_USED etc: see above */ + signed short prio; /* swap priority of this type */ + struct plist_node list; /* entry in swap_active_head */ + signed char type; /* strange name for an index */ + unsigned int max; /* size of this swap device */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct list_head free_clusters; /* free clusters list */ + struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ + unsigned int pages; /* total of usable pages of swap */ + atomic_long_t inuse_pages; /* number of those currently in use */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ + struct rb_root swap_extent_root;/* root of the swap extent rbtree */ + struct block_device *bdev; /* swap device or bdev of swap file */ + struct file *swap_file; /* seldom referenced */ + struct completion comp; /* seldom referenced */ + spinlock_t lock; /* + * protect map scan related fields like + * inuse_pages and all cluster lists. + * Other fields are only changed + * at swapon/swapoff, so are protected + * by swap_lock. changing flags need + * hold this lock and swap_lock. If + * both locks need hold, hold swap_lock + * first. + */ + struct work_struct discard_work; /* discard worker */ + struct work_struct reclaim_work; /* reclaim worker */ + struct list_head discard_clusters; /* discard clusters list */ + struct plist_node avail_list; /* entry in swap_avail_head */ +}; + +/* + * Max bad pages in the new format.. + */ +#define MAX_SWAP_BADPAGES \ + ((offsetof(union swap_header, magic.magic) - \ + offsetof(union swap_header, info.badpages)) / sizeof(int)) + +enum { + SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ + SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ + SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ + SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ + SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ + SWP_BLKDEV = (1 << 6), /* its a block device */ + SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ + SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ + SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + /* add others here before... */ +}; + #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) @@ -352,6 +425,13 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) return i; } +bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); +struct swap_info_struct *get_swap_device(swp_entry_t entry); +static inline void put_swap_device(struct swap_info_struct *si) +{ + percpu_ref_put(&si->users); +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -498,5 +578,17 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { return 0; } +static inline bool swap_entry_swapped(struct swap_info_struct *si, + swp_entry_t entry) +{ + return false; +} +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + return NULL; +} +static inline void put_swap_device(struct swap_info_struct *si) +{ +} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 9174f1eeffb09..c0479533f9ef2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -260,6 +260,21 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, return ret; } +/* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of + * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the + * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart + * from setup, they're handled identically. + * + * We always assume that blocks are of size PAGE_SIZE. + */ +struct swap_extent { + struct rb_node rb_node; + pgoff_t start_page; + pgoff_t nr_pages; + sector_t start_block; +}; + static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); @@ -2692,11 +2707,9 @@ static void destroy_swap_extents(struct swap_info_struct *sis, } if (sis->flags & SWP_ACTIVATED) { - struct address_space *mapping = swap_file->f_mapping; - sis->flags &= ~SWP_ACTIVATED; - if (mapping->a_ops->swap_deactivate) - mapping->a_ops->swap_deactivate(swap_file); + if (swap_file->f_op->swap_deactivate) + swap_file->f_op->swap_deactivate(swap_file); } } @@ -2704,15 +2717,27 @@ static void destroy_swap_extents(struct swap_info_struct *sis, * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * - * This function rather assumes that it is called in ascending page order. + * Note that start_block is in units of PAGE_SIZE and not actually in block + * layer sectors as the sector_t would suggest. */ int -add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, - unsigned long nr_pages, sector_t start_block) +add_swap_extent(struct swap_info_struct *sis, unsigned long nr_pages, + struct block_device *bdev, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; - struct swap_extent *new_se; + + if (!nr_pages) + return 0; + if (unlikely(sis->pages >= sis->max)) + return 0; + nr_pages = min(nr_pages, sis->max - sis->pages); + + /* Only one bdev per swap file for now. */ + if (!sis->bdev) + sis->bdev = bdev; + else if (bdev != sis->bdev) + return -EINVAL; /* * place the new node at the right most since the @@ -2725,28 +2750,35 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); - BUG_ON(se->start_page + se->nr_pages != start_page); - if (se->start_block + se->nr_pages == start_block) { - /* Merge it */ - se->nr_pages += nr_pages; - return 0; - } + if (WARN_ON_ONCE(se->start_page + se->nr_pages != sis->pages)) + return -EINVAL; + if (se->start_block + se->nr_pages == start_block) + goto add; } /* No merge, insert a new extent. */ - new_se = kmalloc_obj(*se); - if (new_se == NULL) + se = kzalloc_obj(*se); + if (!se) return -ENOMEM; - new_se->start_page = start_page; - new_se->nr_pages = nr_pages; - new_se->start_block = start_block; - - rb_link_node(&new_se->rb_node, parent, link); - rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); - return 1; + rb_link_node(&se->rb_node, parent, link); + rb_insert_color(&se->rb_node, &sis->swap_extent_root); + + se->start_page = sis->pages; + se->start_block = start_block; +add: + se->nr_pages += nr_pages; + sis->pages += nr_pages; + return 0; } EXPORT_SYMBOL_GPL(add_swap_extent); +int swap_activate_fs_ops(struct swap_info_struct *sis) +{ + sis->flags |= SWP_FS_OPS; + return add_swap_extent(sis, sis->max, NULL, 0); +} +EXPORT_SYMBOL_GPL(swap_activate_fs_ops); + /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is @@ -2775,32 +2807,25 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) + struct file *swap_file) { - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - int ret; + int ret, error = 0; - if (S_ISBLK(inode->i_mode)) { - ret = add_swap_extent(sis, 0, sis->max, 0); - *span = sis->pages; + if (swap_file->f_op->swap_activate) + ret = swap_file->f_op->swap_activate(swap_file, sis); + else + ret = generic_swap_activate(swap_file, sis); + if (ret < 0) return ret; - } - if (mapping->a_ops->swap_activate) { - ret = mapping->a_ops->swap_activate(sis, swap_file, span); - if (ret < 0) - return ret; - sis->flags |= SWP_ACTIVATED; - if ((sis->flags & SWP_FS_OPS) && - sio_pool_init() != 0) { - destroy_swap_extents(sis, swap_file); - return -ENOMEM; - } - return ret; - } - - return generic_swapfile_activate(sis, swap_file, span); + sis->flags |= SWP_ACTIVATED; + if (sis->flags & SWP_FS_OPS) + error = sio_pool_init(); + else if (WARN_ON_ONCE(!sis->bdev)) + error = -EINVAL; + if (error) + destroy_swap_extents(sis, swap_file); + return error; } static void _enable_swap_info(struct swap_info_struct *si) @@ -3229,26 +3254,6 @@ static struct swap_info_struct *alloc_swap_info(void) return p; } -static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) -{ - if (S_ISBLK(inode->i_mode)) { - si->bdev = I_BDEV(inode); - /* - * Zoned block devices contain zones that have a sequential - * write only restriction. Hence zoned block devices are not - * suitable for swapping. Disallow them here. - */ - if (bdev_is_zoned(si->bdev)) - return -EINVAL; - si->flags |= SWP_BLKDEV; - } else if (S_ISREG(inode->i_mode)) { - si->bdev = inode->i_sb->s_bdev; - } - - return 0; -} - - /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: @@ -3350,10 +3355,9 @@ static unsigned long read_swap_header(struct swap_info_struct *si, } static int setup_swap_clusters_info(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned long maxpages) + union swap_header *swap_header) { - unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long nr_clusters = DIV_ROUND_UP(si->max, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; int err = -ENOMEM; unsigned long i; @@ -3395,7 +3399,7 @@ static int setup_swap_clusters_info(struct swap_info_struct *si, if (err) goto err; } - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { + for (i = si->max; i < round_up(si->max, SWAPFILE_CLUSTER); i++) { err = swap_cluster_setup_bad_slot(si, cluster_info, i, true); if (err) goto err; @@ -3425,10 +3429,44 @@ static int setup_swap_clusters_info(struct swap_info_struct *si, si->cluster_info = cluster_info; return 0; err: - free_swap_cluster_info(cluster_info, maxpages); + free_swap_cluster_info(cluster_info, si->max); return err; } +static void swap_print_info(struct swap_info_struct *si, const char *name) +{ + unsigned int nr_extents = 0; + u64 lowest_ppage = (u64)-1; + u64 highest_ppage = 0; + struct swap_extent *se; + + /* + * Calculate how much swap space we're adding; the first page contains + * the swap header and doesn't count. + */ + for (se = first_se(si); se; se = next_se(se)) { + u64 first_ppage = se->start_block; + u64 next_ppage = se->start_block + se->nr_pages; + + if (se->start_page == 0) + first_ppage++; + + if (lowest_ppage > first_ppage) + lowest_ppage = first_ppage; + if (highest_ppage < next_ppage - 1) + highest_ppage = next_ppage - 1; + nr_extents++; + } + + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", + K(si->pages), name, si->prio, nr_extents, + K(highest_ppage - lowest_ppage), + (si->flags & SWP_SOLIDSTATE) ? "SS" : "", + (si->flags & SWP_DISCARDABLE) ? "D" : "", + (si->flags & SWP_AREA_DISCARD) ? "s" : "", + (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *si; @@ -3438,9 +3476,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int prio; int error; union swap_header *swap_header; - int nr_extents; - sector_t span; - unsigned long maxpages; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3475,15 +3510,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) dentry = swap_file->f_path.dentry; inode = mapping->host; - error = claim_swapfile(si, inode); - if (unlikely(error)) - goto bad_swap; - inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } + if (S_ISBLK(inode->i_mode)) { + si->flags |= SWP_BLKDEV; + } else if (!S_ISREG(inode->i_mode)) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; @@ -3512,33 +3549,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } swap_header = kmap_local_folio(folio, 0); - maxpages = read_swap_header(si, swap_header, inode); - if (unlikely(!maxpages)) { + si->pages = 0; + si->max = read_swap_header(si, swap_header, inode); + if (unlikely(!si->max)) { error = -EINVAL; goto bad_swap_unlock_inode; } - si->max = maxpages; - si->pages = maxpages - 1; - nr_extents = setup_swap_extents(si, swap_file, &span); - if (nr_extents < 0) { - error = nr_extents; + error = setup_swap_extents(si, swap_file); + if (error < 0) goto bad_swap_unlock_inode; - } - if (si->pages != si->max - 1) { - pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max); + if (si->pages != si->max) { + pr_err("swap:%u != (max:%u)\n", si->pages, si->max); error = -EINVAL; goto bad_swap_unlock_inode; } - maxpages = si->max; + /* Remove the first page countaining the swap header. */ + si->pages--; /* Set up the swap cluster info */ - error = setup_swap_clusters_info(si, swap_header, maxpages); + error = setup_swap_clusters_info(si, swap_header); if (error) goto bad_swap_unlock_inode; - error = swap_cgroup_swapon(si->type, maxpages); + error = swap_cgroup_swapon(si->type, si->max); if (error) goto bad_swap_unlock_inode; @@ -3546,7 +3581,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ - si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + si->zeromap = kvmalloc_array(BITS_TO_LONGS(si->max), sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!si->zeromap) { error = -ENOMEM; @@ -3597,7 +3632,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = zswap_swapon(si->type, maxpages); + error = zswap_swapon(si->type, si->max); if (error) goto bad_swap_unlock_inode; @@ -3629,13 +3664,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */ enable_swap_info(si); - pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", - K(si->pages), name->name, si->prio, nr_extents, - K((unsigned long long)span), - (si->flags & SWP_SOLIDSTATE) ? "SS" : "", - (si->flags & SWP_DISCARDABLE) ? "D" : "", - (si->flags & SWP_AREA_DISCARD) ? "s" : "", - (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); + swap_print_info(si, name->name); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event);