diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4d0f545fb3ec5..2de1b7bffbe90 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2792,7 +2792,6 @@ Kernel parameters "number of CPUs in system - 1". managed_irq - Isolate from being targeted by managed interrupts which have an interrupt mask containing isolated CPUs. The affinity of managed interrupts is @@ -2815,6 +2814,35 @@ Kernel parameters housekeeping CPUs has no influence on those queues. + io_queue + Applicable to managed IRQs only. Restrict + multiqueue hardware queue allocation to online + housekeeping CPUs. This guarantees that all + managed hardware completion interrupts are routed + exclusively to housekeeping cores, shielding + isolated CPUs from I/O interruptions even if they + initiated the request. + + The io_queue configuration takes precedence over + managed_irq. When io_queue is used, managed_irq + placement constraints have no effect. + + Note: Using io_queue restricts the number of + allocated hardware queues to match the number of + housekeeping CPUs. This prevents MSI-X vector + exhaustion and forces isolated CPUs to share + submission queues. + + Note: Offlining housekeeping CPUs which serve + isolated CPUs will fail. The isolated CPUs must + be offlined before offlining the housekeeping + CPUs. + + Note: When I/O is submitted by an application on + an isolated CPU, the hardware completion + interrupt is handled entirely by a housekeeping + CPU. + The format of is described above. iucv= [HW,NET] diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 705da074ad6c7..f953714d190c2 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -22,7 +22,11 @@ static unsigned int blk_mq_num_queues(const struct cpumask *mask, { unsigned int num; - num = cpumask_weight(mask); + if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) + num = cpumask_weight_and(mask, housekeeping_cpumask(HK_TYPE_IO_QUEUE)); + else + num = cpumask_weight(mask); + return min_not_zero(num, max_queues); } @@ -33,7 +37,8 @@ static unsigned int blk_mq_num_queues(const struct cpumask *mask, * ignored. * * Calculates the number of queues to be used for a multiqueue - * device based on the number of possible CPUs. + * device based on the number of possible CPUs. This helper + * takes isolcpus settings into account. */ unsigned int blk_mq_num_possible_queues(unsigned int max_queues) { @@ -48,7 +53,8 @@ EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues); * ignored. * * Calculates the number of queues to be used for a multiqueue - * device based on the number of online CPUs. + * device based on the number of online CPUs. This helper + * takes isolcpus settings into account. */ unsigned int blk_mq_num_online_queues(unsigned int max_queues) { @@ -56,23 +62,139 @@ unsigned int blk_mq_num_online_queues(unsigned int max_queues) } EXPORT_SYMBOL_GPL(blk_mq_num_online_queues); +static bool blk_mq_validate(struct blk_mq_queue_map *qmap, + const unsigned long *active_hctx, + const struct cpumask *online_mask) +{ + /* + * Verify if the mapping is usable when housekeeping + * configuration is enabled + */ + for (int queue = 0; queue < qmap->nr_queues; queue++) { + int cpu; + + if (test_bit(queue, active_hctx)) { + /* + * This hctx has at least one online CPU thus it + * is able to serve any assigned isolated CPU. + */ + continue; + } + + /* + * There is no housekeeping online CPU for this hctx, all + * good as long as all non-housekeeping CPUs are also + * offline. + */ + for_each_cpu(cpu, online_mask) { + if (qmap->mq_map[cpu] != qmap->queue_offset + queue) + continue; + + pr_warn("Unable to create a usable CPU-to-queue mapping with the given constraints\n"); + return false; + } + } + + return true; +} + +static void blk_mq_map_fallback(struct blk_mq_queue_map *qmap) +{ + unsigned int cpu; + + /* + * Map all CPUs to the first hctx of this specific map to ensure + * at least one online CPU is serving it, respecting the map's + * boundaries so secondary maps do not route into the default map. + */ + for_each_possible_cpu(cpu) + qmap->mq_map[cpu] = qmap->queue_offset; +} + void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { - const struct cpumask *masks; + struct cpumask *masks; + const struct cpumask *constraint; unsigned int queue, cpu, nr_masks; + unsigned long *active_hctx; + cpumask_var_t online_mask; - masks = group_cpus_evenly(qmap->nr_queues, &nr_masks); - if (!masks) { - for_each_possible_cpu(cpu) - qmap->mq_map[cpu] = qmap->queue_offset; - return; - } + active_hctx = bitmap_zalloc(qmap->nr_queues, GFP_KERNEL); + if (!active_hctx) + goto fallback; - for (queue = 0; queue < qmap->nr_queues; queue++) { - for_each_cpu(cpu, &masks[queue % nr_masks]) + if (!alloc_cpumask_var(&online_mask, GFP_KERNEL)) + goto free_fallback_hctx; + + /* + * Snapshot online CPUs to prevent TOCTOU races between the + * mapping phase and the validation phase. + */ + cpumask_copy(online_mask, cpu_online_mask); + + if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) + constraint = housekeeping_cpumask(HK_TYPE_IO_QUEUE); + else + constraint = cpu_possible_mask; + + /* Map CPUs to the hardware contexts (hctx) */ + masks = group_mask_cpus_evenly(qmap->nr_queues, constraint, &nr_masks); + if (!masks) + goto free_fallback; + + /* + * Iterate directly over the generated CPU masks. + * Calculate the final, highest hardware queue index that maps to this + * mask. This skips all intermediate overwrites and safely evaluates + * active_hctx only for queues that survive the mapping. + */ + for (unsigned int idx = 0; idx < nr_masks; idx++) { + bool active = false; + queue = qmap->nr_queues - 1 - + ((qmap->nr_queues - 1 - idx) % nr_masks); + + for_each_cpu(cpu, &masks[idx]) { qmap->mq_map[cpu] = qmap->queue_offset + queue; + + if (!active && cpumask_test_cpu(cpu, online_mask)) { + __set_bit(queue, active_hctx); + active = true; + } + } + } + + /* + * If all CPUs in the generated masks are offline, the active_hctx + * bitmap will be empty. Attempting to route unassigned CPUs to an + * empty bitmap will map them out-of-bounds. Fall back instead. + */ + if (bitmap_empty(active_hctx, qmap->nr_queues)) + goto free_fallback; + + /* Map any unassigned CPU evenly to the hardware contexts (hctx) */ + queue = find_first_bit(active_hctx, qmap->nr_queues); + for_each_cpu_andnot(cpu, cpu_possible_mask, constraint) { + qmap->mq_map[cpu] = qmap->queue_offset + queue; + queue = find_next_bit_wrap(active_hctx, qmap->nr_queues, queue + 1); } + + if (!blk_mq_validate(qmap, active_hctx, online_mask)) + goto free_fallback; + kfree(masks); + free_cpumask_var(online_mask); + bitmap_free(active_hctx); + + return; + +free_fallback: + kfree(masks); + free_cpumask_var(online_mask); +free_fallback_hctx: + bitmap_free(active_hctx); + +fallback: + blk_mq_map_fallback(qmap); } EXPORT_SYMBOL_GPL(blk_mq_map_queues); @@ -109,24 +231,92 @@ void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, struct device *dev, unsigned int offset) { - const struct cpumask *mask; + cpumask_var_t mask, online_mask; + const struct cpumask *constraint; + unsigned long *active_hctx; unsigned int queue, cpu; if (!dev->bus->irq_get_affinity) + goto map_software; + + active_hctx = bitmap_zalloc(qmap->nr_queues, GFP_KERNEL); + if (!active_hctx) + goto fallback; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + bitmap_free(active_hctx); goto fallback; + } + + if (!alloc_cpumask_var(&online_mask, GFP_KERNEL)) + goto free_fallback_mask; + if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) + constraint = housekeeping_cpumask(HK_TYPE_IO_QUEUE); + else + constraint = cpu_possible_mask; + + /* + * Snapshot online CPUs to prevent TOCTOU races between the + * mapping phase and the validation phase. + */ + cpumask_copy(online_mask, cpu_online_mask); + + /* Map CPUs to the hardware contexts (hctx) */ for (queue = 0; queue < qmap->nr_queues; queue++) { - mask = dev->bus->irq_get_affinity(dev, queue + offset); - if (!mask) - goto fallback; + const struct cpumask *affinity_mask; + bool active = false; + + affinity_mask = dev->bus->irq_get_affinity(dev, offset + queue); + if (!affinity_mask) + goto free_fallback; - for_each_cpu(cpu, mask) + for_each_cpu(cpu, affinity_mask) { qmap->mq_map[cpu] = qmap->queue_offset + queue; + + cpumask_set_cpu(cpu, mask); + if (!active && cpumask_test_cpu(cpu, online_mask) && + cpumask_test_cpu(cpu, constraint)) { + __set_bit(queue, active_hctx); + active = true; + } + } + } + + /* + * If all CPUs assigned to this map are offline, the bitmap will + * be empty. Fall back instead of routing out of bounds. + */ + if (bitmap_empty(active_hctx, qmap->nr_queues)) + goto free_fallback; + + /* Map any unassigned CPU evenly to the hardware contexts (hctx) */ + queue = find_first_bit(active_hctx, qmap->nr_queues); + for_each_cpu_andnot(cpu, cpu_possible_mask, mask) { + qmap->mq_map[cpu] = qmap->queue_offset + queue; + queue = find_next_bit_wrap(active_hctx, qmap->nr_queues, queue + 1); } + if (!blk_mq_validate(qmap, active_hctx, online_mask)) + goto free_fallback; + + bitmap_free(active_hctx); + free_cpumask_var(mask); + free_cpumask_var(online_mask); + return; +free_fallback: + free_cpumask_var(online_mask); +free_fallback_mask: + bitmap_free(active_hctx); + free_cpumask_var(mask); + fallback: + blk_mq_map_fallback(qmap); + return; + +map_software: blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c5c16cce4f8f..afe0c0bf7e8ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3720,6 +3720,57 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) return data.has_rq; } +static bool blk_mq_hctx_can_offline_hk_cpu(struct blk_mq_hw_ctx *hctx, + unsigned int this_cpu) +{ + const struct cpumask *hk_mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE); + struct gendisk *disk; + int cpu, fallback_isolated_cpu = -1; + + /* + * If the CPU being offlined is not a housekeeping CPU, + * offlining it will not strand isolated CPUs. Allow it. + */ + if (!cpumask_test_cpu(this_cpu, hk_mask)) + return true; + /* + * Iterate over all online CPUs and manually check their mapping. + * We cannot use hctx->cpumask here because blk_mq_map_swqueue() + * intentionally strips isolated CPUs from it to prevent kworker + * routing. + */ + for_each_online_cpu(cpu) { + struct blk_mq_hw_ctx *h; + + if (cpu == this_cpu) + continue; + + h = blk_mq_map_queue_type(hctx->queue, hctx->type, cpu); + if (h != hctx) + continue; + + if (cpumask_test_cpu(cpu, hk_mask)) + return true; + + if (fallback_isolated_cpu == -1) + fallback_isolated_cpu = cpu; + } + + if (fallback_isolated_cpu != -1) { + /* + * Use READ_ONCE() to prevent compiler double-fetch TOCTOU + * issues if the disk is removed concurrently. + */ + disk = READ_ONCE(hctx->queue->disk); + pr_warn("%s: trying to offline hctx%d but online isolated CPU %d is still mapped to it\n", + disk ? disk->disk_name : "?", hctx->queue_num, + fallback_isolated_cpu); + return false; + } + + return true; +} + static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, unsigned int this_cpu) { @@ -3752,6 +3803,11 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx, cpuhp_online); int ret = 0; + if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) { + if (!blk_mq_hctx_can_offline_hk_cpu(hctx, cpu)) + return -EINVAL; + } + if (!hctx->nr_ctx || blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c index 9bd3f5b868bcd..ec165b57182d3 100644 --- a/drivers/scsi/aacraid/comminit.c +++ b/drivers/scsi/aacraid/comminit.c @@ -469,8 +469,7 @@ void aac_define_int_mode(struct aac_dev *dev) } /* Don't bother allocating more MSI-X vectors than cpus */ - msi_count = min(dev->max_msix, - (unsigned int)num_online_cpus()); + msi_count = blk_mq_num_online_queues(dev->max_msix); dev->max_msix = msi_count; diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h index 9d4e5ab6c314b..defab4123a82f 100644 --- a/include/linux/group_cpus.h +++ b/include/linux/group_cpus.h @@ -10,5 +10,8 @@ #include struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks); +struct cpumask *group_mask_cpus_evenly(unsigned int numgrps, + const struct cpumask *mask, + unsigned int *nummasks); #endif diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index cf0fd03dd7a24..30cb9a44365eb 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -18,6 +18,7 @@ enum hk_type { HK_TYPE_MANAGED_IRQ, /* Inverse of boot-time nohz_full= or isolcpus=nohz arguments */ HK_TYPE_KERNEL_NOISE, + HK_TYPE_IO_QUEUE, HK_TYPE_MAX, /* diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 78f2418a89252..1d39dce685c7f 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -8,6 +8,7 @@ #include #include #include +#include static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) { @@ -25,8 +26,10 @@ static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) { - unsigned int affvecs, curvec, usedvecs, i; + unsigned int affvecs, curvec, usedvecs, i, j; struct irq_affinity_desc *masks = NULL; + const struct cpumask *hk_mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE); + bool hk_enabled = housekeeping_enabled(HK_TYPE_IO_QUEUE); /* * Determine the number of vectors which need interrupt affinities @@ -70,19 +73,29 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) */ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { unsigned int nr_masks, this_vecs = affd->set_size[i]; - struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks); + struct cpumask *result; + const struct cpumask *mask; + if (hk_enabled) + mask = hk_mask; + else + mask = cpu_possible_mask; + + result = group_mask_cpus_evenly(this_vecs, mask, + &nr_masks); if (!result) { kfree(masks); return NULL; } - - for (int j = 0; j < nr_masks; j++) + for (j = 0; j < nr_masks; j++) cpumask_copy(&masks[curvec + j].mask, &result[j]); + for (j = nr_masks; j < this_vecs; j++) + cpumask_copy(&masks[curvec + j].mask, irq_default_affinity); + kfree(result); - curvec += nr_masks; - usedvecs += nr_masks; + curvec += this_vecs; + usedvecs += this_vecs; } /* Fill out vectors at the end that don't need affinity */ @@ -115,10 +128,14 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, if (resv > minvec) return 0; - if (affd->calc_sets) + if (affd->calc_sets) { set_vecs = maxvec - resv; - else - set_vecs = cpumask_weight(cpu_possible_mask); + } else { + if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) + set_vecs = cpumask_weight(housekeeping_cpumask(HK_TYPE_IO_QUEUE)); + else + set_vecs = cpumask_weight(cpu_possible_mask); + } return resv + min(set_vecs, maxvec - resv); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index ef152d401fe20..3406e3024fd43 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -16,6 +16,7 @@ enum hk_flags { HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), + HK_FLAG_IO_QUEUE = BIT(HK_TYPE_IO_QUEUE), }; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); @@ -340,6 +341,12 @@ static int __init housekeeping_isolcpus_setup(char *str) continue; } + if (!strncmp(str, "io_queue,", 9)) { + str += 9; + flags |= HK_FLAG_IO_QUEUE; + continue; + } + /* * Skip unknown sub-parameter and validate that it is not * containing an invalid character. diff --git a/lib/group_cpus.c b/lib/group_cpus.c index e6e18d7a49bba..2552ccea743e1 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -9,8 +9,6 @@ #include #include -#ifdef CONFIG_SMP - static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_grp) { @@ -564,22 +562,110 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) *nummasks = min(nr_present + nr_others, numgrps); return masks; } -#else /* CONFIG_SMP */ -struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) +EXPORT_SYMBOL_GPL(group_cpus_evenly); + +/** + * group_mask_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality + * @numgrps: number of cpumasks to create + * @mask: CPUs to consider for the grouping + * @nummasks: number of initialized cpumasks + * + * Return: cpumask array if successful, NULL otherwise. Only the CPUs + * marked in the mask will be considered for the grouping. And each + * element includes CPUs assigned to this group. nummasks contains the + * number of initialized masks which can be less than numgrps. + * + * Try to put close CPUs from viewpoint of CPU and NUMA locality into + * the same group. + * + * We guarantee in the resulting grouping that all CPUs specified in the + * provided mask are covered, and no same CPU is assigned to multiple + * groups. + */ +struct cpumask *group_mask_cpus_evenly(unsigned int numgrps, + const struct cpumask *mask, + unsigned int *nummasks) { - struct cpumask *masks; + unsigned int curgrp = 0, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, local_mask, npresmsk; + int ret = -ENOMEM; + struct cpumask *masks = NULL; if (numgrps == 0) return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + if (!zalloc_cpumask_var(&local_mask, GFP_KERNEL)) + goto fail_nmsk; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail_local_mask; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; + masks = kzalloc_objs(*masks, numgrps); if (!masks) - return NULL; + goto fail_node_to_cpumask; + + build_node_to_cpumask(node_to_cpumask); + + /* + * Create a stable snapshot of the mask. The grouping algorithm + * requires the CPU count to remain constant across its multiple + * passes. This prevents allocation failures if the caller passes a + * dynamic mask (e.g., cpu_online_mask) that changes concurrently. + */ + cpumask_copy(local_mask, data_race(mask)); - /* assign all CPUs(cpu 0) to the 1st group only */ - cpumask_copy(&masks[0], cpu_possible_mask); - *nummasks = 1; + /* + * Grouping present CPUs first. We intersect the provided mask with + * cpu_present_mask to ensure that we prioritise physically + * available CPUs for the initial distribution. + */ + cpumask_and(npresmsk, local_mask, data_race(cpu_present_mask)); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); + if (ret < 0) + goto fail_node_to_cpumask; + nr_present = ret; + + /* + * Allocate non-present CPUs starting from the next group to be + * handled. If the grouping of present CPUs already exhausted the + * group space, assign the non-present CPUs to the already + * allocated out groups. + */ + if (nr_present >= numgrps) + curgrp = 0; + else + curgrp = nr_present; + cpumask_andnot(npresmsk, local_mask, npresmsk); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); + if (ret >= 0) + nr_others = ret; + +fail_node_to_cpumask: + free_node_to_cpumask(node_to_cpumask); + +fail_npresmsk: + free_cpumask_var(npresmsk); + +fail_local_mask: + free_cpumask_var(local_mask); + +fail_nmsk: + free_cpumask_var(nmsk); + if (ret < 0) { + kfree(masks); + return NULL; + } + *nummasks = min(nr_present + nr_others, numgrps); return masks; } -#endif /* CONFIG_SMP */ -EXPORT_SYMBOL_GPL(group_cpus_evenly); +EXPORT_SYMBOL_GPL(group_mask_cpus_evenly);