Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion Documentation/admin-guide/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2792,7 +2792,6 @@ Kernel parameters
"number of CPUs in system - 1".

managed_irq

Isolate from being targeted by managed interrupts
which have an interrupt mask containing isolated
CPUs. The affinity of managed interrupts is
Expand All @@ -2815,6 +2814,35 @@ Kernel parameters
housekeeping CPUs has no influence on those
queues.

io_queue
Applicable to managed IRQs only. Restrict
multiqueue hardware queue allocation to online
housekeeping CPUs. This guarantees that all
managed hardware completion interrupts are routed
exclusively to housekeeping cores, shielding
isolated CPUs from I/O interruptions even if they
initiated the request.

The io_queue configuration takes precedence over
managed_irq. When io_queue is used, managed_irq
placement constraints have no effect.

Note: Using io_queue restricts the number of
allocated hardware queues to match the number of
housekeeping CPUs. This prevents MSI-X vector
exhaustion and forces isolated CPUs to share
submission queues.

Note: Offlining housekeeping CPUs which serve
isolated CPUs will fail. The isolated CPUs must
be offlined before offlining the housekeeping
CPUs.

Note: When I/O is submitted by an application on
an isolated CPU, the hardware completion
interrupt is handled entirely by a housekeeping
CPU.

The format of <cpu-list> is described above.

iucv= [HW,NET]
Expand Down
224 changes: 207 additions & 17 deletions block/blk-mq-cpumap.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ static unsigned int blk_mq_num_queues(const struct cpumask *mask,
{
unsigned int num;

num = cpumask_weight(mask);
if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
num = cpumask_weight_and(mask, housekeeping_cpumask(HK_TYPE_IO_QUEUE));
else
num = cpumask_weight(mask);

return min_not_zero(num, max_queues);
}

Expand All @@ -33,7 +37,8 @@ static unsigned int blk_mq_num_queues(const struct cpumask *mask,
* ignored.
*
* Calculates the number of queues to be used for a multiqueue
* device based on the number of possible CPUs.
* device based on the number of possible CPUs. This helper
* takes isolcpus settings into account.
*/
unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
{
Expand All @@ -48,31 +53,148 @@ EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
* ignored.
*
* Calculates the number of queues to be used for a multiqueue
* device based on the number of online CPUs.
* device based on the number of online CPUs. This helper
* takes isolcpus settings into account.
*/
unsigned int blk_mq_num_online_queues(unsigned int max_queues)
{
return blk_mq_num_queues(cpu_online_mask, max_queues);
}
EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);

static bool blk_mq_validate(struct blk_mq_queue_map *qmap,
const unsigned long *active_hctx,
const struct cpumask *online_mask)
{
/*
* Verify if the mapping is usable when housekeeping
* configuration is enabled
*/
for (int queue = 0; queue < qmap->nr_queues; queue++) {
int cpu;

if (test_bit(queue, active_hctx)) {
/*
* This hctx has at least one online CPU thus it
* is able to serve any assigned isolated CPU.
*/
continue;
}

/*
* There is no housekeeping online CPU for this hctx, all
* good as long as all non-housekeeping CPUs are also
* offline.
*/
for_each_cpu(cpu, online_mask) {
if (qmap->mq_map[cpu] != qmap->queue_offset + queue)
continue;

pr_warn("Unable to create a usable CPU-to-queue mapping with the given constraints\n");
return false;
}
}

return true;
}

static void blk_mq_map_fallback(struct blk_mq_queue_map *qmap)
{
unsigned int cpu;

/*
* Map all CPUs to the first hctx of this specific map to ensure
* at least one online CPU is serving it, respecting the map's
* boundaries so secondary maps do not route into the default map.
*/
for_each_possible_cpu(cpu)
qmap->mq_map[cpu] = qmap->queue_offset;
}

void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
const struct cpumask *masks;
struct cpumask *masks;
const struct cpumask *constraint;
unsigned int queue, cpu, nr_masks;
unsigned long *active_hctx;
cpumask_var_t online_mask;

masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
if (!masks) {
for_each_possible_cpu(cpu)
qmap->mq_map[cpu] = qmap->queue_offset;
return;
}
active_hctx = bitmap_zalloc(qmap->nr_queues, GFP_KERNEL);
if (!active_hctx)
goto fallback;

for (queue = 0; queue < qmap->nr_queues; queue++) {
for_each_cpu(cpu, &masks[queue % nr_masks])
if (!alloc_cpumask_var(&online_mask, GFP_KERNEL))
goto free_fallback_hctx;

/*
* Snapshot online CPUs to prevent TOCTOU races between the
* mapping phase and the validation phase.
*/
cpumask_copy(online_mask, cpu_online_mask);

if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
constraint = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
else
constraint = cpu_possible_mask;

/* Map CPUs to the hardware contexts (hctx) */
masks = group_mask_cpus_evenly(qmap->nr_queues, constraint, &nr_masks);
if (!masks)
goto free_fallback;

/*
* Iterate directly over the generated CPU masks.
* Calculate the final, highest hardware queue index that maps to this
* mask. This skips all intermediate overwrites and safely evaluates
* active_hctx only for queues that survive the mapping.
*/
for (unsigned int idx = 0; idx < nr_masks; idx++) {
bool active = false;
queue = qmap->nr_queues - 1 -
((qmap->nr_queues - 1 - idx) % nr_masks);

for_each_cpu(cpu, &masks[idx]) {
qmap->mq_map[cpu] = qmap->queue_offset + queue;

if (!active && cpumask_test_cpu(cpu, online_mask)) {
__set_bit(queue, active_hctx);
active = true;
}
}
}

/*
* If all CPUs in the generated masks are offline, the active_hctx
* bitmap will be empty. Attempting to route unassigned CPUs to an
* empty bitmap will map them out-of-bounds. Fall back instead.
*/
if (bitmap_empty(active_hctx, qmap->nr_queues))
goto free_fallback;

/* Map any unassigned CPU evenly to the hardware contexts (hctx) */
queue = find_first_bit(active_hctx, qmap->nr_queues);
for_each_cpu_andnot(cpu, cpu_possible_mask, constraint) {
qmap->mq_map[cpu] = qmap->queue_offset + queue;
queue = find_next_bit_wrap(active_hctx, qmap->nr_queues, queue + 1);
}

if (!blk_mq_validate(qmap, active_hctx, online_mask))
goto free_fallback;

kfree(masks);
free_cpumask_var(online_mask);
bitmap_free(active_hctx);

return;

free_fallback:
kfree(masks);
free_cpumask_var(online_mask);
free_fallback_hctx:
bitmap_free(active_hctx);

fallback:
blk_mq_map_fallback(qmap);
}
EXPORT_SYMBOL_GPL(blk_mq_map_queues);

Expand Down Expand Up @@ -109,24 +231,92 @@ void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
struct device *dev, unsigned int offset)

{
const struct cpumask *mask;
cpumask_var_t mask, online_mask;
const struct cpumask *constraint;
unsigned long *active_hctx;
unsigned int queue, cpu;

if (!dev->bus->irq_get_affinity)
goto map_software;

active_hctx = bitmap_zalloc(qmap->nr_queues, GFP_KERNEL);
if (!active_hctx)
goto fallback;

if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
bitmap_free(active_hctx);
goto fallback;
}

if (!alloc_cpumask_var(&online_mask, GFP_KERNEL))
goto free_fallback_mask;

if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
constraint = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
else
constraint = cpu_possible_mask;

/*
* Snapshot online CPUs to prevent TOCTOU races between the
* mapping phase and the validation phase.
*/
cpumask_copy(online_mask, cpu_online_mask);

/* Map CPUs to the hardware contexts (hctx) */
for (queue = 0; queue < qmap->nr_queues; queue++) {
mask = dev->bus->irq_get_affinity(dev, queue + offset);
if (!mask)
goto fallback;
const struct cpumask *affinity_mask;
bool active = false;

affinity_mask = dev->bus->irq_get_affinity(dev, offset + queue);
if (!affinity_mask)
goto free_fallback;

for_each_cpu(cpu, mask)
for_each_cpu(cpu, affinity_mask) {
qmap->mq_map[cpu] = qmap->queue_offset + queue;

cpumask_set_cpu(cpu, mask);
if (!active && cpumask_test_cpu(cpu, online_mask) &&
cpumask_test_cpu(cpu, constraint)) {
__set_bit(queue, active_hctx);
active = true;
}
}
}

/*
* If all CPUs assigned to this map are offline, the bitmap will
* be empty. Fall back instead of routing out of bounds.
*/
if (bitmap_empty(active_hctx, qmap->nr_queues))
goto free_fallback;

/* Map any unassigned CPU evenly to the hardware contexts (hctx) */
queue = find_first_bit(active_hctx, qmap->nr_queues);
for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
qmap->mq_map[cpu] = qmap->queue_offset + queue;
queue = find_next_bit_wrap(active_hctx, qmap->nr_queues, queue + 1);
}

if (!blk_mq_validate(qmap, active_hctx, online_mask))
goto free_fallback;

bitmap_free(active_hctx);
free_cpumask_var(mask);
free_cpumask_var(online_mask);

return;

free_fallback:
free_cpumask_var(online_mask);
free_fallback_mask:
bitmap_free(active_hctx);
free_cpumask_var(mask);

fallback:
blk_mq_map_fallback(qmap);
return;

map_software:
blk_mq_map_queues(qmap);
}
EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues);
56 changes: 56 additions & 0 deletions block/blk-mq.c
Original file line number Diff line number Diff line change
Expand Up @@ -3720,6 +3720,57 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
return data.has_rq;
}

static bool blk_mq_hctx_can_offline_hk_cpu(struct blk_mq_hw_ctx *hctx,
unsigned int this_cpu)
{
const struct cpumask *hk_mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
struct gendisk *disk;
int cpu, fallback_isolated_cpu = -1;

/*
* If the CPU being offlined is not a housekeeping CPU,
* offlining it will not strand isolated CPUs. Allow it.
*/
if (!cpumask_test_cpu(this_cpu, hk_mask))
return true;
/*
* Iterate over all online CPUs and manually check their mapping.
* We cannot use hctx->cpumask here because blk_mq_map_swqueue()
* intentionally strips isolated CPUs from it to prevent kworker
* routing.
*/
for_each_online_cpu(cpu) {
struct blk_mq_hw_ctx *h;

if (cpu == this_cpu)
continue;

h = blk_mq_map_queue_type(hctx->queue, hctx->type, cpu);
if (h != hctx)
continue;

if (cpumask_test_cpu(cpu, hk_mask))
return true;

if (fallback_isolated_cpu == -1)
fallback_isolated_cpu = cpu;
}

if (fallback_isolated_cpu != -1) {
/*
* Use READ_ONCE() to prevent compiler double-fetch TOCTOU
* issues if the disk is removed concurrently.
*/
disk = READ_ONCE(hctx->queue->disk);
pr_warn("%s: trying to offline hctx%d but online isolated CPU %d is still mapped to it\n",
disk ? disk->disk_name : "?", hctx->queue_num,
fallback_isolated_cpu);
return false;
}

return true;
}

static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
unsigned int this_cpu)
{
Expand Down Expand Up @@ -3752,6 +3803,11 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
struct blk_mq_hw_ctx, cpuhp_online);
int ret = 0;

if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
if (!blk_mq_hctx_can_offline_hk_cpu(hctx, cpu))
return -EINVAL;
}

if (!hctx->nr_ctx || blk_mq_hctx_has_online_cpu(hctx, cpu))
return 0;

Expand Down
Loading
Loading