summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLong Li <longli@microsoft.com>2018-11-05 23:00:00 -0500
committerThomas Gleixner <tglx@linutronix.de>2018-11-06 17:20:13 -0500
commite8da8794a7fd9eef1ec9a07f0d4897c68581c72b (patch)
tree18623b21c6ba79b465ed80453ad33cbe1c12e95e
parent6da4b3ab9a6e9b1b5f90322ab3fa3a7dd18edb19 (diff)
genirq/matrix: Improve target CPU selection for managed interrupts.
On large systems with multiple devices of the same class (e.g. NVMe disks, using managed interrupts), the kernel can affinitize these interrupts to a small subset of CPUs instead of spreading them out evenly. irq_matrix_alloc_managed() tries to select the CPU in the supplied cpumask of possible target CPUs which has the lowest number of interrupt vectors allocated. This is done by searching the CPU with the highest number of available vectors. While this is correct for non-managed CPUs it can select the wrong CPU for managed interrupts. Under certain constellations this results in affinitizing the managed interrupts of several devices to a single CPU in a set. The book keeping of available vectors works the following way: 1) Non-managed interrupts: available is decremented when the interrupt is actually requested by the device driver and a vector is assigned. It's incremented when the interrupt and the vector are freed. 2) Managed interrupts: Managed interrupts guarantee vector reservation when the MSI/MSI-X functionality of a device is enabled, which is achieved by reserving vectors in the bitmaps of the possible target CPUs. This reservation decrements the available count on each possible target CPU. When the interrupt is requested by the device driver then a vector is allocated from the reserved region. The operation is reversed when the interrupt is freed by the device driver. Neither of these operations affect the available count. The reservation persist up to the point where the MSI/MSI-X functionality is disabled and only this operation increments the available count again. For non-managed interrupts the available count is the correct selection criterion because the guaranteed reservations need to be taken into account. Using the allocated counter could lead to a failing allocation in the following situation (total vector space of 10 assumed): CPU0 CPU1 available: 2 0 allocated: 5 3 <--- CPU1 is selected, but available space = 0 managed reserved: 3 7 while available yields the correct result. For managed interrupts the available count is not the appropriate selection criterion because as explained above the available count is not affected by the actual vector allocation. The following example illustrates that. Total vector space of 10 assumed. The starting point is: CPU0 CPU1 available: 5 4 allocated: 2 3 managed reserved: 3 3 Allocating vectors for three non-managed interrupts will result in affinitizing the first two to CPU0 and the third one to CPU1 because the available count is adjusted with each allocation: CPU0 CPU1 available: 5 4 <- Select CPU0 for 1st allocation --> allocated: 3 3 available: 4 4 <- Select CPU0 for 2nd allocation --> allocated: 4 3 available: 3 4 <- Select CPU1 for 3rd allocation --> allocated: 4 4 But the allocation of three managed interrupts starting from the same point will affinitize all of them to CPU0 because the available count is not affected by the allocation (see above). So the end result is: CPU0 CPU1 available: 5 4 allocated: 5 3 Introduce a "managed_allocated" field in struct cpumap to track the vector allocation for managed interrupts separately. Use this information to select the target CPU when a vector is allocated for a managed interrupt, which results in more evenly distributed vector assignments. The above example results in the following allocations: CPU0 CPU1 managed_allocated: 0 0 <- Select CPU0 for 1st allocation --> allocated: 3 3 managed_allocated: 1 0 <- Select CPU1 for 2nd allocation --> allocated: 3 4 managed_allocated: 1 1 <- Select CPU0 for 3rd allocation --> allocated: 4 4 The allocation of non-managed interrupts is not affected by this change and is still evaluating the available count. The overall distribution of interrupt vectors for both types of interrupts might still not be perfectly even depending on the number of non-managed and managed interrupts in a system, but due to the reservation guarantee for managed interrupts this cannot be avoided. Expose the new field in debugfs as well. [ tglx: Clarified the background of the problem in the changelog and described it independent of NVME ] Signed-off-by: Long Li <longli@microsoft.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Michael Kelley <mikelley@microsoft.com> Link: https://lkml.kernel.org/r/20181106040000.27316-1-longli@linuxonhyperv.com
-rw-r--r--kernel/irq/matrix.c34
1 files changed, 30 insertions, 4 deletions
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 1f0985adf193..30cc217b8631 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -14,6 +14,7 @@ struct cpumap {
14 unsigned int available; 14 unsigned int available;
15 unsigned int allocated; 15 unsigned int allocated;
16 unsigned int managed; 16 unsigned int managed;
17 unsigned int managed_allocated;
17 bool initialized; 18 bool initialized;
18 bool online; 19 bool online;
19 unsigned long alloc_map[IRQ_MATRIX_SIZE]; 20 unsigned long alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
145 return best_cpu; 146 return best_cpu;
146} 147}
147 148
149/* Find the best CPU which has the lowest number of managed IRQs allocated */
150static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
151 const struct cpumask *msk)
152{
153 unsigned int cpu, best_cpu, allocated = UINT_MAX;
154 struct cpumap *cm;
155
156 best_cpu = UINT_MAX;
157
158 for_each_cpu(cpu, msk) {
159 cm = per_cpu_ptr(m->maps, cpu);
160
161 if (!cm->online || cm->managed_allocated > allocated)
162 continue;
163
164 best_cpu = cpu;
165 allocated = cm->managed_allocated;
166 }
167 return best_cpu;
168}
169
148/** 170/**
149 * irq_matrix_assign_system - Assign system wide entry in the matrix 171 * irq_matrix_assign_system - Assign system wide entry in the matrix
150 * @m: Matrix pointer 172 * @m: Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
269 if (cpumask_empty(msk)) 291 if (cpumask_empty(msk))
270 return -EINVAL; 292 return -EINVAL;
271 293
272 cpu = matrix_find_best_cpu(m, msk); 294 cpu = matrix_find_best_cpu_managed(m, msk);
273 if (cpu == UINT_MAX) 295 if (cpu == UINT_MAX)
274 return -ENOSPC; 296 return -ENOSPC;
275 297
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
282 return -ENOSPC; 304 return -ENOSPC;
283 set_bit(bit, cm->alloc_map); 305 set_bit(bit, cm->alloc_map);
284 cm->allocated++; 306 cm->allocated++;
307 cm->managed_allocated++;
285 m->total_allocated++; 308 m->total_allocated++;
286 *mapped_cpu = cpu; 309 *mapped_cpu = cpu;
287 trace_irq_matrix_alloc_managed(bit, cpu, m, cm); 310 trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
395 418
396 clear_bit(bit, cm->alloc_map); 419 clear_bit(bit, cm->alloc_map);
397 cm->allocated--; 420 cm->allocated--;
421 if(managed)
422 cm->managed_allocated--;
398 423
399 if (cm->online) 424 if (cm->online)
400 m->total_allocated--; 425 m->total_allocated--;
@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
464 seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); 489 seq_printf(sf, "Total allocated: %6u\n", m->total_allocated);
465 seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, 490 seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
466 m->system_map); 491 m->system_map);
467 seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " "); 492 seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " ");
468 cpus_read_lock(); 493 cpus_read_lock();
469 for_each_online_cpu(cpu) { 494 for_each_online_cpu(cpu) {
470 struct cpumap *cm = per_cpu_ptr(m->maps, cpu); 495 struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
471 496
472 seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ", 497 seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ",
473 cpu, cm->available, cm->managed, cm->allocated, 498 cpu, cm->available, cm->managed,
499 cm->managed_allocated, cm->allocated,
474 m->matrix_bits, cm->alloc_map); 500 m->matrix_bits, cm->alloc_map);
475 } 501 }
476 cpus_read_unlock(); 502 cpus_read_unlock();