aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYinghai Lu <yinghai@kernel.org>2008-12-11 03:15:01 -0500
committerIngo Molnar <mingo@elte.hu>2008-12-16 18:14:01 -0500
commit48a1b10aff588833b73994704c47bbd0deb73e9c (patch)
treedeb3c7b486346c3afa54014b3c3516344c2708f2
parent13bd41bc227a48d6cf8992a3286bf6eba3c71a0c (diff)
x86, sparseirq: move irq_desc according to smp_affinity, v7
Impact: improve NUMA handling by migrating irq_desc on smp_affinity changes if CONFIG_NUMA_MIGRATE_IRQ_DESC is set: - make irq_desc to go with affinity aka irq_desc moving etc - call move_irq_desc in irq_complete_move() - legacy irq_desc is not moved, because they are allocated via static array for logical apic mode, need to add move_desc_in_progress_in_same_domain, otherwise it will not be moved ==> also could need two phases to get irq_desc moved. Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/Kconfig9
-rw-r--r--arch/x86/kernel/io_apic.c142
-rw-r--r--include/linux/irq.h10
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/handle.c15
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/numa_migrate.c127
8 files changed, 313 insertions, 8 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8943c13502c6..29073532f94c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -248,6 +248,15 @@ config SPARSE_IRQ
248 248
249 If you don't know what to do here, say Y. 249 If you don't know what to do here, say Y.
250 250
251config NUMA_MIGRATE_IRQ_DESC
252 bool "Move irq desc when changing irq smp_affinity"
253 depends on SPARSE_IRQ && SMP
254 default n
255 help
256 This enables moving irq_desc to cpu/node that irq will use handled.
257
258 If you don't know what to do here, say N.
259
251config X86_FIND_SMP_CONFIG 260config X86_FIND_SMP_CONFIG
252 def_bool y 261 def_bool y
253 depends on X86_MPPARSE || X86_VOYAGER 262 depends on X86_MPPARSE || X86_VOYAGER
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index a1a2e070f31a..bfe1245b1a3e 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -141,6 +141,9 @@ struct irq_cfg {
141 unsigned move_cleanup_count; 141 unsigned move_cleanup_count;
142 u8 vector; 142 u8 vector;
143 u8 move_in_progress : 1; 143 u8 move_in_progress : 1;
144#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
145 u8 move_desc_pending : 1;
146#endif
144}; 147};
145 148
146/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 149/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -223,6 +226,121 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu)
223 } 226 }
224} 227}
225 228
229#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
230
231static void
232init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
233{
234 struct irq_pin_list *old_entry, *head, *tail, *entry;
235
236 cfg->irq_2_pin = NULL;
237 old_entry = old_cfg->irq_2_pin;
238 if (!old_entry)
239 return;
240
241 entry = get_one_free_irq_2_pin(cpu);
242 if (!entry)
243 return;
244
245 entry->apic = old_entry->apic;
246 entry->pin = old_entry->pin;
247 head = entry;
248 tail = entry;
249 old_entry = old_entry->next;
250 while (old_entry) {
251 entry = get_one_free_irq_2_pin(cpu);
252 if (!entry) {
253 entry = head;
254 while (entry) {
255 head = entry->next;
256 kfree(entry);
257 entry = head;
258 }
259 /* still use the old one */
260 return;
261 }
262 entry->apic = old_entry->apic;
263 entry->pin = old_entry->pin;
264 tail->next = entry;
265 tail = entry;
266 old_entry = old_entry->next;
267 }
268
269 tail->next = NULL;
270 cfg->irq_2_pin = head;
271}
272
273static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
274{
275 struct irq_pin_list *entry, *next;
276
277 if (old_cfg->irq_2_pin == cfg->irq_2_pin)
278 return;
279
280 entry = old_cfg->irq_2_pin;
281
282 while (entry) {
283 next = entry->next;
284 kfree(entry);
285 entry = next;
286 }
287 old_cfg->irq_2_pin = NULL;
288}
289
290void arch_init_copy_chip_data(struct irq_desc *old_desc,
291 struct irq_desc *desc, int cpu)
292{
293 struct irq_cfg *cfg;
294 struct irq_cfg *old_cfg;
295
296 cfg = get_one_free_irq_cfg(cpu);
297
298 if (!cfg)
299 return;
300
301 desc->chip_data = cfg;
302
303 old_cfg = old_desc->chip_data;
304
305 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
306
307 init_copy_irq_2_pin(old_cfg, cfg, cpu);
308}
309
310static void free_irq_cfg(struct irq_cfg *old_cfg)
311{
312 kfree(old_cfg);
313}
314
315void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
316{
317 struct irq_cfg *old_cfg, *cfg;
318
319 old_cfg = old_desc->chip_data;
320 cfg = desc->chip_data;
321
322 if (old_cfg == cfg)
323 return;
324
325 if (old_cfg) {
326 free_irq_2_pin(old_cfg, cfg);
327 free_irq_cfg(old_cfg);
328 old_desc->chip_data = NULL;
329 }
330}
331
332static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
333{
334 struct irq_cfg *cfg = desc->chip_data;
335
336 if (!cfg->move_in_progress) {
337 /* it means that domain is not changed */
338 if (!cpus_intersects(desc->affinity, mask))
339 cfg->move_desc_pending = 1;
340 }
341}
342#endif
343
226#else 344#else
227static struct irq_cfg *irq_cfg(unsigned int irq) 345static struct irq_cfg *irq_cfg(unsigned int irq)
228{ 346{
@@ -231,9 +349,11 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
231 349
232#endif 350#endif
233 351
352#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
234static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) 353static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
235{ 354{
236} 355}
356#endif
237 357
238struct io_apic { 358struct io_apic {
239 unsigned int index; 359 unsigned int index;
@@ -2346,14 +2466,34 @@ static void irq_complete_move(struct irq_desc **descp)
2346 struct irq_cfg *cfg = desc->chip_data; 2466 struct irq_cfg *cfg = desc->chip_data;
2347 unsigned vector, me; 2467 unsigned vector, me;
2348 2468
2349 if (likely(!cfg->move_in_progress)) 2469 if (likely(!cfg->move_in_progress)) {
2470#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2471 if (likely(!cfg->move_desc_pending))
2472 return;
2473
2474 /* domain is not change, but affinity is changed */
2475 me = smp_processor_id();
2476 if (cpu_isset(me, desc->affinity)) {
2477 *descp = desc = move_irq_desc(desc, me);
2478 /* get the new one */
2479 cfg = desc->chip_data;
2480 cfg->move_desc_pending = 0;
2481 }
2482#endif
2350 return; 2483 return;
2484 }
2351 2485
2352 vector = ~get_irq_regs()->orig_ax; 2486 vector = ~get_irq_regs()->orig_ax;
2353 me = smp_processor_id(); 2487 me = smp_processor_id();
2354 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { 2488 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
2355 cpumask_t cleanup_mask; 2489 cpumask_t cleanup_mask;
2356 2490
2491#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2492 *descp = desc = move_irq_desc(desc, me);
2493 /* get the new one */
2494 cfg = desc->chip_data;
2495#endif
2496
2357 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 2497 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
2358 cfg->move_cleanup_count = cpus_weight(cleanup_mask); 2498 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2359 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2499 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index b5749db3e5a1..36a015746788 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -227,6 +227,16 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
227 227
228#endif 228#endif
229 229
230static inline struct irq_desc *
231irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
232{
233#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
234 return irq_to_desc(irq);
235#else
236 return desc;
237#endif
238}
239
230/* 240/*
231 * Migration helpers for obsolete names, they will go away: 241 * Migration helpers for obsolete names, they will go away:
232 */ 242 */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 681c52dbfe22..4dd5b1edac98 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8e4fce4a1b1f..de210f4b7a92 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -353,6 +353,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
353 353
354 spin_lock(&desc->lock); 354 spin_lock(&desc->lock);
355 mask_ack_irq(desc, irq); 355 mask_ack_irq(desc, irq);
356 desc = irq_remap_to_desc(irq, desc);
356 357
357 if (unlikely(desc->status & IRQ_INPROGRESS)) 358 if (unlikely(desc->status & IRQ_INPROGRESS))
358 goto out_unlock; 359 goto out_unlock;
@@ -430,6 +431,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
430 desc->status &= ~IRQ_INPROGRESS; 431 desc->status &= ~IRQ_INPROGRESS;
431out: 432out:
432 desc->chip->eoi(irq); 433 desc->chip->eoi(irq);
434 desc = irq_remap_to_desc(irq, desc);
433 435
434 spin_unlock(&desc->lock); 436 spin_unlock(&desc->lock);
435} 437}
@@ -466,12 +468,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
466 !desc->action)) { 468 !desc->action)) {
467 desc->status |= (IRQ_PENDING | IRQ_MASKED); 469 desc->status |= (IRQ_PENDING | IRQ_MASKED);
468 mask_ack_irq(desc, irq); 470 mask_ack_irq(desc, irq);
471 desc = irq_remap_to_desc(irq, desc);
469 goto out_unlock; 472 goto out_unlock;
470 } 473 }
471 kstat_incr_irqs_this_cpu(irq, desc); 474 kstat_incr_irqs_this_cpu(irq, desc);
472 475
473 /* Start handling the irq */ 476 /* Start handling the irq */
474 desc->chip->ack(irq); 477 desc->chip->ack(irq);
478 desc = irq_remap_to_desc(irq, desc);
475 479
476 /* Mark the IRQ currently in progress.*/ 480 /* Mark the IRQ currently in progress.*/
477 desc->status |= IRQ_INPROGRESS; 481 desc->status |= IRQ_INPROGRESS;
@@ -532,8 +536,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
532 if (!noirqdebug) 536 if (!noirqdebug)
533 note_interrupt(irq, desc, action_ret); 537 note_interrupt(irq, desc, action_ret);
534 538
535 if (desc->chip->eoi) 539 if (desc->chip->eoi) {
536 desc->chip->eoi(irq); 540 desc->chip->eoi(irq);
541 desc = irq_remap_to_desc(irq, desc);
542 }
537} 543}
538 544
539void 545void
@@ -568,8 +574,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
568 574
569 /* Uninstall? */ 575 /* Uninstall? */
570 if (handle == handle_bad_irq) { 576 if (handle == handle_bad_irq) {
571 if (desc->chip != &no_irq_chip) 577 if (desc->chip != &no_irq_chip) {
572 mask_ack_irq(desc, irq); 578 mask_ack_irq(desc, irq);
579 desc = irq_remap_to_desc(irq, desc);
580 }
573 desc->status |= IRQ_DISABLED; 581 desc->status |= IRQ_DISABLED;
574 desc->depth = 1; 582 desc->depth = 1;
575 } 583 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 8aa09547f5ef..f1a23069c20a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -23,7 +23,7 @@
23/* 23/*
24 * lockdep: we want to handle all irq_desc locks as a single lock-class: 24 * lockdep: we want to handle all irq_desc locks as a single lock-class:
25 */ 25 */
26static struct lock_class_key irq_desc_lock_class; 26struct lock_class_key irq_desc_lock_class;
27 27
28/** 28/**
29 * handle_bad_irq - handle spurious and unhandled irqs 29 * handle_bad_irq - handle spurious and unhandled irqs
@@ -73,7 +73,7 @@ static struct irq_desc irq_desc_init = {
73#endif 73#endif
74}; 74};
75 75
76static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 76void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
77{ 77{
78 unsigned long bytes; 78 unsigned long bytes;
79 char *ptr; 79 char *ptr;
@@ -113,7 +113,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
113/* 113/*
114 * Protect the sparse_irqs: 114 * Protect the sparse_irqs:
115 */ 115 */
116static DEFINE_SPINLOCK(sparse_irq_lock); 116DEFINE_SPINLOCK(sparse_irq_lock);
117 117
118struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; 118struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
119 119
@@ -337,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq)
337 /* 337 /*
338 * No locking required for CPU-local interrupts: 338 * No locking required for CPU-local interrupts:
339 */ 339 */
340 if (desc->chip->ack) 340 if (desc->chip->ack) {
341 desc->chip->ack(irq); 341 desc->chip->ack(irq);
342 /* get new one */
343 desc = irq_remap_to_desc(irq, desc);
344 }
342 if (likely(!(desc->status & IRQ_DISABLED))) { 345 if (likely(!(desc->status & IRQ_DISABLED))) {
343 action_ret = handle_IRQ_event(irq, desc->action); 346 action_ret = handle_IRQ_event(irq, desc->action);
344 if (!noirqdebug) 347 if (!noirqdebug)
@@ -349,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq)
349 } 352 }
350 353
351 spin_lock(&desc->lock); 354 spin_lock(&desc->lock);
352 if (desc->chip->ack) 355 if (desc->chip->ack) {
353 desc->chip->ack(irq); 356 desc->chip->ack(irq);
357 desc = irq_remap_to_desc(irq, desc);
358 }
354 /* 359 /*
355 * REPLAY is when Linux resends an IRQ that was dropped earlier 360 * REPLAY is when Linux resends an IRQ that was dropped earlier
356 * WAITING is used by probe to mark irqs that are being tested 361 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 64c1c7253dae..e6d0a43cc125 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 14 unsigned long flags);
15 15
16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock;
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
20
16#ifdef CONFIG_PROC_FS 21#ifdef CONFIG_PROC_FS
17extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
18extern void register_handler_proc(unsigned int irq, struct irqaction *action); 23extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644
index 000000000000..0178e2296990
--- /dev/null
+++ b/kernel/irq/numa_migrate.c
@@ -0,0 +1,127 @@
1/*
2 * linux/kernel/irq/handle.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
6 *
7 * This file contains the core interrupt handling code.
8 *
9 * Detailed information is available in Documentation/DocBook/genericirq
10 *
11 */
12
13#include <linux/irq.h>
14#include <linux/module.h>
15#include <linux/random.h>
16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h>
18
19#include "internals.h"
20
21static void init_copy_kstat_irqs(struct irq_desc *old_desc,
22 struct irq_desc *desc,
23 int cpu, int nr)
24{
25 unsigned long bytes;
26
27 init_kstat_irqs(desc, cpu, nr);
28
29 if (desc->kstat_irqs != old_desc->kstat_irqs) {
30 /* Compute how many bytes we need per irq and allocate them */
31 bytes = nr * sizeof(unsigned int);
32
33 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
34 }
35}
36
37static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
38{
39 if (old_desc->kstat_irqs == desc->kstat_irqs)
40 return;
41
42 kfree(old_desc->kstat_irqs);
43 old_desc->kstat_irqs = NULL;
44}
45
46static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
47 struct irq_desc *desc, int cpu)
48{
49 memcpy(desc, old_desc, sizeof(struct irq_desc));
50 desc->cpu = cpu;
51 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
52 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
53 arch_init_copy_chip_data(old_desc, desc, cpu);
54}
55
56static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
57{
58 free_kstat_irqs(old_desc, desc);
59 arch_free_chip_data(old_desc, desc);
60}
61
62static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
63 int cpu)
64{
65 struct irq_desc *desc;
66 unsigned int irq;
67 unsigned long flags;
68 int node;
69
70 irq = old_desc->irq;
71
72 spin_lock_irqsave(&sparse_irq_lock, flags);
73
74 /* We have to check it to avoid races with another CPU */
75 desc = irq_desc_ptrs[irq];
76
77 if (desc && old_desc != desc)
78 goto out_unlock;
79
80 node = cpu_to_node(cpu);
81 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
82 printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n",
83 irq, cpu, node);
84 if (!desc) {
85 printk(KERN_ERR "can not get new irq_desc for moving\n");
86 /* still use old one */
87 desc = old_desc;
88 goto out_unlock;
89 }
90 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
91
92 irq_desc_ptrs[irq] = desc;
93
94 /* free the old one */
95 free_one_irq_desc(old_desc, desc);
96 kfree(old_desc);
97
98out_unlock:
99 spin_unlock_irqrestore(&sparse_irq_lock, flags);
100
101 return desc;
102}
103
104struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
105{
106 int old_cpu;
107 int node, old_node;
108
109 /* those all static, do move them */
110 if (desc->irq < NR_IRQS_LEGACY)
111 return desc;
112
113 old_cpu = desc->cpu;
114 printk(KERN_DEBUG
115 "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
116 if (old_cpu != cpu) {
117 node = cpu_to_node(cpu);
118 old_node = cpu_to_node(old_cpu);
119 if (old_node != node)
120 desc = __real_move_irq_desc(desc, cpu);
121 else
122 desc->cpu = cpu;
123 }
124
125 return desc;
126}
127