From ced5b697a76d325e7a7ac7d382dbbb632c765093 Mon Sep 17 00:00:00 2001 From: Brandon Phiilps Date: Wed, 10 Feb 2010 01:20:06 -0800 Subject: x86: Avoid race condition in pci_enable_msix() Keep chip_data in create_irq_nr and destroy_irq. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-3-git-send-email-yinghai@kernel.org> Signed-off-by: Brandon Phililps Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- kernel/irq/chip.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f666..d70394f12ee9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,11 +18,7 @@ #include "internals.h" -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) +static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc; unsigned long flags; @@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq) desc->depth = 1; desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->action = NULL; desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq) } /** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * dynamic_irq_init - initialize a dynamically allocated irq * @irq: irq number to initialize */ -void dynamic_irq_cleanup(unsigned int irq) +void dynamic_irq_init(unsigned int irq) +{ + dynamic_irq_init_x(irq, false); +} + +/** + * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_init_keep_chip_data(unsigned int irq) +{ + dynamic_irq_init_x(irq, true); +} + +static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq) } desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; desc->name = NULL; @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, false); +} + +/** + * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, true); +} + /** * set_irq_chip - set the irq chip for an irq -- cgit v1.2.2 From febcb0c59ac19fef2081a30e371e7af3619b5e91 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:32 -0800 Subject: irq: Remove unnecessary bootmem code mem_init is moved early already. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-29-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/irq/handle.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 814940e7f485..0e823c0d1c9c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "internals.h" @@ -87,12 +86,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { void *ptr; - if (slab_is_available()) - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), - GFP_ATOMIC, node); - else - ptr = alloc_bootmem_node(NODE_DATA(node), - nr * sizeof(*desc->kstat_irqs)); + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); /* * don't overwite if can not get new one @@ -219,10 +214,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) goto out_unlock; - if (slab_is_available()) - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - else - desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { -- cgit v1.2.2 From 99558f0bbe68cb09799ec38adbaa3f3b2dc7ba63 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:34 -0800 Subject: sparseirq: Change irq_desc_ptrs to static Add replace_irq_desc() instead of poking at the array directly. -v2: remove unneeded boundary check in replace_irq_desc Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-31-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/irq/handle.c | 7 ++++++- kernel/irq/internals.h | 6 +----- kernel/irq/numa_migrate.c | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 0e823c0d1c9c..266f7986aa08 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -127,7 +127,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) */ DEFINE_RAW_SPINLOCK(sparse_irq_lock); -struct irq_desc **irq_desc_ptrs __read_mostly; +static struct irq_desc **irq_desc_ptrs __read_mostly; static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -192,6 +192,11 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } +void replace_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + irq_desc_ptrs[irq] = desc; +} + struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b2821f070a3d..c63f3bc88f0b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc); extern raw_spinlock_t sparse_irq_lock; #ifdef CONFIG_SPARSE_IRQ -/* irq_desc_ptrs allocated at boot time */ -extern struct irq_desc **irq_desc_ptrs; -#else -/* irq_desc_ptrs is a fixed size array */ -extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; +void replace_irq_desc(unsigned int irq, struct irq_desc *desc); #endif #ifdef CONFIG_PROC_FS diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 26bac9d8f860..963559dbd858 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ - desc = irq_desc_ptrs[irq]; + desc = irq_to_desc(irq); if (desc && old_desc != desc) goto out_unlock; @@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, goto out_unlock; } - irq_desc_ptrs[irq] = desc; + replace_irq_desc(irq, desc); raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ -- cgit v1.2.2 From b5eb78f76ddfa7caf4340cf6893b032f45d8114a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:35 -0800 Subject: sparseirq: Use radix_tree instead of ptrs array Use radix_tree irq_desc_tree instead of irq_desc_ptrs. -v2: according to Eric and cyrill to use radix_tree_lookup_slot and radix_tree_replace_slot Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-32-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/irq/handle.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 266f7986aa08..76d5a671bfe1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "internals.h" @@ -127,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) */ DEFINE_RAW_SPINLOCK(sparse_irq_lock); -static struct irq_desc **irq_desc_ptrs __read_mostly; +static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); + +static void set_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + radix_tree_insert(&irq_desc_tree, irq, desc); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return radix_tree_lookup(&irq_desc_tree, irq); +} + +void replace_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + void **ptr; + + ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); + if (ptr) + radix_tree_replace_slot(ptr, desc); +} static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -159,9 +179,6 @@ int __init early_irq_init(void) legacy_count = ARRAY_SIZE(irq_desc_legacy); node = first_online_node; - /* allocate irq_desc_ptrs array based on nr_irqs */ - irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); - /* allocate based on nr_cpu_ids */ kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * sizeof(int), GFP_NOWAIT, node); @@ -175,28 +192,12 @@ int __init early_irq_init(void) lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); alloc_desc_masks(&desc[i], node, true); init_desc_masks(&desc[i]); - irq_desc_ptrs[i] = desc + i; + set_irq_desc(i, &desc[i]); } - for (i = legacy_count; i < nr_irqs; i++) - irq_desc_ptrs[i] = NULL; - return arch_early_irq_init(); } -struct irq_desc *irq_to_desc(unsigned int irq) -{ - if (irq_desc_ptrs && irq < nr_irqs) - return irq_desc_ptrs[irq]; - - return NULL; -} - -void replace_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - irq_desc_ptrs[irq] = desc; -} - struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; @@ -208,14 +209,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } - desc = irq_desc_ptrs[irq]; + desc = irq_to_desc(irq); if (desc) return desc; raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ - desc = irq_desc_ptrs[irq]; + desc = irq_to_desc(irq); if (desc) goto out_unlock; @@ -228,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) } init_one_irq_desc(irq, desc, node); - irq_desc_ptrs[irq] = desc; + set_irq_desc(irq, desc); out_unlock: raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); -- cgit v1.2.2