aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/apic
diff options
context:
space:
mode:
authorAlexander Gordeev <agordeev@redhat.com>2012-11-19 10:01:29 -0500
committerIngo Molnar <mingo@kernel.org>2013-01-24 11:25:12 -0500
commit51906e779f2b13b38f8153774c4c7163d412ffd9 (patch)
tree970633752f6a5cea156226cd31457289ba16f1c5 /arch/x86/kernel/apic
parent4cca6ea04d31c22a7d0436949c072b27bde41f86 (diff)
x86/MSI: Support multiple MSIs in presense of IRQ remapping
The MSI specification has several constraints in comparison with MSI-X, most notable of them is the inability to configure MSIs independently. As a result, it is impossible to dispatch interrupts from different queues to different CPUs. This is largely devalues the support of multiple MSIs in SMP systems. Also, a necessity to allocate a contiguous block of vector numbers for devices capable of multiple MSIs might cause a considerable pressure on x86 interrupt vector allocator and could lead to fragmentation of the interrupt vectors space. This patch overcomes both drawbacks in presense of IRQ remapping and lets devices take advantage of multiple queues and per-IRQ affinity assignments. Signed-off-by: Alexander Gordeev <agordeev@redhat.com> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Jeff Garzik <jgarzik@pobox.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/c8bd86ff56b5fc118257436768aaa04489ac0a4c.1353324359.git.agordeev@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/kernel/apic')
-rw-r--r--arch/x86/kernel/apic/io_apic.c165
1 files changed, 133 insertions, 32 deletions
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b739d398bb29..2016f9dabd72 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -300,9 +300,9 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
300 return cfg; 300 return cfg;
301} 301}
302 302
303static int alloc_irq_from(unsigned int from, int node) 303static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
304{ 304{
305 return irq_alloc_desc_from(from, node); 305 return irq_alloc_descs_from(from, count, node);
306} 306}
307 307
308static void free_irq_at(unsigned int at, struct irq_cfg *cfg) 308static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
@@ -2982,37 +2982,58 @@ device_initcall(ioapic_init_ops);
2982/* 2982/*
2983 * Dynamic irq allocate and deallocation 2983 * Dynamic irq allocate and deallocation
2984 */ 2984 */
2985unsigned int create_irq_nr(unsigned int from, int node) 2985unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
2986{ 2986{
2987 struct irq_cfg *cfg; 2987 struct irq_cfg **cfg;
2988 unsigned long flags; 2988 unsigned long flags;
2989 unsigned int ret = 0; 2989 int irq, i;
2990 int irq;
2991 2990
2992 if (from < nr_irqs_gsi) 2991 if (from < nr_irqs_gsi)
2993 from = nr_irqs_gsi; 2992 from = nr_irqs_gsi;
2994 2993
2995 irq = alloc_irq_from(from, node); 2994 cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
2996 if (irq < 0) 2995 if (!cfg)
2997 return 0;
2998 cfg = alloc_irq_cfg(irq, node);
2999 if (!cfg) {
3000 free_irq_at(irq, NULL);
3001 return 0; 2996 return 0;
2997
2998 irq = alloc_irqs_from(from, count, node);
2999 if (irq < 0)
3000 goto out_cfgs;
3001
3002 for (i = 0; i < count; i++) {
3003 cfg[i] = alloc_irq_cfg(irq + i, node);
3004 if (!cfg[i])
3005 goto out_irqs;
3002 } 3006 }
3003 3007
3004 raw_spin_lock_irqsave(&vector_lock, flags); 3008 raw_spin_lock_irqsave(&vector_lock, flags);
3005 if (!__assign_irq_vector(irq, cfg, apic->target_cpus())) 3009 for (i = 0; i < count; i++)
3006 ret = irq; 3010 if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
3011 goto out_vecs;
3007 raw_spin_unlock_irqrestore(&vector_lock, flags); 3012 raw_spin_unlock_irqrestore(&vector_lock, flags);
3008 3013
3009 if (ret) { 3014 for (i = 0; i < count; i++) {
3010 irq_set_chip_data(irq, cfg); 3015 irq_set_chip_data(irq + i, cfg[i]);
3011 irq_clear_status_flags(irq, IRQ_NOREQUEST); 3016 irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
3012 } else {
3013 free_irq_at(irq, cfg);
3014 } 3017 }
3015 return ret; 3018
3019 kfree(cfg);
3020 return irq;
3021
3022out_vecs:
3023 for (i--; i >= 0; i--)
3024 __clear_irq_vector(irq + i, cfg[i]);
3025 raw_spin_unlock_irqrestore(&vector_lock, flags);
3026out_irqs:
3027 for (i = 0; i < count; i++)
3028 free_irq_at(irq + i, cfg[i]);
3029out_cfgs:
3030 kfree(cfg);
3031 return 0;
3032}
3033
3034unsigned int create_irq_nr(unsigned int from, int node)
3035{
3036 return __create_irqs(from, 1, node);
3016} 3037}
3017 3038
3018int create_irq(void) 3039int create_irq(void)
@@ -3045,6 +3066,14 @@ void destroy_irq(unsigned int irq)
3045 free_irq_at(irq, cfg); 3066 free_irq_at(irq, cfg);
3046} 3067}
3047 3068
3069static inline void destroy_irqs(unsigned int irq, unsigned int count)
3070{
3071 unsigned int i;
3072
3073 for (i = 0; i < count; i++)
3074 destroy_irq(irq + i);
3075}
3076
3048/* 3077/*
3049 * MSI message composition 3078 * MSI message composition
3050 */ 3079 */
@@ -3071,7 +3100,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3071 3100
3072 if (irq_remapped(cfg)) { 3101 if (irq_remapped(cfg)) {
3073 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); 3102 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
3074 return err; 3103 return 0;
3075 } 3104 }
3076 3105
3077 if (x2apic_enabled()) 3106 if (x2apic_enabled())
@@ -3098,7 +3127,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3098 MSI_DATA_DELIVERY_LOWPRI) | 3127 MSI_DATA_DELIVERY_LOWPRI) |
3099 MSI_DATA_VECTOR(cfg->vector); 3128 MSI_DATA_VECTOR(cfg->vector);
3100 3129
3101 return err; 3130 return 0;
3102} 3131}
3103 3132
3104static int 3133static int
@@ -3136,18 +3165,26 @@ static struct irq_chip msi_chip = {
3136 .irq_retrigger = ioapic_retrigger_irq, 3165 .irq_retrigger = ioapic_retrigger_irq,
3137}; 3166};
3138 3167
3139static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3168static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
3169 unsigned int irq_base, unsigned int irq_offset)
3140{ 3170{
3141 struct irq_chip *chip = &msi_chip; 3171 struct irq_chip *chip = &msi_chip;
3142 struct msi_msg msg; 3172 struct msi_msg msg;
3173 unsigned int irq = irq_base + irq_offset;
3143 int ret; 3174 int ret;
3144 3175
3145 ret = msi_compose_msg(dev, irq, &msg, -1); 3176 ret = msi_compose_msg(dev, irq, &msg, -1);
3146 if (ret < 0) 3177 if (ret < 0)
3147 return ret; 3178 return ret;
3148 3179
3149 irq_set_msi_desc(irq, msidesc); 3180 irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
3150 write_msi_msg(irq, &msg); 3181
3182 /*
3183 * MSI-X message is written per-IRQ, the offset is always 0.
3184 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
3185 */
3186 if (!irq_offset)
3187 write_msi_msg(irq, &msg);
3151 3188
3152 if (irq_remapped(irq_get_chip_data(irq))) { 3189 if (irq_remapped(irq_get_chip_data(irq))) {
3153 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3190 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
@@ -3161,23 +3198,19 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3161 return 0; 3198 return 0;
3162} 3199}
3163 3200
3164int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3201int setup_msix_irqs(struct pci_dev *dev, int nvec)
3165{ 3202{
3166 int node, ret, sub_handle, index = 0; 3203 int node, ret, sub_handle, index = 0;
3167 unsigned int irq, irq_want; 3204 unsigned int irq, irq_want;
3168 struct msi_desc *msidesc; 3205 struct msi_desc *msidesc;
3169 3206
3170 /* x86 doesn't support multiple MSI yet */
3171 if (type == PCI_CAP_ID_MSI && nvec > 1)
3172 return 1;
3173
3174 node = dev_to_node(&dev->dev); 3207 node = dev_to_node(&dev->dev);
3175 irq_want = nr_irqs_gsi; 3208 irq_want = nr_irqs_gsi;
3176 sub_handle = 0; 3209 sub_handle = 0;
3177 list_for_each_entry(msidesc, &dev->msi_list, list) { 3210 list_for_each_entry(msidesc, &dev->msi_list, list) {
3178 irq = create_irq_nr(irq_want, node); 3211 irq = create_irq_nr(irq_want, node);
3179 if (irq == 0) 3212 if (irq == 0)
3180 return -1; 3213 return -ENOSPC;
3181 irq_want = irq + 1; 3214 irq_want = irq + 1;
3182 if (!irq_remapping_enabled) 3215 if (!irq_remapping_enabled)
3183 goto no_ir; 3216 goto no_ir;
@@ -3199,7 +3232,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3199 goto error; 3232 goto error;
3200 } 3233 }
3201no_ir: 3234no_ir:
3202 ret = setup_msi_irq(dev, msidesc, irq); 3235 ret = setup_msi_irq(dev, msidesc, irq, 0);
3203 if (ret < 0) 3236 if (ret < 0)
3204 goto error; 3237 goto error;
3205 sub_handle++; 3238 sub_handle++;
@@ -3211,6 +3244,74 @@ error:
3211 return ret; 3244 return ret;
3212} 3245}
3213 3246
3247int setup_msi_irqs(struct pci_dev *dev, int nvec)
3248{
3249 int node, ret, sub_handle, index = 0;
3250 unsigned int irq;
3251 struct msi_desc *msidesc;
3252
3253 if (nvec > 1 && !irq_remapping_enabled)
3254 return 1;
3255
3256 nvec = __roundup_pow_of_two(nvec);
3257
3258 WARN_ON(!list_is_singular(&dev->msi_list));
3259 msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
3260 WARN_ON(msidesc->irq);
3261 WARN_ON(msidesc->msi_attrib.multiple);
3262
3263 node = dev_to_node(&dev->dev);
3264 irq = __create_irqs(nr_irqs_gsi, nvec, node);
3265 if (irq == 0)
3266 return -ENOSPC;
3267
3268 if (!irq_remapping_enabled) {
3269 ret = setup_msi_irq(dev, msidesc, irq, 0);
3270 if (ret < 0)
3271 goto error;
3272 return 0;
3273 }
3274
3275 msidesc->msi_attrib.multiple = ilog2(nvec);
3276 for (sub_handle = 0; sub_handle < nvec; sub_handle++) {
3277 if (!sub_handle) {
3278 index = msi_alloc_remapped_irq(dev, irq, nvec);
3279 if (index < 0) {
3280 ret = index;
3281 goto error;
3282 }
3283 } else {
3284 ret = msi_setup_remapped_irq(dev, irq + sub_handle,
3285 index, sub_handle);
3286 if (ret < 0)
3287 goto error;
3288 }
3289 ret = setup_msi_irq(dev, msidesc, irq, sub_handle);
3290 if (ret < 0)
3291 goto error;
3292 }
3293 return 0;
3294
3295error:
3296 destroy_irqs(irq, nvec);
3297
3298 /*
3299 * Restore altered MSI descriptor fields and prevent just destroyed
3300 * IRQs from tearing down again in default_teardown_msi_irqs()
3301 */
3302 msidesc->irq = 0;
3303 msidesc->msi_attrib.multiple = 0;
3304
3305 return ret;
3306}
3307
3308int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3309{
3310 if (type == PCI_CAP_ID_MSI)
3311 return setup_msi_irqs(dev, nvec);
3312 return setup_msix_irqs(dev, nvec);
3313}
3314
3214void native_teardown_msi_irq(unsigned int irq) 3315void native_teardown_msi_irq(unsigned int irq)
3215{ 3316{
3216 destroy_irq(irq); 3317 destroy_irq(irq);