aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2012-06-07 14:14:48 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-07-03 00:14:48 -0400
commitb4c3a8729ae57b4f84d661e16a192f828eca1d03 (patch)
tree03ff960dc63b7c60ed54cbf88f98c8b6df1823ec
parentd362213722c8875b40d712796392682968ce685e (diff)
powerpc/iommu: Implement IOMMU pools to improve multiqueue adapter performance
At the moment all queues in a multiqueue adapter will serialise against the IOMMU table lock. This is proving to be a big issue, especially with 10Gbit ethernet. This patch creates 4 pools and tries to spread the load across them. If the table is under 1GB in size we revert back to the original behaviour of 1 pool and 1 largealloc pool. We create a hash to map CPUs to pools. Since we prefer interrupts to be affinitised to primary CPUs, without some form of hashing we are very likely to end up using the same pool. As an example, POWER7 has 4 way SMT and with 4 pools all primary threads will map to the same pool. The largealloc pool is reduced from 1/2 to 1/4 of the space to partially offset the overhead of breaking the table up into pools. Some performance numbers were obtained with a Chelsio T3 adapter on two POWER7 boxes, running a 100 session TCP round robin test. Performance improved 69% with this patch applied. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/include/asm/iommu.h18
-rw-r--r--arch/powerpc/kernel/iommu.c148
-rw-r--r--arch/powerpc/platforms/cell/iommu.c1
3 files changed, 128 insertions, 39 deletions
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 957a83f43646..cbfe678e3dbe 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -53,6 +53,16 @@ static __inline__ __attribute_const__ int get_iommu_order(unsigned long size)
53 */ 53 */
54#define IOMAP_MAX_ORDER 13 54#define IOMAP_MAX_ORDER 13
55 55
56#define IOMMU_POOL_HASHBITS 2
57#define IOMMU_NR_POOLS (1 << IOMMU_POOL_HASHBITS)
58
59struct iommu_pool {
60 unsigned long start;
61 unsigned long end;
62 unsigned long hint;
63 spinlock_t lock;
64} ____cacheline_aligned_in_smp;
65
56struct iommu_table { 66struct iommu_table {
57 unsigned long it_busno; /* Bus number this table belongs to */ 67 unsigned long it_busno; /* Bus number this table belongs to */
58 unsigned long it_size; /* Size of iommu table in entries */ 68 unsigned long it_size; /* Size of iommu table in entries */
@@ -61,10 +71,10 @@ struct iommu_table {
61 unsigned long it_index; /* which iommu table this is */ 71 unsigned long it_index; /* which iommu table this is */
62 unsigned long it_type; /* type: PCI or Virtual Bus */ 72 unsigned long it_type; /* type: PCI or Virtual Bus */
63 unsigned long it_blocksize; /* Entries in each block (cacheline) */ 73 unsigned long it_blocksize; /* Entries in each block (cacheline) */
64 unsigned long it_hint; /* Hint for next alloc */ 74 unsigned long poolsize;
65 unsigned long it_largehint; /* Hint for large allocs */ 75 unsigned long nr_pools;
66 unsigned long it_halfpoint; /* Breaking point for small/large allocs */ 76 struct iommu_pool large_pool;
67 spinlock_t it_lock; /* Protects it_map */ 77 struct iommu_pool pools[IOMMU_NR_POOLS];
68 unsigned long *it_map; /* A simple allocation bitmap for now */ 78 unsigned long *it_map; /* A simple allocation bitmap for now */
69}; 79};
70 80
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 70a212cec587..7bc94da1a837 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -33,6 +33,7 @@
33#include <linux/bitmap.h> 33#include <linux/bitmap.h>
34#include <linux/iommu-helper.h> 34#include <linux/iommu-helper.h>
35#include <linux/crash_dump.h> 35#include <linux/crash_dump.h>
36#include <linux/hash.h>
36#include <asm/io.h> 37#include <asm/io.h>
37#include <asm/prom.h> 38#include <asm/prom.h>
38#include <asm/iommu.h> 39#include <asm/iommu.h>
@@ -58,6 +59,26 @@ static int __init setup_iommu(char *str)
58 59
59__setup("iommu=", setup_iommu); 60__setup("iommu=", setup_iommu);
60 61
62static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
63
64/*
65 * We precalculate the hash to avoid doing it on every allocation.
66 *
67 * The hash is important to spread CPUs across all the pools. For example,
68 * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
69 * with 4 pools all primary threads would map to the same pool.
70 */
71static int __init setup_iommu_pool_hash(void)
72{
73 unsigned int i;
74
75 for_each_possible_cpu(i)
76 per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
77
78 return 0;
79}
80subsys_initcall(setup_iommu_pool_hash);
81
61static unsigned long iommu_range_alloc(struct device *dev, 82static unsigned long iommu_range_alloc(struct device *dev,
62 struct iommu_table *tbl, 83 struct iommu_table *tbl,
63 unsigned long npages, 84 unsigned long npages,
@@ -72,6 +93,8 @@ static unsigned long iommu_range_alloc(struct device *dev,
72 unsigned long align_mask; 93 unsigned long align_mask;
73 unsigned long boundary_size; 94 unsigned long boundary_size;
74 unsigned long flags; 95 unsigned long flags;
96 unsigned int pool_nr;
97 struct iommu_pool *pool;
75 98
76 align_mask = 0xffffffffffffffffl >> (64 - align_order); 99 align_mask = 0xffffffffffffffffl >> (64 - align_order);
77 100
@@ -84,38 +107,46 @@ static unsigned long iommu_range_alloc(struct device *dev,
84 return DMA_ERROR_CODE; 107 return DMA_ERROR_CODE;
85 } 108 }
86 109
87 spin_lock_irqsave(&(tbl->it_lock), flags); 110 /*
111 * We don't need to disable preemption here because any CPU can
112 * safely use any IOMMU pool.
113 */
114 pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1);
88 115
89 if (handle && *handle) 116 if (largealloc)
90 start = *handle; 117 pool = &(tbl->large_pool);
91 else 118 else
92 start = largealloc ? tbl->it_largehint : tbl->it_hint; 119 pool = &(tbl->pools[pool_nr]);
93 120
94 /* Use only half of the table for small allocs (15 pages or less) */ 121 spin_lock_irqsave(&(pool->lock), flags);
95 limit = largealloc ? tbl->it_size : tbl->it_halfpoint; 122
123again:
124 if ((pass == 0) && handle && *handle)
125 start = *handle;
126 else
127 start = pool->hint;
96 128
97 if (largealloc && start < tbl->it_halfpoint) 129 limit = pool->end;
98 start = tbl->it_halfpoint;
99 130
100 /* The case below can happen if we have a small segment appended 131 /* The case below can happen if we have a small segment appended
101 * to a large, or when the previous alloc was at the very end of 132 * to a large, or when the previous alloc was at the very end of
102 * the available space. If so, go back to the initial start. 133 * the available space. If so, go back to the initial start.
103 */ 134 */
104 if (start >= limit) 135 if (start >= limit)
105 start = largealloc ? tbl->it_largehint : tbl->it_hint; 136 start = pool->start;
106
107 again:
108 137
109 if (limit + tbl->it_offset > mask) { 138 if (limit + tbl->it_offset > mask) {
110 limit = mask - tbl->it_offset + 1; 139 limit = mask - tbl->it_offset + 1;
111 /* If we're constrained on address range, first try 140 /* If we're constrained on address range, first try
112 * at the masked hint to avoid O(n) search complexity, 141 * at the masked hint to avoid O(n) search complexity,
113 * but on second pass, start at 0. 142 * but on second pass, start at 0 in pool 0.
114 */ 143 */
115 if ((start & mask) >= limit || pass > 0) 144 if ((start & mask) >= limit || pass > 0) {
116 start = 0; 145 pool = &(tbl->pools[0]);
117 else 146 start = pool->start;
147 } else {
118 start &= mask; 148 start &= mask;
149 }
119 } 150 }
120 151
121 if (dev) 152 if (dev)
@@ -129,17 +160,25 @@ static unsigned long iommu_range_alloc(struct device *dev,
129 tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT, 160 tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
130 align_mask); 161 align_mask);
131 if (n == -1) { 162 if (n == -1) {
132 if (likely(pass < 2)) { 163 if (likely(pass == 0)) {
133 /* First failure, just rescan the half of the table. 164 /* First try the pool from the start */
134 * Second failure, rescan the other half of the table. 165 pool->hint = pool->start;
135 */
136 start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
137 limit = pass ? tbl->it_size : limit;
138 pass++; 166 pass++;
139 goto again; 167 goto again;
168
169 } else if (pass <= tbl->nr_pools) {
170 /* Now try scanning all the other pools */
171 spin_unlock(&(pool->lock));
172 pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
173 pool = &tbl->pools[pool_nr];
174 spin_lock(&(pool->lock));
175 pool->hint = pool->start;
176 pass++;
177 goto again;
178
140 } else { 179 } else {
141 /* Third failure, give up */ 180 /* Give up */
142 spin_unlock_irqrestore(&(tbl->it_lock), flags); 181 spin_unlock_irqrestore(&(pool->lock), flags);
143 return DMA_ERROR_CODE; 182 return DMA_ERROR_CODE;
144 } 183 }
145 } 184 }
@@ -149,10 +188,10 @@ static unsigned long iommu_range_alloc(struct device *dev,
149 /* Bump the hint to a new block for small allocs. */ 188 /* Bump the hint to a new block for small allocs. */
150 if (largealloc) { 189 if (largealloc) {
151 /* Don't bump to new block to avoid fragmentation */ 190 /* Don't bump to new block to avoid fragmentation */
152 tbl->it_largehint = end; 191 pool->hint = end;
153 } else { 192 } else {
154 /* Overflow will be taken care of at the next allocation */ 193 /* Overflow will be taken care of at the next allocation */
155 tbl->it_hint = (end + tbl->it_blocksize - 1) & 194 pool->hint = (end + tbl->it_blocksize - 1) &
156 ~(tbl->it_blocksize - 1); 195 ~(tbl->it_blocksize - 1);
157 } 196 }
158 197
@@ -160,7 +199,8 @@ static unsigned long iommu_range_alloc(struct device *dev,
160 if (handle) 199 if (handle)
161 *handle = end; 200 *handle = end;
162 201
163 spin_unlock_irqrestore(&(tbl->it_lock), flags); 202 spin_unlock_irqrestore(&(pool->lock), flags);
203
164 return n; 204 return n;
165} 205}
166 206
@@ -235,23 +275,45 @@ static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
235 return true; 275 return true;
236} 276}
237 277
278static struct iommu_pool *get_pool(struct iommu_table *tbl,
279 unsigned long entry)
280{
281 struct iommu_pool *p;
282 unsigned long largepool_start = tbl->large_pool.start;
283
284 /* The large pool is the last pool at the top of the table */
285 if (entry >= largepool_start) {
286 p = &tbl->large_pool;
287 } else {
288 unsigned int pool_nr = entry / tbl->poolsize;
289
290 BUG_ON(pool_nr > tbl->nr_pools);
291 p = &tbl->pools[pool_nr];
292 }
293
294 return p;
295}
296
238static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 297static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
239 unsigned int npages) 298 unsigned int npages)
240{ 299{
241 unsigned long entry, free_entry; 300 unsigned long entry, free_entry;
242 unsigned long flags; 301 unsigned long flags;
302 struct iommu_pool *pool;
243 303
244 entry = dma_addr >> IOMMU_PAGE_SHIFT; 304 entry = dma_addr >> IOMMU_PAGE_SHIFT;
245 free_entry = entry - tbl->it_offset; 305 free_entry = entry - tbl->it_offset;
246 306
307 pool = get_pool(tbl, free_entry);
308
247 if (!iommu_free_check(tbl, dma_addr, npages)) 309 if (!iommu_free_check(tbl, dma_addr, npages))
248 return; 310 return;
249 311
250 ppc_md.tce_free(tbl, entry, npages); 312 ppc_md.tce_free(tbl, entry, npages);
251 313
252 spin_lock_irqsave(&(tbl->it_lock), flags); 314 spin_lock_irqsave(&(pool->lock), flags);
253 bitmap_clear(tbl->it_map, free_entry, npages); 315 bitmap_clear(tbl->it_map, free_entry, npages);
254 spin_unlock_irqrestore(&(tbl->it_lock), flags); 316 spin_unlock_irqrestore(&(pool->lock), flags);
255} 317}
256 318
257static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 319static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -493,9 +555,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
493 unsigned long sz; 555 unsigned long sz;
494 static int welcomed = 0; 556 static int welcomed = 0;
495 struct page *page; 557 struct page *page;
496 558 unsigned int i;
497 /* Set aside 1/4 of the table for large allocations. */ 559 struct iommu_pool *p;
498 tbl->it_halfpoint = tbl->it_size * 3 / 4;
499 560
500 /* number of bytes needed for the bitmap */ 561 /* number of bytes needed for the bitmap */
501 sz = (tbl->it_size + 7) >> 3; 562 sz = (tbl->it_size + 7) >> 3;
@@ -514,9 +575,28 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
514 if (tbl->it_offset == 0) 575 if (tbl->it_offset == 0)
515 set_bit(0, tbl->it_map); 576 set_bit(0, tbl->it_map);
516 577
517 tbl->it_hint = 0; 578 /* We only split the IOMMU table if we have 1GB or more of space */
518 tbl->it_largehint = tbl->it_halfpoint; 579 if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024))
519 spin_lock_init(&tbl->it_lock); 580 tbl->nr_pools = IOMMU_NR_POOLS;
581 else
582 tbl->nr_pools = 1;
583
584 /* We reserve the top 1/4 of the table for large allocations */
585 tbl->poolsize = (tbl->it_size * 3 / 4) / IOMMU_NR_POOLS;
586
587 for (i = 0; i < IOMMU_NR_POOLS; i++) {
588 p = &tbl->pools[i];
589 spin_lock_init(&(p->lock));
590 p->start = tbl->poolsize * i;
591 p->hint = p->start;
592 p->end = p->start + tbl->poolsize;
593 }
594
595 p = &tbl->large_pool;
596 spin_lock_init(&(p->lock));
597 p->start = tbl->poolsize * i;
598 p->hint = p->start;
599 p->end = tbl->it_size;
520 600
521 iommu_table_clear(tbl); 601 iommu_table_clear(tbl);
522 602
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index b9f509a34c01..c264969c9319 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -518,7 +518,6 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
518 __set_bit(0, window->table.it_map); 518 __set_bit(0, window->table.it_map);
519 tce_build_cell(&window->table, window->table.it_offset, 1, 519 tce_build_cell(&window->table, window->table.it_offset, 1,
520 (unsigned long)iommu->pad_page, DMA_TO_DEVICE, NULL); 520 (unsigned long)iommu->pad_page, DMA_TO_DEVICE, NULL);
521 window->table.it_hint = window->table.it_blocksize;
522 521
523 return window; 522 return window;
524} 523}