aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2011-07-23 14:44:25 -0400
committerJens Axboe <jaxboe@fusionio.com>2011-07-23 14:44:25 -0400
commit5757a6d76cdf6dda2a492c09b985c015e86779b1 (patch)
tree6356a6353639eb473dd917a1b2062f9e7e20de22
parentef3230880abd36553ab442363d3c9a0661f00769 (diff)
block: strict rq_affinity
Some systems benefit from completions always being steered to the strict requester cpu rather than the looser "per-socket" steering that blk_cpu_to_group() attempts by default. This is because the first CPU in the group mask ends up being completely overloaded with work, while the others (including the original submitter) has power left to spare. Allow the strict mode to be set by writing '2' to the sysfs control file. This is identical to the scheme used for the nomerges file, where '2' is a more aggressive setting than just being turned on. echo 2 > /sys/block/<bdev>/queue/rq_affinity Cc: Christoph Hellwig <hch@infradead.org> Cc: Roland Dreier <roland@purestorage.com> Tested-by: Dave Jiang <dave.jiang@intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--Documentation/block/queue-sysfs.txt10
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk-softirq.c11
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--include/linux/blkdev.h3
5 files changed, 27 insertions, 16 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index f65274081c8d..d8147b336c35 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -45,9 +45,13 @@ device.
45 45
46rq_affinity (RW) 46rq_affinity (RW)
47---------------- 47----------------
48If this option is enabled, the block layer will migrate request completions 48If this option is '1', the block layer will migrate request completions to the
49to the CPU that originally submitted the request. For some workloads 49cpu "group" that originally submitted the request. For some workloads this
50this provides a significant reduction in CPU cycles due to caching effects. 50provides a significant reduction in CPU cycles due to caching effects.
51
52For storage configurations that need to maximize distribution of completion
53processing setting this option to '2' forces the completion to run on the
54requesting cpu (bypassing the "group" aggregation logic).
51 55
52scheduler (RW) 56scheduler (RW)
53-------------- 57--------------
diff --git a/block/blk-core.c b/block/blk-core.c
index a56485292062..b3228255304d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1279,10 +1279,8 @@ get_rq:
1279 init_request_from_bio(req, bio); 1279 init_request_from_bio(req, bio);
1280 1280
1281 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1281 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1282 bio_flagged(bio, BIO_CPU_AFFINE)) { 1282 bio_flagged(bio, BIO_CPU_AFFINE))
1283 req->cpu = blk_cpu_to_group(get_cpu()); 1283 req->cpu = smp_processor_id();
1284 put_cpu();
1285 }
1286 1284
1287 plug = current->plug; 1285 plug = current->plug;
1288 if (plug) { 1286 if (plug) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ee9c21602228..475fab809a80 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
103 103
104void __blk_complete_request(struct request *req) 104void __blk_complete_request(struct request *req)
105{ 105{
106 int ccpu, cpu, group_cpu = NR_CPUS;
106 struct request_queue *q = req->q; 107 struct request_queue *q = req->q;
107 unsigned long flags; 108 unsigned long flags;
108 int ccpu, cpu, group_cpu;
109 109
110 BUG_ON(!q->softirq_done_fn); 110 BUG_ON(!q->softirq_done_fn);
111 111
112 local_irq_save(flags); 112 local_irq_save(flags);
113 cpu = smp_processor_id(); 113 cpu = smp_processor_id();
114 group_cpu = blk_cpu_to_group(cpu);
115 114
116 /* 115 /*
117 * Select completion CPU 116 * Select completion CPU
118 */ 117 */
119 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) 118 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
120 ccpu = req->cpu; 119 ccpu = req->cpu;
121 else 120 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
121 ccpu = blk_cpu_to_group(ccpu);
122 group_cpu = blk_cpu_to_group(cpu);
123 }
124 } else
122 ccpu = cpu; 125 ccpu = cpu;
123 126
124 if (ccpu == cpu || ccpu == group_cpu) { 127 if (ccpu == cpu || ccpu == group_cpu) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d935bd859c87..0ee17b5e7fb6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
244static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) 244static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
245{ 245{
246 bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); 246 bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
247 bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
247 248
248 return queue_var_show(set, page); 249 return queue_var_show(set << force, page);
249} 250}
250 251
251static ssize_t 252static ssize_t
@@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
257 258
258 ret = queue_var_store(&val, page, count); 259 ret = queue_var_store(&val, page, count);
259 spin_lock_irq(q->queue_lock); 260 spin_lock_irq(q->queue_lock);
260 if (val) 261 if (val) {
261 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 262 queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
262 else 263 if (val == 2)
263 queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 264 queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
265 } else {
266 queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
267 queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
268 }
264 spin_unlock_irq(q->queue_lock); 269 spin_unlock_irq(q->queue_lock);
265#endif 270#endif
266 return ret; 271 return ret;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c0cd9a2f22ef..0e67c45b3bc9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -392,7 +392,7 @@ struct request_queue {
392#define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ 392#define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */
393#define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ 393#define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */
394#define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ 394#define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */
395#define QUEUE_FLAG_SAME_COMP 9 /* force complete on same CPU */ 395#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */
396#define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ 396#define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */
397#define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ 397#define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */
398#define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ 398#define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */
@@ -402,6 +402,7 @@ struct request_queue {
402#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ 402#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */
403#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ 403#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */
404#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 404#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */
405#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
405 406
406#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 407#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
407 (1 << QUEUE_FLAG_STACKABLE) | \ 408 (1 << QUEUE_FLAG_STACKABLE) | \