aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2008-09-13 14:26:01 -0400
committerJens Axboe <jens.axboe@oracle.com>2008-10-09 02:56:09 -0400
commitc7c22e4d5c1fdebfac4dba76de7d0338c2b0d832 (patch)
treeecc3d2517b3471ccc35d4cb4e3b48d4b57205061
parent18887ad910e56066233a07fd3cfb2fa11338b782 (diff)
block: add support for IO CPU affinity
This patch adds support for controlling the IO completion CPU of either all requests on a queue, or on a per-request basis. We export a sysfs variable (rq_affinity) which, if set, migrates completions of requests to the CPU that originally submitted it. A bio helper (bio_set_completion_cpu()) is also added, so that queuers can ask for completion on that specific CPU. In testing, this has been show to cut the system time by as much as 20-40% on synthetic workloads where CPU affinity is desired. This requires a little help from the architecture, so it'll only work as designed for archs that are using the new generic smp helper infrastructure. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--block/blk-core.c46
-rw-r--r--block/blk-settings.c2
-rw-r--r--block/blk-softirq.c126
-rw-r--r--block/blk-sysfs.c31
-rw-r--r--block/blk.h12
-rw-r--r--fs/bio.c1
-rw-r--r--include/linux/bio.h11
-rw-r--r--include/linux/blkdev.h5
-rw-r--r--include/linux/elevator.h8
9 files changed, 182 insertions, 60 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 9c6f818d0c33..5484838f46e7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -110,7 +110,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
110 memset(rq, 0, sizeof(*rq)); 110 memset(rq, 0, sizeof(*rq));
111 111
112 INIT_LIST_HEAD(&rq->queuelist); 112 INIT_LIST_HEAD(&rq->queuelist);
113 INIT_LIST_HEAD(&rq->donelist); 113 rq->cpu = -1;
114 rq->q = q; 114 rq->q = q;
115 rq->sector = rq->hard_sector = (sector_t) -1; 115 rq->sector = rq->hard_sector = (sector_t) -1;
116 INIT_HLIST_NODE(&rq->hash); 116 INIT_HLIST_NODE(&rq->hash);
@@ -322,6 +322,21 @@ void blk_unplug(struct request_queue *q)
322} 322}
323EXPORT_SYMBOL(blk_unplug); 323EXPORT_SYMBOL(blk_unplug);
324 324
325static void blk_invoke_request_fn(struct request_queue *q)
326{
327 /*
328 * one level of recursion is ok and is much faster than kicking
329 * the unplug handling
330 */
331 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
332 q->request_fn(q);
333 queue_flag_clear(QUEUE_FLAG_REENTER, q);
334 } else {
335 queue_flag_set(QUEUE_FLAG_PLUGGED, q);
336 kblockd_schedule_work(q, &q->unplug_work);
337 }
338}
339
325/** 340/**
326 * blk_start_queue - restart a previously stopped queue 341 * blk_start_queue - restart a previously stopped queue
327 * @q: The &struct request_queue in question 342 * @q: The &struct request_queue in question
@@ -336,18 +351,7 @@ void blk_start_queue(struct request_queue *q)
336 WARN_ON(!irqs_disabled()); 351 WARN_ON(!irqs_disabled());
337 352
338 queue_flag_clear(QUEUE_FLAG_STOPPED, q); 353 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
339 354 blk_invoke_request_fn(q);
340 /*
341 * one level of recursion is ok and is much faster than kicking
342 * the unplug handling
343 */
344 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
345 q->request_fn(q);
346 queue_flag_clear(QUEUE_FLAG_REENTER, q);
347 } else {
348 blk_plug_device(q);
349 kblockd_schedule_work(q, &q->unplug_work);
350 }
351} 355}
352EXPORT_SYMBOL(blk_start_queue); 356EXPORT_SYMBOL(blk_start_queue);
353 357
@@ -405,15 +409,8 @@ void __blk_run_queue(struct request_queue *q)
405 * Only recurse once to avoid overrunning the stack, let the unplug 409 * Only recurse once to avoid overrunning the stack, let the unplug
406 * handling reinvoke the handler shortly if we already got there. 410 * handling reinvoke the handler shortly if we already got there.
407 */ 411 */
408 if (!elv_queue_empty(q)) { 412 if (!elv_queue_empty(q))
409 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { 413 blk_invoke_request_fn(q);
410 q->request_fn(q);
411 queue_flag_clear(QUEUE_FLAG_REENTER, q);
412 } else {
413 blk_plug_device(q);
414 kblockd_schedule_work(q, &q->unplug_work);
415 }
416 }
417} 414}
418EXPORT_SYMBOL(__blk_run_queue); 415EXPORT_SYMBOL(__blk_run_queue);
419 416
@@ -1056,6 +1053,7 @@ EXPORT_SYMBOL(blk_put_request);
1056 1053
1057void init_request_from_bio(struct request *req, struct bio *bio) 1054void init_request_from_bio(struct request *req, struct bio *bio)
1058{ 1055{
1056 req->cpu = bio->bi_comp_cpu;
1059 req->cmd_type = REQ_TYPE_FS; 1057 req->cmd_type = REQ_TYPE_FS;
1060 1058
1061 /* 1059 /*
@@ -1198,13 +1196,15 @@ get_rq:
1198 init_request_from_bio(req, bio); 1196 init_request_from_bio(req, bio);
1199 1197
1200 spin_lock_irq(q->queue_lock); 1198 spin_lock_irq(q->queue_lock);
1199 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1200 bio_flagged(bio, BIO_CPU_AFFINE))
1201 req->cpu = blk_cpu_to_group(smp_processor_id());
1201 if (elv_queue_empty(q)) 1202 if (elv_queue_empty(q))
1202 blk_plug_device(q); 1203 blk_plug_device(q);
1203 add_request(q, req); 1204 add_request(q, req);
1204out: 1205out:
1205 if (sync) 1206 if (sync)
1206 __generic_unplug_device(q); 1207 __generic_unplug_device(q);
1207
1208 spin_unlock_irq(q->queue_lock); 1208 spin_unlock_irq(q->queue_lock);
1209 return 0; 1209 return 0;
1210 1210
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d70692badcdb..a60e959a12c4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -443,7 +443,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
443} 443}
444EXPORT_SYMBOL(blk_queue_update_dma_alignment); 444EXPORT_SYMBOL(blk_queue_update_dma_alignment);
445 445
446static int __init blk_settings_init(void) 446int __init blk_settings_init(void)
447{ 447{
448 blk_max_low_pfn = max_low_pfn - 1; 448 blk_max_low_pfn = max_low_pfn - 1;
449 blk_max_pfn = max_pfn - 1; 449 blk_max_pfn = max_pfn - 1;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 9e1c43bff662..3a1af551191e 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -13,6 +13,70 @@
13 13
14static DEFINE_PER_CPU(struct list_head, blk_cpu_done); 14static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
15 15
16/*
17 * Softirq action handler - move entries to local list and loop over them
18 * while passing them to the queue registered handler.
19 */
20static void blk_done_softirq(struct softirq_action *h)
21{
22 struct list_head *cpu_list, local_list;
23
24 local_irq_disable();
25 cpu_list = &__get_cpu_var(blk_cpu_done);
26 list_replace_init(cpu_list, &local_list);
27 local_irq_enable();
28
29 while (!list_empty(&local_list)) {
30 struct request *rq;
31
32 rq = list_entry(local_list.next, struct request, csd.list);
33 list_del_init(&rq->csd.list);
34 rq->q->softirq_done_fn(rq);
35 }
36}
37
38#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
39static void trigger_softirq(void *data)
40{
41 struct request *rq = data;
42 unsigned long flags;
43 struct list_head *list;
44
45 local_irq_save(flags);
46 list = &__get_cpu_var(blk_cpu_done);
47 list_add_tail(&rq->csd.list, list);
48
49 if (list->next == &rq->csd.list)
50 raise_softirq_irqoff(BLOCK_SOFTIRQ);
51
52 local_irq_restore(flags);
53}
54
55/*
56 * Setup and invoke a run of 'trigger_softirq' on the given cpu.
57 */
58static int raise_blk_irq(int cpu, struct request *rq)
59{
60 if (cpu_online(cpu)) {
61 struct call_single_data *data = &rq->csd;
62
63 data->func = trigger_softirq;
64 data->info = rq;
65 data->flags = 0;
66
67 __smp_call_function_single(cpu, data);
68 return 0;
69 }
70
71 return 1;
72}
73#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
74static int raise_blk_irq(int cpu, struct request *rq)
75{
76 return 1;
77}
78#endif
79
16static int __cpuinit blk_cpu_notify(struct notifier_block *self, 80static int __cpuinit blk_cpu_notify(struct notifier_block *self,
17 unsigned long action, void *hcpu) 81 unsigned long action, void *hcpu)
18{ 82{
@@ -33,33 +97,10 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
33 return NOTIFY_OK; 97 return NOTIFY_OK;
34} 98}
35 99
36 100static struct notifier_block __cpuinitdata blk_cpu_notifier = {
37static struct notifier_block blk_cpu_notifier __cpuinitdata = {
38 .notifier_call = blk_cpu_notify, 101 .notifier_call = blk_cpu_notify,
39}; 102};
40 103
41/*
42 * splice the completion data to a local structure and hand off to
43 * process_completion_queue() to complete the requests
44 */
45static void blk_done_softirq(struct softirq_action *h)
46{
47 struct list_head *cpu_list, local_list;
48
49 local_irq_disable();
50 cpu_list = &__get_cpu_var(blk_cpu_done);
51 list_replace_init(cpu_list, &local_list);
52 local_irq_enable();
53
54 while (!list_empty(&local_list)) {
55 struct request *rq;
56
57 rq = list_entry(local_list.next, struct request, donelist);
58 list_del_init(&rq->donelist);
59 rq->q->softirq_done_fn(rq);
60 }
61}
62
63/** 104/**
64 * blk_complete_request - end I/O on a request 105 * blk_complete_request - end I/O on a request
65 * @req: the request being processed 106 * @req: the request being processed
@@ -71,25 +112,48 @@ static void blk_done_softirq(struct softirq_action *h)
71 * through a softirq handler. The user must have registered a completion 112 * through a softirq handler. The user must have registered a completion
72 * callback through blk_queue_softirq_done(). 113 * callback through blk_queue_softirq_done().
73 **/ 114 **/
74
75void blk_complete_request(struct request *req) 115void blk_complete_request(struct request *req)
76{ 116{
77 struct list_head *cpu_list; 117 struct request_queue *q = req->q;
78 unsigned long flags; 118 unsigned long flags;
119 int ccpu, cpu, group_cpu;
79 120
80 BUG_ON(!req->q->softirq_done_fn); 121 BUG_ON(!q->softirq_done_fn);
81 122
82 local_irq_save(flags); 123 local_irq_save(flags);
124 cpu = smp_processor_id();
125 group_cpu = blk_cpu_to_group(cpu);
83 126
84 cpu_list = &__get_cpu_var(blk_cpu_done); 127 /*
85 list_add_tail(&req->donelist, cpu_list); 128 * Select completion CPU
86 raise_softirq_irqoff(BLOCK_SOFTIRQ); 129 */
130 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
131 ccpu = req->cpu;
132 else
133 ccpu = cpu;
134
135 if (ccpu == cpu || ccpu == group_cpu) {
136 struct list_head *list;
137do_local:
138 list = &__get_cpu_var(blk_cpu_done);
139 list_add_tail(&req->csd.list, list);
140
141 /*
142 * if the list only contains our just added request,
143 * signal a raise of the softirq. If there are already
144 * entries there, someone already raised the irq but it
145 * hasn't run yet.
146 */
147 if (list->next == &req->csd.list)
148 raise_softirq_irqoff(BLOCK_SOFTIRQ);
149 } else if (raise_blk_irq(ccpu, req))
150 goto do_local;
87 151
88 local_irq_restore(flags); 152 local_irq_restore(flags);
89} 153}
90EXPORT_SYMBOL(blk_complete_request); 154EXPORT_SYMBOL(blk_complete_request);
91 155
92int __init blk_softirq_init(void) 156__init int blk_softirq_init(void)
93{ 157{
94 int i; 158 int i;
95 159
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b9a6ed166649..21e275d7eed9 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
156 return ret; 156 return ret;
157} 157}
158 158
159static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
160{
161 unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
162
163 return queue_var_show(set != 0, page);
164}
165
166static ssize_t
167queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
168{
169 ssize_t ret = -EINVAL;
170#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
171 unsigned long val;
172
173 ret = queue_var_store(&val, page, count);
174 spin_lock_irq(q->queue_lock);
175 if (val)
176 queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
177 else
178 queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
179 spin_unlock_irq(q->queue_lock);
180#endif
181 return ret;
182}
159 183
160static struct queue_sysfs_entry queue_requests_entry = { 184static struct queue_sysfs_entry queue_requests_entry = {
161 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 185 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_nomerges_entry = {
197 .store = queue_nomerges_store, 221 .store = queue_nomerges_store,
198}; 222};
199 223
224static struct queue_sysfs_entry queue_rq_affinity_entry = {
225 .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
226 .show = queue_rq_affinity_show,
227 .store = queue_rq_affinity_store,
228};
229
200static struct attribute *default_attrs[] = { 230static struct attribute *default_attrs[] = {
201 &queue_requests_entry.attr, 231 &queue_requests_entry.attr,
202 &queue_ra_entry.attr, 232 &queue_ra_entry.attr,
@@ -205,6 +235,7 @@ static struct attribute *default_attrs[] = {
205 &queue_iosched_entry.attr, 235 &queue_iosched_entry.attr,
206 &queue_hw_sector_size_entry.attr, 236 &queue_hw_sector_size_entry.attr,
207 &queue_nomerges_entry.attr, 237 &queue_nomerges_entry.attr,
238 &queue_rq_affinity_entry.attr,
208 NULL, 239 NULL,
209}; 240};
210 241
diff --git a/block/blk.h b/block/blk.h
index c79f30e1df52..de74254cb916 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -59,4 +59,16 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
59 59
60#endif /* BLK_DEV_INTEGRITY */ 60#endif /* BLK_DEV_INTEGRITY */
61 61
62static inline int blk_cpu_to_group(int cpu)
63{
64#ifdef CONFIG_SCHED_MC
65 cpumask_t mask = cpu_coregroup_map(cpu);
66 return first_cpu(mask);
67#elif defined(CONFIG_SCHED_SMT)
68 return first_cpu(per_cpu(cpu_sibling_map, cpu));
69#else
70 return cpu;
71#endif
72}
73
62#endif 74#endif
diff --git a/fs/bio.c b/fs/bio.c
index bee4deca774a..6a637b5c24b5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -111,6 +111,7 @@ void bio_init(struct bio *bio)
111{ 111{
112 memset(bio, 0, sizeof(*bio)); 112 memset(bio, 0, sizeof(*bio));
113 bio->bi_flags = 1 << BIO_UPTODATE; 113 bio->bi_flags = 1 << BIO_UPTODATE;
114 bio->bi_comp_cpu = -1;
114 atomic_set(&bio->bi_cnt, 1); 115 atomic_set(&bio->bi_cnt, 1);
115} 116}
116 117
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2c0c09034fd2..13aba20edb2d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -81,6 +81,8 @@ struct bio {
81 81
82 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ 82 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
83 83
84 unsigned int bi_comp_cpu; /* completion CPU */
85
84 struct bio_vec *bi_io_vec; /* the actual vec list */ 86 struct bio_vec *bi_io_vec; /* the actual vec list */
85 87
86 bio_end_io_t *bi_end_io; 88 bio_end_io_t *bi_end_io;
@@ -105,6 +107,7 @@ struct bio {
105#define BIO_BOUNCED 5 /* bio is a bounce bio */ 107#define BIO_BOUNCED 5 /* bio is a bounce bio */
106#define BIO_USER_MAPPED 6 /* contains user pages */ 108#define BIO_USER_MAPPED 6 /* contains user pages */
107#define BIO_EOPNOTSUPP 7 /* not supported */ 109#define BIO_EOPNOTSUPP 7 /* not supported */
110#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */
108#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) 111#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
109 112
110/* 113/*
@@ -343,6 +346,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
343extern unsigned int bvec_nr_vecs(unsigned short idx); 346extern unsigned int bvec_nr_vecs(unsigned short idx);
344 347
345/* 348/*
349 * Allow queuer to specify a completion CPU for this bio
350 */
351static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
352{
353 bio->bi_comp_cpu = cpu;
354}
355
356/*
346 * bio_set is used to allow other portions of the IO system to 357 * bio_set is used to allow other portions of the IO system to
347 * allocate their own private memory pools for bio and iovec structures. 358 * allocate their own private memory pools for bio and iovec structures.
348 * These memory pools in turn all allocate from the bio_slab 359 * These memory pools in turn all allocate from the bio_slab
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 10aa46c8f170..93204bf7b297 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -17,6 +17,7 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/stringify.h> 18#include <linux/stringify.h>
19#include <linux/bsg.h> 19#include <linux/bsg.h>
20#include <linux/smp.h>
20 21
21#include <asm/scatterlist.h> 22#include <asm/scatterlist.h>
22 23
@@ -139,7 +140,8 @@ enum rq_flag_bits {
139 */ 140 */
140struct request { 141struct request {
141 struct list_head queuelist; 142 struct list_head queuelist;
142 struct list_head donelist; 143 struct call_single_data csd;
144 int cpu;
143 145
144 struct request_queue *q; 146 struct request_queue *q;
145 147
@@ -420,6 +422,7 @@ struct request_queue
420#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ 422#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */
421#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ 423#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */
422#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ 424#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */
425#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */
423 426
424static inline int queue_is_locked(struct request_queue *q) 427static inline int queue_is_locked(struct request_queue *q)
425{ 428{
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 639624b55fbe..bb791c311a56 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -173,15 +173,15 @@ enum {
173#define rb_entry_rq(node) rb_entry((node), struct request, rb_node) 173#define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
174 174
175/* 175/*
176 * Hack to reuse the donelist list_head as the fifo time holder while 176 * Hack to reuse the csd.list list_head as the fifo time holder while
177 * the request is in the io scheduler. Saves an unsigned long in rq. 177 * the request is in the io scheduler. Saves an unsigned long in rq.
178 */ 178 */
179#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next) 179#define rq_fifo_time(rq) ((unsigned long) (rq)->csd.list.next)
180#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp)) 180#define rq_set_fifo_time(rq,exp) ((rq)->csd.list.next = (void *) (exp))
181#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) 181#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
182#define rq_fifo_clear(rq) do { \ 182#define rq_fifo_clear(rq) do { \
183 list_del_init(&(rq)->queuelist); \ 183 list_del_init(&(rq)->queuelist); \
184 INIT_LIST_HEAD(&(rq)->donelist); \ 184 INIT_LIST_HEAD(&(rq)->csd.list); \
185 } while (0) 185 } while (0)
186 186
187/* 187/*