aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-core.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-10-24 04:20:05 -0400
committerJens Axboe <axboe@kernel.dk>2013-10-25 06:56:00 -0400
commit320ae51feed5c2f13664aa05a76bec198967e04d (patch)
treead37ccbcc5ddb1c9c19e48965bf8fec1b05217dc /block/blk-core.c
parent1dddc01af0d42b21058e0cb9c1ca9e8d5204d9b0 (diff)
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices: - The classic request_fn based approach, where drivers use struct request units for IO. The block layer provides various helper functionalities to let drivers share code, things like tag management, timeout handling, queueing, etc. - The "stacked" approach, where a driver squeezes in between the block layer and IO submitter. Since this bypasses the IO stack, driver generally have to manage everything themselves. With drivers being written for new high IOPS devices, the classic request_fn based driver doesn't work well enough. The design dates back to when both SMP and high IOPS was rare. It has problems with scaling to bigger machines, and runs into scaling issues even on smaller machines when you have IOPS in the hundreds of thousands per device. The stacked approach is then most often selected as the model for the driver. But this means that everybody has to re-invent everything, and along with that we get all the problems again that the shared approach solved. This commit introduces blk-mq, block multi queue support. The design is centered around per-cpu queues for queueing IO, which then funnel down into x number of hardware submission queues. We might have a 1:1 mapping between the two, or it might be an N:M mapping. That all depends on what the hardware supports. blk-mq provides various helper functions, which include: - Scalable support for request tagging. Most devices need to be able to uniquely identify a request both in the driver and to the hardware. The tagging uses per-cpu caches for freed tags, to enable cache hot reuse. - Timeout handling without tracking request on a per-device basis. Basically the driver should be able to get a notification, if a request happens to fail. - Optional support for non 1:1 mappings between issue and submission queues. blk-mq can redirect IO completions to the desired location. - Support for per-request payloads. Drivers almost always need to associate a request structure with some driver private command structure. Drivers can tell blk-mq this at init time, and then any request handed to the driver will have the required size of memory associated with it. - Support for merging of IO, and plugging. The stacked model gets neither of these. Even for high IOPS devices, merging sequential IO reduces per-command overhead and thus increases bandwidth. For now, this is provided as a potential 3rd queueing model, with the hope being that, as it matures, it can replace both the classic and stacked model. That would get us back to having just 1 real model for block devices, leaving the stacked approach to dm/md devices (as it was originally intended). Contributions in this patch from the following people: Shaohua Li <shli@fusionio.com> Alexander Gordeev <agordeev@redhat.com> Christoph Hellwig <hch@infradead.org> Mike Christie <michaelc@cs.wisc.edu> Matias Bjorling <m@bjorling.me> Jeff Moyer <jmoyer@redhat.com> Acked-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block/blk-core.c')
-rw-r--r--block/blk-core.c142
1 files changed, 84 insertions, 58 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 18faa7e81d3b..3bb9e9f7f87e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/blk-mq.h>
19#include <linux/highmem.h> 20#include <linux/highmem.h>
20#include <linux/mm.h> 21#include <linux/mm.h>
21#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
48/* 49/*
49 * For the allocated request tables 50 * For the allocated request tables
50 */ 51 */
51static struct kmem_cache *request_cachep; 52struct kmem_cache *request_cachep = NULL;
52 53
53/* 54/*
54 * For queue allocation 55 * For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
60 */ 61 */
61static struct workqueue_struct *kblockd_workqueue; 62static struct workqueue_struct *kblockd_workqueue;
62 63
63static void drive_stat_acct(struct request *rq, int new_io)
64{
65 struct hd_struct *part;
66 int rw = rq_data_dir(rq);
67 int cpu;
68
69 if (!blk_do_io_stat(rq))
70 return;
71
72 cpu = part_stat_lock();
73
74 if (!new_io) {
75 part = rq->part;
76 part_stat_inc(cpu, part, merges[rw]);
77 } else {
78 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
79 if (!hd_struct_try_get(part)) {
80 /*
81 * The partition is already being removed,
82 * the request will be accounted on the disk only
83 *
84 * We take a reference on disk->part0 although that
85 * partition will never be deleted, so we can treat
86 * it as any other partition.
87 */
88 part = &rq->rq_disk->part0;
89 hd_struct_get(part);
90 }
91 part_round_stats(cpu, part);
92 part_inc_in_flight(part, rw);
93 rq->part = part;
94 }
95
96 part_stat_unlock();
97}
98
99void blk_queue_congestion_threshold(struct request_queue *q) 64void blk_queue_congestion_threshold(struct request_queue *q)
100{ 65{
101 int nr; 66 int nr;
@@ -594,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
594 if (!q) 559 if (!q)
595 return NULL; 560 return NULL;
596 561
562 if (percpu_counter_init(&q->mq_usage_counter, 0))
563 goto fail_q;
564
597 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 565 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
598 if (q->id < 0) 566 if (q->id < 0)
599 goto fail_q; 567 goto fail_c;
600 568
601 q->backing_dev_info.ra_pages = 569 q->backing_dev_info.ra_pages =
602 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 570 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -643,6 +611,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
643 q->bypass_depth = 1; 611 q->bypass_depth = 1;
644 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); 612 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
645 613
614 init_waitqueue_head(&q->mq_freeze_wq);
615
646 if (blkcg_init_queue(q)) 616 if (blkcg_init_queue(q))
647 goto fail_id; 617 goto fail_id;
648 618
@@ -650,6 +620,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
650 620
651fail_id: 621fail_id:
652 ida_simple_remove(&blk_queue_ida, q->id); 622 ida_simple_remove(&blk_queue_ida, q->id);
623fail_c:
624 percpu_counter_destroy(&q->mq_usage_counter);
653fail_q: 625fail_q:
654 kmem_cache_free(blk_requestq_cachep, q); 626 kmem_cache_free(blk_requestq_cachep, q);
655 return NULL; 627 return NULL;
@@ -1108,7 +1080,8 @@ retry:
1108 goto retry; 1080 goto retry;
1109} 1081}
1110 1082
1111struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1083static struct request *blk_old_get_request(struct request_queue *q, int rw,
1084 gfp_t gfp_mask)
1112{ 1085{
1113 struct request *rq; 1086 struct request *rq;
1114 1087
@@ -1125,6 +1098,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1125 1098
1126 return rq; 1099 return rq;
1127} 1100}
1101
1102struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1103{
1104 if (q->mq_ops)
1105 return blk_mq_alloc_request(q, rw, gfp_mask);
1106 else
1107 return blk_old_get_request(q, rw, gfp_mask);
1108}
1128EXPORT_SYMBOL(blk_get_request); 1109EXPORT_SYMBOL(blk_get_request);
1129 1110
1130/** 1111/**
@@ -1210,7 +1191,7 @@ EXPORT_SYMBOL(blk_requeue_request);
1210static void add_acct_request(struct request_queue *q, struct request *rq, 1191static void add_acct_request(struct request_queue *q, struct request *rq,
1211 int where) 1192 int where)
1212{ 1193{
1213 drive_stat_acct(rq, 1); 1194 blk_account_io_start(rq, true);
1214 __elv_add_request(q, rq, where); 1195 __elv_add_request(q, rq, where);
1215} 1196}
1216 1197
@@ -1299,12 +1280,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
1299 1280
1300void blk_put_request(struct request *req) 1281void blk_put_request(struct request *req)
1301{ 1282{
1302 unsigned long flags;
1303 struct request_queue *q = req->q; 1283 struct request_queue *q = req->q;
1304 1284
1305 spin_lock_irqsave(q->queue_lock, flags); 1285 if (q->mq_ops)
1306 __blk_put_request(q, req); 1286 blk_mq_free_request(req);
1307 spin_unlock_irqrestore(q->queue_lock, flags); 1287 else {
1288 unsigned long flags;
1289
1290 spin_lock_irqsave(q->queue_lock, flags);
1291 __blk_put_request(q, req);
1292 spin_unlock_irqrestore(q->queue_lock, flags);
1293 }
1308} 1294}
1309EXPORT_SYMBOL(blk_put_request); 1295EXPORT_SYMBOL(blk_put_request);
1310 1296
@@ -1340,8 +1326,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1340} 1326}
1341EXPORT_SYMBOL_GPL(blk_add_request_payload); 1327EXPORT_SYMBOL_GPL(blk_add_request_payload);
1342 1328
1343static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1329bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1344 struct bio *bio) 1330 struct bio *bio)
1345{ 1331{
1346 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1332 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1347 1333
@@ -1358,12 +1344,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1358 req->__data_len += bio->bi_size; 1344 req->__data_len += bio->bi_size;
1359 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1345 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1360 1346
1361 drive_stat_acct(req, 0); 1347 blk_account_io_start(req, false);
1362 return true; 1348 return true;
1363} 1349}
1364 1350
1365static bool bio_attempt_front_merge(struct request_queue *q, 1351bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1366 struct request *req, struct bio *bio) 1352 struct bio *bio)
1367{ 1353{
1368 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1354 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1369 1355
@@ -1388,12 +1374,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1388 req->__data_len += bio->bi_size; 1374 req->__data_len += bio->bi_size;
1389 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1375 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1390 1376
1391 drive_stat_acct(req, 0); 1377 blk_account_io_start(req, false);
1392 return true; 1378 return true;
1393} 1379}
1394 1380
1395/** 1381/**
1396 * attempt_plug_merge - try to merge with %current's plugged list 1382 * blk_attempt_plug_merge - try to merge with %current's plugged list
1397 * @q: request_queue new bio is being queued at 1383 * @q: request_queue new bio is being queued at
1398 * @bio: new bio being queued 1384 * @bio: new bio being queued
1399 * @request_count: out parameter for number of traversed plugged requests 1385 * @request_count: out parameter for number of traversed plugged requests
@@ -1409,8 +1395,8 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1409 * reliable access to the elevator outside queue lock. Only check basic 1395 * reliable access to the elevator outside queue lock. Only check basic
1410 * merging parameters without querying the elevator. 1396 * merging parameters without querying the elevator.
1411 */ 1397 */
1412static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1398bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1413 unsigned int *request_count) 1399 unsigned int *request_count)
1414{ 1400{
1415 struct blk_plug *plug; 1401 struct blk_plug *plug;
1416 struct request *rq; 1402 struct request *rq;
@@ -1489,7 +1475,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1489 * Check if we can merge with the plugged list before grabbing 1475 * Check if we can merge with the plugged list before grabbing
1490 * any locks. 1476 * any locks.
1491 */ 1477 */
1492 if (attempt_plug_merge(q, bio, &request_count)) 1478 if (blk_attempt_plug_merge(q, bio, &request_count))
1493 return; 1479 return;
1494 1480
1495 spin_lock_irq(q->queue_lock); 1481 spin_lock_irq(q->queue_lock);
@@ -1557,7 +1543,7 @@ get_rq:
1557 } 1543 }
1558 } 1544 }
1559 list_add_tail(&req->queuelist, &plug->list); 1545 list_add_tail(&req->queuelist, &plug->list);
1560 drive_stat_acct(req, 1); 1546 blk_account_io_start(req, true);
1561 } else { 1547 } else {
1562 spin_lock_irq(q->queue_lock); 1548 spin_lock_irq(q->queue_lock);
1563 add_acct_request(q, req, where); 1549 add_acct_request(q, req, where);
@@ -2011,7 +1997,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
2011} 1997}
2012EXPORT_SYMBOL_GPL(blk_rq_err_bytes); 1998EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
2013 1999
2014static void blk_account_io_completion(struct request *req, unsigned int bytes) 2000void blk_account_io_completion(struct request *req, unsigned int bytes)
2015{ 2001{
2016 if (blk_do_io_stat(req)) { 2002 if (blk_do_io_stat(req)) {
2017 const int rw = rq_data_dir(req); 2003 const int rw = rq_data_dir(req);
@@ -2025,7 +2011,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
2025 } 2011 }
2026} 2012}
2027 2013
2028static void blk_account_io_done(struct request *req) 2014void blk_account_io_done(struct request *req)
2029{ 2015{
2030 /* 2016 /*
2031 * Account IO completion. flush_rq isn't accounted as a 2017 * Account IO completion. flush_rq isn't accounted as a
@@ -2073,6 +2059,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
2073} 2059}
2074#endif 2060#endif
2075 2061
2062void blk_account_io_start(struct request *rq, bool new_io)
2063{
2064 struct hd_struct *part;
2065 int rw = rq_data_dir(rq);
2066 int cpu;
2067
2068 if (!blk_do_io_stat(rq))
2069 return;
2070
2071 cpu = part_stat_lock();
2072
2073 if (!new_io) {
2074 part = rq->part;
2075 part_stat_inc(cpu, part, merges[rw]);
2076 } else {
2077 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2078 if (!hd_struct_try_get(part)) {
2079 /*
2080 * The partition is already being removed,
2081 * the request will be accounted on the disk only
2082 *
2083 * We take a reference on disk->part0 although that
2084 * partition will never be deleted, so we can treat
2085 * it as any other partition.
2086 */
2087 part = &rq->rq_disk->part0;
2088 hd_struct_get(part);
2089 }
2090 part_round_stats(cpu, part);
2091 part_inc_in_flight(part, rw);
2092 rq->part = part;
2093 }
2094
2095 part_stat_unlock();
2096}
2097
2076/** 2098/**
2077 * blk_peek_request - peek at the top of a request queue 2099 * blk_peek_request - peek at the top of a request queue
2078 * @q: request queue to peek at 2100 * @q: request queue to peek at
@@ -2448,7 +2470,6 @@ static void blk_finish_request(struct request *req, int error)
2448 if (req->cmd_flags & REQ_DONTPREP) 2470 if (req->cmd_flags & REQ_DONTPREP)
2449 blk_unprep_request(req); 2471 blk_unprep_request(req);
2450 2472
2451
2452 blk_account_io_done(req); 2473 blk_account_io_done(req);
2453 2474
2454 if (req->end_io) 2475 if (req->end_io)
@@ -2870,6 +2891,7 @@ void blk_start_plug(struct blk_plug *plug)
2870 2891
2871 plug->magic = PLUG_MAGIC; 2892 plug->magic = PLUG_MAGIC;
2872 INIT_LIST_HEAD(&plug->list); 2893 INIT_LIST_HEAD(&plug->list);
2894 INIT_LIST_HEAD(&plug->mq_list);
2873 INIT_LIST_HEAD(&plug->cb_list); 2895 INIT_LIST_HEAD(&plug->cb_list);
2874 2896
2875 /* 2897 /*
@@ -2967,6 +2989,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2967 BUG_ON(plug->magic != PLUG_MAGIC); 2989 BUG_ON(plug->magic != PLUG_MAGIC);
2968 2990
2969 flush_plug_callbacks(plug, from_schedule); 2991 flush_plug_callbacks(plug, from_schedule);
2992
2993 if (!list_empty(&plug->mq_list))
2994 blk_mq_flush_plug_list(plug, from_schedule);
2995
2970 if (list_empty(&plug->list)) 2996 if (list_empty(&plug->list))
2971 return; 2997 return;
2972 2998