diff options
author | Jens Axboe <axboe@kernel.dk> | 2013-11-08 11:08:12 -0500 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2013-11-08 11:08:12 -0500 |
commit | e37459b8e2c7db6735e39e019e448b76e5e77647 (patch) | |
tree | a3f0944db87a8ae0d41e5acbbbabc1e7ef534d1b /block | |
parent | c7d1ba417c7cb7297d14dd47a390ec90ce548d5c (diff) | |
parent | e7e245000110a7794de8f925b9edc06a9c852f80 (diff) |
Merge branch 'blk-mq/core' into for-3.13/core
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Conflicts:
block/blk-timeout.c
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 5 | ||||
-rw-r--r-- | block/blk-core.c | 157 | ||||
-rw-r--r-- | block/blk-exec.c | 14 | ||||
-rw-r--r-- | block/blk-flush.c | 154 | ||||
-rw-r--r-- | block/blk-merge.c | 17 | ||||
-rw-r--r-- | block/blk-mq-cpu.c | 93 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 108 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 384 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 204 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 27 | ||||
-rw-r--r-- | block/blk-mq.c | 1500 | ||||
-rw-r--r-- | block/blk-mq.h | 52 | ||||
-rw-r--r-- | block/blk-sysfs.c | 13 | ||||
-rw-r--r-- | block/blk-timeout.c | 74 | ||||
-rw-r--r-- | block/blk.h | 17 |
15 files changed, 2701 insertions, 118 deletions
diff --git a/block/Makefile b/block/Makefile index 671a83d063a5..20645e88fb57 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -5,8 +5,9 @@ | |||
5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | 5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ |
6 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ | 6 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ |
7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ | 7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ |
8 | blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ | 8 | blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ |
9 | partition-generic.o partitions/ | 9 | blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ |
10 | genhd.o scsi_ioctl.o partition-generic.o partitions/ | ||
10 | 11 | ||
11 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | 12 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
12 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o | 13 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o |
diff --git a/block/blk-core.c b/block/blk-core.c index 25f13479f552..8bdd0121212a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/bio.h> | 17 | #include <linux/bio.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/blk-mq.h> | ||
19 | #include <linux/highmem.h> | 20 | #include <linux/highmem.h> |
20 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
21 | #include <linux/kernel_stat.h> | 22 | #include <linux/kernel_stat.h> |
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida); | |||
48 | /* | 49 | /* |
49 | * For the allocated request tables | 50 | * For the allocated request tables |
50 | */ | 51 | */ |
51 | static struct kmem_cache *request_cachep; | 52 | struct kmem_cache *request_cachep = NULL; |
52 | 53 | ||
53 | /* | 54 | /* |
54 | * For queue allocation | 55 | * For queue allocation |
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep; | |||
60 | */ | 61 | */ |
61 | static struct workqueue_struct *kblockd_workqueue; | 62 | static struct workqueue_struct *kblockd_workqueue; |
62 | 63 | ||
63 | static void drive_stat_acct(struct request *rq, int new_io) | ||
64 | { | ||
65 | struct hd_struct *part; | ||
66 | int rw = rq_data_dir(rq); | ||
67 | int cpu; | ||
68 | |||
69 | if (!blk_do_io_stat(rq)) | ||
70 | return; | ||
71 | |||
72 | cpu = part_stat_lock(); | ||
73 | |||
74 | if (!new_io) { | ||
75 | part = rq->part; | ||
76 | part_stat_inc(cpu, part, merges[rw]); | ||
77 | } else { | ||
78 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | ||
79 | if (!hd_struct_try_get(part)) { | ||
80 | /* | ||
81 | * The partition is already being removed, | ||
82 | * the request will be accounted on the disk only | ||
83 | * | ||
84 | * We take a reference on disk->part0 although that | ||
85 | * partition will never be deleted, so we can treat | ||
86 | * it as any other partition. | ||
87 | */ | ||
88 | part = &rq->rq_disk->part0; | ||
89 | hd_struct_get(part); | ||
90 | } | ||
91 | part_round_stats(cpu, part); | ||
92 | part_inc_in_flight(part, rw); | ||
93 | rq->part = part; | ||
94 | } | ||
95 | |||
96 | part_stat_unlock(); | ||
97 | } | ||
98 | |||
99 | void blk_queue_congestion_threshold(struct request_queue *q) | 64 | void blk_queue_congestion_threshold(struct request_queue *q) |
100 | { | 65 | { |
101 | int nr; | 66 | int nr; |
@@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |||
145 | rq->cmd = rq->__cmd; | 110 | rq->cmd = rq->__cmd; |
146 | rq->cmd_len = BLK_MAX_CDB; | 111 | rq->cmd_len = BLK_MAX_CDB; |
147 | rq->tag = -1; | 112 | rq->tag = -1; |
148 | rq->ref_count = 1; | ||
149 | rq->start_time = jiffies; | 113 | rq->start_time = jiffies; |
150 | set_start_time_ns(rq); | 114 | set_start_time_ns(rq); |
151 | rq->part = NULL; | 115 | rq->part = NULL; |
@@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg) | |||
174 | { | 138 | { |
175 | int bit; | 139 | int bit; |
176 | 140 | ||
177 | printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, | 141 | printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, |
178 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, | 142 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, |
179 | rq->cmd_flags); | 143 | (unsigned long long) rq->cmd_flags); |
180 | 144 | ||
181 | printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", | 145 | printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", |
182 | (unsigned long long)blk_rq_pos(rq), | 146 | (unsigned long long)blk_rq_pos(rq), |
@@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
595 | if (!q) | 559 | if (!q) |
596 | return NULL; | 560 | return NULL; |
597 | 561 | ||
562 | if (percpu_counter_init(&q->mq_usage_counter, 0)) | ||
563 | goto fail_q; | ||
564 | |||
598 | q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); | 565 | q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); |
599 | if (q->id < 0) | 566 | if (q->id < 0) |
600 | goto fail_q; | 567 | goto fail_c; |
601 | 568 | ||
602 | q->backing_dev_info.ra_pages = | 569 | q->backing_dev_info.ra_pages = |
603 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 570 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
@@ -644,6 +611,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
644 | q->bypass_depth = 1; | 611 | q->bypass_depth = 1; |
645 | __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); | 612 | __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); |
646 | 613 | ||
614 | init_waitqueue_head(&q->mq_freeze_wq); | ||
615 | |||
647 | if (blkcg_init_queue(q)) | 616 | if (blkcg_init_queue(q)) |
648 | goto fail_bdi; | 617 | goto fail_bdi; |
649 | 618 | ||
@@ -653,6 +622,8 @@ fail_bdi: | |||
653 | bdi_destroy(&q->backing_dev_info); | 622 | bdi_destroy(&q->backing_dev_info); |
654 | fail_id: | 623 | fail_id: |
655 | ida_simple_remove(&blk_queue_ida, q->id); | 624 | ida_simple_remove(&blk_queue_ida, q->id); |
625 | fail_c: | ||
626 | percpu_counter_destroy(&q->mq_usage_counter); | ||
656 | fail_q: | 627 | fail_q: |
657 | kmem_cache_free(blk_requestq_cachep, q); | 628 | kmem_cache_free(blk_requestq_cachep, q); |
658 | return NULL; | 629 | return NULL; |
@@ -1119,7 +1090,8 @@ retry: | |||
1119 | goto retry; | 1090 | goto retry; |
1120 | } | 1091 | } |
1121 | 1092 | ||
1122 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) | 1093 | static struct request *blk_old_get_request(struct request_queue *q, int rw, |
1094 | gfp_t gfp_mask) | ||
1123 | { | 1095 | { |
1124 | struct request *rq; | 1096 | struct request *rq; |
1125 | 1097 | ||
@@ -1136,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) | |||
1136 | 1108 | ||
1137 | return rq; | 1109 | return rq; |
1138 | } | 1110 | } |
1111 | |||
1112 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) | ||
1113 | { | ||
1114 | if (q->mq_ops) | ||
1115 | return blk_mq_alloc_request(q, rw, gfp_mask, false); | ||
1116 | else | ||
1117 | return blk_old_get_request(q, rw, gfp_mask); | ||
1118 | } | ||
1139 | EXPORT_SYMBOL(blk_get_request); | 1119 | EXPORT_SYMBOL(blk_get_request); |
1140 | 1120 | ||
1141 | /** | 1121 | /** |
@@ -1221,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request); | |||
1221 | static void add_acct_request(struct request_queue *q, struct request *rq, | 1201 | static void add_acct_request(struct request_queue *q, struct request *rq, |
1222 | int where) | 1202 | int where) |
1223 | { | 1203 | { |
1224 | drive_stat_acct(rq, 1); | 1204 | blk_account_io_start(rq, true); |
1225 | __elv_add_request(q, rq, where); | 1205 | __elv_add_request(q, rq, where); |
1226 | } | 1206 | } |
1227 | 1207 | ||
@@ -1282,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req) | |||
1282 | { | 1262 | { |
1283 | if (unlikely(!q)) | 1263 | if (unlikely(!q)) |
1284 | return; | 1264 | return; |
1285 | if (unlikely(--req->ref_count)) | ||
1286 | return; | ||
1287 | 1265 | ||
1288 | blk_pm_put_request(req); | 1266 | blk_pm_put_request(req); |
1289 | 1267 | ||
@@ -1312,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request); | |||
1312 | 1290 | ||
1313 | void blk_put_request(struct request *req) | 1291 | void blk_put_request(struct request *req) |
1314 | { | 1292 | { |
1315 | unsigned long flags; | ||
1316 | struct request_queue *q = req->q; | 1293 | struct request_queue *q = req->q; |
1317 | 1294 | ||
1318 | spin_lock_irqsave(q->queue_lock, flags); | 1295 | if (q->mq_ops) |
1319 | __blk_put_request(q, req); | 1296 | blk_mq_free_request(req); |
1320 | spin_unlock_irqrestore(q->queue_lock, flags); | 1297 | else { |
1298 | unsigned long flags; | ||
1299 | |||
1300 | spin_lock_irqsave(q->queue_lock, flags); | ||
1301 | __blk_put_request(q, req); | ||
1302 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
1303 | } | ||
1321 | } | 1304 | } |
1322 | EXPORT_SYMBOL(blk_put_request); | 1305 | EXPORT_SYMBOL(blk_put_request); |
1323 | 1306 | ||
@@ -1353,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page, | |||
1353 | } | 1336 | } |
1354 | EXPORT_SYMBOL_GPL(blk_add_request_payload); | 1337 | EXPORT_SYMBOL_GPL(blk_add_request_payload); |
1355 | 1338 | ||
1356 | static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | 1339 | bool bio_attempt_back_merge(struct request_queue *q, struct request *req, |
1357 | struct bio *bio) | 1340 | struct bio *bio) |
1358 | { | 1341 | { |
1359 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; | 1342 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; |
1360 | 1343 | ||
@@ -1371,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
1371 | req->__data_len += bio->bi_size; | 1354 | req->__data_len += bio->bi_size; |
1372 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | 1355 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); |
1373 | 1356 | ||
1374 | drive_stat_acct(req, 0); | 1357 | blk_account_io_start(req, false); |
1375 | return true; | 1358 | return true; |
1376 | } | 1359 | } |
1377 | 1360 | ||
1378 | static bool bio_attempt_front_merge(struct request_queue *q, | 1361 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, |
1379 | struct request *req, struct bio *bio) | 1362 | struct bio *bio) |
1380 | { | 1363 | { |
1381 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; | 1364 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; |
1382 | 1365 | ||
@@ -1401,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q, | |||
1401 | req->__data_len += bio->bi_size; | 1384 | req->__data_len += bio->bi_size; |
1402 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | 1385 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); |
1403 | 1386 | ||
1404 | drive_stat_acct(req, 0); | 1387 | blk_account_io_start(req, false); |
1405 | return true; | 1388 | return true; |
1406 | } | 1389 | } |
1407 | 1390 | ||
1408 | /** | 1391 | /** |
1409 | * attempt_plug_merge - try to merge with %current's plugged list | 1392 | * blk_attempt_plug_merge - try to merge with %current's plugged list |
1410 | * @q: request_queue new bio is being queued at | 1393 | * @q: request_queue new bio is being queued at |
1411 | * @bio: new bio being queued | 1394 | * @bio: new bio being queued |
1412 | * @request_count: out parameter for number of traversed plugged requests | 1395 | * @request_count: out parameter for number of traversed plugged requests |
@@ -1422,12 +1405,13 @@ static bool bio_attempt_front_merge(struct request_queue *q, | |||
1422 | * reliable access to the elevator outside queue lock. Only check basic | 1405 | * reliable access to the elevator outside queue lock. Only check basic |
1423 | * merging parameters without querying the elevator. | 1406 | * merging parameters without querying the elevator. |
1424 | */ | 1407 | */ |
1425 | static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, | 1408 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, |
1426 | unsigned int *request_count) | 1409 | unsigned int *request_count) |
1427 | { | 1410 | { |
1428 | struct blk_plug *plug; | 1411 | struct blk_plug *plug; |
1429 | struct request *rq; | 1412 | struct request *rq; |
1430 | bool ret = false; | 1413 | bool ret = false; |
1414 | struct list_head *plug_list; | ||
1431 | 1415 | ||
1432 | if (blk_queue_nomerges(q)) | 1416 | if (blk_queue_nomerges(q)) |
1433 | goto out; | 1417 | goto out; |
@@ -1437,7 +1421,12 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, | |||
1437 | goto out; | 1421 | goto out; |
1438 | *request_count = 0; | 1422 | *request_count = 0; |
1439 | 1423 | ||
1440 | list_for_each_entry_reverse(rq, &plug->list, queuelist) { | 1424 | if (q->mq_ops) |
1425 | plug_list = &plug->mq_list; | ||
1426 | else | ||
1427 | plug_list = &plug->list; | ||
1428 | |||
1429 | list_for_each_entry_reverse(rq, plug_list, queuelist) { | ||
1441 | int el_ret; | 1430 | int el_ret; |
1442 | 1431 | ||
1443 | if (rq->q == q) | 1432 | if (rq->q == q) |
@@ -1505,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) | |||
1505 | * Check if we can merge with the plugged list before grabbing | 1494 | * Check if we can merge with the plugged list before grabbing |
1506 | * any locks. | 1495 | * any locks. |
1507 | */ | 1496 | */ |
1508 | if (attempt_plug_merge(q, bio, &request_count)) | 1497 | if (blk_attempt_plug_merge(q, bio, &request_count)) |
1509 | return; | 1498 | return; |
1510 | 1499 | ||
1511 | spin_lock_irq(q->queue_lock); | 1500 | spin_lock_irq(q->queue_lock); |
@@ -1573,7 +1562,7 @@ get_rq: | |||
1573 | } | 1562 | } |
1574 | } | 1563 | } |
1575 | list_add_tail(&req->queuelist, &plug->list); | 1564 | list_add_tail(&req->queuelist, &plug->list); |
1576 | drive_stat_acct(req, 1); | 1565 | blk_account_io_start(req, true); |
1577 | } else { | 1566 | } else { |
1578 | spin_lock_irq(q->queue_lock); | 1567 | spin_lock_irq(q->queue_lock); |
1579 | add_acct_request(q, req, where); | 1568 | add_acct_request(q, req, where); |
@@ -2027,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) | |||
2027 | } | 2016 | } |
2028 | EXPORT_SYMBOL_GPL(blk_rq_err_bytes); | 2017 | EXPORT_SYMBOL_GPL(blk_rq_err_bytes); |
2029 | 2018 | ||
2030 | static void blk_account_io_completion(struct request *req, unsigned int bytes) | 2019 | void blk_account_io_completion(struct request *req, unsigned int bytes) |
2031 | { | 2020 | { |
2032 | if (blk_do_io_stat(req)) { | 2021 | if (blk_do_io_stat(req)) { |
2033 | const int rw = rq_data_dir(req); | 2022 | const int rw = rq_data_dir(req); |
@@ -2041,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) | |||
2041 | } | 2030 | } |
2042 | } | 2031 | } |
2043 | 2032 | ||
2044 | static void blk_account_io_done(struct request *req) | 2033 | void blk_account_io_done(struct request *req) |
2045 | { | 2034 | { |
2046 | /* | 2035 | /* |
2047 | * Account IO completion. flush_rq isn't accounted as a | 2036 | * Account IO completion. flush_rq isn't accounted as a |
@@ -2089,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q, | |||
2089 | } | 2078 | } |
2090 | #endif | 2079 | #endif |
2091 | 2080 | ||
2081 | void blk_account_io_start(struct request *rq, bool new_io) | ||
2082 | { | ||
2083 | struct hd_struct *part; | ||
2084 | int rw = rq_data_dir(rq); | ||
2085 | int cpu; | ||
2086 | |||
2087 | if (!blk_do_io_stat(rq)) | ||
2088 | return; | ||
2089 | |||
2090 | cpu = part_stat_lock(); | ||
2091 | |||
2092 | if (!new_io) { | ||
2093 | part = rq->part; | ||
2094 | part_stat_inc(cpu, part, merges[rw]); | ||
2095 | } else { | ||
2096 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | ||
2097 | if (!hd_struct_try_get(part)) { | ||
2098 | /* | ||
2099 | * The partition is already being removed, | ||
2100 | * the request will be accounted on the disk only | ||
2101 | * | ||
2102 | * We take a reference on disk->part0 although that | ||
2103 | * partition will never be deleted, so we can treat | ||
2104 | * it as any other partition. | ||
2105 | */ | ||
2106 | part = &rq->rq_disk->part0; | ||
2107 | hd_struct_get(part); | ||
2108 | } | ||
2109 | part_round_stats(cpu, part); | ||
2110 | part_inc_in_flight(part, rw); | ||
2111 | rq->part = part; | ||
2112 | } | ||
2113 | |||
2114 | part_stat_unlock(); | ||
2115 | } | ||
2116 | |||
2092 | /** | 2117 | /** |
2093 | * blk_peek_request - peek at the top of a request queue | 2118 | * blk_peek_request - peek at the top of a request queue |
2094 | * @q: request queue to peek at | 2119 | * @q: request queue to peek at |
@@ -2465,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error) | |||
2465 | if (req->cmd_flags & REQ_DONTPREP) | 2490 | if (req->cmd_flags & REQ_DONTPREP) |
2466 | blk_unprep_request(req); | 2491 | blk_unprep_request(req); |
2467 | 2492 | ||
2468 | |||
2469 | blk_account_io_done(req); | 2493 | blk_account_io_done(req); |
2470 | 2494 | ||
2471 | if (req->end_io) | 2495 | if (req->end_io) |
@@ -2887,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug) | |||
2887 | 2911 | ||
2888 | plug->magic = PLUG_MAGIC; | 2912 | plug->magic = PLUG_MAGIC; |
2889 | INIT_LIST_HEAD(&plug->list); | 2913 | INIT_LIST_HEAD(&plug->list); |
2914 | INIT_LIST_HEAD(&plug->mq_list); | ||
2890 | INIT_LIST_HEAD(&plug->cb_list); | 2915 | INIT_LIST_HEAD(&plug->cb_list); |
2891 | 2916 | ||
2892 | /* | 2917 | /* |
@@ -2984,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
2984 | BUG_ON(plug->magic != PLUG_MAGIC); | 3009 | BUG_ON(plug->magic != PLUG_MAGIC); |
2985 | 3010 | ||
2986 | flush_plug_callbacks(plug, from_schedule); | 3011 | flush_plug_callbacks(plug, from_schedule); |
3012 | |||
3013 | if (!list_empty(&plug->mq_list)) | ||
3014 | blk_mq_flush_plug_list(plug, from_schedule); | ||
3015 | |||
2987 | if (list_empty(&plug->list)) | 3016 | if (list_empty(&plug->list)) |
2988 | return; | 3017 | return; |
2989 | 3018 | ||
diff --git a/block/blk-exec.c b/block/blk-exec.c index ae4f27d7944e..c3edf9dff566 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/module.h> | 5 | #include <linux/module.h> |
6 | #include <linux/bio.h> | 6 | #include <linux/bio.h> |
7 | #include <linux/blkdev.h> | 7 | #include <linux/blkdev.h> |
8 | #include <linux/blk-mq.h> | ||
8 | #include <linux/sched/sysctl.h> | 9 | #include <linux/sched/sysctl.h> |
9 | 10 | ||
10 | #include "blk.h" | 11 | #include "blk.h" |
@@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error) | |||
24 | struct completion *waiting = rq->end_io_data; | 25 | struct completion *waiting = rq->end_io_data; |
25 | 26 | ||
26 | rq->end_io_data = NULL; | 27 | rq->end_io_data = NULL; |
27 | __blk_put_request(rq->q, rq); | ||
28 | 28 | ||
29 | /* | 29 | /* |
30 | * complete last, if this is a stack request the process (and thus | 30 | * complete last, if this is a stack request the process (and thus |
@@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | |||
59 | 59 | ||
60 | rq->rq_disk = bd_disk; | 60 | rq->rq_disk = bd_disk; |
61 | rq->end_io = done; | 61 | rq->end_io = done; |
62 | |||
63 | if (q->mq_ops) { | ||
64 | blk_mq_insert_request(q, rq, true); | ||
65 | return; | ||
66 | } | ||
67 | |||
62 | /* | 68 | /* |
63 | * need to check this before __blk_run_queue(), because rq can | 69 | * need to check this before __blk_run_queue(), because rq can |
64 | * be freed before that returns. | 70 | * be freed before that returns. |
@@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, | |||
103 | int err = 0; | 109 | int err = 0; |
104 | unsigned long hang_check; | 110 | unsigned long hang_check; |
105 | 111 | ||
106 | /* | ||
107 | * we need an extra reference to the request, so we can look at | ||
108 | * it after io completion | ||
109 | */ | ||
110 | rq->ref_count++; | ||
111 | |||
112 | if (!rq->sense) { | 112 | if (!rq->sense) { |
113 | memset(sense, 0, sizeof(sense)); | 113 | memset(sense, 0, sizeof(sense)); |
114 | rq->sense = sense; | 114 | rq->sense = sense; |
diff --git a/block/blk-flush.c b/block/blk-flush.c index cc2b827a853c..331e627301ea 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c | |||
@@ -69,8 +69,10 @@ | |||
69 | #include <linux/bio.h> | 69 | #include <linux/bio.h> |
70 | #include <linux/blkdev.h> | 70 | #include <linux/blkdev.h> |
71 | #include <linux/gfp.h> | 71 | #include <linux/gfp.h> |
72 | #include <linux/blk-mq.h> | ||
72 | 73 | ||
73 | #include "blk.h" | 74 | #include "blk.h" |
75 | #include "blk-mq.h" | ||
74 | 76 | ||
75 | /* FLUSH/FUA sequences */ | 77 | /* FLUSH/FUA sequences */ |
76 | enum { | 78 | enum { |
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq) | |||
124 | /* make @rq a normal request */ | 126 | /* make @rq a normal request */ |
125 | rq->cmd_flags &= ~REQ_FLUSH_SEQ; | 127 | rq->cmd_flags &= ~REQ_FLUSH_SEQ; |
126 | rq->end_io = rq->flush.saved_end_io; | 128 | rq->end_io = rq->flush.saved_end_io; |
129 | |||
130 | blk_clear_rq_complete(rq); | ||
131 | } | ||
132 | |||
133 | static void mq_flush_data_run(struct work_struct *work) | ||
134 | { | ||
135 | struct request *rq; | ||
136 | |||
137 | rq = container_of(work, struct request, mq_flush_data); | ||
138 | |||
139 | memset(&rq->csd, 0, sizeof(rq->csd)); | ||
140 | blk_mq_run_request(rq, true, false); | ||
141 | } | ||
142 | |||
143 | static void blk_mq_flush_data_insert(struct request *rq) | ||
144 | { | ||
145 | INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); | ||
146 | kblockd_schedule_work(rq->q, &rq->mq_flush_data); | ||
127 | } | 147 | } |
128 | 148 | ||
129 | /** | 149 | /** |
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq) | |||
136 | * completion and trigger the next step. | 156 | * completion and trigger the next step. |
137 | * | 157 | * |
138 | * CONTEXT: | 158 | * CONTEXT: |
139 | * spin_lock_irq(q->queue_lock) | 159 | * spin_lock_irq(q->queue_lock or q->mq_flush_lock) |
140 | * | 160 | * |
141 | * RETURNS: | 161 | * RETURNS: |
142 | * %true if requests were added to the dispatch queue, %false otherwise. | 162 | * %true if requests were added to the dispatch queue, %false otherwise. |
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, | |||
146 | { | 166 | { |
147 | struct request_queue *q = rq->q; | 167 | struct request_queue *q = rq->q; |
148 | struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; | 168 | struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; |
149 | bool queued = false; | 169 | bool queued = false, kicked; |
150 | 170 | ||
151 | BUG_ON(rq->flush.seq & seq); | 171 | BUG_ON(rq->flush.seq & seq); |
152 | rq->flush.seq |= seq; | 172 | rq->flush.seq |= seq; |
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, | |||
167 | 187 | ||
168 | case REQ_FSEQ_DATA: | 188 | case REQ_FSEQ_DATA: |
169 | list_move_tail(&rq->flush.list, &q->flush_data_in_flight); | 189 | list_move_tail(&rq->flush.list, &q->flush_data_in_flight); |
170 | list_add(&rq->queuelist, &q->queue_head); | 190 | if (q->mq_ops) |
171 | queued = true; | 191 | blk_mq_flush_data_insert(rq); |
192 | else { | ||
193 | list_add(&rq->queuelist, &q->queue_head); | ||
194 | queued = true; | ||
195 | } | ||
172 | break; | 196 | break; |
173 | 197 | ||
174 | case REQ_FSEQ_DONE: | 198 | case REQ_FSEQ_DONE: |
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, | |||
181 | BUG_ON(!list_empty(&rq->queuelist)); | 205 | BUG_ON(!list_empty(&rq->queuelist)); |
182 | list_del_init(&rq->flush.list); | 206 | list_del_init(&rq->flush.list); |
183 | blk_flush_restore_request(rq); | 207 | blk_flush_restore_request(rq); |
184 | __blk_end_request_all(rq, error); | 208 | if (q->mq_ops) |
209 | blk_mq_end_io(rq, error); | ||
210 | else | ||
211 | __blk_end_request_all(rq, error); | ||
185 | break; | 212 | break; |
186 | 213 | ||
187 | default: | 214 | default: |
188 | BUG(); | 215 | BUG(); |
189 | } | 216 | } |
190 | 217 | ||
191 | return blk_kick_flush(q) | queued; | 218 | kicked = blk_kick_flush(q); |
219 | /* blk_mq_run_flush will run queue */ | ||
220 | if (q->mq_ops) | ||
221 | return queued; | ||
222 | return kicked | queued; | ||
192 | } | 223 | } |
193 | 224 | ||
194 | static void flush_end_io(struct request *flush_rq, int error) | 225 | static void flush_end_io(struct request *flush_rq, int error) |
195 | { | 226 | { |
196 | struct request_queue *q = flush_rq->q; | 227 | struct request_queue *q = flush_rq->q; |
197 | struct list_head *running = &q->flush_queue[q->flush_running_idx]; | 228 | struct list_head *running; |
198 | bool queued = false; | 229 | bool queued = false; |
199 | struct request *rq, *n; | 230 | struct request *rq, *n; |
231 | unsigned long flags = 0; | ||
200 | 232 | ||
233 | if (q->mq_ops) { | ||
234 | blk_mq_free_request(flush_rq); | ||
235 | spin_lock_irqsave(&q->mq_flush_lock, flags); | ||
236 | } | ||
237 | running = &q->flush_queue[q->flush_running_idx]; | ||
201 | BUG_ON(q->flush_pending_idx == q->flush_running_idx); | 238 | BUG_ON(q->flush_pending_idx == q->flush_running_idx); |
202 | 239 | ||
203 | /* account completion of the flush request */ | 240 | /* account completion of the flush request */ |
204 | q->flush_running_idx ^= 1; | 241 | q->flush_running_idx ^= 1; |
205 | elv_completed_request(q, flush_rq); | 242 | |
243 | if (!q->mq_ops) | ||
244 | elv_completed_request(q, flush_rq); | ||
206 | 245 | ||
207 | /* and push the waiting requests to the next stage */ | 246 | /* and push the waiting requests to the next stage */ |
208 | list_for_each_entry_safe(rq, n, running, flush.list) { | 247 | list_for_each_entry_safe(rq, n, running, flush.list) { |
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error) | |||
223 | * directly into request_fn may confuse the driver. Always use | 262 | * directly into request_fn may confuse the driver. Always use |
224 | * kblockd. | 263 | * kblockd. |
225 | */ | 264 | */ |
226 | if (queued || q->flush_queue_delayed) | 265 | if (queued || q->flush_queue_delayed) { |
227 | blk_run_queue_async(q); | 266 | if (!q->mq_ops) |
267 | blk_run_queue_async(q); | ||
268 | else | ||
269 | /* | ||
270 | * This can be optimized to only run queues with requests | ||
271 | * queued if necessary. | ||
272 | */ | ||
273 | blk_mq_run_queues(q, true); | ||
274 | } | ||
228 | q->flush_queue_delayed = 0; | 275 | q->flush_queue_delayed = 0; |
276 | if (q->mq_ops) | ||
277 | spin_unlock_irqrestore(&q->mq_flush_lock, flags); | ||
278 | } | ||
279 | |||
280 | static void mq_flush_work(struct work_struct *work) | ||
281 | { | ||
282 | struct request_queue *q; | ||
283 | struct request *rq; | ||
284 | |||
285 | q = container_of(work, struct request_queue, mq_flush_work); | ||
286 | |||
287 | /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ | ||
288 | rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, | ||
289 | __GFP_WAIT|GFP_ATOMIC, true); | ||
290 | rq->cmd_type = REQ_TYPE_FS; | ||
291 | rq->end_io = flush_end_io; | ||
292 | |||
293 | blk_mq_run_request(rq, true, false); | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * We can't directly use q->flush_rq, because it doesn't have tag and is not in | ||
298 | * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, | ||
299 | * so offload the work to workqueue. | ||
300 | * | ||
301 | * Note: we assume a flush request finished in any hardware queue will flush | ||
302 | * the whole disk cache. | ||
303 | */ | ||
304 | static void mq_run_flush(struct request_queue *q) | ||
305 | { | ||
306 | kblockd_schedule_work(q, &q->mq_flush_work); | ||
229 | } | 307 | } |
230 | 308 | ||
231 | /** | 309 | /** |
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error) | |||
236 | * Please read the comment at the top of this file for more info. | 314 | * Please read the comment at the top of this file for more info. |
237 | * | 315 | * |
238 | * CONTEXT: | 316 | * CONTEXT: |
239 | * spin_lock_irq(q->queue_lock) | 317 | * spin_lock_irq(q->queue_lock or q->mq_flush_lock) |
240 | * | 318 | * |
241 | * RETURNS: | 319 | * RETURNS: |
242 | * %true if flush was issued, %false otherwise. | 320 | * %true if flush was issued, %false otherwise. |
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q) | |||
261 | * Issue flush and toggle pending_idx. This makes pending_idx | 339 | * Issue flush and toggle pending_idx. This makes pending_idx |
262 | * different from running_idx, which means flush is in flight. | 340 | * different from running_idx, which means flush is in flight. |
263 | */ | 341 | */ |
342 | q->flush_pending_idx ^= 1; | ||
343 | if (q->mq_ops) { | ||
344 | mq_run_flush(q); | ||
345 | return true; | ||
346 | } | ||
347 | |||
264 | blk_rq_init(q, &q->flush_rq); | 348 | blk_rq_init(q, &q->flush_rq); |
265 | q->flush_rq.cmd_type = REQ_TYPE_FS; | 349 | q->flush_rq.cmd_type = REQ_TYPE_FS; |
266 | q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; | 350 | q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; |
267 | q->flush_rq.rq_disk = first_rq->rq_disk; | 351 | q->flush_rq.rq_disk = first_rq->rq_disk; |
268 | q->flush_rq.end_io = flush_end_io; | 352 | q->flush_rq.end_io = flush_end_io; |
269 | 353 | ||
270 | q->flush_pending_idx ^= 1; | ||
271 | list_add_tail(&q->flush_rq.queuelist, &q->queue_head); | 354 | list_add_tail(&q->flush_rq.queuelist, &q->queue_head); |
272 | return true; | 355 | return true; |
273 | } | 356 | } |
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error) | |||
284 | blk_run_queue_async(q); | 367 | blk_run_queue_async(q); |
285 | } | 368 | } |
286 | 369 | ||
370 | static void mq_flush_data_end_io(struct request *rq, int error) | ||
371 | { | ||
372 | struct request_queue *q = rq->q; | ||
373 | struct blk_mq_hw_ctx *hctx; | ||
374 | struct blk_mq_ctx *ctx; | ||
375 | unsigned long flags; | ||
376 | |||
377 | ctx = rq->mq_ctx; | ||
378 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
379 | |||
380 | /* | ||
381 | * After populating an empty queue, kick it to avoid stall. Read | ||
382 | * the comment in flush_end_io(). | ||
383 | */ | ||
384 | spin_lock_irqsave(&q->mq_flush_lock, flags); | ||
385 | if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) | ||
386 | blk_mq_run_hw_queue(hctx, true); | ||
387 | spin_unlock_irqrestore(&q->mq_flush_lock, flags); | ||
388 | } | ||
389 | |||
287 | /** | 390 | /** |
288 | * blk_insert_flush - insert a new FLUSH/FUA request | 391 | * blk_insert_flush - insert a new FLUSH/FUA request |
289 | * @rq: request to insert | 392 | * @rq: request to insert |
290 | * | 393 | * |
291 | * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. | 394 | * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. |
395 | * or __blk_mq_run_hw_queue() to dispatch request. | ||
292 | * @rq is being submitted. Analyze what needs to be done and put it on the | 396 | * @rq is being submitted. Analyze what needs to be done and put it on the |
293 | * right queue. | 397 | * right queue. |
294 | * | 398 | * |
295 | * CONTEXT: | 399 | * CONTEXT: |
296 | * spin_lock_irq(q->queue_lock) | 400 | * spin_lock_irq(q->queue_lock) in !mq case |
297 | */ | 401 | */ |
298 | void blk_insert_flush(struct request *rq) | 402 | void blk_insert_flush(struct request *rq) |
299 | { | 403 | { |
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq) | |||
316 | * complete the request. | 420 | * complete the request. |
317 | */ | 421 | */ |
318 | if (!policy) { | 422 | if (!policy) { |
319 | __blk_end_bidi_request(rq, 0, 0, 0); | 423 | if (q->mq_ops) |
424 | blk_mq_end_io(rq, 0); | ||
425 | else | ||
426 | __blk_end_bidi_request(rq, 0, 0, 0); | ||
320 | return; | 427 | return; |
321 | } | 428 | } |
322 | 429 | ||
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq) | |||
329 | */ | 436 | */ |
330 | if ((policy & REQ_FSEQ_DATA) && | 437 | if ((policy & REQ_FSEQ_DATA) && |
331 | !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { | 438 | !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { |
332 | list_add_tail(&rq->queuelist, &q->queue_head); | 439 | if (q->mq_ops) { |
440 | blk_mq_run_request(rq, false, true); | ||
441 | } else | ||
442 | list_add_tail(&rq->queuelist, &q->queue_head); | ||
333 | return; | 443 | return; |
334 | } | 444 | } |
335 | 445 | ||
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq) | |||
341 | INIT_LIST_HEAD(&rq->flush.list); | 451 | INIT_LIST_HEAD(&rq->flush.list); |
342 | rq->cmd_flags |= REQ_FLUSH_SEQ; | 452 | rq->cmd_flags |= REQ_FLUSH_SEQ; |
343 | rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ | 453 | rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ |
454 | if (q->mq_ops) { | ||
455 | rq->end_io = mq_flush_data_end_io; | ||
456 | |||
457 | spin_lock_irq(&q->mq_flush_lock); | ||
458 | blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); | ||
459 | spin_unlock_irq(&q->mq_flush_lock); | ||
460 | return; | ||
461 | } | ||
344 | rq->end_io = flush_data_end_io; | 462 | rq->end_io = flush_data_end_io; |
345 | 463 | ||
346 | blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); | 464 | blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); |
@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, | |||
453 | return ret; | 571 | return ret; |
454 | } | 572 | } |
455 | EXPORT_SYMBOL(blkdev_issue_flush); | 573 | EXPORT_SYMBOL(blkdev_issue_flush); |
574 | |||
575 | void blk_mq_init_flush(struct request_queue *q) | ||
576 | { | ||
577 | spin_lock_init(&q->mq_flush_lock); | ||
578 | INIT_WORK(&q->mq_flush_work, mq_flush_work); | ||
579 | } | ||
diff --git a/block/blk-merge.c b/block/blk-merge.c index 5f2448253797..1ffc58977835 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, | |||
308 | return ll_new_hw_segment(q, req, bio); | 308 | return ll_new_hw_segment(q, req, bio); |
309 | } | 309 | } |
310 | 310 | ||
311 | /* | ||
312 | * blk-mq uses req->special to carry normal driver per-request payload, it | ||
313 | * does not indicate a prepared command that we cannot merge with. | ||
314 | */ | ||
315 | static bool req_no_special_merge(struct request *req) | ||
316 | { | ||
317 | struct request_queue *q = req->q; | ||
318 | |||
319 | return !q->mq_ops && req->special; | ||
320 | } | ||
321 | |||
311 | static int ll_merge_requests_fn(struct request_queue *q, struct request *req, | 322 | static int ll_merge_requests_fn(struct request_queue *q, struct request *req, |
312 | struct request *next) | 323 | struct request *next) |
313 | { | 324 | { |
@@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, | |||
319 | * First check if the either of the requests are re-queued | 330 | * First check if the either of the requests are re-queued |
320 | * requests. Can't merge them if they are. | 331 | * requests. Can't merge them if they are. |
321 | */ | 332 | */ |
322 | if (req->special || next->special) | 333 | if (req_no_special_merge(req) || req_no_special_merge(next)) |
323 | return 0; | 334 | return 0; |
324 | 335 | ||
325 | /* | 336 | /* |
@@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, | |||
416 | 427 | ||
417 | if (rq_data_dir(req) != rq_data_dir(next) | 428 | if (rq_data_dir(req) != rq_data_dir(next) |
418 | || req->rq_disk != next->rq_disk | 429 | || req->rq_disk != next->rq_disk |
419 | || next->special) | 430 | || req_no_special_merge(next)) |
420 | return 0; | 431 | return 0; |
421 | 432 | ||
422 | if (req->cmd_flags & REQ_WRITE_SAME && | 433 | if (req->cmd_flags & REQ_WRITE_SAME && |
@@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) | |||
515 | return false; | 526 | return false; |
516 | 527 | ||
517 | /* must be same device and not a special request */ | 528 | /* must be same device and not a special request */ |
518 | if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) | 529 | if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) |
519 | return false; | 530 | return false; |
520 | 531 | ||
521 | /* only merge integrity protected bio into ditto rq */ | 532 | /* only merge integrity protected bio into ditto rq */ |
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c new file mode 100644 index 000000000000..f8ea39d7ae54 --- /dev/null +++ b/block/blk-mq-cpu.c | |||
@@ -0,0 +1,93 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/blkdev.h> | ||
5 | #include <linux/list.h> | ||
6 | #include <linux/llist.h> | ||
7 | #include <linux/smp.h> | ||
8 | #include <linux/cpu.h> | ||
9 | |||
10 | #include <linux/blk-mq.h> | ||
11 | #include "blk-mq.h" | ||
12 | |||
13 | static LIST_HEAD(blk_mq_cpu_notify_list); | ||
14 | static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); | ||
15 | |||
16 | static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, | ||
17 | unsigned long action, void *hcpu) | ||
18 | { | ||
19 | unsigned int cpu = (unsigned long) hcpu; | ||
20 | struct blk_mq_cpu_notifier *notify; | ||
21 | |||
22 | spin_lock(&blk_mq_cpu_notify_lock); | ||
23 | |||
24 | list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) | ||
25 | notify->notify(notify->data, action, cpu); | ||
26 | |||
27 | spin_unlock(&blk_mq_cpu_notify_lock); | ||
28 | return NOTIFY_OK; | ||
29 | } | ||
30 | |||
31 | static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, | ||
32 | unsigned int cpu) | ||
33 | { | ||
34 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | ||
35 | /* | ||
36 | * If the CPU goes away, ensure that we run any pending | ||
37 | * completions. | ||
38 | */ | ||
39 | struct llist_node *node; | ||
40 | struct request *rq; | ||
41 | |||
42 | local_irq_disable(); | ||
43 | |||
44 | node = llist_del_all(&per_cpu(ipi_lists, cpu)); | ||
45 | while (node) { | ||
46 | struct llist_node *next = node->next; | ||
47 | |||
48 | rq = llist_entry(node, struct request, ll_list); | ||
49 | __blk_mq_end_io(rq, rq->errors); | ||
50 | node = next; | ||
51 | } | ||
52 | |||
53 | local_irq_enable(); | ||
54 | } | ||
55 | } | ||
56 | |||
57 | static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = { | ||
58 | .notifier_call = blk_mq_main_cpu_notify, | ||
59 | }; | ||
60 | |||
61 | void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) | ||
62 | { | ||
63 | BUG_ON(!notifier->notify); | ||
64 | |||
65 | spin_lock(&blk_mq_cpu_notify_lock); | ||
66 | list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); | ||
67 | spin_unlock(&blk_mq_cpu_notify_lock); | ||
68 | } | ||
69 | |||
70 | void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) | ||
71 | { | ||
72 | spin_lock(&blk_mq_cpu_notify_lock); | ||
73 | list_del(¬ifier->list); | ||
74 | spin_unlock(&blk_mq_cpu_notify_lock); | ||
75 | } | ||
76 | |||
77 | void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, | ||
78 | void (*fn)(void *, unsigned long, unsigned int), | ||
79 | void *data) | ||
80 | { | ||
81 | notifier->notify = fn; | ||
82 | notifier->data = data; | ||
83 | } | ||
84 | |||
85 | static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = { | ||
86 | .notify = blk_mq_cpu_notify, | ||
87 | }; | ||
88 | |||
89 | void __init blk_mq_cpu_init(void) | ||
90 | { | ||
91 | register_hotcpu_notifier(&blk_mq_main_cpu_notifier); | ||
92 | blk_mq_register_cpu_notifier(&cpu_notifier); | ||
93 | } | ||
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c new file mode 100644 index 000000000000..f8721278601c --- /dev/null +++ b/block/blk-mq-cpumap.c | |||
@@ -0,0 +1,108 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/threads.h> | ||
3 | #include <linux/module.h> | ||
4 | #include <linux/mm.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/cpu.h> | ||
7 | |||
8 | #include <linux/blk-mq.h> | ||
9 | #include "blk.h" | ||
10 | #include "blk-mq.h" | ||
11 | |||
12 | static void show_map(unsigned int *map, unsigned int nr) | ||
13 | { | ||
14 | int i; | ||
15 | |||
16 | pr_info("blk-mq: CPU -> queue map\n"); | ||
17 | for_each_online_cpu(i) | ||
18 | pr_info(" CPU%2u -> Queue %u\n", i, map[i]); | ||
19 | } | ||
20 | |||
21 | static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, | ||
22 | const int cpu) | ||
23 | { | ||
24 | return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); | ||
25 | } | ||
26 | |||
27 | static int get_first_sibling(unsigned int cpu) | ||
28 | { | ||
29 | unsigned int ret; | ||
30 | |||
31 | ret = cpumask_first(topology_thread_cpumask(cpu)); | ||
32 | if (ret < nr_cpu_ids) | ||
33 | return ret; | ||
34 | |||
35 | return cpu; | ||
36 | } | ||
37 | |||
38 | int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) | ||
39 | { | ||
40 | unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; | ||
41 | cpumask_var_t cpus; | ||
42 | |||
43 | if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) | ||
44 | return 1; | ||
45 | |||
46 | cpumask_clear(cpus); | ||
47 | nr_cpus = nr_uniq_cpus = 0; | ||
48 | for_each_online_cpu(i) { | ||
49 | nr_cpus++; | ||
50 | first_sibling = get_first_sibling(i); | ||
51 | if (!cpumask_test_cpu(first_sibling, cpus)) | ||
52 | nr_uniq_cpus++; | ||
53 | cpumask_set_cpu(i, cpus); | ||
54 | } | ||
55 | |||
56 | queue = 0; | ||
57 | for_each_possible_cpu(i) { | ||
58 | if (!cpu_online(i)) { | ||
59 | map[i] = 0; | ||
60 | continue; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Easy case - we have equal or more hardware queues. Or | ||
65 | * there are no thread siblings to take into account. Do | ||
66 | * 1:1 if enough, or sequential mapping if less. | ||
67 | */ | ||
68 | if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { | ||
69 | map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); | ||
70 | queue++; | ||
71 | continue; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Less then nr_cpus queues, and we have some number of | ||
76 | * threads per cores. Map sibling threads to the same | ||
77 | * queue. | ||
78 | */ | ||
79 | first_sibling = get_first_sibling(i); | ||
80 | if (first_sibling == i) { | ||
81 | map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, | ||
82 | queue); | ||
83 | queue++; | ||
84 | } else | ||
85 | map[i] = map[first_sibling]; | ||
86 | } | ||
87 | |||
88 | show_map(map, nr_cpus); | ||
89 | free_cpumask_var(cpus); | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) | ||
94 | { | ||
95 | unsigned int *map; | ||
96 | |||
97 | /* If cpus are offline, map them to first hctx */ | ||
98 | map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, | ||
99 | reg->numa_node); | ||
100 | if (!map) | ||
101 | return NULL; | ||
102 | |||
103 | if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) | ||
104 | return map; | ||
105 | |||
106 | kfree(map); | ||
107 | return NULL; | ||
108 | } | ||
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c new file mode 100644 index 000000000000..ba6cf8e9aa0a --- /dev/null +++ b/block/blk-mq-sysfs.c | |||
@@ -0,0 +1,384 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/backing-dev.h> | ||
4 | #include <linux/bio.h> | ||
5 | #include <linux/blkdev.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/workqueue.h> | ||
10 | #include <linux/smp.h> | ||
11 | |||
12 | #include <linux/blk-mq.h> | ||
13 | #include "blk-mq.h" | ||
14 | #include "blk-mq-tag.h" | ||
15 | |||
16 | static void blk_mq_sysfs_release(struct kobject *kobj) | ||
17 | { | ||
18 | } | ||
19 | |||
20 | struct blk_mq_ctx_sysfs_entry { | ||
21 | struct attribute attr; | ||
22 | ssize_t (*show)(struct blk_mq_ctx *, char *); | ||
23 | ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); | ||
24 | }; | ||
25 | |||
26 | struct blk_mq_hw_ctx_sysfs_entry { | ||
27 | struct attribute attr; | ||
28 | ssize_t (*show)(struct blk_mq_hw_ctx *, char *); | ||
29 | ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); | ||
30 | }; | ||
31 | |||
32 | static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, | ||
33 | char *page) | ||
34 | { | ||
35 | struct blk_mq_ctx_sysfs_entry *entry; | ||
36 | struct blk_mq_ctx *ctx; | ||
37 | struct request_queue *q; | ||
38 | ssize_t res; | ||
39 | |||
40 | entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); | ||
41 | ctx = container_of(kobj, struct blk_mq_ctx, kobj); | ||
42 | q = ctx->queue; | ||
43 | |||
44 | if (!entry->show) | ||
45 | return -EIO; | ||
46 | |||
47 | res = -ENOENT; | ||
48 | mutex_lock(&q->sysfs_lock); | ||
49 | if (!blk_queue_dying(q)) | ||
50 | res = entry->show(ctx, page); | ||
51 | mutex_unlock(&q->sysfs_lock); | ||
52 | return res; | ||
53 | } | ||
54 | |||
55 | static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, | ||
56 | const char *page, size_t length) | ||
57 | { | ||
58 | struct blk_mq_ctx_sysfs_entry *entry; | ||
59 | struct blk_mq_ctx *ctx; | ||
60 | struct request_queue *q; | ||
61 | ssize_t res; | ||
62 | |||
63 | entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); | ||
64 | ctx = container_of(kobj, struct blk_mq_ctx, kobj); | ||
65 | q = ctx->queue; | ||
66 | |||
67 | if (!entry->store) | ||
68 | return -EIO; | ||
69 | |||
70 | res = -ENOENT; | ||
71 | mutex_lock(&q->sysfs_lock); | ||
72 | if (!blk_queue_dying(q)) | ||
73 | res = entry->store(ctx, page, length); | ||
74 | mutex_unlock(&q->sysfs_lock); | ||
75 | return res; | ||
76 | } | ||
77 | |||
78 | static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, | ||
79 | struct attribute *attr, char *page) | ||
80 | { | ||
81 | struct blk_mq_hw_ctx_sysfs_entry *entry; | ||
82 | struct blk_mq_hw_ctx *hctx; | ||
83 | struct request_queue *q; | ||
84 | ssize_t res; | ||
85 | |||
86 | entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); | ||
87 | hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); | ||
88 | q = hctx->queue; | ||
89 | |||
90 | if (!entry->show) | ||
91 | return -EIO; | ||
92 | |||
93 | res = -ENOENT; | ||
94 | mutex_lock(&q->sysfs_lock); | ||
95 | if (!blk_queue_dying(q)) | ||
96 | res = entry->show(hctx, page); | ||
97 | mutex_unlock(&q->sysfs_lock); | ||
98 | return res; | ||
99 | } | ||
100 | |||
101 | static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, | ||
102 | struct attribute *attr, const char *page, | ||
103 | size_t length) | ||
104 | { | ||
105 | struct blk_mq_hw_ctx_sysfs_entry *entry; | ||
106 | struct blk_mq_hw_ctx *hctx; | ||
107 | struct request_queue *q; | ||
108 | ssize_t res; | ||
109 | |||
110 | entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); | ||
111 | hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); | ||
112 | q = hctx->queue; | ||
113 | |||
114 | if (!entry->store) | ||
115 | return -EIO; | ||
116 | |||
117 | res = -ENOENT; | ||
118 | mutex_lock(&q->sysfs_lock); | ||
119 | if (!blk_queue_dying(q)) | ||
120 | res = entry->store(hctx, page, length); | ||
121 | mutex_unlock(&q->sysfs_lock); | ||
122 | return res; | ||
123 | } | ||
124 | |||
125 | static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) | ||
126 | { | ||
127 | return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], | ||
128 | ctx->rq_dispatched[0]); | ||
129 | } | ||
130 | |||
131 | static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) | ||
132 | { | ||
133 | return sprintf(page, "%lu\n", ctx->rq_merged); | ||
134 | } | ||
135 | |||
136 | static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) | ||
137 | { | ||
138 | return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], | ||
139 | ctx->rq_completed[0]); | ||
140 | } | ||
141 | |||
142 | static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) | ||
143 | { | ||
144 | char *start_page = page; | ||
145 | struct request *rq; | ||
146 | |||
147 | page += sprintf(page, "%s:\n", msg); | ||
148 | |||
149 | list_for_each_entry(rq, list, queuelist) | ||
150 | page += sprintf(page, "\t%p\n", rq); | ||
151 | |||
152 | return page - start_page; | ||
153 | } | ||
154 | |||
155 | static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) | ||
156 | { | ||
157 | ssize_t ret; | ||
158 | |||
159 | spin_lock(&ctx->lock); | ||
160 | ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); | ||
161 | spin_unlock(&ctx->lock); | ||
162 | |||
163 | return ret; | ||
164 | } | ||
165 | |||
166 | static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, | ||
167 | char *page) | ||
168 | { | ||
169 | return sprintf(page, "%lu\n", hctx->queued); | ||
170 | } | ||
171 | |||
172 | static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) | ||
173 | { | ||
174 | return sprintf(page, "%lu\n", hctx->run); | ||
175 | } | ||
176 | |||
177 | static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, | ||
178 | char *page) | ||
179 | { | ||
180 | char *start_page = page; | ||
181 | int i; | ||
182 | |||
183 | page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); | ||
184 | |||
185 | for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { | ||
186 | unsigned long d = 1U << (i - 1); | ||
187 | |||
188 | page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); | ||
189 | } | ||
190 | |||
191 | return page - start_page; | ||
192 | } | ||
193 | |||
194 | static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, | ||
195 | char *page) | ||
196 | { | ||
197 | ssize_t ret; | ||
198 | |||
199 | spin_lock(&hctx->lock); | ||
200 | ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); | ||
201 | spin_unlock(&hctx->lock); | ||
202 | |||
203 | return ret; | ||
204 | } | ||
205 | |||
206 | static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) | ||
207 | { | ||
208 | ssize_t ret; | ||
209 | |||
210 | spin_lock(&hctx->lock); | ||
211 | ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); | ||
212 | spin_unlock(&hctx->lock); | ||
213 | |||
214 | return ret; | ||
215 | } | ||
216 | |||
217 | static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, | ||
218 | const char *page, size_t len) | ||
219 | { | ||
220 | struct blk_mq_ctx *ctx; | ||
221 | unsigned long ret; | ||
222 | unsigned int i; | ||
223 | |||
224 | if (kstrtoul(page, 10, &ret)) { | ||
225 | pr_err("blk-mq-sysfs: invalid input '%s'\n", page); | ||
226 | return -EINVAL; | ||
227 | } | ||
228 | |||
229 | spin_lock(&hctx->lock); | ||
230 | if (ret) | ||
231 | hctx->flags |= BLK_MQ_F_SHOULD_IPI; | ||
232 | else | ||
233 | hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; | ||
234 | spin_unlock(&hctx->lock); | ||
235 | |||
236 | hctx_for_each_ctx(hctx, ctx, i) | ||
237 | ctx->ipi_redirect = !!ret; | ||
238 | |||
239 | return len; | ||
240 | } | ||
241 | |||
242 | static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) | ||
243 | { | ||
244 | return blk_mq_tag_sysfs_show(hctx->tags, page); | ||
245 | } | ||
246 | |||
247 | static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { | ||
248 | .attr = {.name = "dispatched", .mode = S_IRUGO }, | ||
249 | .show = blk_mq_sysfs_dispatched_show, | ||
250 | }; | ||
251 | static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { | ||
252 | .attr = {.name = "merged", .mode = S_IRUGO }, | ||
253 | .show = blk_mq_sysfs_merged_show, | ||
254 | }; | ||
255 | static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { | ||
256 | .attr = {.name = "completed", .mode = S_IRUGO }, | ||
257 | .show = blk_mq_sysfs_completed_show, | ||
258 | }; | ||
259 | static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { | ||
260 | .attr = {.name = "rq_list", .mode = S_IRUGO }, | ||
261 | .show = blk_mq_sysfs_rq_list_show, | ||
262 | }; | ||
263 | |||
264 | static struct attribute *default_ctx_attrs[] = { | ||
265 | &blk_mq_sysfs_dispatched.attr, | ||
266 | &blk_mq_sysfs_merged.attr, | ||
267 | &blk_mq_sysfs_completed.attr, | ||
268 | &blk_mq_sysfs_rq_list.attr, | ||
269 | NULL, | ||
270 | }; | ||
271 | |||
272 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { | ||
273 | .attr = {.name = "queued", .mode = S_IRUGO }, | ||
274 | .show = blk_mq_hw_sysfs_queued_show, | ||
275 | }; | ||
276 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { | ||
277 | .attr = {.name = "run", .mode = S_IRUGO }, | ||
278 | .show = blk_mq_hw_sysfs_run_show, | ||
279 | }; | ||
280 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { | ||
281 | .attr = {.name = "dispatched", .mode = S_IRUGO }, | ||
282 | .show = blk_mq_hw_sysfs_dispatched_show, | ||
283 | }; | ||
284 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { | ||
285 | .attr = {.name = "pending", .mode = S_IRUGO }, | ||
286 | .show = blk_mq_hw_sysfs_rq_list_show, | ||
287 | }; | ||
288 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { | ||
289 | .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, | ||
290 | .show = blk_mq_hw_sysfs_ipi_show, | ||
291 | .store = blk_mq_hw_sysfs_ipi_store, | ||
292 | }; | ||
293 | static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { | ||
294 | .attr = {.name = "tags", .mode = S_IRUGO }, | ||
295 | .show = blk_mq_hw_sysfs_tags_show, | ||
296 | }; | ||
297 | |||
298 | static struct attribute *default_hw_ctx_attrs[] = { | ||
299 | &blk_mq_hw_sysfs_queued.attr, | ||
300 | &blk_mq_hw_sysfs_run.attr, | ||
301 | &blk_mq_hw_sysfs_dispatched.attr, | ||
302 | &blk_mq_hw_sysfs_pending.attr, | ||
303 | &blk_mq_hw_sysfs_ipi.attr, | ||
304 | &blk_mq_hw_sysfs_tags.attr, | ||
305 | NULL, | ||
306 | }; | ||
307 | |||
308 | static const struct sysfs_ops blk_mq_sysfs_ops = { | ||
309 | .show = blk_mq_sysfs_show, | ||
310 | .store = blk_mq_sysfs_store, | ||
311 | }; | ||
312 | |||
313 | static const struct sysfs_ops blk_mq_hw_sysfs_ops = { | ||
314 | .show = blk_mq_hw_sysfs_show, | ||
315 | .store = blk_mq_hw_sysfs_store, | ||
316 | }; | ||
317 | |||
318 | static struct kobj_type blk_mq_ktype = { | ||
319 | .sysfs_ops = &blk_mq_sysfs_ops, | ||
320 | .release = blk_mq_sysfs_release, | ||
321 | }; | ||
322 | |||
323 | static struct kobj_type blk_mq_ctx_ktype = { | ||
324 | .sysfs_ops = &blk_mq_sysfs_ops, | ||
325 | .default_attrs = default_ctx_attrs, | ||
326 | .release = blk_mq_sysfs_release, | ||
327 | }; | ||
328 | |||
329 | static struct kobj_type blk_mq_hw_ktype = { | ||
330 | .sysfs_ops = &blk_mq_hw_sysfs_ops, | ||
331 | .default_attrs = default_hw_ctx_attrs, | ||
332 | .release = blk_mq_sysfs_release, | ||
333 | }; | ||
334 | |||
335 | void blk_mq_unregister_disk(struct gendisk *disk) | ||
336 | { | ||
337 | struct request_queue *q = disk->queue; | ||
338 | |||
339 | kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); | ||
340 | kobject_del(&q->mq_kobj); | ||
341 | |||
342 | kobject_put(&disk_to_dev(disk)->kobj); | ||
343 | } | ||
344 | |||
345 | int blk_mq_register_disk(struct gendisk *disk) | ||
346 | { | ||
347 | struct device *dev = disk_to_dev(disk); | ||
348 | struct request_queue *q = disk->queue; | ||
349 | struct blk_mq_hw_ctx *hctx; | ||
350 | struct blk_mq_ctx *ctx; | ||
351 | int ret, i, j; | ||
352 | |||
353 | kobject_init(&q->mq_kobj, &blk_mq_ktype); | ||
354 | |||
355 | ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); | ||
356 | if (ret < 0) | ||
357 | return ret; | ||
358 | |||
359 | kobject_uevent(&q->mq_kobj, KOBJ_ADD); | ||
360 | |||
361 | queue_for_each_hw_ctx(q, hctx, i) { | ||
362 | kobject_init(&hctx->kobj, &blk_mq_hw_ktype); | ||
363 | ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); | ||
364 | if (ret) | ||
365 | break; | ||
366 | |||
367 | if (!hctx->nr_ctx) | ||
368 | continue; | ||
369 | |||
370 | hctx_for_each_ctx(hctx, ctx, j) { | ||
371 | kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); | ||
372 | ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); | ||
373 | if (ret) | ||
374 | break; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | if (ret) { | ||
379 | blk_mq_unregister_disk(disk); | ||
380 | return ret; | ||
381 | } | ||
382 | |||
383 | return 0; | ||
384 | } | ||
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c new file mode 100644 index 000000000000..d64a02fb1f73 --- /dev/null +++ b/block/blk-mq-tag.c | |||
@@ -0,0 +1,204 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/percpu_ida.h> | ||
4 | |||
5 | #include <linux/blk-mq.h> | ||
6 | #include "blk.h" | ||
7 | #include "blk-mq.h" | ||
8 | #include "blk-mq-tag.h" | ||
9 | |||
10 | /* | ||
11 | * Per tagged queue (tag address space) map | ||
12 | */ | ||
13 | struct blk_mq_tags { | ||
14 | unsigned int nr_tags; | ||
15 | unsigned int nr_reserved_tags; | ||
16 | unsigned int nr_batch_move; | ||
17 | unsigned int nr_max_cache; | ||
18 | |||
19 | struct percpu_ida free_tags; | ||
20 | struct percpu_ida reserved_tags; | ||
21 | }; | ||
22 | |||
23 | void blk_mq_wait_for_tags(struct blk_mq_tags *tags) | ||
24 | { | ||
25 | int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); | ||
26 | blk_mq_put_tag(tags, tag); | ||
27 | } | ||
28 | |||
29 | bool blk_mq_has_free_tags(struct blk_mq_tags *tags) | ||
30 | { | ||
31 | return !tags || | ||
32 | percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; | ||
33 | } | ||
34 | |||
35 | static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) | ||
36 | { | ||
37 | int tag; | ||
38 | |||
39 | tag = percpu_ida_alloc(&tags->free_tags, gfp); | ||
40 | if (tag < 0) | ||
41 | return BLK_MQ_TAG_FAIL; | ||
42 | return tag + tags->nr_reserved_tags; | ||
43 | } | ||
44 | |||
45 | static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, | ||
46 | gfp_t gfp) | ||
47 | { | ||
48 | int tag; | ||
49 | |||
50 | if (unlikely(!tags->nr_reserved_tags)) { | ||
51 | WARN_ON_ONCE(1); | ||
52 | return BLK_MQ_TAG_FAIL; | ||
53 | } | ||
54 | |||
55 | tag = percpu_ida_alloc(&tags->reserved_tags, gfp); | ||
56 | if (tag < 0) | ||
57 | return BLK_MQ_TAG_FAIL; | ||
58 | return tag; | ||
59 | } | ||
60 | |||
61 | unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) | ||
62 | { | ||
63 | if (!reserved) | ||
64 | return __blk_mq_get_tag(tags, gfp); | ||
65 | |||
66 | return __blk_mq_get_reserved_tag(tags, gfp); | ||
67 | } | ||
68 | |||
69 | static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) | ||
70 | { | ||
71 | BUG_ON(tag >= tags->nr_tags); | ||
72 | |||
73 | percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); | ||
74 | } | ||
75 | |||
76 | static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, | ||
77 | unsigned int tag) | ||
78 | { | ||
79 | BUG_ON(tag >= tags->nr_reserved_tags); | ||
80 | |||
81 | percpu_ida_free(&tags->reserved_tags, tag); | ||
82 | } | ||
83 | |||
84 | void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) | ||
85 | { | ||
86 | if (tag >= tags->nr_reserved_tags) | ||
87 | __blk_mq_put_tag(tags, tag); | ||
88 | else | ||
89 | __blk_mq_put_reserved_tag(tags, tag); | ||
90 | } | ||
91 | |||
92 | static int __blk_mq_tag_iter(unsigned id, void *data) | ||
93 | { | ||
94 | unsigned long *tag_map = data; | ||
95 | __set_bit(id, tag_map); | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, | ||
100 | void (*fn)(void *, unsigned long *), void *data) | ||
101 | { | ||
102 | unsigned long *tag_map; | ||
103 | size_t map_size; | ||
104 | |||
105 | map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; | ||
106 | tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); | ||
107 | if (!tag_map) | ||
108 | return; | ||
109 | |||
110 | percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); | ||
111 | if (tags->nr_reserved_tags) | ||
112 | percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, | ||
113 | tag_map); | ||
114 | |||
115 | fn(data, tag_map); | ||
116 | kfree(tag_map); | ||
117 | } | ||
118 | |||
119 | struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, | ||
120 | unsigned int reserved_tags, int node) | ||
121 | { | ||
122 | unsigned int nr_tags, nr_cache; | ||
123 | struct blk_mq_tags *tags; | ||
124 | int ret; | ||
125 | |||
126 | if (total_tags > BLK_MQ_TAG_MAX) { | ||
127 | pr_err("blk-mq: tag depth too large\n"); | ||
128 | return NULL; | ||
129 | } | ||
130 | |||
131 | tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); | ||
132 | if (!tags) | ||
133 | return NULL; | ||
134 | |||
135 | nr_tags = total_tags - reserved_tags; | ||
136 | nr_cache = nr_tags / num_possible_cpus(); | ||
137 | |||
138 | if (nr_cache < BLK_MQ_TAG_CACHE_MIN) | ||
139 | nr_cache = BLK_MQ_TAG_CACHE_MIN; | ||
140 | else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) | ||
141 | nr_cache = BLK_MQ_TAG_CACHE_MAX; | ||
142 | |||
143 | tags->nr_tags = total_tags; | ||
144 | tags->nr_reserved_tags = reserved_tags; | ||
145 | tags->nr_max_cache = nr_cache; | ||
146 | tags->nr_batch_move = max(1u, nr_cache / 2); | ||
147 | |||
148 | ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - | ||
149 | tags->nr_reserved_tags, | ||
150 | tags->nr_max_cache, | ||
151 | tags->nr_batch_move); | ||
152 | if (ret) | ||
153 | goto err_free_tags; | ||
154 | |||
155 | if (reserved_tags) { | ||
156 | /* | ||
157 | * With max_cahe and batch set to 1, the allocator fallbacks to | ||
158 | * no cached. It's fine reserved tags allocation is slow. | ||
159 | */ | ||
160 | ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, | ||
161 | 1, 1); | ||
162 | if (ret) | ||
163 | goto err_reserved_tags; | ||
164 | } | ||
165 | |||
166 | return tags; | ||
167 | |||
168 | err_reserved_tags: | ||
169 | percpu_ida_destroy(&tags->free_tags); | ||
170 | err_free_tags: | ||
171 | kfree(tags); | ||
172 | return NULL; | ||
173 | } | ||
174 | |||
175 | void blk_mq_free_tags(struct blk_mq_tags *tags) | ||
176 | { | ||
177 | percpu_ida_destroy(&tags->free_tags); | ||
178 | percpu_ida_destroy(&tags->reserved_tags); | ||
179 | kfree(tags); | ||
180 | } | ||
181 | |||
182 | ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) | ||
183 | { | ||
184 | char *orig_page = page; | ||
185 | int cpu; | ||
186 | |||
187 | if (!tags) | ||
188 | return 0; | ||
189 | |||
190 | page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," | ||
191 | " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, | ||
192 | tags->nr_batch_move, tags->nr_max_cache); | ||
193 | |||
194 | page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", | ||
195 | percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), | ||
196 | percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); | ||
197 | |||
198 | for_each_possible_cpu(cpu) { | ||
199 | page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, | ||
200 | percpu_ida_free_tags(&tags->free_tags, cpu)); | ||
201 | } | ||
202 | |||
203 | return page - orig_page; | ||
204 | } | ||
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h new file mode 100644 index 000000000000..947ba2c6148e --- /dev/null +++ b/block/blk-mq-tag.h | |||
@@ -0,0 +1,27 @@ | |||
1 | #ifndef INT_BLK_MQ_TAG_H | ||
2 | #define INT_BLK_MQ_TAG_H | ||
3 | |||
4 | struct blk_mq_tags; | ||
5 | |||
6 | extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); | ||
7 | extern void blk_mq_free_tags(struct blk_mq_tags *tags); | ||
8 | |||
9 | extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); | ||
10 | extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); | ||
11 | extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); | ||
12 | extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); | ||
13 | extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); | ||
14 | extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); | ||
15 | |||
16 | enum { | ||
17 | BLK_MQ_TAG_CACHE_MIN = 1, | ||
18 | BLK_MQ_TAG_CACHE_MAX = 64, | ||
19 | }; | ||
20 | |||
21 | enum { | ||
22 | BLK_MQ_TAG_FAIL = -1U, | ||
23 | BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, | ||
24 | BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, | ||
25 | }; | ||
26 | |||
27 | #endif | ||
diff --git a/block/blk-mq.c b/block/blk-mq.c new file mode 100644 index 000000000000..88d4e864d4c0 --- /dev/null +++ b/block/blk-mq.c | |||
@@ -0,0 +1,1500 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/backing-dev.h> | ||
4 | #include <linux/bio.h> | ||
5 | #include <linux/blkdev.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/workqueue.h> | ||
10 | #include <linux/smp.h> | ||
11 | #include <linux/llist.h> | ||
12 | #include <linux/list_sort.h> | ||
13 | #include <linux/cpu.h> | ||
14 | #include <linux/cache.h> | ||
15 | #include <linux/sched/sysctl.h> | ||
16 | #include <linux/delay.h> | ||
17 | |||
18 | #include <trace/events/block.h> | ||
19 | |||
20 | #include <linux/blk-mq.h> | ||
21 | #include "blk.h" | ||
22 | #include "blk-mq.h" | ||
23 | #include "blk-mq-tag.h" | ||
24 | |||
25 | static DEFINE_MUTEX(all_q_mutex); | ||
26 | static LIST_HEAD(all_q_list); | ||
27 | |||
28 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); | ||
29 | |||
30 | DEFINE_PER_CPU(struct llist_head, ipi_lists); | ||
31 | |||
32 | static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, | ||
33 | unsigned int cpu) | ||
34 | { | ||
35 | return per_cpu_ptr(q->queue_ctx, cpu); | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * This assumes per-cpu software queueing queues. They could be per-node | ||
40 | * as well, for instance. For now this is hardcoded as-is. Note that we don't | ||
41 | * care about preemption, since we know the ctx's are persistent. This does | ||
42 | * mean that we can't rely on ctx always matching the currently running CPU. | ||
43 | */ | ||
44 | static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) | ||
45 | { | ||
46 | return __blk_mq_get_ctx(q, get_cpu()); | ||
47 | } | ||
48 | |||
49 | static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) | ||
50 | { | ||
51 | put_cpu(); | ||
52 | } | ||
53 | |||
54 | /* | ||
55 | * Check if any of the ctx's have pending work in this hardware queue | ||
56 | */ | ||
57 | static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) | ||
58 | { | ||
59 | unsigned int i; | ||
60 | |||
61 | for (i = 0; i < hctx->nr_ctx_map; i++) | ||
62 | if (hctx->ctx_map[i]) | ||
63 | return true; | ||
64 | |||
65 | return false; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Mark this ctx as having pending work in this hardware queue | ||
70 | */ | ||
71 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, | ||
72 | struct blk_mq_ctx *ctx) | ||
73 | { | ||
74 | if (!test_bit(ctx->index_hw, hctx->ctx_map)) | ||
75 | set_bit(ctx->index_hw, hctx->ctx_map); | ||
76 | } | ||
77 | |||
78 | static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, | ||
79 | bool reserved) | ||
80 | { | ||
81 | struct request *rq; | ||
82 | unsigned int tag; | ||
83 | |||
84 | tag = blk_mq_get_tag(hctx->tags, gfp, reserved); | ||
85 | if (tag != BLK_MQ_TAG_FAIL) { | ||
86 | rq = hctx->rqs[tag]; | ||
87 | rq->tag = tag; | ||
88 | |||
89 | return rq; | ||
90 | } | ||
91 | |||
92 | return NULL; | ||
93 | } | ||
94 | |||
95 | static int blk_mq_queue_enter(struct request_queue *q) | ||
96 | { | ||
97 | int ret; | ||
98 | |||
99 | __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); | ||
100 | smp_wmb(); | ||
101 | /* we have problems to freeze the queue if it's initializing */ | ||
102 | if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) | ||
103 | return 0; | ||
104 | |||
105 | __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); | ||
106 | |||
107 | spin_lock_irq(q->queue_lock); | ||
108 | ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, | ||
109 | !blk_queue_bypass(q), *q->queue_lock); | ||
110 | /* inc usage with lock hold to avoid freeze_queue runs here */ | ||
111 | if (!ret) | ||
112 | __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); | ||
113 | spin_unlock_irq(q->queue_lock); | ||
114 | |||
115 | return ret; | ||
116 | } | ||
117 | |||
118 | static void blk_mq_queue_exit(struct request_queue *q) | ||
119 | { | ||
120 | __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Guarantee no request is in use, so we can change any data structure of | ||
125 | * the queue afterward. | ||
126 | */ | ||
127 | static void blk_mq_freeze_queue(struct request_queue *q) | ||
128 | { | ||
129 | bool drain; | ||
130 | |||
131 | spin_lock_irq(q->queue_lock); | ||
132 | drain = !q->bypass_depth++; | ||
133 | queue_flag_set(QUEUE_FLAG_BYPASS, q); | ||
134 | spin_unlock_irq(q->queue_lock); | ||
135 | |||
136 | if (!drain) | ||
137 | return; | ||
138 | |||
139 | while (true) { | ||
140 | s64 count; | ||
141 | |||
142 | spin_lock_irq(q->queue_lock); | ||
143 | count = percpu_counter_sum(&q->mq_usage_counter); | ||
144 | spin_unlock_irq(q->queue_lock); | ||
145 | |||
146 | if (count == 0) | ||
147 | break; | ||
148 | blk_mq_run_queues(q, false); | ||
149 | msleep(10); | ||
150 | } | ||
151 | } | ||
152 | |||
153 | static void blk_mq_unfreeze_queue(struct request_queue *q) | ||
154 | { | ||
155 | bool wake = false; | ||
156 | |||
157 | spin_lock_irq(q->queue_lock); | ||
158 | if (!--q->bypass_depth) { | ||
159 | queue_flag_clear(QUEUE_FLAG_BYPASS, q); | ||
160 | wake = true; | ||
161 | } | ||
162 | WARN_ON_ONCE(q->bypass_depth < 0); | ||
163 | spin_unlock_irq(q->queue_lock); | ||
164 | if (wake) | ||
165 | wake_up_all(&q->mq_freeze_wq); | ||
166 | } | ||
167 | |||
168 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) | ||
169 | { | ||
170 | return blk_mq_has_free_tags(hctx->tags); | ||
171 | } | ||
172 | EXPORT_SYMBOL(blk_mq_can_queue); | ||
173 | |||
174 | static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq, | ||
175 | unsigned int rw_flags) | ||
176 | { | ||
177 | rq->mq_ctx = ctx; | ||
178 | rq->cmd_flags = rw_flags; | ||
179 | ctx->rq_dispatched[rw_is_sync(rw_flags)]++; | ||
180 | } | ||
181 | |||
182 | static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, | ||
183 | gfp_t gfp, bool reserved) | ||
184 | { | ||
185 | return blk_mq_alloc_rq(hctx, gfp, reserved); | ||
186 | } | ||
187 | |||
188 | static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, | ||
189 | int rw, gfp_t gfp, | ||
190 | bool reserved) | ||
191 | { | ||
192 | struct request *rq; | ||
193 | |||
194 | do { | ||
195 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); | ||
196 | struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
197 | |||
198 | rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); | ||
199 | if (rq) { | ||
200 | blk_mq_rq_ctx_init(ctx, rq, rw); | ||
201 | break; | ||
202 | } else if (!(gfp & __GFP_WAIT)) | ||
203 | break; | ||
204 | |||
205 | blk_mq_put_ctx(ctx); | ||
206 | __blk_mq_run_hw_queue(hctx); | ||
207 | blk_mq_wait_for_tags(hctx->tags); | ||
208 | } while (1); | ||
209 | |||
210 | return rq; | ||
211 | } | ||
212 | |||
213 | struct request *blk_mq_alloc_request(struct request_queue *q, int rw, | ||
214 | gfp_t gfp, bool reserved) | ||
215 | { | ||
216 | struct request *rq; | ||
217 | |||
218 | if (blk_mq_queue_enter(q)) | ||
219 | return NULL; | ||
220 | |||
221 | rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); | ||
222 | blk_mq_put_ctx(rq->mq_ctx); | ||
223 | return rq; | ||
224 | } | ||
225 | |||
226 | struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, | ||
227 | gfp_t gfp) | ||
228 | { | ||
229 | struct request *rq; | ||
230 | |||
231 | if (blk_mq_queue_enter(q)) | ||
232 | return NULL; | ||
233 | |||
234 | rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); | ||
235 | blk_mq_put_ctx(rq->mq_ctx); | ||
236 | return rq; | ||
237 | } | ||
238 | EXPORT_SYMBOL(blk_mq_alloc_reserved_request); | ||
239 | |||
240 | /* | ||
241 | * Re-init and set pdu, if we have it | ||
242 | */ | ||
243 | static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) | ||
244 | { | ||
245 | blk_rq_init(hctx->queue, rq); | ||
246 | |||
247 | if (hctx->cmd_size) | ||
248 | rq->special = blk_mq_rq_to_pdu(rq); | ||
249 | } | ||
250 | |||
251 | static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, | ||
252 | struct blk_mq_ctx *ctx, struct request *rq) | ||
253 | { | ||
254 | const int tag = rq->tag; | ||
255 | struct request_queue *q = rq->q; | ||
256 | |||
257 | blk_mq_rq_init(hctx, rq); | ||
258 | blk_mq_put_tag(hctx->tags, tag); | ||
259 | |||
260 | blk_mq_queue_exit(q); | ||
261 | } | ||
262 | |||
263 | void blk_mq_free_request(struct request *rq) | ||
264 | { | ||
265 | struct blk_mq_ctx *ctx = rq->mq_ctx; | ||
266 | struct blk_mq_hw_ctx *hctx; | ||
267 | struct request_queue *q = rq->q; | ||
268 | |||
269 | ctx->rq_completed[rq_is_sync(rq)]++; | ||
270 | |||
271 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
272 | __blk_mq_free_request(hctx, ctx, rq); | ||
273 | } | ||
274 | |||
275 | static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) | ||
276 | { | ||
277 | if (error) | ||
278 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
279 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
280 | error = -EIO; | ||
281 | |||
282 | if (unlikely(rq->cmd_flags & REQ_QUIET)) | ||
283 | set_bit(BIO_QUIET, &bio->bi_flags); | ||
284 | |||
285 | /* don't actually finish bio if it's part of flush sequence */ | ||
286 | if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) | ||
287 | bio_endio(bio, error); | ||
288 | } | ||
289 | |||
290 | void blk_mq_complete_request(struct request *rq, int error) | ||
291 | { | ||
292 | struct bio *bio = rq->bio; | ||
293 | unsigned int bytes = 0; | ||
294 | |||
295 | trace_block_rq_complete(rq->q, rq); | ||
296 | |||
297 | while (bio) { | ||
298 | struct bio *next = bio->bi_next; | ||
299 | |||
300 | bio->bi_next = NULL; | ||
301 | bytes += bio->bi_size; | ||
302 | blk_mq_bio_endio(rq, bio, error); | ||
303 | bio = next; | ||
304 | } | ||
305 | |||
306 | blk_account_io_completion(rq, bytes); | ||
307 | |||
308 | if (rq->end_io) | ||
309 | rq->end_io(rq, error); | ||
310 | else | ||
311 | blk_mq_free_request(rq); | ||
312 | |||
313 | blk_account_io_done(rq); | ||
314 | } | ||
315 | |||
316 | void __blk_mq_end_io(struct request *rq, int error) | ||
317 | { | ||
318 | if (!blk_mark_rq_complete(rq)) | ||
319 | blk_mq_complete_request(rq, error); | ||
320 | } | ||
321 | |||
322 | #if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) | ||
323 | |||
324 | /* | ||
325 | * Called with interrupts disabled. | ||
326 | */ | ||
327 | static void ipi_end_io(void *data) | ||
328 | { | ||
329 | struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id()); | ||
330 | struct llist_node *entry, *next; | ||
331 | struct request *rq; | ||
332 | |||
333 | entry = llist_del_all(list); | ||
334 | |||
335 | while (entry) { | ||
336 | next = entry->next; | ||
337 | rq = llist_entry(entry, struct request, ll_list); | ||
338 | __blk_mq_end_io(rq, rq->errors); | ||
339 | entry = next; | ||
340 | } | ||
341 | } | ||
342 | |||
343 | static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, | ||
344 | struct request *rq, const int error) | ||
345 | { | ||
346 | struct call_single_data *data = &rq->csd; | ||
347 | |||
348 | rq->errors = error; | ||
349 | rq->ll_list.next = NULL; | ||
350 | |||
351 | /* | ||
352 | * If the list is non-empty, an existing IPI must already | ||
353 | * be "in flight". If that is the case, we need not schedule | ||
354 | * a new one. | ||
355 | */ | ||
356 | if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) { | ||
357 | data->func = ipi_end_io; | ||
358 | data->flags = 0; | ||
359 | __smp_call_function_single(ctx->cpu, data, 0); | ||
360 | } | ||
361 | |||
362 | return true; | ||
363 | } | ||
364 | #else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ | ||
365 | static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, | ||
366 | struct request *rq, const int error) | ||
367 | { | ||
368 | return false; | ||
369 | } | ||
370 | #endif | ||
371 | |||
372 | /* | ||
373 | * End IO on this request on a multiqueue enabled driver. We'll either do | ||
374 | * it directly inline, or punt to a local IPI handler on the matching | ||
375 | * remote CPU. | ||
376 | */ | ||
377 | void blk_mq_end_io(struct request *rq, int error) | ||
378 | { | ||
379 | struct blk_mq_ctx *ctx = rq->mq_ctx; | ||
380 | int cpu; | ||
381 | |||
382 | if (!ctx->ipi_redirect) | ||
383 | return __blk_mq_end_io(rq, error); | ||
384 | |||
385 | cpu = get_cpu(); | ||
386 | |||
387 | if (cpu == ctx->cpu || !cpu_online(ctx->cpu) || | ||
388 | !ipi_remote_cpu(ctx, cpu, rq, error)) | ||
389 | __blk_mq_end_io(rq, error); | ||
390 | |||
391 | put_cpu(); | ||
392 | } | ||
393 | EXPORT_SYMBOL(blk_mq_end_io); | ||
394 | |||
395 | static void blk_mq_start_request(struct request *rq) | ||
396 | { | ||
397 | struct request_queue *q = rq->q; | ||
398 | |||
399 | trace_block_rq_issue(q, rq); | ||
400 | |||
401 | /* | ||
402 | * Just mark start time and set the started bit. Due to memory | ||
403 | * ordering, we know we'll see the correct deadline as long as | ||
404 | * REQ_ATOMIC_STARTED is seen. | ||
405 | */ | ||
406 | rq->deadline = jiffies + q->rq_timeout; | ||
407 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | ||
408 | } | ||
409 | |||
410 | static void blk_mq_requeue_request(struct request *rq) | ||
411 | { | ||
412 | struct request_queue *q = rq->q; | ||
413 | |||
414 | trace_block_rq_requeue(q, rq); | ||
415 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | ||
416 | } | ||
417 | |||
418 | struct blk_mq_timeout_data { | ||
419 | struct blk_mq_hw_ctx *hctx; | ||
420 | unsigned long *next; | ||
421 | unsigned int *next_set; | ||
422 | }; | ||
423 | |||
424 | static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) | ||
425 | { | ||
426 | struct blk_mq_timeout_data *data = __data; | ||
427 | struct blk_mq_hw_ctx *hctx = data->hctx; | ||
428 | unsigned int tag; | ||
429 | |||
430 | /* It may not be in flight yet (this is where | ||
431 | * the REQ_ATOMIC_STARTED flag comes in). The requests are | ||
432 | * statically allocated, so we know it's always safe to access the | ||
433 | * memory associated with a bit offset into ->rqs[]. | ||
434 | */ | ||
435 | tag = 0; | ||
436 | do { | ||
437 | struct request *rq; | ||
438 | |||
439 | tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); | ||
440 | if (tag >= hctx->queue_depth) | ||
441 | break; | ||
442 | |||
443 | rq = hctx->rqs[tag++]; | ||
444 | |||
445 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | ||
446 | continue; | ||
447 | |||
448 | blk_rq_check_expired(rq, data->next, data->next_set); | ||
449 | } while (1); | ||
450 | } | ||
451 | |||
452 | static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, | ||
453 | unsigned long *next, | ||
454 | unsigned int *next_set) | ||
455 | { | ||
456 | struct blk_mq_timeout_data data = { | ||
457 | .hctx = hctx, | ||
458 | .next = next, | ||
459 | .next_set = next_set, | ||
460 | }; | ||
461 | |||
462 | /* | ||
463 | * Ask the tagging code to iterate busy requests, so we can | ||
464 | * check them for timeout. | ||
465 | */ | ||
466 | blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); | ||
467 | } | ||
468 | |||
469 | static void blk_mq_rq_timer(unsigned long data) | ||
470 | { | ||
471 | struct request_queue *q = (struct request_queue *) data; | ||
472 | struct blk_mq_hw_ctx *hctx; | ||
473 | unsigned long next = 0; | ||
474 | int i, next_set = 0; | ||
475 | |||
476 | queue_for_each_hw_ctx(q, hctx, i) | ||
477 | blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); | ||
478 | |||
479 | if (next_set) | ||
480 | mod_timer(&q->timeout, round_jiffies_up(next)); | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Reverse check our software queue for entries that we could potentially | ||
485 | * merge with. Currently includes a hand-wavy stop count of 8, to not spend | ||
486 | * too much time checking for merges. | ||
487 | */ | ||
488 | static bool blk_mq_attempt_merge(struct request_queue *q, | ||
489 | struct blk_mq_ctx *ctx, struct bio *bio) | ||
490 | { | ||
491 | struct request *rq; | ||
492 | int checked = 8; | ||
493 | |||
494 | list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { | ||
495 | int el_ret; | ||
496 | |||
497 | if (!checked--) | ||
498 | break; | ||
499 | |||
500 | if (!blk_rq_merge_ok(rq, bio)) | ||
501 | continue; | ||
502 | |||
503 | el_ret = blk_try_merge(rq, bio); | ||
504 | if (el_ret == ELEVATOR_BACK_MERGE) { | ||
505 | if (bio_attempt_back_merge(q, rq, bio)) { | ||
506 | ctx->rq_merged++; | ||
507 | return true; | ||
508 | } | ||
509 | break; | ||
510 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { | ||
511 | if (bio_attempt_front_merge(q, rq, bio)) { | ||
512 | ctx->rq_merged++; | ||
513 | return true; | ||
514 | } | ||
515 | break; | ||
516 | } | ||
517 | } | ||
518 | |||
519 | return false; | ||
520 | } | ||
521 | |||
522 | void blk_mq_add_timer(struct request *rq) | ||
523 | { | ||
524 | __blk_add_timer(rq, NULL); | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Run this hardware queue, pulling any software queues mapped to it in. | ||
529 | * Note that this function currently has various problems around ordering | ||
530 | * of IO. In particular, we'd like FIFO behaviour on handling existing | ||
531 | * items on the hctx->dispatch list. Ignore that for now. | ||
532 | */ | ||
533 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | ||
534 | { | ||
535 | struct request_queue *q = hctx->queue; | ||
536 | struct blk_mq_ctx *ctx; | ||
537 | struct request *rq; | ||
538 | LIST_HEAD(rq_list); | ||
539 | int bit, queued; | ||
540 | |||
541 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) | ||
542 | return; | ||
543 | |||
544 | hctx->run++; | ||
545 | |||
546 | /* | ||
547 | * Touch any software queue that has pending entries. | ||
548 | */ | ||
549 | for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { | ||
550 | clear_bit(bit, hctx->ctx_map); | ||
551 | ctx = hctx->ctxs[bit]; | ||
552 | BUG_ON(bit != ctx->index_hw); | ||
553 | |||
554 | spin_lock(&ctx->lock); | ||
555 | list_splice_tail_init(&ctx->rq_list, &rq_list); | ||
556 | spin_unlock(&ctx->lock); | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * If we have previous entries on our dispatch list, grab them | ||
561 | * and stuff them at the front for more fair dispatch. | ||
562 | */ | ||
563 | if (!list_empty_careful(&hctx->dispatch)) { | ||
564 | spin_lock(&hctx->lock); | ||
565 | if (!list_empty(&hctx->dispatch)) | ||
566 | list_splice_init(&hctx->dispatch, &rq_list); | ||
567 | spin_unlock(&hctx->lock); | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * Delete and return all entries from our dispatch list | ||
572 | */ | ||
573 | queued = 0; | ||
574 | |||
575 | /* | ||
576 | * Now process all the entries, sending them to the driver. | ||
577 | */ | ||
578 | while (!list_empty(&rq_list)) { | ||
579 | int ret; | ||
580 | |||
581 | rq = list_first_entry(&rq_list, struct request, queuelist); | ||
582 | list_del_init(&rq->queuelist); | ||
583 | blk_mq_start_request(rq); | ||
584 | |||
585 | /* | ||
586 | * Last request in the series. Flag it as such, this | ||
587 | * enables drivers to know when IO should be kicked off, | ||
588 | * if they don't do it on a per-request basis. | ||
589 | * | ||
590 | * Note: the flag isn't the only condition drivers | ||
591 | * should do kick off. If drive is busy, the last | ||
592 | * request might not have the bit set. | ||
593 | */ | ||
594 | if (list_empty(&rq_list)) | ||
595 | rq->cmd_flags |= REQ_END; | ||
596 | |||
597 | ret = q->mq_ops->queue_rq(hctx, rq); | ||
598 | switch (ret) { | ||
599 | case BLK_MQ_RQ_QUEUE_OK: | ||
600 | queued++; | ||
601 | continue; | ||
602 | case BLK_MQ_RQ_QUEUE_BUSY: | ||
603 | /* | ||
604 | * FIXME: we should have a mechanism to stop the queue | ||
605 | * like blk_stop_queue, otherwise we will waste cpu | ||
606 | * time | ||
607 | */ | ||
608 | list_add(&rq->queuelist, &rq_list); | ||
609 | blk_mq_requeue_request(rq); | ||
610 | break; | ||
611 | default: | ||
612 | pr_err("blk-mq: bad return on queue: %d\n", ret); | ||
613 | rq->errors = -EIO; | ||
614 | case BLK_MQ_RQ_QUEUE_ERROR: | ||
615 | blk_mq_end_io(rq, rq->errors); | ||
616 | break; | ||
617 | } | ||
618 | |||
619 | if (ret == BLK_MQ_RQ_QUEUE_BUSY) | ||
620 | break; | ||
621 | } | ||
622 | |||
623 | if (!queued) | ||
624 | hctx->dispatched[0]++; | ||
625 | else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) | ||
626 | hctx->dispatched[ilog2(queued) + 1]++; | ||
627 | |||
628 | /* | ||
629 | * Any items that need requeuing? Stuff them into hctx->dispatch, | ||
630 | * that is where we will continue on next queue run. | ||
631 | */ | ||
632 | if (!list_empty(&rq_list)) { | ||
633 | spin_lock(&hctx->lock); | ||
634 | list_splice(&rq_list, &hctx->dispatch); | ||
635 | spin_unlock(&hctx->lock); | ||
636 | } | ||
637 | } | ||
638 | |||
639 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | ||
640 | { | ||
641 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) | ||
642 | return; | ||
643 | |||
644 | if (!async) | ||
645 | __blk_mq_run_hw_queue(hctx); | ||
646 | else { | ||
647 | struct request_queue *q = hctx->queue; | ||
648 | |||
649 | kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); | ||
650 | } | ||
651 | } | ||
652 | |||
653 | void blk_mq_run_queues(struct request_queue *q, bool async) | ||
654 | { | ||
655 | struct blk_mq_hw_ctx *hctx; | ||
656 | int i; | ||
657 | |||
658 | queue_for_each_hw_ctx(q, hctx, i) { | ||
659 | if ((!blk_mq_hctx_has_pending(hctx) && | ||
660 | list_empty_careful(&hctx->dispatch)) || | ||
661 | test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) | ||
662 | continue; | ||
663 | |||
664 | blk_mq_run_hw_queue(hctx, async); | ||
665 | } | ||
666 | } | ||
667 | EXPORT_SYMBOL(blk_mq_run_queues); | ||
668 | |||
669 | void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) | ||
670 | { | ||
671 | cancel_delayed_work(&hctx->delayed_work); | ||
672 | set_bit(BLK_MQ_S_STOPPED, &hctx->state); | ||
673 | } | ||
674 | EXPORT_SYMBOL(blk_mq_stop_hw_queue); | ||
675 | |||
676 | void blk_mq_stop_hw_queues(struct request_queue *q) | ||
677 | { | ||
678 | struct blk_mq_hw_ctx *hctx; | ||
679 | int i; | ||
680 | |||
681 | queue_for_each_hw_ctx(q, hctx, i) | ||
682 | blk_mq_stop_hw_queue(hctx); | ||
683 | } | ||
684 | EXPORT_SYMBOL(blk_mq_stop_hw_queues); | ||
685 | |||
686 | void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) | ||
687 | { | ||
688 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); | ||
689 | __blk_mq_run_hw_queue(hctx); | ||
690 | } | ||
691 | EXPORT_SYMBOL(blk_mq_start_hw_queue); | ||
692 | |||
693 | void blk_mq_start_stopped_hw_queues(struct request_queue *q) | ||
694 | { | ||
695 | struct blk_mq_hw_ctx *hctx; | ||
696 | int i; | ||
697 | |||
698 | queue_for_each_hw_ctx(q, hctx, i) { | ||
699 | if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) | ||
700 | continue; | ||
701 | |||
702 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); | ||
703 | blk_mq_run_hw_queue(hctx, true); | ||
704 | } | ||
705 | } | ||
706 | EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); | ||
707 | |||
708 | static void blk_mq_work_fn(struct work_struct *work) | ||
709 | { | ||
710 | struct blk_mq_hw_ctx *hctx; | ||
711 | |||
712 | hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); | ||
713 | __blk_mq_run_hw_queue(hctx); | ||
714 | } | ||
715 | |||
716 | static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, | ||
717 | struct request *rq) | ||
718 | { | ||
719 | struct blk_mq_ctx *ctx = rq->mq_ctx; | ||
720 | |||
721 | list_add_tail(&rq->queuelist, &ctx->rq_list); | ||
722 | blk_mq_hctx_mark_pending(hctx, ctx); | ||
723 | |||
724 | /* | ||
725 | * We do this early, to ensure we are on the right CPU. | ||
726 | */ | ||
727 | blk_mq_add_timer(rq); | ||
728 | } | ||
729 | |||
730 | void blk_mq_insert_request(struct request_queue *q, struct request *rq, | ||
731 | bool run_queue) | ||
732 | { | ||
733 | struct blk_mq_hw_ctx *hctx; | ||
734 | struct blk_mq_ctx *ctx, *current_ctx; | ||
735 | |||
736 | ctx = rq->mq_ctx; | ||
737 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
738 | |||
739 | if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { | ||
740 | blk_insert_flush(rq); | ||
741 | } else { | ||
742 | current_ctx = blk_mq_get_ctx(q); | ||
743 | |||
744 | if (!cpu_online(ctx->cpu)) { | ||
745 | ctx = current_ctx; | ||
746 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
747 | rq->mq_ctx = ctx; | ||
748 | } | ||
749 | spin_lock(&ctx->lock); | ||
750 | __blk_mq_insert_request(hctx, rq); | ||
751 | spin_unlock(&ctx->lock); | ||
752 | |||
753 | blk_mq_put_ctx(current_ctx); | ||
754 | } | ||
755 | |||
756 | if (run_queue) | ||
757 | __blk_mq_run_hw_queue(hctx); | ||
758 | } | ||
759 | EXPORT_SYMBOL(blk_mq_insert_request); | ||
760 | |||
761 | /* | ||
762 | * This is a special version of blk_mq_insert_request to bypass FLUSH request | ||
763 | * check. Should only be used internally. | ||
764 | */ | ||
765 | void blk_mq_run_request(struct request *rq, bool run_queue, bool async) | ||
766 | { | ||
767 | struct request_queue *q = rq->q; | ||
768 | struct blk_mq_hw_ctx *hctx; | ||
769 | struct blk_mq_ctx *ctx, *current_ctx; | ||
770 | |||
771 | current_ctx = blk_mq_get_ctx(q); | ||
772 | |||
773 | ctx = rq->mq_ctx; | ||
774 | if (!cpu_online(ctx->cpu)) { | ||
775 | ctx = current_ctx; | ||
776 | rq->mq_ctx = ctx; | ||
777 | } | ||
778 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
779 | |||
780 | /* ctx->cpu might be offline */ | ||
781 | spin_lock(&ctx->lock); | ||
782 | __blk_mq_insert_request(hctx, rq); | ||
783 | spin_unlock(&ctx->lock); | ||
784 | |||
785 | blk_mq_put_ctx(current_ctx); | ||
786 | |||
787 | if (run_queue) | ||
788 | blk_mq_run_hw_queue(hctx, async); | ||
789 | } | ||
790 | |||
791 | static void blk_mq_insert_requests(struct request_queue *q, | ||
792 | struct blk_mq_ctx *ctx, | ||
793 | struct list_head *list, | ||
794 | int depth, | ||
795 | bool from_schedule) | ||
796 | |||
797 | { | ||
798 | struct blk_mq_hw_ctx *hctx; | ||
799 | struct blk_mq_ctx *current_ctx; | ||
800 | |||
801 | trace_block_unplug(q, depth, !from_schedule); | ||
802 | |||
803 | current_ctx = blk_mq_get_ctx(q); | ||
804 | |||
805 | if (!cpu_online(ctx->cpu)) | ||
806 | ctx = current_ctx; | ||
807 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
808 | |||
809 | /* | ||
810 | * preemption doesn't flush plug list, so it's possible ctx->cpu is | ||
811 | * offline now | ||
812 | */ | ||
813 | spin_lock(&ctx->lock); | ||
814 | while (!list_empty(list)) { | ||
815 | struct request *rq; | ||
816 | |||
817 | rq = list_first_entry(list, struct request, queuelist); | ||
818 | list_del_init(&rq->queuelist); | ||
819 | rq->mq_ctx = ctx; | ||
820 | __blk_mq_insert_request(hctx, rq); | ||
821 | } | ||
822 | spin_unlock(&ctx->lock); | ||
823 | |||
824 | blk_mq_put_ctx(current_ctx); | ||
825 | |||
826 | blk_mq_run_hw_queue(hctx, from_schedule); | ||
827 | } | ||
828 | |||
829 | static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
830 | { | ||
831 | struct request *rqa = container_of(a, struct request, queuelist); | ||
832 | struct request *rqb = container_of(b, struct request, queuelist); | ||
833 | |||
834 | return !(rqa->mq_ctx < rqb->mq_ctx || | ||
835 | (rqa->mq_ctx == rqb->mq_ctx && | ||
836 | blk_rq_pos(rqa) < blk_rq_pos(rqb))); | ||
837 | } | ||
838 | |||
839 | void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) | ||
840 | { | ||
841 | struct blk_mq_ctx *this_ctx; | ||
842 | struct request_queue *this_q; | ||
843 | struct request *rq; | ||
844 | LIST_HEAD(list); | ||
845 | LIST_HEAD(ctx_list); | ||
846 | unsigned int depth; | ||
847 | |||
848 | list_splice_init(&plug->mq_list, &list); | ||
849 | |||
850 | list_sort(NULL, &list, plug_ctx_cmp); | ||
851 | |||
852 | this_q = NULL; | ||
853 | this_ctx = NULL; | ||
854 | depth = 0; | ||
855 | |||
856 | while (!list_empty(&list)) { | ||
857 | rq = list_entry_rq(list.next); | ||
858 | list_del_init(&rq->queuelist); | ||
859 | BUG_ON(!rq->q); | ||
860 | if (rq->mq_ctx != this_ctx) { | ||
861 | if (this_ctx) { | ||
862 | blk_mq_insert_requests(this_q, this_ctx, | ||
863 | &ctx_list, depth, | ||
864 | from_schedule); | ||
865 | } | ||
866 | |||
867 | this_ctx = rq->mq_ctx; | ||
868 | this_q = rq->q; | ||
869 | depth = 0; | ||
870 | } | ||
871 | |||
872 | depth++; | ||
873 | list_add_tail(&rq->queuelist, &ctx_list); | ||
874 | } | ||
875 | |||
876 | /* | ||
877 | * If 'this_ctx' is set, we know we have entries to complete | ||
878 | * on 'ctx_list'. Do those. | ||
879 | */ | ||
880 | if (this_ctx) { | ||
881 | blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, | ||
882 | from_schedule); | ||
883 | } | ||
884 | } | ||
885 | |||
886 | static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | ||
887 | { | ||
888 | init_request_from_bio(rq, bio); | ||
889 | blk_account_io_start(rq, 1); | ||
890 | } | ||
891 | |||
892 | static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | ||
893 | { | ||
894 | struct blk_mq_hw_ctx *hctx; | ||
895 | struct blk_mq_ctx *ctx; | ||
896 | const int is_sync = rw_is_sync(bio->bi_rw); | ||
897 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); | ||
898 | int rw = bio_data_dir(bio); | ||
899 | struct request *rq; | ||
900 | unsigned int use_plug, request_count = 0; | ||
901 | |||
902 | /* | ||
903 | * If we have multiple hardware queues, just go directly to | ||
904 | * one of those for sync IO. | ||
905 | */ | ||
906 | use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); | ||
907 | |||
908 | blk_queue_bounce(q, &bio); | ||
909 | |||
910 | if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) | ||
911 | return; | ||
912 | |||
913 | if (blk_mq_queue_enter(q)) { | ||
914 | bio_endio(bio, -EIO); | ||
915 | return; | ||
916 | } | ||
917 | |||
918 | ctx = blk_mq_get_ctx(q); | ||
919 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
920 | |||
921 | trace_block_getrq(q, bio, rw); | ||
922 | rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); | ||
923 | if (likely(rq)) | ||
924 | blk_mq_rq_ctx_init(ctx, rq, rw); | ||
925 | else { | ||
926 | blk_mq_put_ctx(ctx); | ||
927 | trace_block_sleeprq(q, bio, rw); | ||
928 | rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, | ||
929 | false); | ||
930 | ctx = rq->mq_ctx; | ||
931 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | ||
932 | } | ||
933 | |||
934 | hctx->queued++; | ||
935 | |||
936 | if (unlikely(is_flush_fua)) { | ||
937 | blk_mq_bio_to_request(rq, bio); | ||
938 | blk_mq_put_ctx(ctx); | ||
939 | blk_insert_flush(rq); | ||
940 | goto run_queue; | ||
941 | } | ||
942 | |||
943 | /* | ||
944 | * A task plug currently exists. Since this is completely lockless, | ||
945 | * utilize that to temporarily store requests until the task is | ||
946 | * either done or scheduled away. | ||
947 | */ | ||
948 | if (use_plug) { | ||
949 | struct blk_plug *plug = current->plug; | ||
950 | |||
951 | if (plug) { | ||
952 | blk_mq_bio_to_request(rq, bio); | ||
953 | if (list_empty(&plug->mq_list)) | ||
954 | trace_block_plug(q); | ||
955 | else if (request_count >= BLK_MAX_REQUEST_COUNT) { | ||
956 | blk_flush_plug_list(plug, false); | ||
957 | trace_block_plug(q); | ||
958 | } | ||
959 | list_add_tail(&rq->queuelist, &plug->mq_list); | ||
960 | blk_mq_put_ctx(ctx); | ||
961 | return; | ||
962 | } | ||
963 | } | ||
964 | |||
965 | spin_lock(&ctx->lock); | ||
966 | |||
967 | if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && | ||
968 | blk_mq_attempt_merge(q, ctx, bio)) | ||
969 | __blk_mq_free_request(hctx, ctx, rq); | ||
970 | else { | ||
971 | blk_mq_bio_to_request(rq, bio); | ||
972 | __blk_mq_insert_request(hctx, rq); | ||
973 | } | ||
974 | |||
975 | spin_unlock(&ctx->lock); | ||
976 | blk_mq_put_ctx(ctx); | ||
977 | |||
978 | /* | ||
979 | * For a SYNC request, send it to the hardware immediately. For an | ||
980 | * ASYNC request, just ensure that we run it later on. The latter | ||
981 | * allows for merging opportunities and more efficient dispatching. | ||
982 | */ | ||
983 | run_queue: | ||
984 | blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); | ||
985 | } | ||
986 | |||
987 | /* | ||
988 | * Default mapping to a software queue, since we use one per CPU. | ||
989 | */ | ||
990 | struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) | ||
991 | { | ||
992 | return q->queue_hw_ctx[q->mq_map[cpu]]; | ||
993 | } | ||
994 | EXPORT_SYMBOL(blk_mq_map_queue); | ||
995 | |||
996 | struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, | ||
997 | unsigned int hctx_index) | ||
998 | { | ||
999 | return kmalloc_node(sizeof(struct blk_mq_hw_ctx), | ||
1000 | GFP_KERNEL | __GFP_ZERO, reg->numa_node); | ||
1001 | } | ||
1002 | EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); | ||
1003 | |||
1004 | void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, | ||
1005 | unsigned int hctx_index) | ||
1006 | { | ||
1007 | kfree(hctx); | ||
1008 | } | ||
1009 | EXPORT_SYMBOL(blk_mq_free_single_hw_queue); | ||
1010 | |||
1011 | static void blk_mq_hctx_notify(void *data, unsigned long action, | ||
1012 | unsigned int cpu) | ||
1013 | { | ||
1014 | struct blk_mq_hw_ctx *hctx = data; | ||
1015 | struct blk_mq_ctx *ctx; | ||
1016 | LIST_HEAD(tmp); | ||
1017 | |||
1018 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) | ||
1019 | return; | ||
1020 | |||
1021 | /* | ||
1022 | * Move ctx entries to new CPU, if this one is going away. | ||
1023 | */ | ||
1024 | ctx = __blk_mq_get_ctx(hctx->queue, cpu); | ||
1025 | |||
1026 | spin_lock(&ctx->lock); | ||
1027 | if (!list_empty(&ctx->rq_list)) { | ||
1028 | list_splice_init(&ctx->rq_list, &tmp); | ||
1029 | clear_bit(ctx->index_hw, hctx->ctx_map); | ||
1030 | } | ||
1031 | spin_unlock(&ctx->lock); | ||
1032 | |||
1033 | if (list_empty(&tmp)) | ||
1034 | return; | ||
1035 | |||
1036 | ctx = blk_mq_get_ctx(hctx->queue); | ||
1037 | spin_lock(&ctx->lock); | ||
1038 | |||
1039 | while (!list_empty(&tmp)) { | ||
1040 | struct request *rq; | ||
1041 | |||
1042 | rq = list_first_entry(&tmp, struct request, queuelist); | ||
1043 | rq->mq_ctx = ctx; | ||
1044 | list_move_tail(&rq->queuelist, &ctx->rq_list); | ||
1045 | } | ||
1046 | |||
1047 | blk_mq_hctx_mark_pending(hctx, ctx); | ||
1048 | |||
1049 | spin_unlock(&ctx->lock); | ||
1050 | blk_mq_put_ctx(ctx); | ||
1051 | } | ||
1052 | |||
1053 | static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, | ||
1054 | void (*init)(void *, struct blk_mq_hw_ctx *, | ||
1055 | struct request *, unsigned int), | ||
1056 | void *data) | ||
1057 | { | ||
1058 | unsigned int i; | ||
1059 | |||
1060 | for (i = 0; i < hctx->queue_depth; i++) { | ||
1061 | struct request *rq = hctx->rqs[i]; | ||
1062 | |||
1063 | init(data, hctx, rq, i); | ||
1064 | } | ||
1065 | } | ||
1066 | |||
1067 | void blk_mq_init_commands(struct request_queue *q, | ||
1068 | void (*init)(void *, struct blk_mq_hw_ctx *, | ||
1069 | struct request *, unsigned int), | ||
1070 | void *data) | ||
1071 | { | ||
1072 | struct blk_mq_hw_ctx *hctx; | ||
1073 | unsigned int i; | ||
1074 | |||
1075 | queue_for_each_hw_ctx(q, hctx, i) | ||
1076 | blk_mq_init_hw_commands(hctx, init, data); | ||
1077 | } | ||
1078 | EXPORT_SYMBOL(blk_mq_init_commands); | ||
1079 | |||
1080 | static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) | ||
1081 | { | ||
1082 | struct page *page; | ||
1083 | |||
1084 | while (!list_empty(&hctx->page_list)) { | ||
1085 | page = list_first_entry(&hctx->page_list, struct page, list); | ||
1086 | list_del_init(&page->list); | ||
1087 | __free_pages(page, page->private); | ||
1088 | } | ||
1089 | |||
1090 | kfree(hctx->rqs); | ||
1091 | |||
1092 | if (hctx->tags) | ||
1093 | blk_mq_free_tags(hctx->tags); | ||
1094 | } | ||
1095 | |||
1096 | static size_t order_to_size(unsigned int order) | ||
1097 | { | ||
1098 | size_t ret = PAGE_SIZE; | ||
1099 | |||
1100 | while (order--) | ||
1101 | ret *= 2; | ||
1102 | |||
1103 | return ret; | ||
1104 | } | ||
1105 | |||
1106 | static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, | ||
1107 | unsigned int reserved_tags, int node) | ||
1108 | { | ||
1109 | unsigned int i, j, entries_per_page, max_order = 4; | ||
1110 | size_t rq_size, left; | ||
1111 | |||
1112 | INIT_LIST_HEAD(&hctx->page_list); | ||
1113 | |||
1114 | hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), | ||
1115 | GFP_KERNEL, node); | ||
1116 | if (!hctx->rqs) | ||
1117 | return -ENOMEM; | ||
1118 | |||
1119 | /* | ||
1120 | * rq_size is the size of the request plus driver payload, rounded | ||
1121 | * to the cacheline size | ||
1122 | */ | ||
1123 | rq_size = round_up(sizeof(struct request) + hctx->cmd_size, | ||
1124 | cache_line_size()); | ||
1125 | left = rq_size * hctx->queue_depth; | ||
1126 | |||
1127 | for (i = 0; i < hctx->queue_depth;) { | ||
1128 | int this_order = max_order; | ||
1129 | struct page *page; | ||
1130 | int to_do; | ||
1131 | void *p; | ||
1132 | |||
1133 | while (left < order_to_size(this_order - 1) && this_order) | ||
1134 | this_order--; | ||
1135 | |||
1136 | do { | ||
1137 | page = alloc_pages_node(node, GFP_KERNEL, this_order); | ||
1138 | if (page) | ||
1139 | break; | ||
1140 | if (!this_order--) | ||
1141 | break; | ||
1142 | if (order_to_size(this_order) < rq_size) | ||
1143 | break; | ||
1144 | } while (1); | ||
1145 | |||
1146 | if (!page) | ||
1147 | break; | ||
1148 | |||
1149 | page->private = this_order; | ||
1150 | list_add_tail(&page->list, &hctx->page_list); | ||
1151 | |||
1152 | p = page_address(page); | ||
1153 | entries_per_page = order_to_size(this_order) / rq_size; | ||
1154 | to_do = min(entries_per_page, hctx->queue_depth - i); | ||
1155 | left -= to_do * rq_size; | ||
1156 | for (j = 0; j < to_do; j++) { | ||
1157 | hctx->rqs[i] = p; | ||
1158 | blk_mq_rq_init(hctx, hctx->rqs[i]); | ||
1159 | p += rq_size; | ||
1160 | i++; | ||
1161 | } | ||
1162 | } | ||
1163 | |||
1164 | if (i < (reserved_tags + BLK_MQ_TAG_MIN)) | ||
1165 | goto err_rq_map; | ||
1166 | else if (i != hctx->queue_depth) { | ||
1167 | hctx->queue_depth = i; | ||
1168 | pr_warn("%s: queue depth set to %u because of low memory\n", | ||
1169 | __func__, i); | ||
1170 | } | ||
1171 | |||
1172 | hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); | ||
1173 | if (!hctx->tags) { | ||
1174 | err_rq_map: | ||
1175 | blk_mq_free_rq_map(hctx); | ||
1176 | return -ENOMEM; | ||
1177 | } | ||
1178 | |||
1179 | return 0; | ||
1180 | } | ||
1181 | |||
1182 | static int blk_mq_init_hw_queues(struct request_queue *q, | ||
1183 | struct blk_mq_reg *reg, void *driver_data) | ||
1184 | { | ||
1185 | struct blk_mq_hw_ctx *hctx; | ||
1186 | unsigned int i, j; | ||
1187 | |||
1188 | /* | ||
1189 | * Initialize hardware queues | ||
1190 | */ | ||
1191 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1192 | unsigned int num_maps; | ||
1193 | int node; | ||
1194 | |||
1195 | node = hctx->numa_node; | ||
1196 | if (node == NUMA_NO_NODE) | ||
1197 | node = hctx->numa_node = reg->numa_node; | ||
1198 | |||
1199 | INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); | ||
1200 | spin_lock_init(&hctx->lock); | ||
1201 | INIT_LIST_HEAD(&hctx->dispatch); | ||
1202 | hctx->queue = q; | ||
1203 | hctx->queue_num = i; | ||
1204 | hctx->flags = reg->flags; | ||
1205 | hctx->queue_depth = reg->queue_depth; | ||
1206 | hctx->cmd_size = reg->cmd_size; | ||
1207 | |||
1208 | blk_mq_init_cpu_notifier(&hctx->cpu_notifier, | ||
1209 | blk_mq_hctx_notify, hctx); | ||
1210 | blk_mq_register_cpu_notifier(&hctx->cpu_notifier); | ||
1211 | |||
1212 | if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) | ||
1213 | break; | ||
1214 | |||
1215 | /* | ||
1216 | * Allocate space for all possible cpus to avoid allocation in | ||
1217 | * runtime | ||
1218 | */ | ||
1219 | hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), | ||
1220 | GFP_KERNEL, node); | ||
1221 | if (!hctx->ctxs) | ||
1222 | break; | ||
1223 | |||
1224 | num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; | ||
1225 | hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), | ||
1226 | GFP_KERNEL, node); | ||
1227 | if (!hctx->ctx_map) | ||
1228 | break; | ||
1229 | |||
1230 | hctx->nr_ctx_map = num_maps; | ||
1231 | hctx->nr_ctx = 0; | ||
1232 | |||
1233 | if (reg->ops->init_hctx && | ||
1234 | reg->ops->init_hctx(hctx, driver_data, i)) | ||
1235 | break; | ||
1236 | } | ||
1237 | |||
1238 | if (i == q->nr_hw_queues) | ||
1239 | return 0; | ||
1240 | |||
1241 | /* | ||
1242 | * Init failed | ||
1243 | */ | ||
1244 | queue_for_each_hw_ctx(q, hctx, j) { | ||
1245 | if (i == j) | ||
1246 | break; | ||
1247 | |||
1248 | if (reg->ops->exit_hctx) | ||
1249 | reg->ops->exit_hctx(hctx, j); | ||
1250 | |||
1251 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | ||
1252 | blk_mq_free_rq_map(hctx); | ||
1253 | kfree(hctx->ctxs); | ||
1254 | } | ||
1255 | |||
1256 | return 1; | ||
1257 | } | ||
1258 | |||
1259 | static void blk_mq_init_cpu_queues(struct request_queue *q, | ||
1260 | unsigned int nr_hw_queues) | ||
1261 | { | ||
1262 | unsigned int i; | ||
1263 | |||
1264 | for_each_possible_cpu(i) { | ||
1265 | struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); | ||
1266 | struct blk_mq_hw_ctx *hctx; | ||
1267 | |||
1268 | memset(__ctx, 0, sizeof(*__ctx)); | ||
1269 | __ctx->cpu = i; | ||
1270 | spin_lock_init(&__ctx->lock); | ||
1271 | INIT_LIST_HEAD(&__ctx->rq_list); | ||
1272 | __ctx->queue = q; | ||
1273 | |||
1274 | /* If the cpu isn't online, the cpu is mapped to first hctx */ | ||
1275 | hctx = q->mq_ops->map_queue(q, i); | ||
1276 | hctx->nr_ctx++; | ||
1277 | |||
1278 | if (!cpu_online(i)) | ||
1279 | continue; | ||
1280 | |||
1281 | /* | ||
1282 | * Set local node, IFF we have more than one hw queue. If | ||
1283 | * not, we remain on the home node of the device | ||
1284 | */ | ||
1285 | if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) | ||
1286 | hctx->numa_node = cpu_to_node(i); | ||
1287 | } | ||
1288 | } | ||
1289 | |||
1290 | static void blk_mq_map_swqueue(struct request_queue *q) | ||
1291 | { | ||
1292 | unsigned int i; | ||
1293 | struct blk_mq_hw_ctx *hctx; | ||
1294 | struct blk_mq_ctx *ctx; | ||
1295 | |||
1296 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1297 | hctx->nr_ctx = 0; | ||
1298 | } | ||
1299 | |||
1300 | /* | ||
1301 | * Map software to hardware queues | ||
1302 | */ | ||
1303 | queue_for_each_ctx(q, ctx, i) { | ||
1304 | /* If the cpu isn't online, the cpu is mapped to first hctx */ | ||
1305 | hctx = q->mq_ops->map_queue(q, i); | ||
1306 | ctx->index_hw = hctx->nr_ctx; | ||
1307 | hctx->ctxs[hctx->nr_ctx++] = ctx; | ||
1308 | } | ||
1309 | } | ||
1310 | |||
1311 | struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, | ||
1312 | void *driver_data) | ||
1313 | { | ||
1314 | struct blk_mq_hw_ctx **hctxs; | ||
1315 | struct blk_mq_ctx *ctx; | ||
1316 | struct request_queue *q; | ||
1317 | int i; | ||
1318 | |||
1319 | if (!reg->nr_hw_queues || | ||
1320 | !reg->ops->queue_rq || !reg->ops->map_queue || | ||
1321 | !reg->ops->alloc_hctx || !reg->ops->free_hctx) | ||
1322 | return ERR_PTR(-EINVAL); | ||
1323 | |||
1324 | if (!reg->queue_depth) | ||
1325 | reg->queue_depth = BLK_MQ_MAX_DEPTH; | ||
1326 | else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { | ||
1327 | pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); | ||
1328 | reg->queue_depth = BLK_MQ_MAX_DEPTH; | ||
1329 | } | ||
1330 | |||
1331 | /* | ||
1332 | * Set aside a tag for flush requests. It will only be used while | ||
1333 | * another flush request is in progress but outside the driver. | ||
1334 | * | ||
1335 | * TODO: only allocate if flushes are supported | ||
1336 | */ | ||
1337 | reg->queue_depth++; | ||
1338 | reg->reserved_tags++; | ||
1339 | |||
1340 | if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) | ||
1341 | return ERR_PTR(-EINVAL); | ||
1342 | |||
1343 | ctx = alloc_percpu(struct blk_mq_ctx); | ||
1344 | if (!ctx) | ||
1345 | return ERR_PTR(-ENOMEM); | ||
1346 | |||
1347 | hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, | ||
1348 | reg->numa_node); | ||
1349 | |||
1350 | if (!hctxs) | ||
1351 | goto err_percpu; | ||
1352 | |||
1353 | for (i = 0; i < reg->nr_hw_queues; i++) { | ||
1354 | hctxs[i] = reg->ops->alloc_hctx(reg, i); | ||
1355 | if (!hctxs[i]) | ||
1356 | goto err_hctxs; | ||
1357 | |||
1358 | hctxs[i]->numa_node = NUMA_NO_NODE; | ||
1359 | hctxs[i]->queue_num = i; | ||
1360 | } | ||
1361 | |||
1362 | q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); | ||
1363 | if (!q) | ||
1364 | goto err_hctxs; | ||
1365 | |||
1366 | q->mq_map = blk_mq_make_queue_map(reg); | ||
1367 | if (!q->mq_map) | ||
1368 | goto err_map; | ||
1369 | |||
1370 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); | ||
1371 | blk_queue_rq_timeout(q, 30000); | ||
1372 | |||
1373 | q->nr_queues = nr_cpu_ids; | ||
1374 | q->nr_hw_queues = reg->nr_hw_queues; | ||
1375 | |||
1376 | q->queue_ctx = ctx; | ||
1377 | q->queue_hw_ctx = hctxs; | ||
1378 | |||
1379 | q->mq_ops = reg->ops; | ||
1380 | |||
1381 | blk_queue_make_request(q, blk_mq_make_request); | ||
1382 | blk_queue_rq_timed_out(q, reg->ops->timeout); | ||
1383 | if (reg->timeout) | ||
1384 | blk_queue_rq_timeout(q, reg->timeout); | ||
1385 | |||
1386 | blk_mq_init_flush(q); | ||
1387 | blk_mq_init_cpu_queues(q, reg->nr_hw_queues); | ||
1388 | |||
1389 | if (blk_mq_init_hw_queues(q, reg, driver_data)) | ||
1390 | goto err_hw; | ||
1391 | |||
1392 | blk_mq_map_swqueue(q); | ||
1393 | |||
1394 | mutex_lock(&all_q_mutex); | ||
1395 | list_add_tail(&q->all_q_node, &all_q_list); | ||
1396 | mutex_unlock(&all_q_mutex); | ||
1397 | |||
1398 | return q; | ||
1399 | err_hw: | ||
1400 | kfree(q->mq_map); | ||
1401 | err_map: | ||
1402 | blk_cleanup_queue(q); | ||
1403 | err_hctxs: | ||
1404 | for (i = 0; i < reg->nr_hw_queues; i++) { | ||
1405 | if (!hctxs[i]) | ||
1406 | break; | ||
1407 | reg->ops->free_hctx(hctxs[i], i); | ||
1408 | } | ||
1409 | kfree(hctxs); | ||
1410 | err_percpu: | ||
1411 | free_percpu(ctx); | ||
1412 | return ERR_PTR(-ENOMEM); | ||
1413 | } | ||
1414 | EXPORT_SYMBOL(blk_mq_init_queue); | ||
1415 | |||
1416 | void blk_mq_free_queue(struct request_queue *q) | ||
1417 | { | ||
1418 | struct blk_mq_hw_ctx *hctx; | ||
1419 | int i; | ||
1420 | |||
1421 | queue_for_each_hw_ctx(q, hctx, i) { | ||
1422 | cancel_delayed_work_sync(&hctx->delayed_work); | ||
1423 | kfree(hctx->ctx_map); | ||
1424 | kfree(hctx->ctxs); | ||
1425 | blk_mq_free_rq_map(hctx); | ||
1426 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | ||
1427 | if (q->mq_ops->exit_hctx) | ||
1428 | q->mq_ops->exit_hctx(hctx, i); | ||
1429 | q->mq_ops->free_hctx(hctx, i); | ||
1430 | } | ||
1431 | |||
1432 | free_percpu(q->queue_ctx); | ||
1433 | kfree(q->queue_hw_ctx); | ||
1434 | kfree(q->mq_map); | ||
1435 | |||
1436 | q->queue_ctx = NULL; | ||
1437 | q->queue_hw_ctx = NULL; | ||
1438 | q->mq_map = NULL; | ||
1439 | |||
1440 | mutex_lock(&all_q_mutex); | ||
1441 | list_del_init(&q->all_q_node); | ||
1442 | mutex_unlock(&all_q_mutex); | ||
1443 | } | ||
1444 | EXPORT_SYMBOL(blk_mq_free_queue); | ||
1445 | |||
1446 | /* Basically redo blk_mq_init_queue with queue frozen */ | ||
1447 | static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) | ||
1448 | { | ||
1449 | blk_mq_freeze_queue(q); | ||
1450 | |||
1451 | blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); | ||
1452 | |||
1453 | /* | ||
1454 | * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe | ||
1455 | * we should change hctx numa_node according to new topology (this | ||
1456 | * involves free and re-allocate memory, worthy doing?) | ||
1457 | */ | ||
1458 | |||
1459 | blk_mq_map_swqueue(q); | ||
1460 | |||
1461 | blk_mq_unfreeze_queue(q); | ||
1462 | } | ||
1463 | |||
1464 | static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, | ||
1465 | unsigned long action, void *hcpu) | ||
1466 | { | ||
1467 | struct request_queue *q; | ||
1468 | |||
1469 | /* | ||
1470 | * Before new mapping is established, hotadded cpu might already start | ||
1471 | * handling requests. This doesn't break anything as we map offline | ||
1472 | * CPUs to first hardware queue. We will re-init queue below to get | ||
1473 | * optimal settings. | ||
1474 | */ | ||
1475 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && | ||
1476 | action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) | ||
1477 | return NOTIFY_OK; | ||
1478 | |||
1479 | mutex_lock(&all_q_mutex); | ||
1480 | list_for_each_entry(q, &all_q_list, all_q_node) | ||
1481 | blk_mq_queue_reinit(q); | ||
1482 | mutex_unlock(&all_q_mutex); | ||
1483 | return NOTIFY_OK; | ||
1484 | } | ||
1485 | |||
1486 | static int __init blk_mq_init(void) | ||
1487 | { | ||
1488 | unsigned int i; | ||
1489 | |||
1490 | for_each_possible_cpu(i) | ||
1491 | init_llist_head(&per_cpu(ipi_lists, i)); | ||
1492 | |||
1493 | blk_mq_cpu_init(); | ||
1494 | |||
1495 | /* Must be called after percpu_counter_hotcpu_callback() */ | ||
1496 | hotcpu_notifier(blk_mq_queue_reinit_notify, -10); | ||
1497 | |||
1498 | return 0; | ||
1499 | } | ||
1500 | subsys_initcall(blk_mq_init); | ||
diff --git a/block/blk-mq.h b/block/blk-mq.h new file mode 100644 index 000000000000..52bf1f96a2c2 --- /dev/null +++ b/block/blk-mq.h | |||
@@ -0,0 +1,52 @@ | |||
1 | #ifndef INT_BLK_MQ_H | ||
2 | #define INT_BLK_MQ_H | ||
3 | |||
4 | struct blk_mq_ctx { | ||
5 | struct { | ||
6 | spinlock_t lock; | ||
7 | struct list_head rq_list; | ||
8 | } ____cacheline_aligned_in_smp; | ||
9 | |||
10 | unsigned int cpu; | ||
11 | unsigned int index_hw; | ||
12 | unsigned int ipi_redirect; | ||
13 | |||
14 | /* incremented at dispatch time */ | ||
15 | unsigned long rq_dispatched[2]; | ||
16 | unsigned long rq_merged; | ||
17 | |||
18 | /* incremented at completion time */ | ||
19 | unsigned long ____cacheline_aligned_in_smp rq_completed[2]; | ||
20 | |||
21 | struct request_queue *queue; | ||
22 | struct kobject kobj; | ||
23 | }; | ||
24 | |||
25 | void __blk_mq_end_io(struct request *rq, int error); | ||
26 | void blk_mq_complete_request(struct request *rq, int error); | ||
27 | void blk_mq_run_request(struct request *rq, bool run_queue, bool async); | ||
28 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); | ||
29 | void blk_mq_init_flush(struct request_queue *q); | ||
30 | |||
31 | /* | ||
32 | * CPU hotplug helpers | ||
33 | */ | ||
34 | struct blk_mq_cpu_notifier; | ||
35 | void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, | ||
36 | void (*fn)(void *, unsigned long, unsigned int), | ||
37 | void *data); | ||
38 | void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); | ||
39 | void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); | ||
40 | void blk_mq_cpu_init(void); | ||
41 | DECLARE_PER_CPU(struct llist_head, ipi_lists); | ||
42 | |||
43 | /* | ||
44 | * CPU -> queue mappings | ||
45 | */ | ||
46 | struct blk_mq_reg; | ||
47 | extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); | ||
48 | extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); | ||
49 | |||
50 | void blk_mq_add_timer(struct request *rq); | ||
51 | |||
52 | #endif | ||
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3aa5b195f4dd..4f8c4d90ec73 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/bio.h> | 7 | #include <linux/bio.h> |
8 | #include <linux/blkdev.h> | 8 | #include <linux/blkdev.h> |
9 | #include <linux/blktrace_api.h> | 9 | #include <linux/blktrace_api.h> |
10 | #include <linux/blk-mq.h> | ||
10 | 11 | ||
11 | #include "blk.h" | 12 | #include "blk.h" |
12 | #include "blk-cgroup.h" | 13 | #include "blk-cgroup.h" |
@@ -542,6 +543,11 @@ static void blk_release_queue(struct kobject *kobj) | |||
542 | if (q->queue_tags) | 543 | if (q->queue_tags) |
543 | __blk_queue_free_tags(q); | 544 | __blk_queue_free_tags(q); |
544 | 545 | ||
546 | percpu_counter_destroy(&q->mq_usage_counter); | ||
547 | |||
548 | if (q->mq_ops) | ||
549 | blk_mq_free_queue(q); | ||
550 | |||
545 | blk_trace_shutdown(q); | 551 | blk_trace_shutdown(q); |
546 | 552 | ||
547 | bdi_destroy(&q->backing_dev_info); | 553 | bdi_destroy(&q->backing_dev_info); |
@@ -575,6 +581,7 @@ int blk_register_queue(struct gendisk *disk) | |||
575 | * bypass from queue allocation. | 581 | * bypass from queue allocation. |
576 | */ | 582 | */ |
577 | blk_queue_bypass_end(q); | 583 | blk_queue_bypass_end(q); |
584 | queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); | ||
578 | 585 | ||
579 | ret = blk_trace_init_sysfs(dev); | 586 | ret = blk_trace_init_sysfs(dev); |
580 | if (ret) | 587 | if (ret) |
@@ -588,6 +595,9 @@ int blk_register_queue(struct gendisk *disk) | |||
588 | 595 | ||
589 | kobject_uevent(&q->kobj, KOBJ_ADD); | 596 | kobject_uevent(&q->kobj, KOBJ_ADD); |
590 | 597 | ||
598 | if (q->mq_ops) | ||
599 | blk_mq_register_disk(disk); | ||
600 | |||
591 | if (!q->request_fn) | 601 | if (!q->request_fn) |
592 | return 0; | 602 | return 0; |
593 | 603 | ||
@@ -610,6 +620,9 @@ void blk_unregister_queue(struct gendisk *disk) | |||
610 | if (WARN_ON(!q)) | 620 | if (WARN_ON(!q)) |
611 | return; | 621 | return; |
612 | 622 | ||
623 | if (q->mq_ops) | ||
624 | blk_mq_unregister_disk(disk); | ||
625 | |||
613 | if (q->request_fn) | 626 | if (q->request_fn) |
614 | elv_unregister_queue(q); | 627 | elv_unregister_queue(q); |
615 | 628 | ||
diff --git a/block/blk-timeout.c b/block/blk-timeout.c index abf725c655fc..bba81c9348e1 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/fault-inject.h> | 7 | #include <linux/fault-inject.h> |
8 | 8 | ||
9 | #include "blk.h" | 9 | #include "blk.h" |
10 | #include "blk-mq.h" | ||
10 | 11 | ||
11 | #ifdef CONFIG_FAIL_IO_TIMEOUT | 12 | #ifdef CONFIG_FAIL_IO_TIMEOUT |
12 | 13 | ||
@@ -88,10 +89,18 @@ static void blk_rq_timed_out(struct request *req) | |||
88 | ret = q->rq_timed_out_fn(req); | 89 | ret = q->rq_timed_out_fn(req); |
89 | switch (ret) { | 90 | switch (ret) { |
90 | case BLK_EH_HANDLED: | 91 | case BLK_EH_HANDLED: |
91 | __blk_complete_request(req); | 92 | /* Can we use req->errors here? */ |
93 | if (q->mq_ops) | ||
94 | blk_mq_complete_request(req, req->errors); | ||
95 | else | ||
96 | __blk_complete_request(req); | ||
92 | break; | 97 | break; |
93 | case BLK_EH_RESET_TIMER: | 98 | case BLK_EH_RESET_TIMER: |
94 | blk_add_timer(req); | 99 | if (q->mq_ops) |
100 | blk_mq_add_timer(req); | ||
101 | else | ||
102 | blk_add_timer(req); | ||
103 | |||
95 | blk_clear_rq_complete(req); | 104 | blk_clear_rq_complete(req); |
96 | break; | 105 | break; |
97 | case BLK_EH_NOT_HANDLED: | 106 | case BLK_EH_NOT_HANDLED: |
@@ -108,6 +117,23 @@ static void blk_rq_timed_out(struct request *req) | |||
108 | } | 117 | } |
109 | } | 118 | } |
110 | 119 | ||
120 | void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, | ||
121 | unsigned int *next_set) | ||
122 | { | ||
123 | if (time_after_eq(jiffies, rq->deadline)) { | ||
124 | list_del_init(&rq->timeout_list); | ||
125 | |||
126 | /* | ||
127 | * Check if we raced with end io completion | ||
128 | */ | ||
129 | if (!blk_mark_rq_complete(rq)) | ||
130 | blk_rq_timed_out(rq); | ||
131 | } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { | ||
132 | *next_timeout = rq->deadline; | ||
133 | *next_set = 1; | ||
134 | } | ||
135 | } | ||
136 | |||
111 | void blk_rq_timed_out_timer(unsigned long data) | 137 | void blk_rq_timed_out_timer(unsigned long data) |
112 | { | 138 | { |
113 | struct request_queue *q = (struct request_queue *) data; | 139 | struct request_queue *q = (struct request_queue *) data; |
@@ -117,21 +143,8 @@ void blk_rq_timed_out_timer(unsigned long data) | |||
117 | 143 | ||
118 | spin_lock_irqsave(q->queue_lock, flags); | 144 | spin_lock_irqsave(q->queue_lock, flags); |
119 | 145 | ||
120 | list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { | 146 | list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) |
121 | if (time_after_eq(jiffies, rq->deadline)) { | 147 | blk_rq_check_expired(rq, &next, &next_set); |
122 | list_del_init(&rq->timeout_list); | ||
123 | |||
124 | /* | ||
125 | * Check if we raced with end io completion | ||
126 | */ | ||
127 | if (blk_mark_rq_complete(rq)) | ||
128 | continue; | ||
129 | blk_rq_timed_out(rq); | ||
130 | } else if (!next_set || time_after(next, rq->deadline)) { | ||
131 | next = rq->deadline; | ||
132 | next_set = 1; | ||
133 | } | ||
134 | } | ||
135 | 148 | ||
136 | if (next_set) | 149 | if (next_set) |
137 | mod_timer(&q->timeout, round_jiffies_up(next)); | 150 | mod_timer(&q->timeout, round_jiffies_up(next)); |
@@ -157,15 +170,7 @@ void blk_abort_request(struct request *req) | |||
157 | } | 170 | } |
158 | EXPORT_SYMBOL_GPL(blk_abort_request); | 171 | EXPORT_SYMBOL_GPL(blk_abort_request); |
159 | 172 | ||
160 | /** | 173 | void __blk_add_timer(struct request *req, struct list_head *timeout_list) |
161 | * blk_add_timer - Start timeout timer for a single request | ||
162 | * @req: request that is about to start running. | ||
163 | * | ||
164 | * Notes: | ||
165 | * Each request has its own timer, and as it is added to the queue, we | ||
166 | * set up the timer. When the request completes, we cancel the timer. | ||
167 | */ | ||
168 | void blk_add_timer(struct request *req) | ||
169 | { | 174 | { |
170 | struct request_queue *q = req->q; | 175 | struct request_queue *q = req->q; |
171 | unsigned long expiry; | 176 | unsigned long expiry; |
@@ -183,7 +188,8 @@ void blk_add_timer(struct request *req) | |||
183 | req->timeout = q->rq_timeout; | 188 | req->timeout = q->rq_timeout; |
184 | 189 | ||
185 | req->deadline = jiffies + req->timeout; | 190 | req->deadline = jiffies + req->timeout; |
186 | list_add_tail(&req->timeout_list, &q->timeout_list); | 191 | if (timeout_list) |
192 | list_add_tail(&req->timeout_list, timeout_list); | ||
187 | 193 | ||
188 | /* | 194 | /* |
189 | * If the timer isn't already pending or this timeout is earlier | 195 | * If the timer isn't already pending or this timeout is earlier |
@@ -195,5 +201,19 @@ void blk_add_timer(struct request *req) | |||
195 | if (!timer_pending(&q->timeout) || | 201 | if (!timer_pending(&q->timeout) || |
196 | time_before(expiry, q->timeout.expires)) | 202 | time_before(expiry, q->timeout.expires)) |
197 | mod_timer(&q->timeout, expiry); | 203 | mod_timer(&q->timeout, expiry); |
204 | |||
205 | } | ||
206 | |||
207 | /** | ||
208 | * blk_add_timer - Start timeout timer for a single request | ||
209 | * @req: request that is about to start running. | ||
210 | * | ||
211 | * Notes: | ||
212 | * Each request has its own timer, and as it is added to the queue, we | ||
213 | * set up the timer. When the request completes, we cancel the timer. | ||
214 | */ | ||
215 | void blk_add_timer(struct request *req) | ||
216 | { | ||
217 | __blk_add_timer(req, &req->q->timeout_list); | ||
198 | } | 218 | } |
199 | 219 | ||
diff --git a/block/blk.h b/block/blk.h index e837b8f619b7..c90e1d8f7a2b 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #define BLK_BATCH_REQ 32 | 10 | #define BLK_BATCH_REQ 32 |
11 | 11 | ||
12 | extern struct kmem_cache *blk_requestq_cachep; | 12 | extern struct kmem_cache *blk_requestq_cachep; |
13 | extern struct kmem_cache *request_cachep; | ||
13 | extern struct kobj_type blk_queue_ktype; | 14 | extern struct kobj_type blk_queue_ktype; |
14 | extern struct ida blk_queue_ida; | 15 | extern struct ida blk_queue_ida; |
15 | 16 | ||
@@ -34,14 +35,30 @@ bool __blk_end_bidi_request(struct request *rq, int error, | |||
34 | unsigned int nr_bytes, unsigned int bidi_bytes); | 35 | unsigned int nr_bytes, unsigned int bidi_bytes); |
35 | 36 | ||
36 | void blk_rq_timed_out_timer(unsigned long data); | 37 | void blk_rq_timed_out_timer(unsigned long data); |
38 | void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, | ||
39 | unsigned int *next_set); | ||
40 | void __blk_add_timer(struct request *req, struct list_head *timeout_list); | ||
37 | void blk_delete_timer(struct request *); | 41 | void blk_delete_timer(struct request *); |
38 | void blk_add_timer(struct request *); | 42 | void blk_add_timer(struct request *); |
39 | 43 | ||
44 | |||
45 | bool bio_attempt_front_merge(struct request_queue *q, struct request *req, | ||
46 | struct bio *bio); | ||
47 | bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | ||
48 | struct bio *bio); | ||
49 | bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, | ||
50 | unsigned int *request_count); | ||
51 | |||
52 | void blk_account_io_start(struct request *req, bool new_io); | ||
53 | void blk_account_io_completion(struct request *req, unsigned int bytes); | ||
54 | void blk_account_io_done(struct request *req); | ||
55 | |||
40 | /* | 56 | /* |
41 | * Internal atomic flags for request handling | 57 | * Internal atomic flags for request handling |
42 | */ | 58 | */ |
43 | enum rq_atomic_flags { | 59 | enum rq_atomic_flags { |
44 | REQ_ATOM_COMPLETE = 0, | 60 | REQ_ATOM_COMPLETE = 0, |
61 | REQ_ATOM_STARTED, | ||
45 | }; | 62 | }; |
46 | 63 | ||
47 | /* | 64 | /* |