aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 22:08:14 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 22:08:14 -0500
commit0910c0bdf7c291a41bc21e40a97389c9d4c1960d (patch)
tree177c4cb22ece78b18f64f548ae82b9a15edbb99c /block
parent2821fe6b00a1e902fd399bb4b7e40bc3041f4d44 (diff)
parente37459b8e2c7db6735e39e019e448b76e5e77647 (diff)
Merge branch 'for-3.13/core' of git://git.kernel.dk/linux-block
Pull block IO core updates from Jens Axboe: "This is the pull request for the core changes in the block layer for 3.13. It contains: - The new blk-mq request interface. This is a new and more scalable queueing model that marries the best part of the request based interface we currently have (which is fully featured, but scales poorly) and the bio based "interface" which the new drivers for high IOPS devices end up using because it's much faster than the request based one. The bio interface has no block layer support, since it taps into the stack much earlier. This means that drivers end up having to implement a lot of functionality on their own, like tagging, timeout handling, requeue, etc. The blk-mq interface provides all these. Some drivers even provide a switch to select bio or rq and has code to handle both, since things like merging only works in the rq model and hence is faster for some workloads. This is a huge mess. Conversion of these drivers nets us a substantial code reduction. Initial results on converting SCSI to this model even shows an 8x improvement on single queue devices. So while the model was intended to work on the newer multiqueue devices, it has substantial improvements for "classic" hardware as well. This code has gone through extensive testing and development, it's now ready to go. A pull request is coming to convert virtio-blk to this model will be will be coming as well, with more drivers scheduled for 3.14 conversion. - Two blktrace fixes from Jan and Chen Gang. - A plug merge fix from Alireza Haghdoost. - Conversion of __get_cpu_var() from Christoph Lameter. - Fix for sector_div() with 64-bit divider from Geert Uytterhoeven. - A fix for a race between request completion and the timeout handling from Jeff Moyer. This is what caused the merge conflict with blk-mq/core, in case you are looking at that. - A dm stacking fix from Mike Snitzer. - A code consolidation fix and duplicated code removal from Kent Overstreet. - A handful of block bug fixes from Mikulas Patocka, fixing a loop crash and memory corruption on blk cg. - Elevator switch bug fix from Tomoki Sekiyama. A heads-up that I had to rebase this branch. Initially the immutable bio_vecs had been queued up for inclusion, but a week later, it became clear that it wasn't fully cooked yet. So the decision was made to pull this out and postpone it until 3.14. It was a straight forward rebase, just pruning out the immutable series and the later fixes of problems with it. The rest of the patches applied directly and no further changes were made" * 'for-3.13/core' of git://git.kernel.dk/linux-block: (31 commits) block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO block: Do not call sector_div() with a 64-bit divisor kernel: trace: blktrace: remove redundent memcpy() in compat_blk_trace_setup() block: Consolidate duplicated bio_trim() implementations block: Use rw_copy_check_uvector() block: Enable sysfs nomerge control for I/O requests in the plug list block: properly stack underlying max_segment_size to DM device elevator: acquire q->sysfs_lock in elevator_change() elevator: Fix a race in elevator switching and md device initialization block: Replace __get_cpu_var uses bdi: test bdi_init failure block: fix a probe argument to blk_register_region loop: fix crash if blk_alloc_queue fails blk-core: Fix memory corruption if blkcg_init_queue fails block: fix race between request completion and timeout handling blktrace: Send BLK_TN_PROCESS events to all running traces blk-mq: don't disallow request merges for req->special being set blk-mq: mq plug list breakage blk-mq: fix for flush deadlock ...
Diffstat (limited to 'block')
-rw-r--r--block/Makefile5
-rw-r--r--block/blk-core.c175
-rw-r--r--block/blk-exec.c14
-rw-r--r--block/blk-flush.c154
-rw-r--r--block/blk-iopoll.c6
-rw-r--r--block/blk-lib.c10
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-cpu.c93
-rw-r--r--block/blk-mq-cpumap.c108
-rw-r--r--block/blk-mq-sysfs.c384
-rw-r--r--block/blk-mq-tag.c204
-rw-r--r--block/blk-mq-tag.h27
-rw-r--r--block/blk-mq.c1500
-rw-r--r--block/blk-mq.h52
-rw-r--r--block/blk-settings.c1
-rw-r--r--block/blk-softirq.c8
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--block/blk-timeout.c77
-rw-r--r--block/blk.h17
-rw-r--r--block/elevator.c22
-rw-r--r--block/ioctl.c2
-rw-r--r--block/scsi_ioctl.c39
22 files changed, 2761 insertions, 167 deletions
diff --git a/block/Makefile b/block/Makefile
index 671a83d063a5..20645e88fb57 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,9 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ 8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
9 partition-generic.o partitions/ 9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
10 genhd.o scsi_ioctl.o partition-generic.o partitions/
10 11
11obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 12obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
12obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 13obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 0a00e4ecf87c..8bdd0121212a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/blk-mq.h>
19#include <linux/highmem.h> 20#include <linux/highmem.h>
20#include <linux/mm.h> 21#include <linux/mm.h>
21#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
48/* 49/*
49 * For the allocated request tables 50 * For the allocated request tables
50 */ 51 */
51static struct kmem_cache *request_cachep; 52struct kmem_cache *request_cachep = NULL;
52 53
53/* 54/*
54 * For queue allocation 55 * For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
60 */ 61 */
61static struct workqueue_struct *kblockd_workqueue; 62static struct workqueue_struct *kblockd_workqueue;
62 63
63static void drive_stat_acct(struct request *rq, int new_io)
64{
65 struct hd_struct *part;
66 int rw = rq_data_dir(rq);
67 int cpu;
68
69 if (!blk_do_io_stat(rq))
70 return;
71
72 cpu = part_stat_lock();
73
74 if (!new_io) {
75 part = rq->part;
76 part_stat_inc(cpu, part, merges[rw]);
77 } else {
78 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
79 if (!hd_struct_try_get(part)) {
80 /*
81 * The partition is already being removed,
82 * the request will be accounted on the disk only
83 *
84 * We take a reference on disk->part0 although that
85 * partition will never be deleted, so we can treat
86 * it as any other partition.
87 */
88 part = &rq->rq_disk->part0;
89 hd_struct_get(part);
90 }
91 part_round_stats(cpu, part);
92 part_inc_in_flight(part, rw);
93 rq->part = part;
94 }
95
96 part_stat_unlock();
97}
98
99void blk_queue_congestion_threshold(struct request_queue *q) 64void blk_queue_congestion_threshold(struct request_queue *q)
100{ 65{
101 int nr; 66 int nr;
@@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
145 rq->cmd = rq->__cmd; 110 rq->cmd = rq->__cmd;
146 rq->cmd_len = BLK_MAX_CDB; 111 rq->cmd_len = BLK_MAX_CDB;
147 rq->tag = -1; 112 rq->tag = -1;
148 rq->ref_count = 1;
149 rq->start_time = jiffies; 113 rq->start_time = jiffies;
150 set_start_time_ns(rq); 114 set_start_time_ns(rq);
151 rq->part = NULL; 115 rq->part = NULL;
@@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
174{ 138{
175 int bit; 139 int bit;
176 140
177 printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, 141 printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
178 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, 142 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
179 rq->cmd_flags); 143 (unsigned long long) rq->cmd_flags);
180 144
181 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 145 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
182 (unsigned long long)blk_rq_pos(rq), 146 (unsigned long long)blk_rq_pos(rq),
@@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
595 if (!q) 559 if (!q)
596 return NULL; 560 return NULL;
597 561
562 if (percpu_counter_init(&q->mq_usage_counter, 0))
563 goto fail_q;
564
598 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 565 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
599 if (q->id < 0) 566 if (q->id < 0)
600 goto fail_q; 567 goto fail_c;
601 568
602 q->backing_dev_info.ra_pages = 569 q->backing_dev_info.ra_pages =
603 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 570 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -644,13 +611,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
644 q->bypass_depth = 1; 611 q->bypass_depth = 1;
645 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); 612 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
646 613
614 init_waitqueue_head(&q->mq_freeze_wq);
615
647 if (blkcg_init_queue(q)) 616 if (blkcg_init_queue(q))
648 goto fail_id; 617 goto fail_bdi;
649 618
650 return q; 619 return q;
651 620
621fail_bdi:
622 bdi_destroy(&q->backing_dev_info);
652fail_id: 623fail_id:
653 ida_simple_remove(&blk_queue_ida, q->id); 624 ida_simple_remove(&blk_queue_ida, q->id);
625fail_c:
626 percpu_counter_destroy(&q->mq_usage_counter);
654fail_q: 627fail_q:
655 kmem_cache_free(blk_requestq_cachep, q); 628 kmem_cache_free(blk_requestq_cachep, q);
656 return NULL; 629 return NULL;
@@ -739,9 +712,17 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
739 712
740 q->sg_reserved_size = INT_MAX; 713 q->sg_reserved_size = INT_MAX;
741 714
715 /* Protect q->elevator from elevator_change */
716 mutex_lock(&q->sysfs_lock);
717
742 /* init elevator */ 718 /* init elevator */
743 if (elevator_init(q, NULL)) 719 if (elevator_init(q, NULL)) {
720 mutex_unlock(&q->sysfs_lock);
744 return NULL; 721 return NULL;
722 }
723
724 mutex_unlock(&q->sysfs_lock);
725
745 return q; 726 return q;
746} 727}
747EXPORT_SYMBOL(blk_init_allocated_queue); 728EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1109,7 +1090,8 @@ retry:
1109 goto retry; 1090 goto retry;
1110} 1091}
1111 1092
1112struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1093static struct request *blk_old_get_request(struct request_queue *q, int rw,
1094 gfp_t gfp_mask)
1113{ 1095{
1114 struct request *rq; 1096 struct request *rq;
1115 1097
@@ -1126,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1126 1108
1127 return rq; 1109 return rq;
1128} 1110}
1111
1112struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1113{
1114 if (q->mq_ops)
1115 return blk_mq_alloc_request(q, rw, gfp_mask, false);
1116 else
1117 return blk_old_get_request(q, rw, gfp_mask);
1118}
1129EXPORT_SYMBOL(blk_get_request); 1119EXPORT_SYMBOL(blk_get_request);
1130 1120
1131/** 1121/**
@@ -1211,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request);
1211static void add_acct_request(struct request_queue *q, struct request *rq, 1201static void add_acct_request(struct request_queue *q, struct request *rq,
1212 int where) 1202 int where)
1213{ 1203{
1214 drive_stat_acct(rq, 1); 1204 blk_account_io_start(rq, true);
1215 __elv_add_request(q, rq, where); 1205 __elv_add_request(q, rq, where);
1216} 1206}
1217 1207
@@ -1272,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1272{ 1262{
1273 if (unlikely(!q)) 1263 if (unlikely(!q))
1274 return; 1264 return;
1275 if (unlikely(--req->ref_count))
1276 return;
1277 1265
1278 blk_pm_put_request(req); 1266 blk_pm_put_request(req);
1279 1267
@@ -1302,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
1302 1290
1303void blk_put_request(struct request *req) 1291void blk_put_request(struct request *req)
1304{ 1292{
1305 unsigned long flags;
1306 struct request_queue *q = req->q; 1293 struct request_queue *q = req->q;
1307 1294
1308 spin_lock_irqsave(q->queue_lock, flags); 1295 if (q->mq_ops)
1309 __blk_put_request(q, req); 1296 blk_mq_free_request(req);
1310 spin_unlock_irqrestore(q->queue_lock, flags); 1297 else {
1298 unsigned long flags;
1299
1300 spin_lock_irqsave(q->queue_lock, flags);
1301 __blk_put_request(q, req);
1302 spin_unlock_irqrestore(q->queue_lock, flags);
1303 }
1311} 1304}
1312EXPORT_SYMBOL(blk_put_request); 1305EXPORT_SYMBOL(blk_put_request);
1313 1306
@@ -1343,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1343} 1336}
1344EXPORT_SYMBOL_GPL(blk_add_request_payload); 1337EXPORT_SYMBOL_GPL(blk_add_request_payload);
1345 1338
1346static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1339bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1347 struct bio *bio) 1340 struct bio *bio)
1348{ 1341{
1349 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1342 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1350 1343
@@ -1361,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1361 req->__data_len += bio->bi_size; 1354 req->__data_len += bio->bi_size;
1362 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1355 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1363 1356
1364 drive_stat_acct(req, 0); 1357 blk_account_io_start(req, false);
1365 return true; 1358 return true;
1366} 1359}
1367 1360
1368static bool bio_attempt_front_merge(struct request_queue *q, 1361bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1369 struct request *req, struct bio *bio) 1362 struct bio *bio)
1370{ 1363{
1371 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1364 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1372 1365
@@ -1391,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1391 req->__data_len += bio->bi_size; 1384 req->__data_len += bio->bi_size;
1392 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1385 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1393 1386
1394 drive_stat_acct(req, 0); 1387 blk_account_io_start(req, false);
1395 return true; 1388 return true;
1396} 1389}
1397 1390
1398/** 1391/**
1399 * attempt_plug_merge - try to merge with %current's plugged list 1392 * blk_attempt_plug_merge - try to merge with %current's plugged list
1400 * @q: request_queue new bio is being queued at 1393 * @q: request_queue new bio is being queued at
1401 * @bio: new bio being queued 1394 * @bio: new bio being queued
1402 * @request_count: out parameter for number of traversed plugged requests 1395 * @request_count: out parameter for number of traversed plugged requests
@@ -1412,19 +1405,28 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1412 * reliable access to the elevator outside queue lock. Only check basic 1405 * reliable access to the elevator outside queue lock. Only check basic
1413 * merging parameters without querying the elevator. 1406 * merging parameters without querying the elevator.
1414 */ 1407 */
1415static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1408bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1416 unsigned int *request_count) 1409 unsigned int *request_count)
1417{ 1410{
1418 struct blk_plug *plug; 1411 struct blk_plug *plug;
1419 struct request *rq; 1412 struct request *rq;
1420 bool ret = false; 1413 bool ret = false;
1414 struct list_head *plug_list;
1415
1416 if (blk_queue_nomerges(q))
1417 goto out;
1421 1418
1422 plug = current->plug; 1419 plug = current->plug;
1423 if (!plug) 1420 if (!plug)
1424 goto out; 1421 goto out;
1425 *request_count = 0; 1422 *request_count = 0;
1426 1423
1427 list_for_each_entry_reverse(rq, &plug->list, queuelist) { 1424 if (q->mq_ops)
1425 plug_list = &plug->mq_list;
1426 else
1427 plug_list = &plug->list;
1428
1429 list_for_each_entry_reverse(rq, plug_list, queuelist) {
1428 int el_ret; 1430 int el_ret;
1429 1431
1430 if (rq->q == q) 1432 if (rq->q == q)
@@ -1492,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1492 * Check if we can merge with the plugged list before grabbing 1494 * Check if we can merge with the plugged list before grabbing
1493 * any locks. 1495 * any locks.
1494 */ 1496 */
1495 if (attempt_plug_merge(q, bio, &request_count)) 1497 if (blk_attempt_plug_merge(q, bio, &request_count))
1496 return; 1498 return;
1497 1499
1498 spin_lock_irq(q->queue_lock); 1500 spin_lock_irq(q->queue_lock);
@@ -1560,7 +1562,7 @@ get_rq:
1560 } 1562 }
1561 } 1563 }
1562 list_add_tail(&req->queuelist, &plug->list); 1564 list_add_tail(&req->queuelist, &plug->list);
1563 drive_stat_acct(req, 1); 1565 blk_account_io_start(req, true);
1564 } else { 1566 } else {
1565 spin_lock_irq(q->queue_lock); 1567 spin_lock_irq(q->queue_lock);
1566 add_acct_request(q, req, where); 1568 add_acct_request(q, req, where);
@@ -2014,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
2014} 2016}
2015EXPORT_SYMBOL_GPL(blk_rq_err_bytes); 2017EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
2016 2018
2017static void blk_account_io_completion(struct request *req, unsigned int bytes) 2019void blk_account_io_completion(struct request *req, unsigned int bytes)
2018{ 2020{
2019 if (blk_do_io_stat(req)) { 2021 if (blk_do_io_stat(req)) {
2020 const int rw = rq_data_dir(req); 2022 const int rw = rq_data_dir(req);
@@ -2028,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
2028 } 2030 }
2029} 2031}
2030 2032
2031static void blk_account_io_done(struct request *req) 2033void blk_account_io_done(struct request *req)
2032{ 2034{
2033 /* 2035 /*
2034 * Account IO completion. flush_rq isn't accounted as a 2036 * Account IO completion. flush_rq isn't accounted as a
@@ -2076,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
2076} 2078}
2077#endif 2079#endif
2078 2080
2081void blk_account_io_start(struct request *rq, bool new_io)
2082{
2083 struct hd_struct *part;
2084 int rw = rq_data_dir(rq);
2085 int cpu;
2086
2087 if (!blk_do_io_stat(rq))
2088 return;
2089
2090 cpu = part_stat_lock();
2091
2092 if (!new_io) {
2093 part = rq->part;
2094 part_stat_inc(cpu, part, merges[rw]);
2095 } else {
2096 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2097 if (!hd_struct_try_get(part)) {
2098 /*
2099 * The partition is already being removed,
2100 * the request will be accounted on the disk only
2101 *
2102 * We take a reference on disk->part0 although that
2103 * partition will never be deleted, so we can treat
2104 * it as any other partition.
2105 */
2106 part = &rq->rq_disk->part0;
2107 hd_struct_get(part);
2108 }
2109 part_round_stats(cpu, part);
2110 part_inc_in_flight(part, rw);
2111 rq->part = part;
2112 }
2113
2114 part_stat_unlock();
2115}
2116
2079/** 2117/**
2080 * blk_peek_request - peek at the top of a request queue 2118 * blk_peek_request - peek at the top of a request queue
2081 * @q: request queue to peek at 2119 * @q: request queue to peek at
@@ -2227,6 +2265,7 @@ void blk_start_request(struct request *req)
2227 if (unlikely(blk_bidi_rq(req))) 2265 if (unlikely(blk_bidi_rq(req)))
2228 req->next_rq->resid_len = blk_rq_bytes(req->next_rq); 2266 req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
2229 2267
2268 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
2230 blk_add_timer(req); 2269 blk_add_timer(req);
2231} 2270}
2232EXPORT_SYMBOL(blk_start_request); 2271EXPORT_SYMBOL(blk_start_request);
@@ -2451,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error)
2451 if (req->cmd_flags & REQ_DONTPREP) 2490 if (req->cmd_flags & REQ_DONTPREP)
2452 blk_unprep_request(req); 2491 blk_unprep_request(req);
2453 2492
2454
2455 blk_account_io_done(req); 2493 blk_account_io_done(req);
2456 2494
2457 if (req->end_io) 2495 if (req->end_io)
@@ -2873,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug)
2873 2911
2874 plug->magic = PLUG_MAGIC; 2912 plug->magic = PLUG_MAGIC;
2875 INIT_LIST_HEAD(&plug->list); 2913 INIT_LIST_HEAD(&plug->list);
2914 INIT_LIST_HEAD(&plug->mq_list);
2876 INIT_LIST_HEAD(&plug->cb_list); 2915 INIT_LIST_HEAD(&plug->cb_list);
2877 2916
2878 /* 2917 /*
@@ -2970,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2970 BUG_ON(plug->magic != PLUG_MAGIC); 3009 BUG_ON(plug->magic != PLUG_MAGIC);
2971 3010
2972 flush_plug_callbacks(plug, from_schedule); 3011 flush_plug_callbacks(plug, from_schedule);
3012
3013 if (!list_empty(&plug->mq_list))
3014 blk_mq_flush_plug_list(plug, from_schedule);
3015
2973 if (list_empty(&plug->list)) 3016 if (list_empty(&plug->list))
2974 return; 3017 return;
2975 3018
diff --git a/block/blk-exec.c b/block/blk-exec.c
index ae4f27d7944e..c3edf9dff566 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/bio.h> 6#include <linux/bio.h>
7#include <linux/blkdev.h> 7#include <linux/blkdev.h>
8#include <linux/blk-mq.h>
8#include <linux/sched/sysctl.h> 9#include <linux/sched/sysctl.h>
9 10
10#include "blk.h" 11#include "blk.h"
@@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)
24 struct completion *waiting = rq->end_io_data; 25 struct completion *waiting = rq->end_io_data;
25 26
26 rq->end_io_data = NULL; 27 rq->end_io_data = NULL;
27 __blk_put_request(rq->q, rq);
28 28
29 /* 29 /*
30 * complete last, if this is a stack request the process (and thus 30 * complete last, if this is a stack request the process (and thus
@@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
59 59
60 rq->rq_disk = bd_disk; 60 rq->rq_disk = bd_disk;
61 rq->end_io = done; 61 rq->end_io = done;
62
63 if (q->mq_ops) {
64 blk_mq_insert_request(q, rq, true);
65 return;
66 }
67
62 /* 68 /*
63 * need to check this before __blk_run_queue(), because rq can 69 * need to check this before __blk_run_queue(), because rq can
64 * be freed before that returns. 70 * be freed before that returns.
@@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
103 int err = 0; 109 int err = 0;
104 unsigned long hang_check; 110 unsigned long hang_check;
105 111
106 /*
107 * we need an extra reference to the request, so we can look at
108 * it after io completion
109 */
110 rq->ref_count++;
111
112 if (!rq->sense) { 112 if (!rq->sense) {
113 memset(sense, 0, sizeof(sense)); 113 memset(sense, 0, sizeof(sense));
114 rq->sense = sense; 114 rq->sense = sense;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827a853c..331e627301ea 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,8 +69,10 @@
69#include <linux/bio.h> 69#include <linux/bio.h>
70#include <linux/blkdev.h> 70#include <linux/blkdev.h>
71#include <linux/gfp.h> 71#include <linux/gfp.h>
72#include <linux/blk-mq.h>
72 73
73#include "blk.h" 74#include "blk.h"
75#include "blk-mq.h"
74 76
75/* FLUSH/FUA sequences */ 77/* FLUSH/FUA sequences */
76enum { 78enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
124 /* make @rq a normal request */ 126 /* make @rq a normal request */
125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 127 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
126 rq->end_io = rq->flush.saved_end_io; 128 rq->end_io = rq->flush.saved_end_io;
129
130 blk_clear_rq_complete(rq);
131}
132
133static void mq_flush_data_run(struct work_struct *work)
134{
135 struct request *rq;
136
137 rq = container_of(work, struct request, mq_flush_data);
138
139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_run_request(rq, true, false);
141}
142
143static void blk_mq_flush_data_insert(struct request *rq)
144{
145 INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
146 kblockd_schedule_work(rq->q, &rq->mq_flush_data);
127} 147}
128 148
129/** 149/**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
136 * completion and trigger the next step. 156 * completion and trigger the next step.
137 * 157 *
138 * CONTEXT: 158 * CONTEXT:
139 * spin_lock_irq(q->queue_lock) 159 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
140 * 160 *
141 * RETURNS: 161 * RETURNS:
142 * %true if requests were added to the dispatch queue, %false otherwise. 162 * %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
146{ 166{
147 struct request_queue *q = rq->q; 167 struct request_queue *q = rq->q;
148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 168 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
149 bool queued = false; 169 bool queued = false, kicked;
150 170
151 BUG_ON(rq->flush.seq & seq); 171 BUG_ON(rq->flush.seq & seq);
152 rq->flush.seq |= seq; 172 rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
167 187
168 case REQ_FSEQ_DATA: 188 case REQ_FSEQ_DATA:
169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 189 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
170 list_add(&rq->queuelist, &q->queue_head); 190 if (q->mq_ops)
171 queued = true; 191 blk_mq_flush_data_insert(rq);
192 else {
193 list_add(&rq->queuelist, &q->queue_head);
194 queued = true;
195 }
172 break; 196 break;
173 197
174 case REQ_FSEQ_DONE: 198 case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
181 BUG_ON(!list_empty(&rq->queuelist)); 205 BUG_ON(!list_empty(&rq->queuelist));
182 list_del_init(&rq->flush.list); 206 list_del_init(&rq->flush.list);
183 blk_flush_restore_request(rq); 207 blk_flush_restore_request(rq);
184 __blk_end_request_all(rq, error); 208 if (q->mq_ops)
209 blk_mq_end_io(rq, error);
210 else
211 __blk_end_request_all(rq, error);
185 break; 212 break;
186 213
187 default: 214 default:
188 BUG(); 215 BUG();
189 } 216 }
190 217
191 return blk_kick_flush(q) | queued; 218 kicked = blk_kick_flush(q);
219 /* blk_mq_run_flush will run queue */
220 if (q->mq_ops)
221 return queued;
222 return kicked | queued;
192} 223}
193 224
194static void flush_end_io(struct request *flush_rq, int error) 225static void flush_end_io(struct request *flush_rq, int error)
195{ 226{
196 struct request_queue *q = flush_rq->q; 227 struct request_queue *q = flush_rq->q;
197 struct list_head *running = &q->flush_queue[q->flush_running_idx]; 228 struct list_head *running;
198 bool queued = false; 229 bool queued = false;
199 struct request *rq, *n; 230 struct request *rq, *n;
231 unsigned long flags = 0;
200 232
233 if (q->mq_ops) {
234 blk_mq_free_request(flush_rq);
235 spin_lock_irqsave(&q->mq_flush_lock, flags);
236 }
237 running = &q->flush_queue[q->flush_running_idx];
201 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 238 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
202 239
203 /* account completion of the flush request */ 240 /* account completion of the flush request */
204 q->flush_running_idx ^= 1; 241 q->flush_running_idx ^= 1;
205 elv_completed_request(q, flush_rq); 242
243 if (!q->mq_ops)
244 elv_completed_request(q, flush_rq);
206 245
207 /* and push the waiting requests to the next stage */ 246 /* and push the waiting requests to the next stage */
208 list_for_each_entry_safe(rq, n, running, flush.list) { 247 list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
223 * directly into request_fn may confuse the driver. Always use 262 * directly into request_fn may confuse the driver. Always use
224 * kblockd. 263 * kblockd.
225 */ 264 */
226 if (queued || q->flush_queue_delayed) 265 if (queued || q->flush_queue_delayed) {
227 blk_run_queue_async(q); 266 if (!q->mq_ops)
267 blk_run_queue_async(q);
268 else
269 /*
270 * This can be optimized to only run queues with requests
271 * queued if necessary.
272 */
273 blk_mq_run_queues(q, true);
274 }
228 q->flush_queue_delayed = 0; 275 q->flush_queue_delayed = 0;
276 if (q->mq_ops)
277 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
278}
279
280static void mq_flush_work(struct work_struct *work)
281{
282 struct request_queue *q;
283 struct request *rq;
284
285 q = container_of(work, struct request_queue, mq_flush_work);
286
287 /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
288 rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
289 __GFP_WAIT|GFP_ATOMIC, true);
290 rq->cmd_type = REQ_TYPE_FS;
291 rq->end_io = flush_end_io;
292
293 blk_mq_run_request(rq, true, false);
294}
295
296/*
297 * We can't directly use q->flush_rq, because it doesn't have tag and is not in
298 * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
299 * so offload the work to workqueue.
300 *
301 * Note: we assume a flush request finished in any hardware queue will flush
302 * the whole disk cache.
303 */
304static void mq_run_flush(struct request_queue *q)
305{
306 kblockd_schedule_work(q, &q->mq_flush_work);
229} 307}
230 308
231/** 309/**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
236 * Please read the comment at the top of this file for more info. 314 * Please read the comment at the top of this file for more info.
237 * 315 *
238 * CONTEXT: 316 * CONTEXT:
239 * spin_lock_irq(q->queue_lock) 317 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
240 * 318 *
241 * RETURNS: 319 * RETURNS:
242 * %true if flush was issued, %false otherwise. 320 * %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
261 * Issue flush and toggle pending_idx. This makes pending_idx 339 * Issue flush and toggle pending_idx. This makes pending_idx
262 * different from running_idx, which means flush is in flight. 340 * different from running_idx, which means flush is in flight.
263 */ 341 */
342 q->flush_pending_idx ^= 1;
343 if (q->mq_ops) {
344 mq_run_flush(q);
345 return true;
346 }
347
264 blk_rq_init(q, &q->flush_rq); 348 blk_rq_init(q, &q->flush_rq);
265 q->flush_rq.cmd_type = REQ_TYPE_FS; 349 q->flush_rq.cmd_type = REQ_TYPE_FS;
266 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 350 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
267 q->flush_rq.rq_disk = first_rq->rq_disk; 351 q->flush_rq.rq_disk = first_rq->rq_disk;
268 q->flush_rq.end_io = flush_end_io; 352 q->flush_rq.end_io = flush_end_io;
269 353
270 q->flush_pending_idx ^= 1;
271 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 354 list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
272 return true; 355 return true;
273} 356}
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
284 blk_run_queue_async(q); 367 blk_run_queue_async(q);
285} 368}
286 369
370static void mq_flush_data_end_io(struct request *rq, int error)
371{
372 struct request_queue *q = rq->q;
373 struct blk_mq_hw_ctx *hctx;
374 struct blk_mq_ctx *ctx;
375 unsigned long flags;
376
377 ctx = rq->mq_ctx;
378 hctx = q->mq_ops->map_queue(q, ctx->cpu);
379
380 /*
381 * After populating an empty queue, kick it to avoid stall. Read
382 * the comment in flush_end_io().
383 */
384 spin_lock_irqsave(&q->mq_flush_lock, flags);
385 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
386 blk_mq_run_hw_queue(hctx, true);
387 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
388}
389
287/** 390/**
288 * blk_insert_flush - insert a new FLUSH/FUA request 391 * blk_insert_flush - insert a new FLUSH/FUA request
289 * @rq: request to insert 392 * @rq: request to insert
290 * 393 *
291 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 394 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
395 * or __blk_mq_run_hw_queue() to dispatch request.
292 * @rq is being submitted. Analyze what needs to be done and put it on the 396 * @rq is being submitted. Analyze what needs to be done and put it on the
293 * right queue. 397 * right queue.
294 * 398 *
295 * CONTEXT: 399 * CONTEXT:
296 * spin_lock_irq(q->queue_lock) 400 * spin_lock_irq(q->queue_lock) in !mq case
297 */ 401 */
298void blk_insert_flush(struct request *rq) 402void blk_insert_flush(struct request *rq)
299{ 403{
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
316 * complete the request. 420 * complete the request.
317 */ 421 */
318 if (!policy) { 422 if (!policy) {
319 __blk_end_bidi_request(rq, 0, 0, 0); 423 if (q->mq_ops)
424 blk_mq_end_io(rq, 0);
425 else
426 __blk_end_bidi_request(rq, 0, 0, 0);
320 return; 427 return;
321 } 428 }
322 429
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
329 */ 436 */
330 if ((policy & REQ_FSEQ_DATA) && 437 if ((policy & REQ_FSEQ_DATA) &&
331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 438 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
332 list_add_tail(&rq->queuelist, &q->queue_head); 439 if (q->mq_ops) {
440 blk_mq_run_request(rq, false, true);
441 } else
442 list_add_tail(&rq->queuelist, &q->queue_head);
333 return; 443 return;
334 } 444 }
335 445
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
341 INIT_LIST_HEAD(&rq->flush.list); 451 INIT_LIST_HEAD(&rq->flush.list);
342 rq->cmd_flags |= REQ_FLUSH_SEQ; 452 rq->cmd_flags |= REQ_FLUSH_SEQ;
343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 453 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
454 if (q->mq_ops) {
455 rq->end_io = mq_flush_data_end_io;
456
457 spin_lock_irq(&q->mq_flush_lock);
458 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
459 spin_unlock_irq(&q->mq_flush_lock);
460 return;
461 }
344 rq->end_io = flush_data_end_io; 462 rq->end_io = flush_data_end_io;
345 463
346 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 464 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
453 return ret; 571 return ret;
454} 572}
455EXPORT_SYMBOL(blkdev_issue_flush); 573EXPORT_SYMBOL(blkdev_issue_flush);
574
575void blk_mq_init_flush(struct request_queue *q)
576{
577 spin_lock_init(&q->mq_flush_lock);
578 INIT_WORK(&q->mq_flush_work, mq_flush_work);
579}
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 4b8d9b541112..1855bf51edb0 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -35,7 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
35 unsigned long flags; 35 unsigned long flags;
36 36
37 local_irq_save(flags); 37 local_irq_save(flags);
38 list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); 38 list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
39 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); 39 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
40 local_irq_restore(flags); 40 local_irq_restore(flags);
41} 41}
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(blk_iopoll_complete);
79 79
80static void blk_iopoll_softirq(struct softirq_action *h) 80static void blk_iopoll_softirq(struct softirq_action *h)
81{ 81{
82 struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); 82 struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
83 int rearm = 0, budget = blk_iopoll_budget; 83 int rearm = 0, budget = blk_iopoll_budget;
84 unsigned long start_time = jiffies; 84 unsigned long start_time = jiffies;
85 85
@@ -201,7 +201,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
201 201
202 local_irq_disable(); 202 local_irq_disable();
203 list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), 203 list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
204 &__get_cpu_var(blk_cpu_iopoll)); 204 this_cpu_ptr(&blk_cpu_iopoll));
205 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); 205 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
206 local_irq_enable(); 206 local_irq_enable();
207 } 207 }
diff --git a/block/blk-lib.c b/block/blk-lib.c
index d6f50d572565..9b5b561cb928 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -43,8 +43,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
43 DECLARE_COMPLETION_ONSTACK(wait); 43 DECLARE_COMPLETION_ONSTACK(wait);
44 struct request_queue *q = bdev_get_queue(bdev); 44 struct request_queue *q = bdev_get_queue(bdev);
45 int type = REQ_WRITE | REQ_DISCARD; 45 int type = REQ_WRITE | REQ_DISCARD;
46 sector_t max_discard_sectors; 46 unsigned int max_discard_sectors, granularity;
47 sector_t granularity, alignment; 47 int alignment;
48 struct bio_batch bb; 48 struct bio_batch bb;
49 struct bio *bio; 49 struct bio *bio;
50 int ret = 0; 50 int ret = 0;
@@ -58,16 +58,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
58 58
59 /* Zero-sector (unknown) and one-sector granularities are the same. */ 59 /* Zero-sector (unknown) and one-sector granularities are the same. */
60 granularity = max(q->limits.discard_granularity >> 9, 1U); 60 granularity = max(q->limits.discard_granularity >> 9, 1U);
61 alignment = bdev_discard_alignment(bdev) >> 9; 61 alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
62 alignment = sector_div(alignment, granularity);
63 62
64 /* 63 /*
65 * Ensure that max_discard_sectors is of the proper 64 * Ensure that max_discard_sectors is of the proper
66 * granularity, so that requests stay aligned after a split. 65 * granularity, so that requests stay aligned after a split.
67 */ 66 */
68 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); 67 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
69 sector_div(max_discard_sectors, granularity); 68 max_discard_sectors -= max_discard_sectors % granularity;
70 max_discard_sectors *= granularity;
71 if (unlikely(!max_discard_sectors)) { 69 if (unlikely(!max_discard_sectors)) {
72 /* Avoid infinite loop below. Being cautious never hurts. */ 70 /* Avoid infinite loop below. Being cautious never hurts. */
73 return -EOPNOTSUPP; 71 return -EOPNOTSUPP;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5f2448253797..1ffc58977835 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
308 return ll_new_hw_segment(q, req, bio); 308 return ll_new_hw_segment(q, req, bio);
309} 309}
310 310
311/*
312 * blk-mq uses req->special to carry normal driver per-request payload, it
313 * does not indicate a prepared command that we cannot merge with.
314 */
315static bool req_no_special_merge(struct request *req)
316{
317 struct request_queue *q = req->q;
318
319 return !q->mq_ops && req->special;
320}
321
311static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 322static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
312 struct request *next) 323 struct request *next)
313{ 324{
@@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
319 * First check if the either of the requests are re-queued 330 * First check if the either of the requests are re-queued
320 * requests. Can't merge them if they are. 331 * requests. Can't merge them if they are.
321 */ 332 */
322 if (req->special || next->special) 333 if (req_no_special_merge(req) || req_no_special_merge(next))
323 return 0; 334 return 0;
324 335
325 /* 336 /*
@@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
416 427
417 if (rq_data_dir(req) != rq_data_dir(next) 428 if (rq_data_dir(req) != rq_data_dir(next)
418 || req->rq_disk != next->rq_disk 429 || req->rq_disk != next->rq_disk
419 || next->special) 430 || req_no_special_merge(next))
420 return 0; 431 return 0;
421 432
422 if (req->cmd_flags & REQ_WRITE_SAME && 433 if (req->cmd_flags & REQ_WRITE_SAME &&
@@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
515 return false; 526 return false;
516 527
517 /* must be same device and not a special request */ 528 /* must be same device and not a special request */
518 if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) 529 if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
519 return false; 530 return false;
520 531
521 /* only merge integrity protected bio into ditto rq */ 532 /* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
new file mode 100644
index 000000000000..f8ea39d7ae54
--- /dev/null
+++ b/block/blk-mq-cpu.c
@@ -0,0 +1,93 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <linux/blkdev.h>
5#include <linux/list.h>
6#include <linux/llist.h>
7#include <linux/smp.h>
8#include <linux/cpu.h>
9
10#include <linux/blk-mq.h>
11#include "blk-mq.h"
12
13static LIST_HEAD(blk_mq_cpu_notify_list);
14static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
15
16static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
17 unsigned long action, void *hcpu)
18{
19 unsigned int cpu = (unsigned long) hcpu;
20 struct blk_mq_cpu_notifier *notify;
21
22 spin_lock(&blk_mq_cpu_notify_lock);
23
24 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
25 notify->notify(notify->data, action, cpu);
26
27 spin_unlock(&blk_mq_cpu_notify_lock);
28 return NOTIFY_OK;
29}
30
31static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action,
32 unsigned int cpu)
33{
34 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
35 /*
36 * If the CPU goes away, ensure that we run any pending
37 * completions.
38 */
39 struct llist_node *node;
40 struct request *rq;
41
42 local_irq_disable();
43
44 node = llist_del_all(&per_cpu(ipi_lists, cpu));
45 while (node) {
46 struct llist_node *next = node->next;
47
48 rq = llist_entry(node, struct request, ll_list);
49 __blk_mq_end_io(rq, rq->errors);
50 node = next;
51 }
52
53 local_irq_enable();
54 }
55}
56
57static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
58 .notifier_call = blk_mq_main_cpu_notify,
59};
60
61void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
62{
63 BUG_ON(!notifier->notify);
64
65 spin_lock(&blk_mq_cpu_notify_lock);
66 list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
67 spin_unlock(&blk_mq_cpu_notify_lock);
68}
69
70void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
71{
72 spin_lock(&blk_mq_cpu_notify_lock);
73 list_del(&notifier->list);
74 spin_unlock(&blk_mq_cpu_notify_lock);
75}
76
77void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
78 void (*fn)(void *, unsigned long, unsigned int),
79 void *data)
80{
81 notifier->notify = fn;
82 notifier->data = data;
83}
84
85static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
86 .notify = blk_mq_cpu_notify,
87};
88
89void __init blk_mq_cpu_init(void)
90{
91 register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
92 blk_mq_register_cpu_notifier(&cpu_notifier);
93}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
new file mode 100644
index 000000000000..f8721278601c
--- /dev/null
+++ b/block/blk-mq-cpumap.c
@@ -0,0 +1,108 @@
1#include <linux/kernel.h>
2#include <linux/threads.h>
3#include <linux/module.h>
4#include <linux/mm.h>
5#include <linux/smp.h>
6#include <linux/cpu.h>
7
8#include <linux/blk-mq.h>
9#include "blk.h"
10#include "blk-mq.h"
11
12static void show_map(unsigned int *map, unsigned int nr)
13{
14 int i;
15
16 pr_info("blk-mq: CPU -> queue map\n");
17 for_each_online_cpu(i)
18 pr_info(" CPU%2u -> Queue %u\n", i, map[i]);
19}
20
21static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
22 const int cpu)
23{
24 return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
25}
26
27static int get_first_sibling(unsigned int cpu)
28{
29 unsigned int ret;
30
31 ret = cpumask_first(topology_thread_cpumask(cpu));
32 if (ret < nr_cpu_ids)
33 return ret;
34
35 return cpu;
36}
37
38int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
39{
40 unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
41 cpumask_var_t cpus;
42
43 if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
44 return 1;
45
46 cpumask_clear(cpus);
47 nr_cpus = nr_uniq_cpus = 0;
48 for_each_online_cpu(i) {
49 nr_cpus++;
50 first_sibling = get_first_sibling(i);
51 if (!cpumask_test_cpu(first_sibling, cpus))
52 nr_uniq_cpus++;
53 cpumask_set_cpu(i, cpus);
54 }
55
56 queue = 0;
57 for_each_possible_cpu(i) {
58 if (!cpu_online(i)) {
59 map[i] = 0;
60 continue;
61 }
62
63 /*
64 * Easy case - we have equal or more hardware queues. Or
65 * there are no thread siblings to take into account. Do
66 * 1:1 if enough, or sequential mapping if less.
67 */
68 if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
69 map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
70 queue++;
71 continue;
72 }
73
74 /*
75 * Less then nr_cpus queues, and we have some number of
76 * threads per cores. Map sibling threads to the same
77 * queue.
78 */
79 first_sibling = get_first_sibling(i);
80 if (first_sibling == i) {
81 map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
82 queue);
83 queue++;
84 } else
85 map[i] = map[first_sibling];
86 }
87
88 show_map(map, nr_cpus);
89 free_cpumask_var(cpus);
90 return 0;
91}
92
93unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
94{
95 unsigned int *map;
96
97 /* If cpus are offline, map them to first hctx */
98 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
99 reg->numa_node);
100 if (!map)
101 return NULL;
102
103 if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
104 return map;
105
106 kfree(map);
107 return NULL;
108}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
new file mode 100644
index 000000000000..ba6cf8e9aa0a
--- /dev/null
+++ b/block/blk-mq-sysfs.c
@@ -0,0 +1,384 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/backing-dev.h>
4#include <linux/bio.h>
5#include <linux/blkdev.h>
6#include <linux/mm.h>
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/workqueue.h>
10#include <linux/smp.h>
11
12#include <linux/blk-mq.h>
13#include "blk-mq.h"
14#include "blk-mq-tag.h"
15
16static void blk_mq_sysfs_release(struct kobject *kobj)
17{
18}
19
20struct blk_mq_ctx_sysfs_entry {
21 struct attribute attr;
22 ssize_t (*show)(struct blk_mq_ctx *, char *);
23 ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
24};
25
26struct blk_mq_hw_ctx_sysfs_entry {
27 struct attribute attr;
28 ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
29 ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
30};
31
32static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
33 char *page)
34{
35 struct blk_mq_ctx_sysfs_entry *entry;
36 struct blk_mq_ctx *ctx;
37 struct request_queue *q;
38 ssize_t res;
39
40 entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
41 ctx = container_of(kobj, struct blk_mq_ctx, kobj);
42 q = ctx->queue;
43
44 if (!entry->show)
45 return -EIO;
46
47 res = -ENOENT;
48 mutex_lock(&q->sysfs_lock);
49 if (!blk_queue_dying(q))
50 res = entry->show(ctx, page);
51 mutex_unlock(&q->sysfs_lock);
52 return res;
53}
54
55static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
56 const char *page, size_t length)
57{
58 struct blk_mq_ctx_sysfs_entry *entry;
59 struct blk_mq_ctx *ctx;
60 struct request_queue *q;
61 ssize_t res;
62
63 entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
64 ctx = container_of(kobj, struct blk_mq_ctx, kobj);
65 q = ctx->queue;
66
67 if (!entry->store)
68 return -EIO;
69
70 res = -ENOENT;
71 mutex_lock(&q->sysfs_lock);
72 if (!blk_queue_dying(q))
73 res = entry->store(ctx, page, length);
74 mutex_unlock(&q->sysfs_lock);
75 return res;
76}
77
78static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
79 struct attribute *attr, char *page)
80{
81 struct blk_mq_hw_ctx_sysfs_entry *entry;
82 struct blk_mq_hw_ctx *hctx;
83 struct request_queue *q;
84 ssize_t res;
85
86 entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
87 hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
88 q = hctx->queue;
89
90 if (!entry->show)
91 return -EIO;
92
93 res = -ENOENT;
94 mutex_lock(&q->sysfs_lock);
95 if (!blk_queue_dying(q))
96 res = entry->show(hctx, page);
97 mutex_unlock(&q->sysfs_lock);
98 return res;
99}
100
101static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
102 struct attribute *attr, const char *page,
103 size_t length)
104{
105 struct blk_mq_hw_ctx_sysfs_entry *entry;
106 struct blk_mq_hw_ctx *hctx;
107 struct request_queue *q;
108 ssize_t res;
109
110 entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
111 hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
112 q = hctx->queue;
113
114 if (!entry->store)
115 return -EIO;
116
117 res = -ENOENT;
118 mutex_lock(&q->sysfs_lock);
119 if (!blk_queue_dying(q))
120 res = entry->store(hctx, page, length);
121 mutex_unlock(&q->sysfs_lock);
122 return res;
123}
124
125static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
126{
127 return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
128 ctx->rq_dispatched[0]);
129}
130
131static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
132{
133 return sprintf(page, "%lu\n", ctx->rq_merged);
134}
135
136static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
137{
138 return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
139 ctx->rq_completed[0]);
140}
141
142static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
143{
144 char *start_page = page;
145 struct request *rq;
146
147 page += sprintf(page, "%s:\n", msg);
148
149 list_for_each_entry(rq, list, queuelist)
150 page += sprintf(page, "\t%p\n", rq);
151
152 return page - start_page;
153}
154
155static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
156{
157 ssize_t ret;
158
159 spin_lock(&ctx->lock);
160 ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
161 spin_unlock(&ctx->lock);
162
163 return ret;
164}
165
166static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
167 char *page)
168{
169 return sprintf(page, "%lu\n", hctx->queued);
170}
171
172static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
173{
174 return sprintf(page, "%lu\n", hctx->run);
175}
176
177static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
178 char *page)
179{
180 char *start_page = page;
181 int i;
182
183 page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
184
185 for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
186 unsigned long d = 1U << (i - 1);
187
188 page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
189 }
190
191 return page - start_page;
192}
193
194static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
195 char *page)
196{
197 ssize_t ret;
198
199 spin_lock(&hctx->lock);
200 ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
201 spin_unlock(&hctx->lock);
202
203 return ret;
204}
205
206static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
207{
208 ssize_t ret;
209
210 spin_lock(&hctx->lock);
211 ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
212 spin_unlock(&hctx->lock);
213
214 return ret;
215}
216
217static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
218 const char *page, size_t len)
219{
220 struct blk_mq_ctx *ctx;
221 unsigned long ret;
222 unsigned int i;
223
224 if (kstrtoul(page, 10, &ret)) {
225 pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
226 return -EINVAL;
227 }
228
229 spin_lock(&hctx->lock);
230 if (ret)
231 hctx->flags |= BLK_MQ_F_SHOULD_IPI;
232 else
233 hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
234 spin_unlock(&hctx->lock);
235
236 hctx_for_each_ctx(hctx, ctx, i)
237 ctx->ipi_redirect = !!ret;
238
239 return len;
240}
241
242static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
243{
244 return blk_mq_tag_sysfs_show(hctx->tags, page);
245}
246
247static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
248 .attr = {.name = "dispatched", .mode = S_IRUGO },
249 .show = blk_mq_sysfs_dispatched_show,
250};
251static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
252 .attr = {.name = "merged", .mode = S_IRUGO },
253 .show = blk_mq_sysfs_merged_show,
254};
255static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
256 .attr = {.name = "completed", .mode = S_IRUGO },
257 .show = blk_mq_sysfs_completed_show,
258};
259static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
260 .attr = {.name = "rq_list", .mode = S_IRUGO },
261 .show = blk_mq_sysfs_rq_list_show,
262};
263
264static struct attribute *default_ctx_attrs[] = {
265 &blk_mq_sysfs_dispatched.attr,
266 &blk_mq_sysfs_merged.attr,
267 &blk_mq_sysfs_completed.attr,
268 &blk_mq_sysfs_rq_list.attr,
269 NULL,
270};
271
272static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
273 .attr = {.name = "queued", .mode = S_IRUGO },
274 .show = blk_mq_hw_sysfs_queued_show,
275};
276static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
277 .attr = {.name = "run", .mode = S_IRUGO },
278 .show = blk_mq_hw_sysfs_run_show,
279};
280static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
281 .attr = {.name = "dispatched", .mode = S_IRUGO },
282 .show = blk_mq_hw_sysfs_dispatched_show,
283};
284static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
285 .attr = {.name = "pending", .mode = S_IRUGO },
286 .show = blk_mq_hw_sysfs_rq_list_show,
287};
288static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
289 .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
290 .show = blk_mq_hw_sysfs_ipi_show,
291 .store = blk_mq_hw_sysfs_ipi_store,
292};
293static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
294 .attr = {.name = "tags", .mode = S_IRUGO },
295 .show = blk_mq_hw_sysfs_tags_show,
296};
297
298static struct attribute *default_hw_ctx_attrs[] = {
299 &blk_mq_hw_sysfs_queued.attr,
300 &blk_mq_hw_sysfs_run.attr,
301 &blk_mq_hw_sysfs_dispatched.attr,
302 &blk_mq_hw_sysfs_pending.attr,
303 &blk_mq_hw_sysfs_ipi.attr,
304 &blk_mq_hw_sysfs_tags.attr,
305 NULL,
306};
307
308static const struct sysfs_ops blk_mq_sysfs_ops = {
309 .show = blk_mq_sysfs_show,
310 .store = blk_mq_sysfs_store,
311};
312
313static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
314 .show = blk_mq_hw_sysfs_show,
315 .store = blk_mq_hw_sysfs_store,
316};
317
318static struct kobj_type blk_mq_ktype = {
319 .sysfs_ops = &blk_mq_sysfs_ops,
320 .release = blk_mq_sysfs_release,
321};
322
323static struct kobj_type blk_mq_ctx_ktype = {
324 .sysfs_ops = &blk_mq_sysfs_ops,
325 .default_attrs = default_ctx_attrs,
326 .release = blk_mq_sysfs_release,
327};
328
329static struct kobj_type blk_mq_hw_ktype = {
330 .sysfs_ops = &blk_mq_hw_sysfs_ops,
331 .default_attrs = default_hw_ctx_attrs,
332 .release = blk_mq_sysfs_release,
333};
334
335void blk_mq_unregister_disk(struct gendisk *disk)
336{
337 struct request_queue *q = disk->queue;
338
339 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
340 kobject_del(&q->mq_kobj);
341
342 kobject_put(&disk_to_dev(disk)->kobj);
343}
344
345int blk_mq_register_disk(struct gendisk *disk)
346{
347 struct device *dev = disk_to_dev(disk);
348 struct request_queue *q = disk->queue;
349 struct blk_mq_hw_ctx *hctx;
350 struct blk_mq_ctx *ctx;
351 int ret, i, j;
352
353 kobject_init(&q->mq_kobj, &blk_mq_ktype);
354
355 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
356 if (ret < 0)
357 return ret;
358
359 kobject_uevent(&q->mq_kobj, KOBJ_ADD);
360
361 queue_for_each_hw_ctx(q, hctx, i) {
362 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
363 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
364 if (ret)
365 break;
366
367 if (!hctx->nr_ctx)
368 continue;
369
370 hctx_for_each_ctx(hctx, ctx, j) {
371 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
372 ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
373 if (ret)
374 break;
375 }
376 }
377
378 if (ret) {
379 blk_mq_unregister_disk(disk);
380 return ret;
381 }
382
383 return 0;
384}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
new file mode 100644
index 000000000000..d64a02fb1f73
--- /dev/null
+++ b/block/blk-mq-tag.c
@@ -0,0 +1,204 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/percpu_ida.h>
4
5#include <linux/blk-mq.h>
6#include "blk.h"
7#include "blk-mq.h"
8#include "blk-mq-tag.h"
9
10/*
11 * Per tagged queue (tag address space) map
12 */
13struct blk_mq_tags {
14 unsigned int nr_tags;
15 unsigned int nr_reserved_tags;
16 unsigned int nr_batch_move;
17 unsigned int nr_max_cache;
18
19 struct percpu_ida free_tags;
20 struct percpu_ida reserved_tags;
21};
22
23void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
24{
25 int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
26 blk_mq_put_tag(tags, tag);
27}
28
29bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
30{
31 return !tags ||
32 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
33}
34
35static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
36{
37 int tag;
38
39 tag = percpu_ida_alloc(&tags->free_tags, gfp);
40 if (tag < 0)
41 return BLK_MQ_TAG_FAIL;
42 return tag + tags->nr_reserved_tags;
43}
44
45static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
46 gfp_t gfp)
47{
48 int tag;
49
50 if (unlikely(!tags->nr_reserved_tags)) {
51 WARN_ON_ONCE(1);
52 return BLK_MQ_TAG_FAIL;
53 }
54
55 tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
56 if (tag < 0)
57 return BLK_MQ_TAG_FAIL;
58 return tag;
59}
60
61unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
62{
63 if (!reserved)
64 return __blk_mq_get_tag(tags, gfp);
65
66 return __blk_mq_get_reserved_tag(tags, gfp);
67}
68
69static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
70{
71 BUG_ON(tag >= tags->nr_tags);
72
73 percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
74}
75
76static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
77 unsigned int tag)
78{
79 BUG_ON(tag >= tags->nr_reserved_tags);
80
81 percpu_ida_free(&tags->reserved_tags, tag);
82}
83
84void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
85{
86 if (tag >= tags->nr_reserved_tags)
87 __blk_mq_put_tag(tags, tag);
88 else
89 __blk_mq_put_reserved_tag(tags, tag);
90}
91
92static int __blk_mq_tag_iter(unsigned id, void *data)
93{
94 unsigned long *tag_map = data;
95 __set_bit(id, tag_map);
96 return 0;
97}
98
99void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
100 void (*fn)(void *, unsigned long *), void *data)
101{
102 unsigned long *tag_map;
103 size_t map_size;
104
105 map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
106 tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
107 if (!tag_map)
108 return;
109
110 percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
111 if (tags->nr_reserved_tags)
112 percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
113 tag_map);
114
115 fn(data, tag_map);
116 kfree(tag_map);
117}
118
119struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
120 unsigned int reserved_tags, int node)
121{
122 unsigned int nr_tags, nr_cache;
123 struct blk_mq_tags *tags;
124 int ret;
125
126 if (total_tags > BLK_MQ_TAG_MAX) {
127 pr_err("blk-mq: tag depth too large\n");
128 return NULL;
129 }
130
131 tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
132 if (!tags)
133 return NULL;
134
135 nr_tags = total_tags - reserved_tags;
136 nr_cache = nr_tags / num_possible_cpus();
137
138 if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
139 nr_cache = BLK_MQ_TAG_CACHE_MIN;
140 else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
141 nr_cache = BLK_MQ_TAG_CACHE_MAX;
142
143 tags->nr_tags = total_tags;
144 tags->nr_reserved_tags = reserved_tags;
145 tags->nr_max_cache = nr_cache;
146 tags->nr_batch_move = max(1u, nr_cache / 2);
147
148 ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
149 tags->nr_reserved_tags,
150 tags->nr_max_cache,
151 tags->nr_batch_move);
152 if (ret)
153 goto err_free_tags;
154
155 if (reserved_tags) {
156 /*
157 * With max_cahe and batch set to 1, the allocator fallbacks to
158 * no cached. It's fine reserved tags allocation is slow.
159 */
160 ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
161 1, 1);
162 if (ret)
163 goto err_reserved_tags;
164 }
165
166 return tags;
167
168err_reserved_tags:
169 percpu_ida_destroy(&tags->free_tags);
170err_free_tags:
171 kfree(tags);
172 return NULL;
173}
174
175void blk_mq_free_tags(struct blk_mq_tags *tags)
176{
177 percpu_ida_destroy(&tags->free_tags);
178 percpu_ida_destroy(&tags->reserved_tags);
179 kfree(tags);
180}
181
182ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
183{
184 char *orig_page = page;
185 int cpu;
186
187 if (!tags)
188 return 0;
189
190 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
191 " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
192 tags->nr_batch_move, tags->nr_max_cache);
193
194 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
195 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
196 percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
197
198 for_each_possible_cpu(cpu) {
199 page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu,
200 percpu_ida_free_tags(&tags->free_tags, cpu));
201 }
202
203 return page - orig_page;
204}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
new file mode 100644
index 000000000000..947ba2c6148e
--- /dev/null
+++ b/block/blk-mq-tag.h
@@ -0,0 +1,27 @@
1#ifndef INT_BLK_MQ_TAG_H
2#define INT_BLK_MQ_TAG_H
3
4struct blk_mq_tags;
5
6extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
7extern void blk_mq_free_tags(struct blk_mq_tags *tags);
8
9extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
10extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
11extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
12extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
13extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
14extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
15
16enum {
17 BLK_MQ_TAG_CACHE_MIN = 1,
18 BLK_MQ_TAG_CACHE_MAX = 64,
19};
20
21enum {
22 BLK_MQ_TAG_FAIL = -1U,
23 BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
24 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
25};
26
27#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
new file mode 100644
index 000000000000..88d4e864d4c0
--- /dev/null
+++ b/block/blk-mq.c
@@ -0,0 +1,1500 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/backing-dev.h>
4#include <linux/bio.h>
5#include <linux/blkdev.h>
6#include <linux/mm.h>
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/workqueue.h>
10#include <linux/smp.h>
11#include <linux/llist.h>
12#include <linux/list_sort.h>
13#include <linux/cpu.h>
14#include <linux/cache.h>
15#include <linux/sched/sysctl.h>
16#include <linux/delay.h>
17
18#include <trace/events/block.h>
19
20#include <linux/blk-mq.h>
21#include "blk.h"
22#include "blk-mq.h"
23#include "blk-mq-tag.h"
24
25static DEFINE_MUTEX(all_q_mutex);
26static LIST_HEAD(all_q_list);
27
28static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
29
30DEFINE_PER_CPU(struct llist_head, ipi_lists);
31
32static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
33 unsigned int cpu)
34{
35 return per_cpu_ptr(q->queue_ctx, cpu);
36}
37
38/*
39 * This assumes per-cpu software queueing queues. They could be per-node
40 * as well, for instance. For now this is hardcoded as-is. Note that we don't
41 * care about preemption, since we know the ctx's are persistent. This does
42 * mean that we can't rely on ctx always matching the currently running CPU.
43 */
44static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
45{
46 return __blk_mq_get_ctx(q, get_cpu());
47}
48
49static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
50{
51 put_cpu();
52}
53
54/*
55 * Check if any of the ctx's have pending work in this hardware queue
56 */
57static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
58{
59 unsigned int i;
60
61 for (i = 0; i < hctx->nr_ctx_map; i++)
62 if (hctx->ctx_map[i])
63 return true;
64
65 return false;
66}
67
68/*
69 * Mark this ctx as having pending work in this hardware queue
70 */
71static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
72 struct blk_mq_ctx *ctx)
73{
74 if (!test_bit(ctx->index_hw, hctx->ctx_map))
75 set_bit(ctx->index_hw, hctx->ctx_map);
76}
77
78static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
79 bool reserved)
80{
81 struct request *rq;
82 unsigned int tag;
83
84 tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
85 if (tag != BLK_MQ_TAG_FAIL) {
86 rq = hctx->rqs[tag];
87 rq->tag = tag;
88
89 return rq;
90 }
91
92 return NULL;
93}
94
95static int blk_mq_queue_enter(struct request_queue *q)
96{
97 int ret;
98
99 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
100 smp_wmb();
101 /* we have problems to freeze the queue if it's initializing */
102 if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
103 return 0;
104
105 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
106
107 spin_lock_irq(q->queue_lock);
108 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
109 !blk_queue_bypass(q), *q->queue_lock);
110 /* inc usage with lock hold to avoid freeze_queue runs here */
111 if (!ret)
112 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
113 spin_unlock_irq(q->queue_lock);
114
115 return ret;
116}
117
118static void blk_mq_queue_exit(struct request_queue *q)
119{
120 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
121}
122
123/*
124 * Guarantee no request is in use, so we can change any data structure of
125 * the queue afterward.
126 */
127static void blk_mq_freeze_queue(struct request_queue *q)
128{
129 bool drain;
130
131 spin_lock_irq(q->queue_lock);
132 drain = !q->bypass_depth++;
133 queue_flag_set(QUEUE_FLAG_BYPASS, q);
134 spin_unlock_irq(q->queue_lock);
135
136 if (!drain)
137 return;
138
139 while (true) {
140 s64 count;
141
142 spin_lock_irq(q->queue_lock);
143 count = percpu_counter_sum(&q->mq_usage_counter);
144 spin_unlock_irq(q->queue_lock);
145
146 if (count == 0)
147 break;
148 blk_mq_run_queues(q, false);
149 msleep(10);
150 }
151}
152
153static void blk_mq_unfreeze_queue(struct request_queue *q)
154{
155 bool wake = false;
156
157 spin_lock_irq(q->queue_lock);
158 if (!--q->bypass_depth) {
159 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
160 wake = true;
161 }
162 WARN_ON_ONCE(q->bypass_depth < 0);
163 spin_unlock_irq(q->queue_lock);
164 if (wake)
165 wake_up_all(&q->mq_freeze_wq);
166}
167
168bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
169{
170 return blk_mq_has_free_tags(hctx->tags);
171}
172EXPORT_SYMBOL(blk_mq_can_queue);
173
174static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,
175 unsigned int rw_flags)
176{
177 rq->mq_ctx = ctx;
178 rq->cmd_flags = rw_flags;
179 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
180}
181
182static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
183 gfp_t gfp, bool reserved)
184{
185 return blk_mq_alloc_rq(hctx, gfp, reserved);
186}
187
188static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
189 int rw, gfp_t gfp,
190 bool reserved)
191{
192 struct request *rq;
193
194 do {
195 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
196 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
197
198 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
199 if (rq) {
200 blk_mq_rq_ctx_init(ctx, rq, rw);
201 break;
202 } else if (!(gfp & __GFP_WAIT))
203 break;
204
205 blk_mq_put_ctx(ctx);
206 __blk_mq_run_hw_queue(hctx);
207 blk_mq_wait_for_tags(hctx->tags);
208 } while (1);
209
210 return rq;
211}
212
213struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
214 gfp_t gfp, bool reserved)
215{
216 struct request *rq;
217
218 if (blk_mq_queue_enter(q))
219 return NULL;
220
221 rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
222 blk_mq_put_ctx(rq->mq_ctx);
223 return rq;
224}
225
226struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
227 gfp_t gfp)
228{
229 struct request *rq;
230
231 if (blk_mq_queue_enter(q))
232 return NULL;
233
234 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
235 blk_mq_put_ctx(rq->mq_ctx);
236 return rq;
237}
238EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
239
240/*
241 * Re-init and set pdu, if we have it
242 */
243static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
244{
245 blk_rq_init(hctx->queue, rq);
246
247 if (hctx->cmd_size)
248 rq->special = blk_mq_rq_to_pdu(rq);
249}
250
251static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
252 struct blk_mq_ctx *ctx, struct request *rq)
253{
254 const int tag = rq->tag;
255 struct request_queue *q = rq->q;
256
257 blk_mq_rq_init(hctx, rq);
258 blk_mq_put_tag(hctx->tags, tag);
259
260 blk_mq_queue_exit(q);
261}
262
263void blk_mq_free_request(struct request *rq)
264{
265 struct blk_mq_ctx *ctx = rq->mq_ctx;
266 struct blk_mq_hw_ctx *hctx;
267 struct request_queue *q = rq->q;
268
269 ctx->rq_completed[rq_is_sync(rq)]++;
270
271 hctx = q->mq_ops->map_queue(q, ctx->cpu);
272 __blk_mq_free_request(hctx, ctx, rq);
273}
274
275static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
276{
277 if (error)
278 clear_bit(BIO_UPTODATE, &bio->bi_flags);
279 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
280 error = -EIO;
281
282 if (unlikely(rq->cmd_flags & REQ_QUIET))
283 set_bit(BIO_QUIET, &bio->bi_flags);
284
285 /* don't actually finish bio if it's part of flush sequence */
286 if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
287 bio_endio(bio, error);
288}
289
290void blk_mq_complete_request(struct request *rq, int error)
291{
292 struct bio *bio = rq->bio;
293 unsigned int bytes = 0;
294
295 trace_block_rq_complete(rq->q, rq);
296
297 while (bio) {
298 struct bio *next = bio->bi_next;
299
300 bio->bi_next = NULL;
301 bytes += bio->bi_size;
302 blk_mq_bio_endio(rq, bio, error);
303 bio = next;
304 }
305
306 blk_account_io_completion(rq, bytes);
307
308 if (rq->end_io)
309 rq->end_io(rq, error);
310 else
311 blk_mq_free_request(rq);
312
313 blk_account_io_done(rq);
314}
315
316void __blk_mq_end_io(struct request *rq, int error)
317{
318 if (!blk_mark_rq_complete(rq))
319 blk_mq_complete_request(rq, error);
320}
321
322#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
323
324/*
325 * Called with interrupts disabled.
326 */
327static void ipi_end_io(void *data)
328{
329 struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
330 struct llist_node *entry, *next;
331 struct request *rq;
332
333 entry = llist_del_all(list);
334
335 while (entry) {
336 next = entry->next;
337 rq = llist_entry(entry, struct request, ll_list);
338 __blk_mq_end_io(rq, rq->errors);
339 entry = next;
340 }
341}
342
343static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
344 struct request *rq, const int error)
345{
346 struct call_single_data *data = &rq->csd;
347
348 rq->errors = error;
349 rq->ll_list.next = NULL;
350
351 /*
352 * If the list is non-empty, an existing IPI must already
353 * be "in flight". If that is the case, we need not schedule
354 * a new one.
355 */
356 if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
357 data->func = ipi_end_io;
358 data->flags = 0;
359 __smp_call_function_single(ctx->cpu, data, 0);
360 }
361
362 return true;
363}
364#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
365static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
366 struct request *rq, const int error)
367{
368 return false;
369}
370#endif
371
372/*
373 * End IO on this request on a multiqueue enabled driver. We'll either do
374 * it directly inline, or punt to a local IPI handler on the matching
375 * remote CPU.
376 */
377void blk_mq_end_io(struct request *rq, int error)
378{
379 struct blk_mq_ctx *ctx = rq->mq_ctx;
380 int cpu;
381
382 if (!ctx->ipi_redirect)
383 return __blk_mq_end_io(rq, error);
384
385 cpu = get_cpu();
386
387 if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||
388 !ipi_remote_cpu(ctx, cpu, rq, error))
389 __blk_mq_end_io(rq, error);
390
391 put_cpu();
392}
393EXPORT_SYMBOL(blk_mq_end_io);
394
395static void blk_mq_start_request(struct request *rq)
396{
397 struct request_queue *q = rq->q;
398
399 trace_block_rq_issue(q, rq);
400
401 /*
402 * Just mark start time and set the started bit. Due to memory
403 * ordering, we know we'll see the correct deadline as long as
404 * REQ_ATOMIC_STARTED is seen.
405 */
406 rq->deadline = jiffies + q->rq_timeout;
407 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
408}
409
410static void blk_mq_requeue_request(struct request *rq)
411{
412 struct request_queue *q = rq->q;
413
414 trace_block_rq_requeue(q, rq);
415 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
416}
417
418struct blk_mq_timeout_data {
419 struct blk_mq_hw_ctx *hctx;
420 unsigned long *next;
421 unsigned int *next_set;
422};
423
424static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
425{
426 struct blk_mq_timeout_data *data = __data;
427 struct blk_mq_hw_ctx *hctx = data->hctx;
428 unsigned int tag;
429
430 /* It may not be in flight yet (this is where
431 * the REQ_ATOMIC_STARTED flag comes in). The requests are
432 * statically allocated, so we know it's always safe to access the
433 * memory associated with a bit offset into ->rqs[].
434 */
435 tag = 0;
436 do {
437 struct request *rq;
438
439 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
440 if (tag >= hctx->queue_depth)
441 break;
442
443 rq = hctx->rqs[tag++];
444
445 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
446 continue;
447
448 blk_rq_check_expired(rq, data->next, data->next_set);
449 } while (1);
450}
451
452static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
453 unsigned long *next,
454 unsigned int *next_set)
455{
456 struct blk_mq_timeout_data data = {
457 .hctx = hctx,
458 .next = next,
459 .next_set = next_set,
460 };
461
462 /*
463 * Ask the tagging code to iterate busy requests, so we can
464 * check them for timeout.
465 */
466 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
467}
468
469static void blk_mq_rq_timer(unsigned long data)
470{
471 struct request_queue *q = (struct request_queue *) data;
472 struct blk_mq_hw_ctx *hctx;
473 unsigned long next = 0;
474 int i, next_set = 0;
475
476 queue_for_each_hw_ctx(q, hctx, i)
477 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
478
479 if (next_set)
480 mod_timer(&q->timeout, round_jiffies_up(next));
481}
482
483/*
484 * Reverse check our software queue for entries that we could potentially
485 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
486 * too much time checking for merges.
487 */
488static bool blk_mq_attempt_merge(struct request_queue *q,
489 struct blk_mq_ctx *ctx, struct bio *bio)
490{
491 struct request *rq;
492 int checked = 8;
493
494 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
495 int el_ret;
496
497 if (!checked--)
498 break;
499
500 if (!blk_rq_merge_ok(rq, bio))
501 continue;
502
503 el_ret = blk_try_merge(rq, bio);
504 if (el_ret == ELEVATOR_BACK_MERGE) {
505 if (bio_attempt_back_merge(q, rq, bio)) {
506 ctx->rq_merged++;
507 return true;
508 }
509 break;
510 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
511 if (bio_attempt_front_merge(q, rq, bio)) {
512 ctx->rq_merged++;
513 return true;
514 }
515 break;
516 }
517 }
518
519 return false;
520}
521
522void blk_mq_add_timer(struct request *rq)
523{
524 __blk_add_timer(rq, NULL);
525}
526
527/*
528 * Run this hardware queue, pulling any software queues mapped to it in.
529 * Note that this function currently has various problems around ordering
530 * of IO. In particular, we'd like FIFO behaviour on handling existing
531 * items on the hctx->dispatch list. Ignore that for now.
532 */
533static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
534{
535 struct request_queue *q = hctx->queue;
536 struct blk_mq_ctx *ctx;
537 struct request *rq;
538 LIST_HEAD(rq_list);
539 int bit, queued;
540
541 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
542 return;
543
544 hctx->run++;
545
546 /*
547 * Touch any software queue that has pending entries.
548 */
549 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
550 clear_bit(bit, hctx->ctx_map);
551 ctx = hctx->ctxs[bit];
552 BUG_ON(bit != ctx->index_hw);
553
554 spin_lock(&ctx->lock);
555 list_splice_tail_init(&ctx->rq_list, &rq_list);
556 spin_unlock(&ctx->lock);
557 }
558
559 /*
560 * If we have previous entries on our dispatch list, grab them
561 * and stuff them at the front for more fair dispatch.
562 */
563 if (!list_empty_careful(&hctx->dispatch)) {
564 spin_lock(&hctx->lock);
565 if (!list_empty(&hctx->dispatch))
566 list_splice_init(&hctx->dispatch, &rq_list);
567 spin_unlock(&hctx->lock);
568 }
569
570 /*
571 * Delete and return all entries from our dispatch list
572 */
573 queued = 0;
574
575 /*
576 * Now process all the entries, sending them to the driver.
577 */
578 while (!list_empty(&rq_list)) {
579 int ret;
580
581 rq = list_first_entry(&rq_list, struct request, queuelist);
582 list_del_init(&rq->queuelist);
583 blk_mq_start_request(rq);
584
585 /*
586 * Last request in the series. Flag it as such, this
587 * enables drivers to know when IO should be kicked off,
588 * if they don't do it on a per-request basis.
589 *
590 * Note: the flag isn't the only condition drivers
591 * should do kick off. If drive is busy, the last
592 * request might not have the bit set.
593 */
594 if (list_empty(&rq_list))
595 rq->cmd_flags |= REQ_END;
596
597 ret = q->mq_ops->queue_rq(hctx, rq);
598 switch (ret) {
599 case BLK_MQ_RQ_QUEUE_OK:
600 queued++;
601 continue;
602 case BLK_MQ_RQ_QUEUE_BUSY:
603 /*
604 * FIXME: we should have a mechanism to stop the queue
605 * like blk_stop_queue, otherwise we will waste cpu
606 * time
607 */
608 list_add(&rq->queuelist, &rq_list);
609 blk_mq_requeue_request(rq);
610 break;
611 default:
612 pr_err("blk-mq: bad return on queue: %d\n", ret);
613 rq->errors = -EIO;
614 case BLK_MQ_RQ_QUEUE_ERROR:
615 blk_mq_end_io(rq, rq->errors);
616 break;
617 }
618
619 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
620 break;
621 }
622
623 if (!queued)
624 hctx->dispatched[0]++;
625 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
626 hctx->dispatched[ilog2(queued) + 1]++;
627
628 /*
629 * Any items that need requeuing? Stuff them into hctx->dispatch,
630 * that is where we will continue on next queue run.
631 */
632 if (!list_empty(&rq_list)) {
633 spin_lock(&hctx->lock);
634 list_splice(&rq_list, &hctx->dispatch);
635 spin_unlock(&hctx->lock);
636 }
637}
638
639void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
640{
641 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
642 return;
643
644 if (!async)
645 __blk_mq_run_hw_queue(hctx);
646 else {
647 struct request_queue *q = hctx->queue;
648
649 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
650 }
651}
652
653void blk_mq_run_queues(struct request_queue *q, bool async)
654{
655 struct blk_mq_hw_ctx *hctx;
656 int i;
657
658 queue_for_each_hw_ctx(q, hctx, i) {
659 if ((!blk_mq_hctx_has_pending(hctx) &&
660 list_empty_careful(&hctx->dispatch)) ||
661 test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
662 continue;
663
664 blk_mq_run_hw_queue(hctx, async);
665 }
666}
667EXPORT_SYMBOL(blk_mq_run_queues);
668
669void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
670{
671 cancel_delayed_work(&hctx->delayed_work);
672 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
673}
674EXPORT_SYMBOL(blk_mq_stop_hw_queue);
675
676void blk_mq_stop_hw_queues(struct request_queue *q)
677{
678 struct blk_mq_hw_ctx *hctx;
679 int i;
680
681 queue_for_each_hw_ctx(q, hctx, i)
682 blk_mq_stop_hw_queue(hctx);
683}
684EXPORT_SYMBOL(blk_mq_stop_hw_queues);
685
686void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
687{
688 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
689 __blk_mq_run_hw_queue(hctx);
690}
691EXPORT_SYMBOL(blk_mq_start_hw_queue);
692
693void blk_mq_start_stopped_hw_queues(struct request_queue *q)
694{
695 struct blk_mq_hw_ctx *hctx;
696 int i;
697
698 queue_for_each_hw_ctx(q, hctx, i) {
699 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
700 continue;
701
702 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
703 blk_mq_run_hw_queue(hctx, true);
704 }
705}
706EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
707
708static void blk_mq_work_fn(struct work_struct *work)
709{
710 struct blk_mq_hw_ctx *hctx;
711
712 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
713 __blk_mq_run_hw_queue(hctx);
714}
715
716static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
717 struct request *rq)
718{
719 struct blk_mq_ctx *ctx = rq->mq_ctx;
720
721 list_add_tail(&rq->queuelist, &ctx->rq_list);
722 blk_mq_hctx_mark_pending(hctx, ctx);
723
724 /*
725 * We do this early, to ensure we are on the right CPU.
726 */
727 blk_mq_add_timer(rq);
728}
729
730void blk_mq_insert_request(struct request_queue *q, struct request *rq,
731 bool run_queue)
732{
733 struct blk_mq_hw_ctx *hctx;
734 struct blk_mq_ctx *ctx, *current_ctx;
735
736 ctx = rq->mq_ctx;
737 hctx = q->mq_ops->map_queue(q, ctx->cpu);
738
739 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
740 blk_insert_flush(rq);
741 } else {
742 current_ctx = blk_mq_get_ctx(q);
743
744 if (!cpu_online(ctx->cpu)) {
745 ctx = current_ctx;
746 hctx = q->mq_ops->map_queue(q, ctx->cpu);
747 rq->mq_ctx = ctx;
748 }
749 spin_lock(&ctx->lock);
750 __blk_mq_insert_request(hctx, rq);
751 spin_unlock(&ctx->lock);
752
753 blk_mq_put_ctx(current_ctx);
754 }
755
756 if (run_queue)
757 __blk_mq_run_hw_queue(hctx);
758}
759EXPORT_SYMBOL(blk_mq_insert_request);
760
761/*
762 * This is a special version of blk_mq_insert_request to bypass FLUSH request
763 * check. Should only be used internally.
764 */
765void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
766{
767 struct request_queue *q = rq->q;
768 struct blk_mq_hw_ctx *hctx;
769 struct blk_mq_ctx *ctx, *current_ctx;
770
771 current_ctx = blk_mq_get_ctx(q);
772
773 ctx = rq->mq_ctx;
774 if (!cpu_online(ctx->cpu)) {
775 ctx = current_ctx;
776 rq->mq_ctx = ctx;
777 }
778 hctx = q->mq_ops->map_queue(q, ctx->cpu);
779
780 /* ctx->cpu might be offline */
781 spin_lock(&ctx->lock);
782 __blk_mq_insert_request(hctx, rq);
783 spin_unlock(&ctx->lock);
784
785 blk_mq_put_ctx(current_ctx);
786
787 if (run_queue)
788 blk_mq_run_hw_queue(hctx, async);
789}
790
791static void blk_mq_insert_requests(struct request_queue *q,
792 struct blk_mq_ctx *ctx,
793 struct list_head *list,
794 int depth,
795 bool from_schedule)
796
797{
798 struct blk_mq_hw_ctx *hctx;
799 struct blk_mq_ctx *current_ctx;
800
801 trace_block_unplug(q, depth, !from_schedule);
802
803 current_ctx = blk_mq_get_ctx(q);
804
805 if (!cpu_online(ctx->cpu))
806 ctx = current_ctx;
807 hctx = q->mq_ops->map_queue(q, ctx->cpu);
808
809 /*
810 * preemption doesn't flush plug list, so it's possible ctx->cpu is
811 * offline now
812 */
813 spin_lock(&ctx->lock);
814 while (!list_empty(list)) {
815 struct request *rq;
816
817 rq = list_first_entry(list, struct request, queuelist);
818 list_del_init(&rq->queuelist);
819 rq->mq_ctx = ctx;
820 __blk_mq_insert_request(hctx, rq);
821 }
822 spin_unlock(&ctx->lock);
823
824 blk_mq_put_ctx(current_ctx);
825
826 blk_mq_run_hw_queue(hctx, from_schedule);
827}
828
829static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
830{
831 struct request *rqa = container_of(a, struct request, queuelist);
832 struct request *rqb = container_of(b, struct request, queuelist);
833
834 return !(rqa->mq_ctx < rqb->mq_ctx ||
835 (rqa->mq_ctx == rqb->mq_ctx &&
836 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
837}
838
839void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
840{
841 struct blk_mq_ctx *this_ctx;
842 struct request_queue *this_q;
843 struct request *rq;
844 LIST_HEAD(list);
845 LIST_HEAD(ctx_list);
846 unsigned int depth;
847
848 list_splice_init(&plug->mq_list, &list);
849
850 list_sort(NULL, &list, plug_ctx_cmp);
851
852 this_q = NULL;
853 this_ctx = NULL;
854 depth = 0;
855
856 while (!list_empty(&list)) {
857 rq = list_entry_rq(list.next);
858 list_del_init(&rq->queuelist);
859 BUG_ON(!rq->q);
860 if (rq->mq_ctx != this_ctx) {
861 if (this_ctx) {
862 blk_mq_insert_requests(this_q, this_ctx,
863 &ctx_list, depth,
864 from_schedule);
865 }
866
867 this_ctx = rq->mq_ctx;
868 this_q = rq->q;
869 depth = 0;
870 }
871
872 depth++;
873 list_add_tail(&rq->queuelist, &ctx_list);
874 }
875
876 /*
877 * If 'this_ctx' is set, we know we have entries to complete
878 * on 'ctx_list'. Do those.
879 */
880 if (this_ctx) {
881 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
882 from_schedule);
883 }
884}
885
886static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
887{
888 init_request_from_bio(rq, bio);
889 blk_account_io_start(rq, 1);
890}
891
892static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
893{
894 struct blk_mq_hw_ctx *hctx;
895 struct blk_mq_ctx *ctx;
896 const int is_sync = rw_is_sync(bio->bi_rw);
897 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
898 int rw = bio_data_dir(bio);
899 struct request *rq;
900 unsigned int use_plug, request_count = 0;
901
902 /*
903 * If we have multiple hardware queues, just go directly to
904 * one of those for sync IO.
905 */
906 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
907
908 blk_queue_bounce(q, &bio);
909
910 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
911 return;
912
913 if (blk_mq_queue_enter(q)) {
914 bio_endio(bio, -EIO);
915 return;
916 }
917
918 ctx = blk_mq_get_ctx(q);
919 hctx = q->mq_ops->map_queue(q, ctx->cpu);
920
921 trace_block_getrq(q, bio, rw);
922 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
923 if (likely(rq))
924 blk_mq_rq_ctx_init(ctx, rq, rw);
925 else {
926 blk_mq_put_ctx(ctx);
927 trace_block_sleeprq(q, bio, rw);
928 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
929 false);
930 ctx = rq->mq_ctx;
931 hctx = q->mq_ops->map_queue(q, ctx->cpu);
932 }
933
934 hctx->queued++;
935
936 if (unlikely(is_flush_fua)) {
937 blk_mq_bio_to_request(rq, bio);
938 blk_mq_put_ctx(ctx);
939 blk_insert_flush(rq);
940 goto run_queue;
941 }
942
943 /*
944 * A task plug currently exists. Since this is completely lockless,
945 * utilize that to temporarily store requests until the task is
946 * either done or scheduled away.
947 */
948 if (use_plug) {
949 struct blk_plug *plug = current->plug;
950
951 if (plug) {
952 blk_mq_bio_to_request(rq, bio);
953 if (list_empty(&plug->mq_list))
954 trace_block_plug(q);
955 else if (request_count >= BLK_MAX_REQUEST_COUNT) {
956 blk_flush_plug_list(plug, false);
957 trace_block_plug(q);
958 }
959 list_add_tail(&rq->queuelist, &plug->mq_list);
960 blk_mq_put_ctx(ctx);
961 return;
962 }
963 }
964
965 spin_lock(&ctx->lock);
966
967 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
968 blk_mq_attempt_merge(q, ctx, bio))
969 __blk_mq_free_request(hctx, ctx, rq);
970 else {
971 blk_mq_bio_to_request(rq, bio);
972 __blk_mq_insert_request(hctx, rq);
973 }
974
975 spin_unlock(&ctx->lock);
976 blk_mq_put_ctx(ctx);
977
978 /*
979 * For a SYNC request, send it to the hardware immediately. For an
980 * ASYNC request, just ensure that we run it later on. The latter
981 * allows for merging opportunities and more efficient dispatching.
982 */
983run_queue:
984 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
985}
986
987/*
988 * Default mapping to a software queue, since we use one per CPU.
989 */
990struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
991{
992 return q->queue_hw_ctx[q->mq_map[cpu]];
993}
994EXPORT_SYMBOL(blk_mq_map_queue);
995
996struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
997 unsigned int hctx_index)
998{
999 return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
1000 GFP_KERNEL | __GFP_ZERO, reg->numa_node);
1001}
1002EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
1003
1004void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
1005 unsigned int hctx_index)
1006{
1007 kfree(hctx);
1008}
1009EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
1010
1011static void blk_mq_hctx_notify(void *data, unsigned long action,
1012 unsigned int cpu)
1013{
1014 struct blk_mq_hw_ctx *hctx = data;
1015 struct blk_mq_ctx *ctx;
1016 LIST_HEAD(tmp);
1017
1018 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1019 return;
1020
1021 /*
1022 * Move ctx entries to new CPU, if this one is going away.
1023 */
1024 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1025
1026 spin_lock(&ctx->lock);
1027 if (!list_empty(&ctx->rq_list)) {
1028 list_splice_init(&ctx->rq_list, &tmp);
1029 clear_bit(ctx->index_hw, hctx->ctx_map);
1030 }
1031 spin_unlock(&ctx->lock);
1032
1033 if (list_empty(&tmp))
1034 return;
1035
1036 ctx = blk_mq_get_ctx(hctx->queue);
1037 spin_lock(&ctx->lock);
1038
1039 while (!list_empty(&tmp)) {
1040 struct request *rq;
1041
1042 rq = list_first_entry(&tmp, struct request, queuelist);
1043 rq->mq_ctx = ctx;
1044 list_move_tail(&rq->queuelist, &ctx->rq_list);
1045 }
1046
1047 blk_mq_hctx_mark_pending(hctx, ctx);
1048
1049 spin_unlock(&ctx->lock);
1050 blk_mq_put_ctx(ctx);
1051}
1052
1053static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
1054 void (*init)(void *, struct blk_mq_hw_ctx *,
1055 struct request *, unsigned int),
1056 void *data)
1057{
1058 unsigned int i;
1059
1060 for (i = 0; i < hctx->queue_depth; i++) {
1061 struct request *rq = hctx->rqs[i];
1062
1063 init(data, hctx, rq, i);
1064 }
1065}
1066
1067void blk_mq_init_commands(struct request_queue *q,
1068 void (*init)(void *, struct blk_mq_hw_ctx *,
1069 struct request *, unsigned int),
1070 void *data)
1071{
1072 struct blk_mq_hw_ctx *hctx;
1073 unsigned int i;
1074
1075 queue_for_each_hw_ctx(q, hctx, i)
1076 blk_mq_init_hw_commands(hctx, init, data);
1077}
1078EXPORT_SYMBOL(blk_mq_init_commands);
1079
1080static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
1081{
1082 struct page *page;
1083
1084 while (!list_empty(&hctx->page_list)) {
1085 page = list_first_entry(&hctx->page_list, struct page, list);
1086 list_del_init(&page->list);
1087 __free_pages(page, page->private);
1088 }
1089
1090 kfree(hctx->rqs);
1091
1092 if (hctx->tags)
1093 blk_mq_free_tags(hctx->tags);
1094}
1095
1096static size_t order_to_size(unsigned int order)
1097{
1098 size_t ret = PAGE_SIZE;
1099
1100 while (order--)
1101 ret *= 2;
1102
1103 return ret;
1104}
1105
1106static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
1107 unsigned int reserved_tags, int node)
1108{
1109 unsigned int i, j, entries_per_page, max_order = 4;
1110 size_t rq_size, left;
1111
1112 INIT_LIST_HEAD(&hctx->page_list);
1113
1114 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
1115 GFP_KERNEL, node);
1116 if (!hctx->rqs)
1117 return -ENOMEM;
1118
1119 /*
1120 * rq_size is the size of the request plus driver payload, rounded
1121 * to the cacheline size
1122 */
1123 rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
1124 cache_line_size());
1125 left = rq_size * hctx->queue_depth;
1126
1127 for (i = 0; i < hctx->queue_depth;) {
1128 int this_order = max_order;
1129 struct page *page;
1130 int to_do;
1131 void *p;
1132
1133 while (left < order_to_size(this_order - 1) && this_order)
1134 this_order--;
1135
1136 do {
1137 page = alloc_pages_node(node, GFP_KERNEL, this_order);
1138 if (page)
1139 break;
1140 if (!this_order--)
1141 break;
1142 if (order_to_size(this_order) < rq_size)
1143 break;
1144 } while (1);
1145
1146 if (!page)
1147 break;
1148
1149 page->private = this_order;
1150 list_add_tail(&page->list, &hctx->page_list);
1151
1152 p = page_address(page);
1153 entries_per_page = order_to_size(this_order) / rq_size;
1154 to_do = min(entries_per_page, hctx->queue_depth - i);
1155 left -= to_do * rq_size;
1156 for (j = 0; j < to_do; j++) {
1157 hctx->rqs[i] = p;
1158 blk_mq_rq_init(hctx, hctx->rqs[i]);
1159 p += rq_size;
1160 i++;
1161 }
1162 }
1163
1164 if (i < (reserved_tags + BLK_MQ_TAG_MIN))
1165 goto err_rq_map;
1166 else if (i != hctx->queue_depth) {
1167 hctx->queue_depth = i;
1168 pr_warn("%s: queue depth set to %u because of low memory\n",
1169 __func__, i);
1170 }
1171
1172 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
1173 if (!hctx->tags) {
1174err_rq_map:
1175 blk_mq_free_rq_map(hctx);
1176 return -ENOMEM;
1177 }
1178
1179 return 0;
1180}
1181
1182static int blk_mq_init_hw_queues(struct request_queue *q,
1183 struct blk_mq_reg *reg, void *driver_data)
1184{
1185 struct blk_mq_hw_ctx *hctx;
1186 unsigned int i, j;
1187
1188 /*
1189 * Initialize hardware queues
1190 */
1191 queue_for_each_hw_ctx(q, hctx, i) {
1192 unsigned int num_maps;
1193 int node;
1194
1195 node = hctx->numa_node;
1196 if (node == NUMA_NO_NODE)
1197 node = hctx->numa_node = reg->numa_node;
1198
1199 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
1200 spin_lock_init(&hctx->lock);
1201 INIT_LIST_HEAD(&hctx->dispatch);
1202 hctx->queue = q;
1203 hctx->queue_num = i;
1204 hctx->flags = reg->flags;
1205 hctx->queue_depth = reg->queue_depth;
1206 hctx->cmd_size = reg->cmd_size;
1207
1208 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1209 blk_mq_hctx_notify, hctx);
1210 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1211
1212 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
1213 break;
1214
1215 /*
1216 * Allocate space for all possible cpus to avoid allocation in
1217 * runtime
1218 */
1219 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1220 GFP_KERNEL, node);
1221 if (!hctx->ctxs)
1222 break;
1223
1224 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
1225 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1226 GFP_KERNEL, node);
1227 if (!hctx->ctx_map)
1228 break;
1229
1230 hctx->nr_ctx_map = num_maps;
1231 hctx->nr_ctx = 0;
1232
1233 if (reg->ops->init_hctx &&
1234 reg->ops->init_hctx(hctx, driver_data, i))
1235 break;
1236 }
1237
1238 if (i == q->nr_hw_queues)
1239 return 0;
1240
1241 /*
1242 * Init failed
1243 */
1244 queue_for_each_hw_ctx(q, hctx, j) {
1245 if (i == j)
1246 break;
1247
1248 if (reg->ops->exit_hctx)
1249 reg->ops->exit_hctx(hctx, j);
1250
1251 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1252 blk_mq_free_rq_map(hctx);
1253 kfree(hctx->ctxs);
1254 }
1255
1256 return 1;
1257}
1258
1259static void blk_mq_init_cpu_queues(struct request_queue *q,
1260 unsigned int nr_hw_queues)
1261{
1262 unsigned int i;
1263
1264 for_each_possible_cpu(i) {
1265 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1266 struct blk_mq_hw_ctx *hctx;
1267
1268 memset(__ctx, 0, sizeof(*__ctx));
1269 __ctx->cpu = i;
1270 spin_lock_init(&__ctx->lock);
1271 INIT_LIST_HEAD(&__ctx->rq_list);
1272 __ctx->queue = q;
1273
1274 /* If the cpu isn't online, the cpu is mapped to first hctx */
1275 hctx = q->mq_ops->map_queue(q, i);
1276 hctx->nr_ctx++;
1277
1278 if (!cpu_online(i))
1279 continue;
1280
1281 /*
1282 * Set local node, IFF we have more than one hw queue. If
1283 * not, we remain on the home node of the device
1284 */
1285 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1286 hctx->numa_node = cpu_to_node(i);
1287 }
1288}
1289
1290static void blk_mq_map_swqueue(struct request_queue *q)
1291{
1292 unsigned int i;
1293 struct blk_mq_hw_ctx *hctx;
1294 struct blk_mq_ctx *ctx;
1295
1296 queue_for_each_hw_ctx(q, hctx, i) {
1297 hctx->nr_ctx = 0;
1298 }
1299
1300 /*
1301 * Map software to hardware queues
1302 */
1303 queue_for_each_ctx(q, ctx, i) {
1304 /* If the cpu isn't online, the cpu is mapped to first hctx */
1305 hctx = q->mq_ops->map_queue(q, i);
1306 ctx->index_hw = hctx->nr_ctx;
1307 hctx->ctxs[hctx->nr_ctx++] = ctx;
1308 }
1309}
1310
1311struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1312 void *driver_data)
1313{
1314 struct blk_mq_hw_ctx **hctxs;
1315 struct blk_mq_ctx *ctx;
1316 struct request_queue *q;
1317 int i;
1318
1319 if (!reg->nr_hw_queues ||
1320 !reg->ops->queue_rq || !reg->ops->map_queue ||
1321 !reg->ops->alloc_hctx || !reg->ops->free_hctx)
1322 return ERR_PTR(-EINVAL);
1323
1324 if (!reg->queue_depth)
1325 reg->queue_depth = BLK_MQ_MAX_DEPTH;
1326 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
1327 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
1328 reg->queue_depth = BLK_MQ_MAX_DEPTH;
1329 }
1330
1331 /*
1332 * Set aside a tag for flush requests. It will only be used while
1333 * another flush request is in progress but outside the driver.
1334 *
1335 * TODO: only allocate if flushes are supported
1336 */
1337 reg->queue_depth++;
1338 reg->reserved_tags++;
1339
1340 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
1341 return ERR_PTR(-EINVAL);
1342
1343 ctx = alloc_percpu(struct blk_mq_ctx);
1344 if (!ctx)
1345 return ERR_PTR(-ENOMEM);
1346
1347 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1348 reg->numa_node);
1349
1350 if (!hctxs)
1351 goto err_percpu;
1352
1353 for (i = 0; i < reg->nr_hw_queues; i++) {
1354 hctxs[i] = reg->ops->alloc_hctx(reg, i);
1355 if (!hctxs[i])
1356 goto err_hctxs;
1357
1358 hctxs[i]->numa_node = NUMA_NO_NODE;
1359 hctxs[i]->queue_num = i;
1360 }
1361
1362 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
1363 if (!q)
1364 goto err_hctxs;
1365
1366 q->mq_map = blk_mq_make_queue_map(reg);
1367 if (!q->mq_map)
1368 goto err_map;
1369
1370 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1371 blk_queue_rq_timeout(q, 30000);
1372
1373 q->nr_queues = nr_cpu_ids;
1374 q->nr_hw_queues = reg->nr_hw_queues;
1375
1376 q->queue_ctx = ctx;
1377 q->queue_hw_ctx = hctxs;
1378
1379 q->mq_ops = reg->ops;
1380
1381 blk_queue_make_request(q, blk_mq_make_request);
1382 blk_queue_rq_timed_out(q, reg->ops->timeout);
1383 if (reg->timeout)
1384 blk_queue_rq_timeout(q, reg->timeout);
1385
1386 blk_mq_init_flush(q);
1387 blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
1388
1389 if (blk_mq_init_hw_queues(q, reg, driver_data))
1390 goto err_hw;
1391
1392 blk_mq_map_swqueue(q);
1393
1394 mutex_lock(&all_q_mutex);
1395 list_add_tail(&q->all_q_node, &all_q_list);
1396 mutex_unlock(&all_q_mutex);
1397
1398 return q;
1399err_hw:
1400 kfree(q->mq_map);
1401err_map:
1402 blk_cleanup_queue(q);
1403err_hctxs:
1404 for (i = 0; i < reg->nr_hw_queues; i++) {
1405 if (!hctxs[i])
1406 break;
1407 reg->ops->free_hctx(hctxs[i], i);
1408 }
1409 kfree(hctxs);
1410err_percpu:
1411 free_percpu(ctx);
1412 return ERR_PTR(-ENOMEM);
1413}
1414EXPORT_SYMBOL(blk_mq_init_queue);
1415
1416void blk_mq_free_queue(struct request_queue *q)
1417{
1418 struct blk_mq_hw_ctx *hctx;
1419 int i;
1420
1421 queue_for_each_hw_ctx(q, hctx, i) {
1422 cancel_delayed_work_sync(&hctx->delayed_work);
1423 kfree(hctx->ctx_map);
1424 kfree(hctx->ctxs);
1425 blk_mq_free_rq_map(hctx);
1426 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1427 if (q->mq_ops->exit_hctx)
1428 q->mq_ops->exit_hctx(hctx, i);
1429 q->mq_ops->free_hctx(hctx, i);
1430 }
1431
1432 free_percpu(q->queue_ctx);
1433 kfree(q->queue_hw_ctx);
1434 kfree(q->mq_map);
1435
1436 q->queue_ctx = NULL;
1437 q->queue_hw_ctx = NULL;
1438 q->mq_map = NULL;
1439
1440 mutex_lock(&all_q_mutex);
1441 list_del_init(&q->all_q_node);
1442 mutex_unlock(&all_q_mutex);
1443}
1444EXPORT_SYMBOL(blk_mq_free_queue);
1445
1446/* Basically redo blk_mq_init_queue with queue frozen */
1447static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)
1448{
1449 blk_mq_freeze_queue(q);
1450
1451 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1452
1453 /*
1454 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1455 * we should change hctx numa_node according to new topology (this
1456 * involves free and re-allocate memory, worthy doing?)
1457 */
1458
1459 blk_mq_map_swqueue(q);
1460
1461 blk_mq_unfreeze_queue(q);
1462}
1463
1464static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb,
1465 unsigned long action, void *hcpu)
1466{
1467 struct request_queue *q;
1468
1469 /*
1470 * Before new mapping is established, hotadded cpu might already start
1471 * handling requests. This doesn't break anything as we map offline
1472 * CPUs to first hardware queue. We will re-init queue below to get
1473 * optimal settings.
1474 */
1475 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1476 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1477 return NOTIFY_OK;
1478
1479 mutex_lock(&all_q_mutex);
1480 list_for_each_entry(q, &all_q_list, all_q_node)
1481 blk_mq_queue_reinit(q);
1482 mutex_unlock(&all_q_mutex);
1483 return NOTIFY_OK;
1484}
1485
1486static int __init blk_mq_init(void)
1487{
1488 unsigned int i;
1489
1490 for_each_possible_cpu(i)
1491 init_llist_head(&per_cpu(ipi_lists, i));
1492
1493 blk_mq_cpu_init();
1494
1495 /* Must be called after percpu_counter_hotcpu_callback() */
1496 hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
1497
1498 return 0;
1499}
1500subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
new file mode 100644
index 000000000000..52bf1f96a2c2
--- /dev/null
+++ b/block/blk-mq.h
@@ -0,0 +1,52 @@
1#ifndef INT_BLK_MQ_H
2#define INT_BLK_MQ_H
3
4struct blk_mq_ctx {
5 struct {
6 spinlock_t lock;
7 struct list_head rq_list;
8 } ____cacheline_aligned_in_smp;
9
10 unsigned int cpu;
11 unsigned int index_hw;
12 unsigned int ipi_redirect;
13
14 /* incremented at dispatch time */
15 unsigned long rq_dispatched[2];
16 unsigned long rq_merged;
17
18 /* incremented at completion time */
19 unsigned long ____cacheline_aligned_in_smp rq_completed[2];
20
21 struct request_queue *queue;
22 struct kobject kobj;
23};
24
25void __blk_mq_end_io(struct request *rq, int error);
26void blk_mq_complete_request(struct request *rq, int error);
27void blk_mq_run_request(struct request *rq, bool run_queue, bool async);
28void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
29void blk_mq_init_flush(struct request_queue *q);
30
31/*
32 * CPU hotplug helpers
33 */
34struct blk_mq_cpu_notifier;
35void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
36 void (*fn)(void *, unsigned long, unsigned int),
37 void *data);
38void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
39void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
40void blk_mq_cpu_init(void);
41DECLARE_PER_CPU(struct llist_head, ipi_lists);
42
43/*
44 * CPU -> queue mappings
45 */
46struct blk_mq_reg;
47extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
48extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
49
50void blk_mq_add_timer(struct request *rq);
51
52#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 026c1517505f..05e826793e4e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -144,6 +144,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
144 lim->discard_zeroes_data = 1; 144 lim->discard_zeroes_data = 1;
145 lim->max_segments = USHRT_MAX; 145 lim->max_segments = USHRT_MAX;
146 lim->max_hw_sectors = UINT_MAX; 146 lim->max_hw_sectors = UINT_MAX;
147 lim->max_segment_size = UINT_MAX;
147 lim->max_sectors = UINT_MAX; 148 lim->max_sectors = UINT_MAX;
148 lim->max_write_same_sectors = UINT_MAX; 149 lim->max_write_same_sectors = UINT_MAX;
149} 150}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ec9e60636f43..ce4b8bfd3d27 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -23,7 +23,7 @@ static void blk_done_softirq(struct softirq_action *h)
23 struct list_head *cpu_list, local_list; 23 struct list_head *cpu_list, local_list;
24 24
25 local_irq_disable(); 25 local_irq_disable();
26 cpu_list = &__get_cpu_var(blk_cpu_done); 26 cpu_list = this_cpu_ptr(&blk_cpu_done);
27 list_replace_init(cpu_list, &local_list); 27 list_replace_init(cpu_list, &local_list);
28 local_irq_enable(); 28 local_irq_enable();
29 29
@@ -44,7 +44,7 @@ static void trigger_softirq(void *data)
44 struct list_head *list; 44 struct list_head *list;
45 45
46 local_irq_save(flags); 46 local_irq_save(flags);
47 list = &__get_cpu_var(blk_cpu_done); 47 list = this_cpu_ptr(&blk_cpu_done);
48 list_add_tail(&rq->csd.list, list); 48 list_add_tail(&rq->csd.list, list);
49 49
50 if (list->next == &rq->csd.list) 50 if (list->next == &rq->csd.list)
@@ -90,7 +90,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
90 90
91 local_irq_disable(); 91 local_irq_disable();
92 list_splice_init(&per_cpu(blk_cpu_done, cpu), 92 list_splice_init(&per_cpu(blk_cpu_done, cpu),
93 &__get_cpu_var(blk_cpu_done)); 93 this_cpu_ptr(&blk_cpu_done));
94 raise_softirq_irqoff(BLOCK_SOFTIRQ); 94 raise_softirq_irqoff(BLOCK_SOFTIRQ);
95 local_irq_enable(); 95 local_irq_enable();
96 } 96 }
@@ -135,7 +135,7 @@ void __blk_complete_request(struct request *req)
135 if (ccpu == cpu || shared) { 135 if (ccpu == cpu || shared) {
136 struct list_head *list; 136 struct list_head *list;
137do_local: 137do_local:
138 list = &__get_cpu_var(blk_cpu_done); 138 list = this_cpu_ptr(&blk_cpu_done);
139 list_add_tail(&req->csd.list, list); 139 list_add_tail(&req->csd.list, list);
140 140
141 /* 141 /*
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3aa5b195f4dd..4f8c4d90ec73 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -7,6 +7,7 @@
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/blktrace_api.h> 9#include <linux/blktrace_api.h>
10#include <linux/blk-mq.h>
10 11
11#include "blk.h" 12#include "blk.h"
12#include "blk-cgroup.h" 13#include "blk-cgroup.h"
@@ -542,6 +543,11 @@ static void blk_release_queue(struct kobject *kobj)
542 if (q->queue_tags) 543 if (q->queue_tags)
543 __blk_queue_free_tags(q); 544 __blk_queue_free_tags(q);
544 545
546 percpu_counter_destroy(&q->mq_usage_counter);
547
548 if (q->mq_ops)
549 blk_mq_free_queue(q);
550
545 blk_trace_shutdown(q); 551 blk_trace_shutdown(q);
546 552
547 bdi_destroy(&q->backing_dev_info); 553 bdi_destroy(&q->backing_dev_info);
@@ -575,6 +581,7 @@ int blk_register_queue(struct gendisk *disk)
575 * bypass from queue allocation. 581 * bypass from queue allocation.
576 */ 582 */
577 blk_queue_bypass_end(q); 583 blk_queue_bypass_end(q);
584 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
578 585
579 ret = blk_trace_init_sysfs(dev); 586 ret = blk_trace_init_sysfs(dev);
580 if (ret) 587 if (ret)
@@ -588,6 +595,9 @@ int blk_register_queue(struct gendisk *disk)
588 595
589 kobject_uevent(&q->kobj, KOBJ_ADD); 596 kobject_uevent(&q->kobj, KOBJ_ADD);
590 597
598 if (q->mq_ops)
599 blk_mq_register_disk(disk);
600
591 if (!q->request_fn) 601 if (!q->request_fn)
592 return 0; 602 return 0;
593 603
@@ -610,6 +620,9 @@ void blk_unregister_queue(struct gendisk *disk)
610 if (WARN_ON(!q)) 620 if (WARN_ON(!q))
611 return; 621 return;
612 622
623 if (q->mq_ops)
624 blk_mq_unregister_disk(disk);
625
613 if (q->request_fn) 626 if (q->request_fn)
614 elv_unregister_queue(q); 627 elv_unregister_queue(q);
615 628
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 65f103563969..bba81c9348e1 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -7,6 +7,7 @@
7#include <linux/fault-inject.h> 7#include <linux/fault-inject.h>
8 8
9#include "blk.h" 9#include "blk.h"
10#include "blk-mq.h"
10 11
11#ifdef CONFIG_FAIL_IO_TIMEOUT 12#ifdef CONFIG_FAIL_IO_TIMEOUT
12 13
@@ -31,7 +32,7 @@ static int __init fail_io_timeout_debugfs(void)
31 struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", 32 struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
32 NULL, &fail_io_timeout); 33 NULL, &fail_io_timeout);
33 34
34 return IS_ERR(dir) ? PTR_ERR(dir) : 0; 35 return PTR_ERR_OR_ZERO(dir);
35} 36}
36 37
37late_initcall(fail_io_timeout_debugfs); 38late_initcall(fail_io_timeout_debugfs);
@@ -88,11 +89,19 @@ static void blk_rq_timed_out(struct request *req)
88 ret = q->rq_timed_out_fn(req); 89 ret = q->rq_timed_out_fn(req);
89 switch (ret) { 90 switch (ret) {
90 case BLK_EH_HANDLED: 91 case BLK_EH_HANDLED:
91 __blk_complete_request(req); 92 /* Can we use req->errors here? */
93 if (q->mq_ops)
94 blk_mq_complete_request(req, req->errors);
95 else
96 __blk_complete_request(req);
92 break; 97 break;
93 case BLK_EH_RESET_TIMER: 98 case BLK_EH_RESET_TIMER:
99 if (q->mq_ops)
100 blk_mq_add_timer(req);
101 else
102 blk_add_timer(req);
103
94 blk_clear_rq_complete(req); 104 blk_clear_rq_complete(req);
95 blk_add_timer(req);
96 break; 105 break;
97 case BLK_EH_NOT_HANDLED: 106 case BLK_EH_NOT_HANDLED:
98 /* 107 /*
@@ -108,6 +117,23 @@ static void blk_rq_timed_out(struct request *req)
108 } 117 }
109} 118}
110 119
120void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
121 unsigned int *next_set)
122{
123 if (time_after_eq(jiffies, rq->deadline)) {
124 list_del_init(&rq->timeout_list);
125
126 /*
127 * Check if we raced with end io completion
128 */
129 if (!blk_mark_rq_complete(rq))
130 blk_rq_timed_out(rq);
131 } else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
132 *next_timeout = rq->deadline;
133 *next_set = 1;
134 }
135}
136
111void blk_rq_timed_out_timer(unsigned long data) 137void blk_rq_timed_out_timer(unsigned long data)
112{ 138{
113 struct request_queue *q = (struct request_queue *) data; 139 struct request_queue *q = (struct request_queue *) data;
@@ -117,21 +143,8 @@ void blk_rq_timed_out_timer(unsigned long data)
117 143
118 spin_lock_irqsave(q->queue_lock, flags); 144 spin_lock_irqsave(q->queue_lock, flags);
119 145
120 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { 146 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
121 if (time_after_eq(jiffies, rq->deadline)) { 147 blk_rq_check_expired(rq, &next, &next_set);
122 list_del_init(&rq->timeout_list);
123
124 /*
125 * Check if we raced with end io completion
126 */
127 if (blk_mark_rq_complete(rq))
128 continue;
129 blk_rq_timed_out(rq);
130 } else if (!next_set || time_after(next, rq->deadline)) {
131 next = rq->deadline;
132 next_set = 1;
133 }
134 }
135 148
136 if (next_set) 149 if (next_set)
137 mod_timer(&q->timeout, round_jiffies_up(next)); 150 mod_timer(&q->timeout, round_jiffies_up(next));
@@ -157,15 +170,7 @@ void blk_abort_request(struct request *req)
157} 170}
158EXPORT_SYMBOL_GPL(blk_abort_request); 171EXPORT_SYMBOL_GPL(blk_abort_request);
159 172
160/** 173void __blk_add_timer(struct request *req, struct list_head *timeout_list)
161 * blk_add_timer - Start timeout timer for a single request
162 * @req: request that is about to start running.
163 *
164 * Notes:
165 * Each request has its own timer, and as it is added to the queue, we
166 * set up the timer. When the request completes, we cancel the timer.
167 */
168void blk_add_timer(struct request *req)
169{ 174{
170 struct request_queue *q = req->q; 175 struct request_queue *q = req->q;
171 unsigned long expiry; 176 unsigned long expiry;
@@ -174,7 +179,6 @@ void blk_add_timer(struct request *req)
174 return; 179 return;
175 180
176 BUG_ON(!list_empty(&req->timeout_list)); 181 BUG_ON(!list_empty(&req->timeout_list));
177 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
178 182
179 /* 183 /*
180 * Some LLDs, like scsi, peek at the timeout to prevent a 184 * Some LLDs, like scsi, peek at the timeout to prevent a
@@ -184,7 +188,8 @@ void blk_add_timer(struct request *req)
184 req->timeout = q->rq_timeout; 188 req->timeout = q->rq_timeout;
185 189
186 req->deadline = jiffies + req->timeout; 190 req->deadline = jiffies + req->timeout;
187 list_add_tail(&req->timeout_list, &q->timeout_list); 191 if (timeout_list)
192 list_add_tail(&req->timeout_list, timeout_list);
188 193
189 /* 194 /*
190 * If the timer isn't already pending or this timeout is earlier 195 * If the timer isn't already pending or this timeout is earlier
@@ -196,5 +201,19 @@ void blk_add_timer(struct request *req)
196 if (!timer_pending(&q->timeout) || 201 if (!timer_pending(&q->timeout) ||
197 time_before(expiry, q->timeout.expires)) 202 time_before(expiry, q->timeout.expires))
198 mod_timer(&q->timeout, expiry); 203 mod_timer(&q->timeout, expiry);
204
205}
206
207/**
208 * blk_add_timer - Start timeout timer for a single request
209 * @req: request that is about to start running.
210 *
211 * Notes:
212 * Each request has its own timer, and as it is added to the queue, we
213 * set up the timer. When the request completes, we cancel the timer.
214 */
215void blk_add_timer(struct request *req)
216{
217 __blk_add_timer(req, &req->q->timeout_list);
199} 218}
200 219
diff --git a/block/blk.h b/block/blk.h
index e837b8f619b7..c90e1d8f7a2b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -10,6 +10,7 @@
10#define BLK_BATCH_REQ 32 10#define BLK_BATCH_REQ 32
11 11
12extern struct kmem_cache *blk_requestq_cachep; 12extern struct kmem_cache *blk_requestq_cachep;
13extern struct kmem_cache *request_cachep;
13extern struct kobj_type blk_queue_ktype; 14extern struct kobj_type blk_queue_ktype;
14extern struct ida blk_queue_ida; 15extern struct ida blk_queue_ida;
15 16
@@ -34,14 +35,30 @@ bool __blk_end_bidi_request(struct request *rq, int error,
34 unsigned int nr_bytes, unsigned int bidi_bytes); 35 unsigned int nr_bytes, unsigned int bidi_bytes);
35 36
36void blk_rq_timed_out_timer(unsigned long data); 37void blk_rq_timed_out_timer(unsigned long data);
38void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
39 unsigned int *next_set);
40void __blk_add_timer(struct request *req, struct list_head *timeout_list);
37void blk_delete_timer(struct request *); 41void blk_delete_timer(struct request *);
38void blk_add_timer(struct request *); 42void blk_add_timer(struct request *);
39 43
44
45bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
46 struct bio *bio);
47bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
48 struct bio *bio);
49bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
50 unsigned int *request_count);
51
52void blk_account_io_start(struct request *req, bool new_io);
53void blk_account_io_completion(struct request *req, unsigned int bytes);
54void blk_account_io_done(struct request *req);
55
40/* 56/*
41 * Internal atomic flags for request handling 57 * Internal atomic flags for request handling
42 */ 58 */
43enum rq_atomic_flags { 59enum rq_atomic_flags {
44 REQ_ATOM_COMPLETE = 0, 60 REQ_ATOM_COMPLETE = 0,
61 REQ_ATOM_STARTED,
45}; 62};
46 63
47/* 64/*
diff --git a/block/elevator.c b/block/elevator.c
index 2bcbd8cc14d4..b7ff2861b6bd 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name)
186 struct elevator_type *e = NULL; 186 struct elevator_type *e = NULL;
187 int err; 187 int err;
188 188
189 /*
190 * q->sysfs_lock must be held to provide mutual exclusion between
191 * elevator_switch() and here.
192 */
193 lockdep_assert_held(&q->sysfs_lock);
194
189 if (unlikely(q->elevator)) 195 if (unlikely(q->elevator))
190 return 0; 196 return 0;
191 197
@@ -959,7 +965,7 @@ fail_init:
959/* 965/*
960 * Switch this queue to the given IO scheduler. 966 * Switch this queue to the given IO scheduler.
961 */ 967 */
962int elevator_change(struct request_queue *q, const char *name) 968static int __elevator_change(struct request_queue *q, const char *name)
963{ 969{
964 char elevator_name[ELV_NAME_MAX]; 970 char elevator_name[ELV_NAME_MAX];
965 struct elevator_type *e; 971 struct elevator_type *e;
@@ -981,6 +987,18 @@ int elevator_change(struct request_queue *q, const char *name)
981 987
982 return elevator_switch(q, e); 988 return elevator_switch(q, e);
983} 989}
990
991int elevator_change(struct request_queue *q, const char *name)
992{
993 int ret;
994
995 /* Protect q->elevator from elevator_init() */
996 mutex_lock(&q->sysfs_lock);
997 ret = __elevator_change(q, name);
998 mutex_unlock(&q->sysfs_lock);
999
1000 return ret;
1001}
984EXPORT_SYMBOL(elevator_change); 1002EXPORT_SYMBOL(elevator_change);
985 1003
986ssize_t elv_iosched_store(struct request_queue *q, const char *name, 1004ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -991,7 +1009,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
991 if (!q->elevator) 1009 if (!q->elevator)
992 return count; 1010 return count;
993 1011
994 ret = elevator_change(q, name); 1012 ret = __elevator_change(q, name);
995 if (!ret) 1013 if (!ret)
996 return count; 1014 return count;
997 1015
diff --git a/block/ioctl.c b/block/ioctl.c
index a31d91d9bc5a..7d5c3b20af45 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -64,7 +64,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
64 part = add_partition(disk, partno, start, length, 64 part = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE, NULL); 65 ADDPART_FLAG_NONE, NULL);
66 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
67 return IS_ERR(part) ? PTR_ERR(part) : 0; 67 return PTR_ERR_OR_ZERO(part);
68 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
69 part = disk_get_part(disk, partno); 69 part = disk_get_part(disk, partno);
70 if (!part) 70 if (!part)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a5ffcc988f0b..625e3e471d65 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -286,7 +286,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
286 struct sg_io_hdr *hdr, fmode_t mode) 286 struct sg_io_hdr *hdr, fmode_t mode)
287{ 287{
288 unsigned long start_time; 288 unsigned long start_time;
289 int writing = 0, ret = 0; 289 ssize_t ret = 0;
290 int writing = 0;
290 struct request *rq; 291 struct request *rq;
291 char sense[SCSI_SENSE_BUFFERSIZE]; 292 char sense[SCSI_SENSE_BUFFERSIZE];
292 struct bio *bio; 293 struct bio *bio;
@@ -321,37 +322,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
321 } 322 }
322 323
323 if (hdr->iovec_count) { 324 if (hdr->iovec_count) {
324 const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
325 size_t iov_data_len; 325 size_t iov_data_len;
326 struct sg_iovec *sg_iov;
327 struct iovec *iov; 326 struct iovec *iov;
328 int i;
329 327
330 sg_iov = kmalloc(size, GFP_KERNEL); 328 ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
331 if (!sg_iov) { 329 0, NULL, &iov);
332 ret = -ENOMEM; 330 if (ret < 0)
333 goto out; 331 goto out;
334 }
335
336 if (copy_from_user(sg_iov, hdr->dxferp, size)) {
337 kfree(sg_iov);
338 ret = -EFAULT;
339 goto out;
340 }
341 332
342 /* 333 iov_data_len = ret;
343 * Sum up the vecs, making sure they don't overflow 334 ret = 0;
344 */
345 iov = (struct iovec *) sg_iov;
346 iov_data_len = 0;
347 for (i = 0; i < hdr->iovec_count; i++) {
348 if (iov_data_len + iov[i].iov_len < iov_data_len) {
349 kfree(sg_iov);
350 ret = -EINVAL;
351 goto out;
352 }
353 iov_data_len += iov[i].iov_len;
354 }
355 335
356 /* SG_IO howto says that the shorter of the two wins */ 336 /* SG_IO howto says that the shorter of the two wins */
357 if (hdr->dxfer_len < iov_data_len) { 337 if (hdr->dxfer_len < iov_data_len) {
@@ -361,9 +341,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
361 iov_data_len = hdr->dxfer_len; 341 iov_data_len = hdr->dxfer_len;
362 } 342 }
363 343
364 ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, 344 ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
345 hdr->iovec_count,
365 iov_data_len, GFP_KERNEL); 346 iov_data_len, GFP_KERNEL);
366 kfree(sg_iov); 347 kfree(iov);
367 } else if (hdr->dxfer_len) 348 } else if (hdr->dxfer_len)
368 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, 349 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
369 GFP_KERNEL); 350 GFP_KERNEL);