aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 22:08:14 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 22:08:14 -0500
commit0910c0bdf7c291a41bc21e40a97389c9d4c1960d (patch)
tree177c4cb22ece78b18f64f548ae82b9a15edbb99c
parent2821fe6b00a1e902fd399bb4b7e40bc3041f4d44 (diff)
parente37459b8e2c7db6735e39e019e448b76e5e77647 (diff)
Merge branch 'for-3.13/core' of git://git.kernel.dk/linux-block
Pull block IO core updates from Jens Axboe: "This is the pull request for the core changes in the block layer for 3.13. It contains: - The new blk-mq request interface. This is a new and more scalable queueing model that marries the best part of the request based interface we currently have (which is fully featured, but scales poorly) and the bio based "interface" which the new drivers for high IOPS devices end up using because it's much faster than the request based one. The bio interface has no block layer support, since it taps into the stack much earlier. This means that drivers end up having to implement a lot of functionality on their own, like tagging, timeout handling, requeue, etc. The blk-mq interface provides all these. Some drivers even provide a switch to select bio or rq and has code to handle both, since things like merging only works in the rq model and hence is faster for some workloads. This is a huge mess. Conversion of these drivers nets us a substantial code reduction. Initial results on converting SCSI to this model even shows an 8x improvement on single queue devices. So while the model was intended to work on the newer multiqueue devices, it has substantial improvements for "classic" hardware as well. This code has gone through extensive testing and development, it's now ready to go. A pull request is coming to convert virtio-blk to this model will be will be coming as well, with more drivers scheduled for 3.14 conversion. - Two blktrace fixes from Jan and Chen Gang. - A plug merge fix from Alireza Haghdoost. - Conversion of __get_cpu_var() from Christoph Lameter. - Fix for sector_div() with 64-bit divider from Geert Uytterhoeven. - A fix for a race between request completion and the timeout handling from Jeff Moyer. This is what caused the merge conflict with blk-mq/core, in case you are looking at that. - A dm stacking fix from Mike Snitzer. - A code consolidation fix and duplicated code removal from Kent Overstreet. - A handful of block bug fixes from Mikulas Patocka, fixing a loop crash and memory corruption on blk cg. - Elevator switch bug fix from Tomoki Sekiyama. A heads-up that I had to rebase this branch. Initially the immutable bio_vecs had been queued up for inclusion, but a week later, it became clear that it wasn't fully cooked yet. So the decision was made to pull this out and postpone it until 3.14. It was a straight forward rebase, just pruning out the immutable series and the later fixes of problems with it. The rest of the patches applied directly and no further changes were made" * 'for-3.13/core' of git://git.kernel.dk/linux-block: (31 commits) block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO block: Do not call sector_div() with a 64-bit divisor kernel: trace: blktrace: remove redundent memcpy() in compat_blk_trace_setup() block: Consolidate duplicated bio_trim() implementations block: Use rw_copy_check_uvector() block: Enable sysfs nomerge control for I/O requests in the plug list block: properly stack underlying max_segment_size to DM device elevator: acquire q->sysfs_lock in elevator_change() elevator: Fix a race in elevator switching and md device initialization block: Replace __get_cpu_var uses bdi: test bdi_init failure block: fix a probe argument to blk_register_region loop: fix crash if blk_alloc_queue fails blk-core: Fix memory corruption if blkcg_init_queue fails block: fix race between request completion and timeout handling blktrace: Send BLK_TN_PROCESS events to all running traces blk-mq: don't disallow request merges for req->special being set blk-mq: mq plug list breakage blk-mq: fix for flush deadlock ...
-rw-r--r--block/Makefile5
-rw-r--r--block/blk-core.c175
-rw-r--r--block/blk-exec.c14
-rw-r--r--block/blk-flush.c154
-rw-r--r--block/blk-iopoll.c6
-rw-r--r--block/blk-lib.c10
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-cpu.c93
-rw-r--r--block/blk-mq-cpumap.c108
-rw-r--r--block/blk-mq-sysfs.c384
-rw-r--r--block/blk-mq-tag.c204
-rw-r--r--block/blk-mq-tag.h27
-rw-r--r--block/blk-mq.c1500
-rw-r--r--block/blk-mq.h52
-rw-r--r--block/blk-settings.c1
-rw-r--r--block/blk-softirq.c8
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--block/blk-timeout.c77
-rw-r--r--block/blk.h17
-rw-r--r--block/elevator.c22
-rw-r--r--block/ioctl.c2
-rw-r--r--block/scsi_ioctl.c39
-rw-r--r--drivers/block/Kconfig3
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/brd.c2
-rw-r--r--drivers/block/floppy.c4
-rw-r--r--drivers/block/loop.c6
-rw-r--r--drivers/block/null_blk.c635
-rw-r--r--drivers/block/xen-blkfront.c53
-rw-r--r--drivers/md/md.c40
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/raid1.c10
-rw-r--r--drivers/md/raid10.c18
-rw-r--r--drivers/scsi/sd.c2
-rw-r--r--fs/bio.c46
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/fscache/object.c2
-rw-r--r--include/linux/backing-dev.h4
-rw-r--r--include/linux/bio.h3
-rw-r--r--include/linux/blk-mq.h183
-rw-r--r--include/linux/blk_types.h68
-rw-r--r--include/linux/blkdev.h60
-rw-r--r--include/linux/blktrace_api.h4
-rw-r--r--include/linux/percpu_ida.h23
-rw-r--r--kernel/smp.c7
-rw-r--r--kernel/trace/blktrace.c36
-rw-r--r--lib/percpu_counter.c15
-rw-r--r--lib/percpu_ida.c89
-rw-r--r--mm/swap.c3
49 files changed, 3885 insertions, 364 deletions
diff --git a/block/Makefile b/block/Makefile
index 671a83d063a5..20645e88fb57 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,9 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ 8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
9 partition-generic.o partitions/ 9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
10 genhd.o scsi_ioctl.o partition-generic.o partitions/
10 11
11obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 12obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
12obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 13obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 0a00e4ecf87c..8bdd0121212a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/blk-mq.h>
19#include <linux/highmem.h> 20#include <linux/highmem.h>
20#include <linux/mm.h> 21#include <linux/mm.h>
21#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
48/* 49/*
49 * For the allocated request tables 50 * For the allocated request tables
50 */ 51 */
51static struct kmem_cache *request_cachep; 52struct kmem_cache *request_cachep = NULL;
52 53
53/* 54/*
54 * For queue allocation 55 * For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
60 */ 61 */
61static struct workqueue_struct *kblockd_workqueue; 62static struct workqueue_struct *kblockd_workqueue;
62 63
63static void drive_stat_acct(struct request *rq, int new_io)
64{
65 struct hd_struct *part;
66 int rw = rq_data_dir(rq);
67 int cpu;
68
69 if (!blk_do_io_stat(rq))
70 return;
71
72 cpu = part_stat_lock();
73
74 if (!new_io) {
75 part = rq->part;
76 part_stat_inc(cpu, part, merges[rw]);
77 } else {
78 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
79 if (!hd_struct_try_get(part)) {
80 /*
81 * The partition is already being removed,
82 * the request will be accounted on the disk only
83 *
84 * We take a reference on disk->part0 although that
85 * partition will never be deleted, so we can treat
86 * it as any other partition.
87 */
88 part = &rq->rq_disk->part0;
89 hd_struct_get(part);
90 }
91 part_round_stats(cpu, part);
92 part_inc_in_flight(part, rw);
93 rq->part = part;
94 }
95
96 part_stat_unlock();
97}
98
99void blk_queue_congestion_threshold(struct request_queue *q) 64void blk_queue_congestion_threshold(struct request_queue *q)
100{ 65{
101 int nr; 66 int nr;
@@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
145 rq->cmd = rq->__cmd; 110 rq->cmd = rq->__cmd;
146 rq->cmd_len = BLK_MAX_CDB; 111 rq->cmd_len = BLK_MAX_CDB;
147 rq->tag = -1; 112 rq->tag = -1;
148 rq->ref_count = 1;
149 rq->start_time = jiffies; 113 rq->start_time = jiffies;
150 set_start_time_ns(rq); 114 set_start_time_ns(rq);
151 rq->part = NULL; 115 rq->part = NULL;
@@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
174{ 138{
175 int bit; 139 int bit;
176 140
177 printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, 141 printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
178 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, 142 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
179 rq->cmd_flags); 143 (unsigned long long) rq->cmd_flags);
180 144
181 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 145 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
182 (unsigned long long)blk_rq_pos(rq), 146 (unsigned long long)blk_rq_pos(rq),
@@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
595 if (!q) 559 if (!q)
596 return NULL; 560 return NULL;
597 561
562 if (percpu_counter_init(&q->mq_usage_counter, 0))
563 goto fail_q;
564
598 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 565 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
599 if (q->id < 0) 566 if (q->id < 0)
600 goto fail_q; 567 goto fail_c;
601 568
602 q->backing_dev_info.ra_pages = 569 q->backing_dev_info.ra_pages =
603 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 570 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -644,13 +611,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
644 q->bypass_depth = 1; 611 q->bypass_depth = 1;
645 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); 612 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
646 613
614 init_waitqueue_head(&q->mq_freeze_wq);
615
647 if (blkcg_init_queue(q)) 616 if (blkcg_init_queue(q))
648 goto fail_id; 617 goto fail_bdi;
649 618
650 return q; 619 return q;
651 620
621fail_bdi:
622 bdi_destroy(&q->backing_dev_info);
652fail_id: 623fail_id:
653 ida_simple_remove(&blk_queue_ida, q->id); 624 ida_simple_remove(&blk_queue_ida, q->id);
625fail_c:
626 percpu_counter_destroy(&q->mq_usage_counter);
654fail_q: 627fail_q:
655 kmem_cache_free(blk_requestq_cachep, q); 628 kmem_cache_free(blk_requestq_cachep, q);
656 return NULL; 629 return NULL;
@@ -739,9 +712,17 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
739 712
740 q->sg_reserved_size = INT_MAX; 713 q->sg_reserved_size = INT_MAX;
741 714
715 /* Protect q->elevator from elevator_change */
716 mutex_lock(&q->sysfs_lock);
717
742 /* init elevator */ 718 /* init elevator */
743 if (elevator_init(q, NULL)) 719 if (elevator_init(q, NULL)) {
720 mutex_unlock(&q->sysfs_lock);
744 return NULL; 721 return NULL;
722 }
723
724 mutex_unlock(&q->sysfs_lock);
725
745 return q; 726 return q;
746} 727}
747EXPORT_SYMBOL(blk_init_allocated_queue); 728EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1109,7 +1090,8 @@ retry:
1109 goto retry; 1090 goto retry;
1110} 1091}
1111 1092
1112struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1093static struct request *blk_old_get_request(struct request_queue *q, int rw,
1094 gfp_t gfp_mask)
1113{ 1095{
1114 struct request *rq; 1096 struct request *rq;
1115 1097
@@ -1126,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1126 1108
1127 return rq; 1109 return rq;
1128} 1110}
1111
1112struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1113{
1114 if (q->mq_ops)
1115 return blk_mq_alloc_request(q, rw, gfp_mask, false);
1116 else
1117 return blk_old_get_request(q, rw, gfp_mask);
1118}
1129EXPORT_SYMBOL(blk_get_request); 1119EXPORT_SYMBOL(blk_get_request);
1130 1120
1131/** 1121/**
@@ -1211,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request);
1211static void add_acct_request(struct request_queue *q, struct request *rq, 1201static void add_acct_request(struct request_queue *q, struct request *rq,
1212 int where) 1202 int where)
1213{ 1203{
1214 drive_stat_acct(rq, 1); 1204 blk_account_io_start(rq, true);
1215 __elv_add_request(q, rq, where); 1205 __elv_add_request(q, rq, where);
1216} 1206}
1217 1207
@@ -1272,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1272{ 1262{
1273 if (unlikely(!q)) 1263 if (unlikely(!q))
1274 return; 1264 return;
1275 if (unlikely(--req->ref_count))
1276 return;
1277 1265
1278 blk_pm_put_request(req); 1266 blk_pm_put_request(req);
1279 1267
@@ -1302,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
1302 1290
1303void blk_put_request(struct request *req) 1291void blk_put_request(struct request *req)
1304{ 1292{
1305 unsigned long flags;
1306 struct request_queue *q = req->q; 1293 struct request_queue *q = req->q;
1307 1294
1308 spin_lock_irqsave(q->queue_lock, flags); 1295 if (q->mq_ops)
1309 __blk_put_request(q, req); 1296 blk_mq_free_request(req);
1310 spin_unlock_irqrestore(q->queue_lock, flags); 1297 else {
1298 unsigned long flags;
1299
1300 spin_lock_irqsave(q->queue_lock, flags);
1301 __blk_put_request(q, req);
1302 spin_unlock_irqrestore(q->queue_lock, flags);
1303 }
1311} 1304}
1312EXPORT_SYMBOL(blk_put_request); 1305EXPORT_SYMBOL(blk_put_request);
1313 1306
@@ -1343,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1343} 1336}
1344EXPORT_SYMBOL_GPL(blk_add_request_payload); 1337EXPORT_SYMBOL_GPL(blk_add_request_payload);
1345 1338
1346static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1339bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1347 struct bio *bio) 1340 struct bio *bio)
1348{ 1341{
1349 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1342 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1350 1343
@@ -1361,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1361 req->__data_len += bio->bi_size; 1354 req->__data_len += bio->bi_size;
1362 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1355 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1363 1356
1364 drive_stat_acct(req, 0); 1357 blk_account_io_start(req, false);
1365 return true; 1358 return true;
1366} 1359}
1367 1360
1368static bool bio_attempt_front_merge(struct request_queue *q, 1361bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1369 struct request *req, struct bio *bio) 1362 struct bio *bio)
1370{ 1363{
1371 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1364 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1372 1365
@@ -1391,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1391 req->__data_len += bio->bi_size; 1384 req->__data_len += bio->bi_size;
1392 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1385 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1393 1386
1394 drive_stat_acct(req, 0); 1387 blk_account_io_start(req, false);
1395 return true; 1388 return true;
1396} 1389}
1397 1390
1398/** 1391/**
1399 * attempt_plug_merge - try to merge with %current's plugged list 1392 * blk_attempt_plug_merge - try to merge with %current's plugged list
1400 * @q: request_queue new bio is being queued at 1393 * @q: request_queue new bio is being queued at
1401 * @bio: new bio being queued 1394 * @bio: new bio being queued
1402 * @request_count: out parameter for number of traversed plugged requests 1395 * @request_count: out parameter for number of traversed plugged requests
@@ -1412,19 +1405,28 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1412 * reliable access to the elevator outside queue lock. Only check basic 1405 * reliable access to the elevator outside queue lock. Only check basic
1413 * merging parameters without querying the elevator. 1406 * merging parameters without querying the elevator.
1414 */ 1407 */
1415static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1408bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1416 unsigned int *request_count) 1409 unsigned int *request_count)
1417{ 1410{
1418 struct blk_plug *plug; 1411 struct blk_plug *plug;
1419 struct request *rq; 1412 struct request *rq;
1420 bool ret = false; 1413 bool ret = false;
1414 struct list_head *plug_list;
1415
1416 if (blk_queue_nomerges(q))
1417 goto out;
1421 1418
1422 plug = current->plug; 1419 plug = current->plug;
1423 if (!plug) 1420 if (!plug)
1424 goto out; 1421 goto out;
1425 *request_count = 0; 1422 *request_count = 0;
1426 1423
1427 list_for_each_entry_reverse(rq, &plug->list, queuelist) { 1424 if (q->mq_ops)
1425 plug_list = &plug->mq_list;
1426 else
1427 plug_list = &plug->list;
1428
1429 list_for_each_entry_reverse(rq, plug_list, queuelist) {
1428 int el_ret; 1430 int el_ret;
1429 1431
1430 if (rq->q == q) 1432 if (rq->q == q)
@@ -1492,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1492 * Check if we can merge with the plugged list before grabbing 1494 * Check if we can merge with the plugged list before grabbing
1493 * any locks. 1495 * any locks.
1494 */ 1496 */
1495 if (attempt_plug_merge(q, bio, &request_count)) 1497 if (blk_attempt_plug_merge(q, bio, &request_count))
1496 return; 1498 return;
1497 1499
1498 spin_lock_irq(q->queue_lock); 1500 spin_lock_irq(q->queue_lock);
@@ -1560,7 +1562,7 @@ get_rq:
1560 } 1562 }
1561 } 1563 }
1562 list_add_tail(&req->queuelist, &plug->list); 1564 list_add_tail(&req->queuelist, &plug->list);
1563 drive_stat_acct(req, 1); 1565 blk_account_io_start(req, true);
1564 } else { 1566 } else {
1565 spin_lock_irq(q->queue_lock); 1567 spin_lock_irq(q->queue_lock);
1566 add_acct_request(q, req, where); 1568 add_acct_request(q, req, where);
@@ -2014,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
2014} 2016}
2015EXPORT_SYMBOL_GPL(blk_rq_err_bytes); 2017EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
2016 2018
2017static void blk_account_io_completion(struct request *req, unsigned int bytes) 2019void blk_account_io_completion(struct request *req, unsigned int bytes)
2018{ 2020{
2019 if (blk_do_io_stat(req)) { 2021 if (blk_do_io_stat(req)) {
2020 const int rw = rq_data_dir(req); 2022 const int rw = rq_data_dir(req);
@@ -2028,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
2028 } 2030 }
2029} 2031}
2030 2032
2031static void blk_account_io_done(struct request *req) 2033void blk_account_io_done(struct request *req)
2032{ 2034{
2033 /* 2035 /*
2034 * Account IO completion. flush_rq isn't accounted as a 2036 * Account IO completion. flush_rq isn't accounted as a
@@ -2076,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
2076} 2078}
2077#endif 2079#endif
2078 2080
2081void blk_account_io_start(struct request *rq, bool new_io)
2082{
2083 struct hd_struct *part;
2084 int rw = rq_data_dir(rq);
2085 int cpu;
2086
2087 if (!blk_do_io_stat(rq))
2088 return;
2089
2090 cpu = part_stat_lock();
2091
2092 if (!new_io) {
2093 part = rq->part;
2094 part_stat_inc(cpu, part, merges[rw]);
2095 } else {
2096 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2097 if (!hd_struct_try_get(part)) {
2098 /*
2099 * The partition is already being removed,
2100 * the request will be accounted on the disk only
2101 *
2102 * We take a reference on disk->part0 although that
2103 * partition will never be deleted, so we can treat
2104 * it as any other partition.
2105 */
2106 part = &rq->rq_disk->part0;
2107 hd_struct_get(part);
2108 }
2109 part_round_stats(cpu, part);
2110 part_inc_in_flight(part, rw);
2111 rq->part = part;
2112 }
2113
2114 part_stat_unlock();
2115}
2116
2079/** 2117/**
2080 * blk_peek_request - peek at the top of a request queue 2118 * blk_peek_request - peek at the top of a request queue
2081 * @q: request queue to peek at 2119 * @q: request queue to peek at
@@ -2227,6 +2265,7 @@ void blk_start_request(struct request *req)
2227 if (unlikely(blk_bidi_rq(req))) 2265 if (unlikely(blk_bidi_rq(req)))
2228 req->next_rq->resid_len = blk_rq_bytes(req->next_rq); 2266 req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
2229 2267
2268 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
2230 blk_add_timer(req); 2269 blk_add_timer(req);
2231} 2270}
2232EXPORT_SYMBOL(blk_start_request); 2271EXPORT_SYMBOL(blk_start_request);
@@ -2451,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error)
2451 if (req->cmd_flags & REQ_DONTPREP) 2490 if (req->cmd_flags & REQ_DONTPREP)
2452 blk_unprep_request(req); 2491 blk_unprep_request(req);
2453 2492
2454
2455 blk_account_io_done(req); 2493 blk_account_io_done(req);
2456 2494
2457 if (req->end_io) 2495 if (req->end_io)
@@ -2873,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug)
2873 2911
2874 plug->magic = PLUG_MAGIC; 2912 plug->magic = PLUG_MAGIC;
2875 INIT_LIST_HEAD(&plug->list); 2913 INIT_LIST_HEAD(&plug->list);
2914 INIT_LIST_HEAD(&plug->mq_list);
2876 INIT_LIST_HEAD(&plug->cb_list); 2915 INIT_LIST_HEAD(&plug->cb_list);
2877 2916
2878 /* 2917 /*
@@ -2970,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2970 BUG_ON(plug->magic != PLUG_MAGIC); 3009 BUG_ON(plug->magic != PLUG_MAGIC);
2971 3010
2972 flush_plug_callbacks(plug, from_schedule); 3011 flush_plug_callbacks(plug, from_schedule);
3012
3013 if (!list_empty(&plug->mq_list))
3014 blk_mq_flush_plug_list(plug, from_schedule);
3015
2973 if (list_empty(&plug->list)) 3016 if (list_empty(&plug->list))
2974 return; 3017 return;
2975 3018
diff --git a/block/blk-exec.c b/block/blk-exec.c
index ae4f27d7944e..c3edf9dff566 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/bio.h> 6#include <linux/bio.h>
7#include <linux/blkdev.h> 7#include <linux/blkdev.h>
8#include <linux/blk-mq.h>
8#include <linux/sched/sysctl.h> 9#include <linux/sched/sysctl.h>
9 10
10#include "blk.h" 11#include "blk.h"
@@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)
24 struct completion *waiting = rq->end_io_data; 25 struct completion *waiting = rq->end_io_data;
25 26
26 rq->end_io_data = NULL; 27 rq->end_io_data = NULL;
27 __blk_put_request(rq->q, rq);
28 28
29 /* 29 /*
30 * complete last, if this is a stack request the process (and thus 30 * complete last, if this is a stack request the process (and thus
@@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
59 59
60 rq->rq_disk = bd_disk; 60 rq->rq_disk = bd_disk;
61 rq->end_io = done; 61 rq->end_io = done;
62
63 if (q->mq_ops) {
64 blk_mq_insert_request(q, rq, true);
65 return;
66 }
67
62 /* 68 /*
63 * need to check this before __blk_run_queue(), because rq can 69 * need to check this before __blk_run_queue(), because rq can
64 * be freed before that returns. 70 * be freed before that returns.
@@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
103 int err = 0; 109 int err = 0;
104 unsigned long hang_check; 110 unsigned long hang_check;
105 111
106 /*
107 * we need an extra reference to the request, so we can look at
108 * it after io completion
109 */
110 rq->ref_count++;
111
112 if (!rq->sense) { 112 if (!rq->sense) {
113 memset(sense, 0, sizeof(sense)); 113 memset(sense, 0, sizeof(sense));
114 rq->sense = sense; 114 rq->sense = sense;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827a853c..331e627301ea 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,8 +69,10 @@
69#include <linux/bio.h> 69#include <linux/bio.h>
70#include <linux/blkdev.h> 70#include <linux/blkdev.h>
71#include <linux/gfp.h> 71#include <linux/gfp.h>
72#include <linux/blk-mq.h>
72 73
73#include "blk.h" 74#include "blk.h"
75#include "blk-mq.h"
74 76
75/* FLUSH/FUA sequences */ 77/* FLUSH/FUA sequences */
76enum { 78enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
124 /* make @rq a normal request */ 126 /* make @rq a normal request */
125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 127 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
126 rq->end_io = rq->flush.saved_end_io; 128 rq->end_io = rq->flush.saved_end_io;
129
130 blk_clear_rq_complete(rq);
131}
132
133static void mq_flush_data_run(struct work_struct *work)
134{
135 struct request *rq;
136
137 rq = container_of(work, struct request, mq_flush_data);
138
139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_run_request(rq, true, false);
141}
142
143static void blk_mq_flush_data_insert(struct request *rq)
144{
145 INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
146 kblockd_schedule_work(rq->q, &rq->mq_flush_data);
127} 147}
128 148
129/** 149/**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
136 * completion and trigger the next step. 156 * completion and trigger the next step.
137 * 157 *
138 * CONTEXT: 158 * CONTEXT:
139 * spin_lock_irq(q->queue_lock) 159 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
140 * 160 *
141 * RETURNS: 161 * RETURNS:
142 * %true if requests were added to the dispatch queue, %false otherwise. 162 * %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
146{ 166{
147 struct request_queue *q = rq->q; 167 struct request_queue *q = rq->q;
148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 168 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
149 bool queued = false; 169 bool queued = false, kicked;
150 170
151 BUG_ON(rq->flush.seq & seq); 171 BUG_ON(rq->flush.seq & seq);
152 rq->flush.seq |= seq; 172 rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
167 187
168 case REQ_FSEQ_DATA: 188 case REQ_FSEQ_DATA:
169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 189 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
170 list_add(&rq->queuelist, &q->queue_head); 190 if (q->mq_ops)
171 queued = true; 191 blk_mq_flush_data_insert(rq);
192 else {
193 list_add(&rq->queuelist, &q->queue_head);
194 queued = true;
195 }
172 break; 196 break;
173 197
174 case REQ_FSEQ_DONE: 198 case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
181 BUG_ON(!list_empty(&rq->queuelist)); 205 BUG_ON(!list_empty(&rq->queuelist));
182 list_del_init(&rq->flush.list); 206 list_del_init(&rq->flush.list);
183 blk_flush_restore_request(rq); 207 blk_flush_restore_request(rq);
184 __blk_end_request_all(rq, error); 208 if (q->mq_ops)
209 blk_mq_end_io(rq, error);
210 else
211 __blk_end_request_all(rq, error);
185 break; 212 break;
186 213
187 default: 214 default:
188 BUG(); 215 BUG();
189 } 216 }
190 217
191 return blk_kick_flush(q) | queued; 218 kicked = blk_kick_flush(q);
219 /* blk_mq_run_flush will run queue */
220 if (q->mq_ops)
221 return queued;
222 return kicked | queued;
192} 223}
193 224
194static void flush_end_io(struct request *flush_rq, int error) 225static void flush_end_io(struct request *flush_rq, int error)
195{ 226{
196 struct request_queue *q = flush_rq->q; 227 struct request_queue *q = flush_rq->q;
197 struct list_head *running = &q->flush_queue[q->flush_running_idx]; 228 struct list_head *running;
198 bool queued = false; 229 bool queued = false;
199 struct request *rq, *n; 230 struct request *rq, *n;
231 unsigned long flags = 0;
200 232
233 if (q->mq_ops) {
234 blk_mq_free_request(flush_rq);
235 spin_lock_irqsave(&q->mq_flush_lock, flags);
236 }
237 running = &q->flush_queue[q->flush_running_idx];
201 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 238 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
202 239
203 /* account completion of the flush request */ 240 /* account completion of the flush request */
204 q->flush_running_idx ^= 1; 241 q->flush_running_idx ^= 1;
205 elv_completed_request(q, flush_rq); 242
243 if (!q->mq_ops)
244 elv_completed_request(q, flush_rq);
206 245
207 /* and push the waiting requests to the next stage */ 246 /* and push the waiting requests to the next stage */
208 list_for_each_entry_safe(rq, n, running, flush.list) { 247 list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
223 * directly into request_fn may confuse the driver. Always use 262 * directly into request_fn may confuse the driver. Always use
224 * kblockd. 263 * kblockd.
225 */ 264 */
226 if (queued || q->flush_queue_delayed) 265 if (queued || q->flush_queue_delayed) {
227 blk_run_queue_async(q); 266 if (!q->mq_ops)
267 blk_run_queue_async(q);
268 else
269 /*
270 * This can be optimized to only run queues with requests
271 * queued if necessary.
272 */
273 blk_mq_run_queues(q, true);
274 }
228 q->flush_queue_delayed = 0; 275 q->flush_queue_delayed = 0;
276 if (q->mq_ops)
277 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
278}
279
280static void mq_flush_work(struct work_struct *work)
281{
282 struct request_queue *q;
283 struct request *rq;
284
285 q = container_of(work, struct request_queue, mq_flush_work);
286
287 /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
288 rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
289 __GFP_WAIT|GFP_ATOMIC, true);
290 rq->cmd_type = REQ_TYPE_FS;
291 rq->end_io = flush_end_io;
292
293 blk_mq_run_request(rq, true, false);
294}
295
296/*
297 * We can't directly use q->flush_rq, because it doesn't have tag and is not in
298 * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
299 * so offload the work to workqueue.
300 *
301 * Note: we assume a flush request finished in any hardware queue will flush
302 * the whole disk cache.
303 */
304static void mq_run_flush(struct request_queue *q)
305{
306 kblockd_schedule_work(q, &q->mq_flush_work);
229} 307}
230 308
231/** 309/**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
236 * Please read the comment at the top of this file for more info. 314 * Please read the comment at the top of this file for more info.
237 * 315 *
238 * CONTEXT: 316 * CONTEXT:
239 * spin_lock_irq(q->queue_lock) 317 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
240 * 318 *
241 * RETURNS: 319 * RETURNS:
242 * %true if flush was issued, %false otherwise. 320 * %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
261 * Issue flush and toggle pending_idx. This makes pending_idx 339 * Issue flush and toggle pending_idx. This makes pending_idx
262 * different from running_idx, which means flush is in flight. 340 * different from running_idx, which means flush is in flight.
263 */ 341 */
342 q->flush_pending_idx ^= 1;
343 if (q->mq_ops) {
344 mq_run_flush(q);
345 return true;
346 }
347
264 blk_rq_init(q, &q->flush_rq); 348 blk_rq_init(q, &q->flush_rq);
265 q->flush_rq.cmd_type = REQ_TYPE_FS; 349 q->flush_rq.cmd_type = REQ_TYPE_FS;
266 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 350 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
267 q->flush_rq.rq_disk = first_rq->rq_disk; 351 q->flush_rq.rq_disk = first_rq->rq_disk;
268 q->flush_rq.end_io = flush_end_io; 352 q->flush_rq.end_io = flush_end_io;
269 353
270 q->flush_pending_idx ^= 1;
271 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 354 list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
272 return true; 355 return true;
273} 356}
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
284 blk_run_queue_async(q); 367 blk_run_queue_async(q);
285} 368}
286 369
370static void mq_flush_data_end_io(struct request *rq, int error)
371{
372 struct request_queue *q = rq->q;
373 struct blk_mq_hw_ctx *hctx;
374 struct blk_mq_ctx *ctx;
375 unsigned long flags;
376
377 ctx = rq->mq_ctx;
378 hctx = q->mq_ops->map_queue(q, ctx->cpu);
379
380 /*
381 * After populating an empty queue, kick it to avoid stall. Read
382 * the comment in flush_end_io().
383 */
384 spin_lock_irqsave(&q->mq_flush_lock, flags);
385 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
386 blk_mq_run_hw_queue(hctx, true);
387 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
388}
389
287/** 390/**
288 * blk_insert_flush - insert a new FLUSH/FUA request 391 * blk_insert_flush - insert a new FLUSH/FUA request
289 * @rq: request to insert 392 * @rq: request to insert
290 * 393 *
291 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 394 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
395 * or __blk_mq_run_hw_queue() to dispatch request.
292 * @rq is being submitted. Analyze what needs to be done and put it on the 396 * @rq is being submitted. Analyze what needs to be done and put it on the
293 * right queue. 397 * right queue.
294 * 398 *
295 * CONTEXT: 399 * CONTEXT:
296 * spin_lock_irq(q->queue_lock) 400 * spin_lock_irq(q->queue_lock) in !mq case
297 */ 401 */
298void blk_insert_flush(struct request *rq) 402void blk_insert_flush(struct request *rq)
299{ 403{
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
316 * complete the request. 420 * complete the request.
317 */ 421 */
318 if (!policy) { 422 if (!policy) {
319 __blk_end_bidi_request(rq, 0, 0, 0); 423 if (q->mq_ops)
424 blk_mq_end_io(rq, 0);
425 else
426 __blk_end_bidi_request(rq, 0, 0, 0);
320 return; 427 return;
321 } 428 }
322 429
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
329 */ 436 */
330 if ((policy & REQ_FSEQ_DATA) && 437 if ((policy & REQ_FSEQ_DATA) &&
331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 438 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
332 list_add_tail(&rq->queuelist, &q->queue_head); 439 if (q->mq_ops) {
440 blk_mq_run_request(rq, false, true);
441 } else
442 list_add_tail(&rq->queuelist, &q->queue_head);
333 return; 443 return;
334 } 444 }
335 445
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
341 INIT_LIST_HEAD(&rq->flush.list); 451 INIT_LIST_HEAD(&rq->flush.list);
342 rq->cmd_flags |= REQ_FLUSH_SEQ; 452 rq->cmd_flags |= REQ_FLUSH_SEQ;
343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 453 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
454 if (q->mq_ops) {
455 rq->end_io = mq_flush_data_end_io;
456
457 spin_lock_irq(&q->mq_flush_lock);
458 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
459 spin_unlock_irq(&q->mq_flush_lock);
460 return;
461 }
344 rq->end_io = flush_data_end_io; 462 rq->end_io = flush_data_end_io;
345 463
346 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 464 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
453 return ret; 571 return ret;
454} 572}
455EXPORT_SYMBOL(blkdev_issue_flush); 573EXPORT_SYMBOL(blkdev_issue_flush);
574
575void blk_mq_init_flush(struct request_queue *q)
576{
577 spin_lock_init(&q->mq_flush_lock);
578 INIT_WORK(&q->mq_flush_work, mq_flush_work);
579}
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 4b8d9b541112..1855bf51edb0 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -35,7 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
35 unsigned long flags; 35 unsigned long flags;
36 36
37 local_irq_save(flags); 37 local_irq_save(flags);
38 list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); 38 list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
39 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); 39 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
40 local_irq_restore(flags); 40 local_irq_restore(flags);
41} 41}
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(blk_iopoll_complete);
79 79
80static void blk_iopoll_softirq(struct softirq_action *h) 80static void blk_iopoll_softirq(struct softirq_action *h)
81{ 81{
82 struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); 82 struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
83 int rearm = 0, budget = blk_iopoll_budget; 83 int rearm = 0, budget = blk_iopoll_budget;
84 unsigned long start_time = jiffies; 84 unsigned long start_time = jiffies;
85 85
@@ -201,7 +201,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
201 201
202 local_irq_disable(); 202 local_irq_disable();
203 list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), 203 list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
204 &__get_cpu_var(blk_cpu_iopoll)); 204 this_cpu_ptr(&blk_cpu_iopoll));
205 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); 205 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
206 local_irq_enable(); 206 local_irq_enable();
207 } 207 }
diff --git a/block/blk-lib.c b/block/blk-lib.c
index d6f50d572565..9b5b561cb928 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -43,8 +43,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
43 DECLARE_COMPLETION_ONSTACK(wait); 43 DECLARE_COMPLETION_ONSTACK(wait);
44 struct request_queue *q = bdev_get_queue(bdev); 44 struct request_queue *q = bdev_get_queue(bdev);
45 int type = REQ_WRITE | REQ_DISCARD; 45 int type = REQ_WRITE | REQ_DISCARD;
46 sector_t max_discard_sectors; 46 unsigned int max_discard_sectors, granularity;
47 sector_t granularity, alignment; 47 int alignment;
48 struct bio_batch bb; 48 struct bio_batch bb;
49 struct bio *bio; 49 struct bio *bio;
50 int ret = 0; 50 int ret = 0;
@@ -58,16 +58,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
58 58
59 /* Zero-sector (unknown) and one-sector granularities are the same. */ 59 /* Zero-sector (unknown) and one-sector granularities are the same. */
60 granularity = max(q->limits.discard_granularity >> 9, 1U); 60 granularity = max(q->limits.discard_granularity >> 9, 1U);
61 alignment = bdev_discard_alignment(bdev) >> 9; 61 alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
62 alignment = sector_div(alignment, granularity);
63 62
64 /* 63 /*
65 * Ensure that max_discard_sectors is of the proper 64 * Ensure that max_discard_sectors is of the proper
66 * granularity, so that requests stay aligned after a split. 65 * granularity, so that requests stay aligned after a split.
67 */ 66 */
68 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); 67 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
69 sector_div(max_discard_sectors, granularity); 68 max_discard_sectors -= max_discard_sectors % granularity;
70 max_discard_sectors *= granularity;
71 if (unlikely(!max_discard_sectors)) { 69 if (unlikely(!max_discard_sectors)) {
72 /* Avoid infinite loop below. Being cautious never hurts. */ 70 /* Avoid infinite loop below. Being cautious never hurts. */
73 return -EOPNOTSUPP; 71 return -EOPNOTSUPP;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5f2448253797..1ffc58977835 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
308 return ll_new_hw_segment(q, req, bio); 308 return ll_new_hw_segment(q, req, bio);
309} 309}
310 310
311/*
312 * blk-mq uses req->special to carry normal driver per-request payload, it
313 * does not indicate a prepared command that we cannot merge with.
314 */
315static bool req_no_special_merge(struct request *req)
316{
317 struct request_queue *q = req->q;
318
319 return !q->mq_ops && req->special;
320}
321
311static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 322static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
312 struct request *next) 323 struct request *next)
313{ 324{
@@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
319 * First check if the either of the requests are re-queued 330 * First check if the either of the requests are re-queued
320 * requests. Can't merge them if they are. 331 * requests. Can't merge them if they are.
321 */ 332 */
322 if (req->special || next->special) 333 if (req_no_special_merge(req) || req_no_special_merge(next))
323 return 0; 334 return 0;
324 335
325 /* 336 /*
@@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
416 427
417 if (rq_data_dir(req) != rq_data_dir(next) 428 if (rq_data_dir(req) != rq_data_dir(next)
418 || req->rq_disk != next->rq_disk 429 || req->rq_disk != next->rq_disk
419 || next->special) 430 || req_no_special_merge(next))
420 return 0; 431 return 0;
421 432
422 if (req->cmd_flags & REQ_WRITE_SAME && 433 if (req->cmd_flags & REQ_WRITE_SAME &&
@@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
515 return false; 526 return false;
516 527
517 /* must be same device and not a special request */ 528 /* must be same device and not a special request */
518 if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) 529 if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
519 return false; 530 return false;
520 531
521 /* only merge integrity protected bio into ditto rq */ 532 /* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
new file mode 100644
index 000000000000..f8ea39d7ae54
--- /dev/null
+++ b/block/blk-mq-cpu.c
@@ -0,0 +1,93 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <linux/blkdev.h>
5#include <linux/list.h>
6#include <linux/llist.h>
7#include <linux/smp.h>
8#include <linux/cpu.h>
9
10#include <linux/blk-mq.h>
11#include "blk-mq.h"
12
13static LIST_HEAD(blk_mq_cpu_notify_list);
14static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
15
16static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
17 unsigned long action, void *hcpu)
18{
19 unsigned int cpu = (unsigned long) hcpu;
20 struct blk_mq_cpu_notifier *notify;
21
22 spin_lock(&blk_mq_cpu_notify_lock);
23
24 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
25 notify->notify(notify->data, action, cpu);
26
27 spin_unlock(&blk_mq_cpu_notify_lock);
28 return NOTIFY_OK;
29}
30
31static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action,
32 unsigned int cpu)
33{
34 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
35 /*
36 * If the CPU goes away, ensure that we run any pending
37 * completions.
38 */
39 struct llist_node *node;
40 struct request *rq;
41
42 local_irq_disable();
43
44 node = llist_del_all(&per_cpu(ipi_lists, cpu));
45 while (node) {
46 struct llist_node *next = node->next;
47
48 rq = llist_entry(node, struct request, ll_list);
49 __blk_mq_end_io(rq, rq->errors);
50 node = next;
51 }
52
53 local_irq_enable();
54 }
55}
56
57static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
58 .notifier_call = blk_mq_main_cpu_notify,
59};
60
61void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
62{
63 BUG_ON(!notifier->notify);
64
65 spin_lock(&blk_mq_cpu_notify_lock);
66 list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
67 spin_unlock(&blk_mq_cpu_notify_lock);
68}
69
70void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
71{
72 spin_lock(&blk_mq_cpu_notify_lock);
73 list_del(&notifier->list);
74 spin_unlock(&blk_mq_cpu_notify_lock);
75}
76
77void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
78 void (*fn)(void *, unsigned long, unsigned int),
79 void *data)
80{
81 notifier->notify = fn;
82 notifier->data = data;
83}
84
85static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
86 .notify = blk_mq_cpu_notify,
87};
88
89void __init blk_mq_cpu_init(void)
90{
91 register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
92 blk_mq_register_cpu_notifier(&cpu_notifier);
93}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
new file mode 100644
index 000000000000..f8721278601c
--- /dev/null
+++ b/block/blk-mq-cpumap.c
@@ -0,0 +1,108 @@
1#include <linux/kernel.h>
2#include <linux/threads.h>
3#include <linux/module.h>
4#include <linux/mm.h>
5#include <linux/smp.h>
6#include <linux/cpu.h>
7
8#include <linux/blk-mq.h>
9#include "blk.h"
10#include "blk-mq.h"
11
12static void show_map(unsigned int *map, unsigned int nr)
13{
14 int i;
15
16 pr_info("blk-mq: CPU -> queue map\n");
17 for_each_online_cpu(i)
18 pr_info(" CPU%2u -> Queue %u\n", i, map[i]);
19}
20
21static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
22 const int cpu)
23{
24 return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
25}
26
27static int get_first_sibling(unsigned int cpu)
28{
29 unsigned int ret;
30
31 ret = cpumask_first(topology_thread_cpumask(cpu));
32 if (ret < nr_cpu_ids)
33 return ret;
34
35 return cpu;
36}
37
38int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
39{
40 unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
41 cpumask_var_t cpus;
42
43 if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
44 return 1;
45
46 cpumask_clear(cpus);
47 nr_cpus = nr_uniq_cpus = 0;
48 for_each_online_cpu(i) {
49 nr_cpus++;
50 first_sibling = get_first_sibling(i);
51 if (!cpumask_test_cpu(first_sibling, cpus))
52 nr_uniq_cpus++;
53 cpumask_set_cpu(i, cpus);
54 }
55
56 queue = 0;
57 for_each_possible_cpu(i) {
58 if (!cpu_online(i)) {
59 map[i] = 0;
60 continue;
61 }
62
63 /*
64 * Easy case - we have equal or more hardware queues. Or
65 * there are no thread siblings to take into account. Do
66 * 1:1 if enough, or sequential mapping if less.
67 */
68 if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
69 map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
70 queue++;
71 continue;
72 }
73
74 /*
75 * Less then nr_cpus queues, and we have some number of
76 * threads per cores. Map sibling threads to the same
77 * queue.
78 */
79 first_sibling = get_first_sibling(i);
80 if (first_sibling == i) {
81 map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
82 queue);
83 queue++;
84 } else
85 map[i] = map[first_sibling];
86 }
87
88 show_map(map, nr_cpus);
89 free_cpumask_var(cpus);
90 return 0;
91}
92
93unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
94{
95 unsigned int *map;
96
97 /* If cpus are offline, map them to first hctx */
98 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
99 reg->numa_node);
100 if (!map)
101 return NULL;
102
103 if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
104 return map;
105
106 kfree(map);
107 return NULL;
108}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
new file mode 100644
index 000000000000..ba6cf8e9aa0a
--- /dev/null
+++ b/block/blk-mq-sysfs.c
@@ -0,0 +1,384 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/backing-dev.h>
4#include <linux/bio.h>
5#include <linux/blkdev.h>
6#include <linux/mm.h>
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/workqueue.h>
10#include <linux/smp.h>
11
12#include <linux/blk-mq.h>
13#include "blk-mq.h"
14#include "blk-mq-tag.h"
15
16static void blk_mq_sysfs_release(struct kobject *kobj)
17{
18}
19
20struct blk_mq_ctx_sysfs_entry {
21 struct attribute attr;
22 ssize_t (*show)(struct blk_mq_ctx *, char *);
23 ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
24};
25
26struct blk_mq_hw_ctx_sysfs_entry {
27 struct attribute attr;
28 ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
29 ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
30};
31
32static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
33 char *page)
34{
35 struct blk_mq_ctx_sysfs_entry *entry;
36 struct blk_mq_ctx *ctx;
37 struct request_queue *q;
38 ssize_t res;
39
40 entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
41 ctx = container_of(kobj, struct blk_mq_ctx, kobj);
42 q = ctx->queue;
43
44 if (!entry->show)
45 return -EIO;
46
47 res = -ENOENT;
48 mutex_lock(&q->sysfs_lock);
49 if (!blk_queue_dying(q))
50 res = entry->show(ctx, page);
51 mutex_unlock(&q->sysfs_lock);
52 return res;
53}
54
55static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
56 const char *page, size_t length)
57{
58 struct blk_mq_ctx_sysfs_entry *entry;
59 struct blk_mq_ctx *ctx;
60 struct request_queue *q;
61 ssize_t res;
62
63 entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
64 ctx = container_of(kobj, struct blk_mq_ctx, kobj);
65 q = ctx->queue;
66
67 if (!entry->store)
68 return -EIO;
69
70 res = -ENOENT;
71 mutex_lock(&q->sysfs_lock);
72 if (!blk_queue_dying(q))
73 res = entry->store(ctx, page, length);
74 mutex_unlock(&q->sysfs_lock);
75 return res;
76}
77
78static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
79 struct attribute *attr, char *page)
80{
81 struct blk_mq_hw_ctx_sysfs_entry *entry;
82 struct blk_mq_hw_ctx *hctx;
83 struct request_queue *q;
84 ssize_t res;
85
86 entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
87 hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
88 q = hctx->queue;
89
90 if (!entry->show)
91 return -EIO;
92
93 res = -ENOENT;
94 mutex_lock(&q->sysfs_lock);
95 if (!blk_queue_dying(q))
96 res = entry->show(hctx, page);
97 mutex_unlock(&q->sysfs_lock);
98 return res;
99}
100
101static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
102 struct attribute *attr, const char *page,
103 size_t length)
104{
105 struct blk_mq_hw_ctx_sysfs_entry *entry;
106 struct blk_mq_hw_ctx *hctx;
107 struct request_queue *q;
108 ssize_t res;
109
110 entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
111 hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
112 q = hctx->queue;
113
114 if (!entry->store)
115 return -EIO;
116
117 res = -ENOENT;
118 mutex_lock(&q->sysfs_lock);
119 if (!blk_queue_dying(q))
120 res = entry->store(hctx, page, length);
121 mutex_unlock(&q->sysfs_lock);
122 return res;
123}
124
125static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
126{
127 return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
128 ctx->rq_dispatched[0]);
129}
130
131static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
132{
133 return sprintf(page, "%lu\n", ctx->rq_merged);
134}
135
136static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
137{
138 return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
139 ctx->rq_completed[0]);
140}
141
142static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
143{
144 char *start_page = page;
145 struct request *rq;
146
147 page += sprintf(page, "%s:\n", msg);
148
149 list_for_each_entry(rq, list, queuelist)
150 page += sprintf(page, "\t%p\n", rq);
151
152 return page - start_page;
153}
154
155static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
156{
157 ssize_t ret;
158
159 spin_lock(&ctx->lock);
160 ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
161 spin_unlock(&ctx->lock);
162
163 return ret;
164}
165
166static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
167 char *page)
168{
169 return sprintf(page, "%lu\n", hctx->queued);
170}
171
172static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
173{
174 return sprintf(page, "%lu\n", hctx->run);
175}
176
177static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
178 char *page)
179{
180 char *start_page = page;
181 int i;
182
183 page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
184
185 for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
186 unsigned long d = 1U << (i - 1);
187
188 page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
189 }
190
191 return page - start_page;
192}
193
194static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
195 char *page)
196{
197 ssize_t ret;
198
199 spin_lock(&hctx->lock);
200 ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
201 spin_unlock(&hctx->lock);
202
203 return ret;
204}
205
206static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
207{
208 ssize_t ret;
209
210 spin_lock(&hctx->lock);
211 ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
212 spin_unlock(&hctx->lock);
213
214 return ret;
215}
216
217static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
218 const char *page, size_t len)
219{
220 struct blk_mq_ctx *ctx;
221 unsigned long ret;
222 unsigned int i;
223
224 if (kstrtoul(page, 10, &ret)) {
225 pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
226 return -EINVAL;
227 }
228
229 spin_lock(&hctx->lock);
230 if (ret)
231 hctx->flags |= BLK_MQ_F_SHOULD_IPI;
232 else
233 hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
234 spin_unlock(&hctx->lock);
235
236 hctx_for_each_ctx(hctx, ctx, i)
237 ctx->ipi_redirect = !!ret;
238
239 return len;
240}
241
242static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
243{
244 return blk_mq_tag_sysfs_show(hctx->tags, page);
245}
246
247static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
248 .attr = {.name = "dispatched", .mode = S_IRUGO },
249 .show = blk_mq_sysfs_dispatched_show,
250};
251static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
252 .attr = {.name = "merged", .mode = S_IRUGO },
253 .show = blk_mq_sysfs_merged_show,
254};
255static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
256 .attr = {.name = "completed", .mode = S_IRUGO },
257 .show = blk_mq_sysfs_completed_show,
258};
259static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
260 .attr = {.name = "rq_list", .mode = S_IRUGO },
261 .show = blk_mq_sysfs_rq_list_show,
262};
263
264static struct attribute *default_ctx_attrs[] = {
265 &blk_mq_sysfs_dispatched.attr,
266 &blk_mq_sysfs_merged.attr,
267 &blk_mq_sysfs_completed.attr,
268 &blk_mq_sysfs_rq_list.attr,
269 NULL,
270};
271
272static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
273 .attr = {.name = "queued", .mode = S_IRUGO },
274 .show = blk_mq_hw_sysfs_queued_show,
275};
276static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
277 .attr = {.name = "run", .mode = S_IRUGO },
278 .show = blk_mq_hw_sysfs_run_show,
279};
280static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
281 .attr = {.name = "dispatched", .mode = S_IRUGO },
282 .show = blk_mq_hw_sysfs_dispatched_show,
283};
284static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
285 .attr = {.name = "pending", .mode = S_IRUGO },
286 .show = blk_mq_hw_sysfs_rq_list_show,
287};
288static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
289 .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
290 .show = blk_mq_hw_sysfs_ipi_show,
291 .store = blk_mq_hw_sysfs_ipi_store,
292};
293static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
294 .attr = {.name = "tags", .mode = S_IRUGO },
295 .show = blk_mq_hw_sysfs_tags_show,
296};
297
298static struct attribute *default_hw_ctx_attrs[] = {
299 &blk_mq_hw_sysfs_queued.attr,
300 &blk_mq_hw_sysfs_run.attr,
301 &blk_mq_hw_sysfs_dispatched.attr,
302 &blk_mq_hw_sysfs_pending.attr,
303 &blk_mq_hw_sysfs_ipi.attr,
304 &blk_mq_hw_sysfs_tags.attr,
305 NULL,
306};
307
308static const struct sysfs_ops blk_mq_sysfs_ops = {
309 .show = blk_mq_sysfs_show,
310 .store = blk_mq_sysfs_store,
311};
312
313static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
314 .show = blk_mq_hw_sysfs_show,
315 .store = blk_mq_hw_sysfs_store,
316};
317
318static struct kobj_type blk_mq_ktype = {
319 .sysfs_ops = &blk_mq_sysfs_ops,
320 .release = blk_mq_sysfs_release,
321};
322
323static struct kobj_type blk_mq_ctx_ktype = {
324 .sysfs_ops = &blk_mq_sysfs_ops,
325 .default_attrs = default_ctx_attrs,
326 .release = blk_mq_sysfs_release,
327};
328
329static struct kobj_type blk_mq_hw_ktype = {
330 .sysfs_ops = &blk_mq_hw_sysfs_ops,
331 .default_attrs = default_hw_ctx_attrs,
332 .release = blk_mq_sysfs_release,
333};
334
335void blk_mq_unregister_disk(struct gendisk *disk)
336{
337 struct request_queue *q = disk->queue;
338
339 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
340 kobject_del(&q->mq_kobj);
341
342 kobject_put(&disk_to_dev(disk)->kobj);
343}
344
345int blk_mq_register_disk(struct gendisk *disk)
346{
347 struct device *dev = disk_to_dev(disk);
348 struct request_queue *q = disk->queue;
349 struct blk_mq_hw_ctx *hctx;
350 struct blk_mq_ctx *ctx;
351 int ret, i, j;
352
353 kobject_init(&q->mq_kobj, &blk_mq_ktype);
354
355 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
356 if (ret < 0)
357 return ret;
358
359 kobject_uevent(&q->mq_kobj, KOBJ_ADD);
360
361 queue_for_each_hw_ctx(q, hctx, i) {
362 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
363 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
364 if (ret)
365 break;
366
367 if (!hctx->nr_ctx)
368 continue;
369
370 hctx_for_each_ctx(hctx, ctx, j) {
371 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
372 ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
373 if (ret)
374 break;
375 }
376 }
377
378 if (ret) {
379 blk_mq_unregister_disk(disk);
380 return ret;
381 }
382
383 return 0;
384}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
new file mode 100644
index 000000000000..d64a02fb1f73
--- /dev/null
+++ b/block/blk-mq-tag.c
@@ -0,0 +1,204 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/percpu_ida.h>
4
5#include <linux/blk-mq.h>
6#include "blk.h"
7#include "blk-mq.h"
8#include "blk-mq-tag.h"
9
10/*
11 * Per tagged queue (tag address space) map
12 */
13struct blk_mq_tags {
14 unsigned int nr_tags;
15 unsigned int nr_reserved_tags;
16 unsigned int nr_batch_move;
17 unsigned int nr_max_cache;
18
19 struct percpu_ida free_tags;
20 struct percpu_ida reserved_tags;
21};
22
23void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
24{
25 int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
26 blk_mq_put_tag(tags, tag);
27}
28
29bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
30{
31 return !tags ||
32 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
33}
34
35static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
36{
37 int tag;
38
39 tag = percpu_ida_alloc(&tags->free_tags, gfp);
40 if (tag < 0)
41 return BLK_MQ_TAG_FAIL;
42 return tag + tags->nr_reserved_tags;
43}
44
45static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
46 gfp_t gfp)
47{
48 int tag;
49
50 if (unlikely(!tags->nr_reserved_tags)) {
51 WARN_ON_ONCE(1);
52 return BLK_MQ_TAG_FAIL;
53 }
54
55 tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
56 if (tag < 0)
57 return BLK_MQ_TAG_FAIL;
58 return tag;
59}
60
61unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
62{
63 if (!reserved)
64 return __blk_mq_get_tag(tags, gfp);
65
66 return __blk_mq_get_reserved_tag(tags, gfp);
67}
68
69static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
70{
71 BUG_ON(tag >= tags->nr_tags);
72
73 percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
74}
75
76static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
77 unsigned int tag)
78{
79 BUG_ON(tag >= tags->nr_reserved_tags);
80
81 percpu_ida_free(&tags->reserved_tags, tag);
82}
83
84void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
85{
86 if (tag >= tags->nr_reserved_tags)
87 __blk_mq_put_tag(tags, tag);
88 else
89 __blk_mq_put_reserved_tag(tags, tag);
90}
91
92static int __blk_mq_tag_iter(unsigned id, void *data)
93{
94 unsigned long *tag_map = data;
95 __set_bit(id, tag_map);
96 return 0;
97}
98
99void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
100 void (*fn)(void *, unsigned long *), void *data)
101{
102 unsigned long *tag_map;
103 size_t map_size;
104
105 map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
106 tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
107 if (!tag_map)
108 return;
109
110 percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
111 if (tags->nr_reserved_tags)
112 percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
113 tag_map);
114
115 fn(data, tag_map);
116 kfree(tag_map);
117}
118
119struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
120 unsigned int reserved_tags, int node)
121{
122 unsigned int nr_tags, nr_cache;
123 struct blk_mq_tags *tags;
124 int ret;
125
126 if (total_tags > BLK_MQ_TAG_MAX) {
127 pr_err("blk-mq: tag depth too large\n");
128 return NULL;
129 }
130
131 tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
132 if (!tags)
133 return NULL;
134
135 nr_tags = total_tags - reserved_tags;
136 nr_cache = nr_tags / num_possible_cpus();
137
138 if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
139 nr_cache = BLK_MQ_TAG_CACHE_MIN;
140 else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
141 nr_cache = BLK_MQ_TAG_CACHE_MAX;
142
143 tags->nr_tags = total_tags;
144 tags->nr_reserved_tags = reserved_tags;
145 tags->nr_max_cache = nr_cache;
146 tags->nr_batch_move = max(1u, nr_cache / 2);
147
148 ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
149 tags->nr_reserved_tags,
150 tags->nr_max_cache,
151 tags->nr_batch_move);
152 if (ret)
153 goto err_free_tags;
154
155 if (reserved_tags) {
156 /*
157 * With max_cahe and batch set to 1, the allocator fallbacks to
158 * no cached. It's fine reserved tags allocation is slow.
159 */
160 ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
161 1, 1);
162 if (ret)
163 goto err_reserved_tags;
164 }
165
166 return tags;
167
168err_reserved_tags:
169 percpu_ida_destroy(&tags->free_tags);
170err_free_tags:
171 kfree(tags);
172 return NULL;
173}
174
175void blk_mq_free_tags(struct blk_mq_tags *tags)
176{
177 percpu_ida_destroy(&tags->free_tags);
178 percpu_ida_destroy(&tags->reserved_tags);
179 kfree(tags);
180}
181
182ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
183{
184 char *orig_page = page;
185 int cpu;
186
187 if (!tags)
188 return 0;
189
190 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
191 " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
192 tags->nr_batch_move, tags->nr_max_cache);
193
194 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
195 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
196 percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
197
198 for_each_possible_cpu(cpu) {
199 page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu,
200 percpu_ida_free_tags(&tags->free_tags, cpu));
201 }
202
203 return page - orig_page;
204}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
new file mode 100644
index 000000000000..947ba2c6148e
--- /dev/null
+++ b/block/blk-mq-tag.h
@@ -0,0 +1,27 @@
1#ifndef INT_BLK_MQ_TAG_H
2#define INT_BLK_MQ_TAG_H
3
4struct blk_mq_tags;
5
6extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
7extern void blk_mq_free_tags(struct blk_mq_tags *tags);
8
9extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
10extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
11extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
12extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
13extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
14extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
15
16enum {
17 BLK_MQ_TAG_CACHE_MIN = 1,
18 BLK_MQ_TAG_CACHE_MAX = 64,
19};
20
21enum {
22 BLK_MQ_TAG_FAIL = -1U,
23 BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
24 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
25};
26
27#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
new file mode 100644
index 000000000000..88d4e864d4c0
--- /dev/null
+++ b/block/blk-mq.c
@@ -0,0 +1,1500 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/backing-dev.h>
4#include <linux/bio.h>
5#include <linux/blkdev.h>
6#include <linux/mm.h>
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/workqueue.h>
10#include <linux/smp.h>
11#include <linux/llist.h>
12#include <linux/list_sort.h>
13#include <linux/cpu.h>
14#include <linux/cache.h>
15#include <linux/sched/sysctl.h>
16#include <linux/delay.h>
17
18#include <trace/events/block.h>
19
20#include <linux/blk-mq.h>
21#include "blk.h"
22#include "blk-mq.h"
23#include "blk-mq-tag.h"
24
25static DEFINE_MUTEX(all_q_mutex);
26static LIST_HEAD(all_q_list);
27
28static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
29
30DEFINE_PER_CPU(struct llist_head, ipi_lists);
31
32static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
33 unsigned int cpu)
34{
35 return per_cpu_ptr(q->queue_ctx, cpu);
36}
37
38/*
39 * This assumes per-cpu software queueing queues. They could be per-node
40 * as well, for instance. For now this is hardcoded as-is. Note that we don't
41 * care about preemption, since we know the ctx's are persistent. This does
42 * mean that we can't rely on ctx always matching the currently running CPU.
43 */
44static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
45{
46 return __blk_mq_get_ctx(q, get_cpu());
47}
48
49static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
50{
51 put_cpu();
52}
53
54/*
55 * Check if any of the ctx's have pending work in this hardware queue
56 */
57static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
58{
59 unsigned int i;
60
61 for (i = 0; i < hctx->nr_ctx_map; i++)
62 if (hctx->ctx_map[i])
63 return true;
64
65 return false;
66}
67
68/*
69 * Mark this ctx as having pending work in this hardware queue
70 */
71static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
72 struct blk_mq_ctx *ctx)
73{
74 if (!test_bit(ctx->index_hw, hctx->ctx_map))
75 set_bit(ctx->index_hw, hctx->ctx_map);
76}
77
78static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
79 bool reserved)
80{
81 struct request *rq;
82 unsigned int tag;
83
84 tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
85 if (tag != BLK_MQ_TAG_FAIL) {
86 rq = hctx->rqs[tag];
87 rq->tag = tag;
88
89 return rq;
90 }
91
92 return NULL;
93}
94
95static int blk_mq_queue_enter(struct request_queue *q)
96{
97 int ret;
98
99 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
100 smp_wmb();
101 /* we have problems to freeze the queue if it's initializing */
102 if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
103 return 0;
104
105 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
106
107 spin_lock_irq(q->queue_lock);
108 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
109 !blk_queue_bypass(q), *q->queue_lock);
110 /* inc usage with lock hold to avoid freeze_queue runs here */
111 if (!ret)
112 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
113 spin_unlock_irq(q->queue_lock);
114
115 return ret;
116}
117
118static void blk_mq_queue_exit(struct request_queue *q)
119{
120 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
121}
122
123/*
124 * Guarantee no request is in use, so we can change any data structure of
125 * the queue afterward.
126 */
127static void blk_mq_freeze_queue(struct request_queue *q)
128{
129 bool drain;
130
131 spin_lock_irq(q->queue_lock);
132 drain = !q->bypass_depth++;
133 queue_flag_set(QUEUE_FLAG_BYPASS, q);
134 spin_unlock_irq(q->queue_lock);
135
136 if (!drain)
137 return;
138
139 while (true) {
140 s64 count;
141
142 spin_lock_irq(q->queue_lock);
143 count = percpu_counter_sum(&q->mq_usage_counter);
144 spin_unlock_irq(q->queue_lock);
145
146 if (count == 0)
147 break;
148 blk_mq_run_queues(q, false);
149 msleep(10);
150 }
151}
152
153static void blk_mq_unfreeze_queue(struct request_queue *q)
154{
155 bool wake = false;
156
157 spin_lock_irq(q->queue_lock);
158 if (!--q->bypass_depth) {
159 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
160 wake = true;
161 }
162 WARN_ON_ONCE(q->bypass_depth < 0);
163 spin_unlock_irq(q->queue_lock);
164 if (wake)
165 wake_up_all(&q->mq_freeze_wq);
166}
167
168bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
169{
170 return blk_mq_has_free_tags(hctx->tags);
171}
172EXPORT_SYMBOL(blk_mq_can_queue);
173
174static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,
175 unsigned int rw_flags)
176{
177 rq->mq_ctx = ctx;
178 rq->cmd_flags = rw_flags;
179 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
180}
181
182static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
183 gfp_t gfp, bool reserved)
184{
185 return blk_mq_alloc_rq(hctx, gfp, reserved);
186}
187
188static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
189 int rw, gfp_t gfp,
190 bool reserved)
191{
192 struct request *rq;
193
194 do {
195 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
196 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
197
198 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
199 if (rq) {
200 blk_mq_rq_ctx_init(ctx, rq, rw);
201 break;
202 } else if (!(gfp & __GFP_WAIT))
203 break;
204
205 blk_mq_put_ctx(ctx);
206 __blk_mq_run_hw_queue(hctx);
207 blk_mq_wait_for_tags(hctx->tags);
208 } while (1);
209
210 return rq;
211}
212
213struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
214 gfp_t gfp, bool reserved)
215{
216 struct request *rq;
217
218 if (blk_mq_queue_enter(q))
219 return NULL;
220
221 rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
222 blk_mq_put_ctx(rq->mq_ctx);
223 return rq;
224}
225
226struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
227 gfp_t gfp)
228{
229 struct request *rq;
230
231 if (blk_mq_queue_enter(q))
232 return NULL;
233
234 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
235 blk_mq_put_ctx(rq->mq_ctx);
236 return rq;
237}
238EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
239
240/*
241 * Re-init and set pdu, if we have it
242 */
243static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
244{
245 blk_rq_init(hctx->queue, rq);
246
247 if (hctx->cmd_size)
248 rq->special = blk_mq_rq_to_pdu(rq);
249}
250
251static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
252 struct blk_mq_ctx *ctx, struct request *rq)
253{
254 const int tag = rq->tag;
255 struct request_queue *q = rq->q;
256
257 blk_mq_rq_init(hctx, rq);
258 blk_mq_put_tag(hctx->tags, tag);
259
260 blk_mq_queue_exit(q);
261}
262
263void blk_mq_free_request(struct request *rq)
264{
265 struct blk_mq_ctx *ctx = rq->mq_ctx;
266 struct blk_mq_hw_ctx *hctx;
267 struct request_queue *q = rq->q;
268
269 ctx->rq_completed[rq_is_sync(rq)]++;
270
271 hctx = q->mq_ops->map_queue(q, ctx->cpu);
272 __blk_mq_free_request(hctx, ctx, rq);
273}
274
275static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
276{
277 if (error)
278 clear_bit(BIO_UPTODATE, &bio->bi_flags);
279 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
280 error = -EIO;
281
282 if (unlikely(rq->cmd_flags & REQ_QUIET))
283 set_bit(BIO_QUIET, &bio->bi_flags);
284
285 /* don't actually finish bio if it's part of flush sequence */
286 if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
287 bio_endio(bio, error);
288}
289
290void blk_mq_complete_request(struct request *rq, int error)
291{
292 struct bio *bio = rq->bio;
293 unsigned int bytes = 0;
294
295 trace_block_rq_complete(rq->q, rq);
296
297 while (bio) {
298 struct bio *next = bio->bi_next;
299
300 bio->bi_next = NULL;
301 bytes += bio->bi_size;
302 blk_mq_bio_endio(rq, bio, error);
303 bio = next;
304 }
305
306 blk_account_io_completion(rq, bytes);
307
308 if (rq->end_io)
309 rq->end_io(rq, error);
310 else
311 blk_mq_free_request(rq);
312
313 blk_account_io_done(rq);
314}
315
316void __blk_mq_end_io(struct request *rq, int error)
317{
318 if (!blk_mark_rq_complete(rq))
319 blk_mq_complete_request(rq, error);
320}
321
322#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
323
324/*
325 * Called with interrupts disabled.
326 */
327static void ipi_end_io(void *data)
328{
329 struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
330 struct llist_node *entry, *next;
331 struct request *rq;
332
333 entry = llist_del_all(list);
334
335 while (entry) {
336 next = entry->next;
337 rq = llist_entry(entry, struct request, ll_list);
338 __blk_mq_end_io(rq, rq->errors);
339 entry = next;
340 }
341}
342
343static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
344 struct request *rq, const int error)
345{
346 struct call_single_data *data = &rq->csd;
347
348 rq->errors = error;
349 rq->ll_list.next = NULL;
350
351 /*
352 * If the list is non-empty, an existing IPI must already
353 * be "in flight". If that is the case, we need not schedule
354 * a new one.
355 */
356 if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
357 data->func = ipi_end_io;
358 data->flags = 0;
359 __smp_call_function_single(ctx->cpu, data, 0);
360 }
361
362 return true;
363}
364#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
365static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
366 struct request *rq, const int error)
367{
368 return false;
369}
370#endif
371
372/*
373 * End IO on this request on a multiqueue enabled driver. We'll either do
374 * it directly inline, or punt to a local IPI handler on the matching
375 * remote CPU.
376 */
377void blk_mq_end_io(struct request *rq, int error)
378{
379 struct blk_mq_ctx *ctx = rq->mq_ctx;
380 int cpu;
381
382 if (!ctx->ipi_redirect)
383 return __blk_mq_end_io(rq, error);
384
385 cpu = get_cpu();
386
387 if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||
388 !ipi_remote_cpu(ctx, cpu, rq, error))
389 __blk_mq_end_io(rq, error);
390
391 put_cpu();
392}
393EXPORT_SYMBOL(blk_mq_end_io);
394
395static void blk_mq_start_request(struct request *rq)
396{
397 struct request_queue *q = rq->q;
398
399 trace_block_rq_issue(q, rq);
400
401 /*
402 * Just mark start time and set the started bit. Due to memory
403 * ordering, we know we'll see the correct deadline as long as
404 * REQ_ATOMIC_STARTED is seen.
405 */
406 rq->deadline = jiffies + q->rq_timeout;
407 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
408}
409
410static void blk_mq_requeue_request(struct request *rq)
411{
412 struct request_queue *q = rq->q;
413
414 trace_block_rq_requeue(q, rq);
415 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
416}
417
418struct blk_mq_timeout_data {
419 struct blk_mq_hw_ctx *hctx;
420 unsigned long *next;
421 unsigned int *next_set;
422};
423
424static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
425{
426 struct blk_mq_timeout_data *data = __data;
427 struct blk_mq_hw_ctx *hctx = data->hctx;
428 unsigned int tag;
429
430 /* It may not be in flight yet (this is where
431 * the REQ_ATOMIC_STARTED flag comes in). The requests are
432 * statically allocated, so we know it's always safe to access the
433 * memory associated with a bit offset into ->rqs[].
434 */
435 tag = 0;
436 do {
437 struct request *rq;
438
439 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
440 if (tag >= hctx->queue_depth)
441 break;
442
443 rq = hctx->rqs[tag++];
444
445 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
446 continue;
447
448 blk_rq_check_expired(rq, data->next, data->next_set);
449 } while (1);
450}
451
452static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
453 unsigned long *next,
454 unsigned int *next_set)
455{
456 struct blk_mq_timeout_data data = {
457 .hctx = hctx,
458 .next = next,
459 .next_set = next_set,
460 };
461
462 /*
463 * Ask the tagging code to iterate busy requests, so we can
464 * check them for timeout.
465 */
466 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
467}
468
469static void blk_mq_rq_timer(unsigned long data)
470{
471 struct request_queue *q = (struct request_queue *) data;
472 struct blk_mq_hw_ctx *hctx;
473 unsigned long next = 0;
474 int i, next_set = 0;
475
476 queue_for_each_hw_ctx(q, hctx, i)
477 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
478
479 if (next_set)
480 mod_timer(&q->timeout, round_jiffies_up(next));
481}
482
483/*
484 * Reverse check our software queue for entries that we could potentially
485 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
486 * too much time checking for merges.
487 */
488static bool blk_mq_attempt_merge(struct request_queue *q,
489 struct blk_mq_ctx *ctx, struct bio *bio)
490{
491 struct request *rq;
492 int checked = 8;
493
494 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
495 int el_ret;
496
497 if (!checked--)
498 break;
499
500 if (!blk_rq_merge_ok(rq, bio))
501 continue;
502
503 el_ret = blk_try_merge(rq, bio);
504 if (el_ret == ELEVATOR_BACK_MERGE) {
505 if (bio_attempt_back_merge(q, rq, bio)) {
506 ctx->rq_merged++;
507 return true;
508 }
509 break;
510 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
511 if (bio_attempt_front_merge(q, rq, bio)) {
512 ctx->rq_merged++;
513 return true;
514 }
515 break;
516 }
517 }
518
519 return false;
520}
521
522void blk_mq_add_timer(struct request *rq)
523{
524 __blk_add_timer(rq, NULL);
525}
526
527/*
528 * Run this hardware queue, pulling any software queues mapped to it in.
529 * Note that this function currently has various problems around ordering
530 * of IO. In particular, we'd like FIFO behaviour on handling existing
531 * items on the hctx->dispatch list. Ignore that for now.
532 */
533static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
534{
535 struct request_queue *q = hctx->queue;
536 struct blk_mq_ctx *ctx;
537 struct request *rq;
538 LIST_HEAD(rq_list);
539 int bit, queued;
540
541 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
542 return;
543
544 hctx->run++;
545
546 /*
547 * Touch any software queue that has pending entries.
548 */
549 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
550 clear_bit(bit, hctx->ctx_map);
551 ctx = hctx->ctxs[bit];
552 BUG_ON(bit != ctx->index_hw);
553
554 spin_lock(&ctx->lock);
555 list_splice_tail_init(&ctx->rq_list, &rq_list);
556 spin_unlock(&ctx->lock);
557 }
558
559 /*
560 * If we have previous entries on our dispatch list, grab them
561 * and stuff them at the front for more fair dispatch.
562 */
563 if (!list_empty_careful(&hctx->dispatch)) {
564 spin_lock(&hctx->lock);
565 if (!list_empty(&hctx->dispatch))
566 list_splice_init(&hctx->dispatch, &rq_list);
567 spin_unlock(&hctx->lock);
568 }
569
570 /*
571 * Delete and return all entries from our dispatch list
572 */
573 queued = 0;
574
575 /*
576 * Now process all the entries, sending them to the driver.
577 */
578 while (!list_empty(&rq_list)) {
579 int ret;
580
581 rq = list_first_entry(&rq_list, struct request, queuelist);
582 list_del_init(&rq->queuelist);
583 blk_mq_start_request(rq);
584
585 /*
586 * Last request in the series. Flag it as such, this
587 * enables drivers to know when IO should be kicked off,
588 * if they don't do it on a per-request basis.
589 *
590 * Note: the flag isn't the only condition drivers
591 * should do kick off. If drive is busy, the last
592 * request might not have the bit set.
593 */
594 if (list_empty(&rq_list))
595 rq->cmd_flags |= REQ_END;
596
597 ret = q->mq_ops->queue_rq(hctx, rq);
598 switch (ret) {
599 case BLK_MQ_RQ_QUEUE_OK:
600 queued++;
601 continue;
602 case BLK_MQ_RQ_QUEUE_BUSY:
603 /*
604 * FIXME: we should have a mechanism to stop the queue
605 * like blk_stop_queue, otherwise we will waste cpu
606 * time
607 */
608 list_add(&rq->queuelist, &rq_list);
609 blk_mq_requeue_request(rq);
610 break;
611 default:
612 pr_err("blk-mq: bad return on queue: %d\n", ret);
613 rq->errors = -EIO;
614 case BLK_MQ_RQ_QUEUE_ERROR:
615 blk_mq_end_io(rq, rq->errors);
616 break;
617 }
618
619 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
620 break;
621 }
622
623 if (!queued)
624 hctx->dispatched[0]++;
625 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
626 hctx->dispatched[ilog2(queued) + 1]++;
627
628 /*
629 * Any items that need requeuing? Stuff them into hctx->dispatch,
630 * that is where we will continue on next queue run.
631 */
632 if (!list_empty(&rq_list)) {
633 spin_lock(&hctx->lock);
634 list_splice(&rq_list, &hctx->dispatch);
635 spin_unlock(&hctx->lock);
636 }
637}
638
639void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
640{
641 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
642 return;
643
644 if (!async)
645 __blk_mq_run_hw_queue(hctx);
646 else {
647 struct request_queue *q = hctx->queue;
648
649 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
650 }
651}
652
653void blk_mq_run_queues(struct request_queue *q, bool async)
654{
655 struct blk_mq_hw_ctx *hctx;
656 int i;
657
658 queue_for_each_hw_ctx(q, hctx, i) {
659 if ((!blk_mq_hctx_has_pending(hctx) &&
660 list_empty_careful(&hctx->dispatch)) ||
661 test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
662 continue;
663
664 blk_mq_run_hw_queue(hctx, async);
665 }
666}
667EXPORT_SYMBOL(blk_mq_run_queues);
668
669void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
670{
671 cancel_delayed_work(&hctx->delayed_work);
672 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
673}
674EXPORT_SYMBOL(blk_mq_stop_hw_queue);
675
676void blk_mq_stop_hw_queues(struct request_queue *q)
677{
678 struct blk_mq_hw_ctx *hctx;
679 int i;
680
681 queue_for_each_hw_ctx(q, hctx, i)
682 blk_mq_stop_hw_queue(hctx);
683}
684EXPORT_SYMBOL(blk_mq_stop_hw_queues);
685
686void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
687{
688 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
689 __blk_mq_run_hw_queue(hctx);
690}
691EXPORT_SYMBOL(blk_mq_start_hw_queue);
692
693void blk_mq_start_stopped_hw_queues(struct request_queue *q)
694{
695 struct blk_mq_hw_ctx *hctx;
696 int i;
697
698 queue_for_each_hw_ctx(q, hctx, i) {
699 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
700 continue;
701
702 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
703 blk_mq_run_hw_queue(hctx, true);
704 }
705}
706EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
707
708static void blk_mq_work_fn(struct work_struct *work)
709{
710 struct blk_mq_hw_ctx *hctx;
711
712 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
713 __blk_mq_run_hw_queue(hctx);
714}
715
716static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
717 struct request *rq)
718{
719 struct blk_mq_ctx *ctx = rq->mq_ctx;
720
721 list_add_tail(&rq->queuelist, &ctx->rq_list);
722 blk_mq_hctx_mark_pending(hctx, ctx);
723
724 /*
725 * We do this early, to ensure we are on the right CPU.
726 */
727 blk_mq_add_timer(rq);
728}
729
730void blk_mq_insert_request(struct request_queue *q, struct request *rq,
731 bool run_queue)
732{
733 struct blk_mq_hw_ctx *hctx;
734 struct blk_mq_ctx *ctx, *current_ctx;
735
736 ctx = rq->mq_ctx;
737 hctx = q->mq_ops->map_queue(q, ctx->cpu);
738
739 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
740 blk_insert_flush(rq);
741 } else {
742 current_ctx = blk_mq_get_ctx(q);
743
744 if (!cpu_online(ctx->cpu)) {
745 ctx = current_ctx;
746 hctx = q->mq_ops->map_queue(q, ctx->cpu);
747 rq->mq_ctx = ctx;
748 }
749 spin_lock(&ctx->lock);
750 __blk_mq_insert_request(hctx, rq);
751 spin_unlock(&ctx->lock);
752
753 blk_mq_put_ctx(current_ctx);
754 }
755
756 if (run_queue)
757 __blk_mq_run_hw_queue(hctx);
758}
759EXPORT_SYMBOL(blk_mq_insert_request);
760
761/*
762 * This is a special version of blk_mq_insert_request to bypass FLUSH request
763 * check. Should only be used internally.
764 */
765void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
766{
767 struct request_queue *q = rq->q;
768 struct blk_mq_hw_ctx *hctx;
769 struct blk_mq_ctx *ctx, *current_ctx;
770
771 current_ctx = blk_mq_get_ctx(q);
772
773 ctx = rq->mq_ctx;
774 if (!cpu_online(ctx->cpu)) {
775 ctx = current_ctx;
776 rq->mq_ctx = ctx;
777 }
778 hctx = q->mq_ops->map_queue(q, ctx->cpu);
779
780 /* ctx->cpu might be offline */
781 spin_lock(&ctx->lock);
782 __blk_mq_insert_request(hctx, rq);
783 spin_unlock(&ctx->lock);
784
785 blk_mq_put_ctx(current_ctx);
786
787 if (run_queue)
788 blk_mq_run_hw_queue(hctx, async);
789}
790
791static void blk_mq_insert_requests(struct request_queue *q,
792 struct blk_mq_ctx *ctx,
793 struct list_head *list,
794 int depth,
795 bool from_schedule)
796
797{
798 struct blk_mq_hw_ctx *hctx;
799 struct blk_mq_ctx *current_ctx;
800
801 trace_block_unplug(q, depth, !from_schedule);
802
803 current_ctx = blk_mq_get_ctx(q);
804
805 if (!cpu_online(ctx->cpu))
806 ctx = current_ctx;
807 hctx = q->mq_ops->map_queue(q, ctx->cpu);
808
809 /*
810 * preemption doesn't flush plug list, so it's possible ctx->cpu is
811 * offline now
812 */
813 spin_lock(&ctx->lock);
814 while (!list_empty(list)) {
815 struct request *rq;
816
817 rq = list_first_entry(list, struct request, queuelist);
818 list_del_init(&rq->queuelist);
819 rq->mq_ctx = ctx;
820 __blk_mq_insert_request(hctx, rq);
821 }
822 spin_unlock(&ctx->lock);
823
824 blk_mq_put_ctx(current_ctx);
825
826 blk_mq_run_hw_queue(hctx, from_schedule);
827}
828
829static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
830{
831 struct request *rqa = container_of(a, struct request, queuelist);
832 struct request *rqb = container_of(b, struct request, queuelist);
833
834 return !(rqa->mq_ctx < rqb->mq_ctx ||
835 (rqa->mq_ctx == rqb->mq_ctx &&
836 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
837}
838
839void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
840{
841 struct blk_mq_ctx *this_ctx;
842 struct request_queue *this_q;
843 struct request *rq;
844 LIST_HEAD(list);
845 LIST_HEAD(ctx_list);
846 unsigned int depth;
847
848 list_splice_init(&plug->mq_list, &list);
849
850 list_sort(NULL, &list, plug_ctx_cmp);
851
852 this_q = NULL;
853 this_ctx = NULL;
854 depth = 0;
855
856 while (!list_empty(&list)) {
857 rq = list_entry_rq(list.next);
858 list_del_init(&rq->queuelist);
859 BUG_ON(!rq->q);
860 if (rq->mq_ctx != this_ctx) {
861 if (this_ctx) {
862 blk_mq_insert_requests(this_q, this_ctx,
863 &ctx_list, depth,
864 from_schedule);
865 }
866
867 this_ctx = rq->mq_ctx;
868 this_q = rq->q;
869 depth = 0;
870 }
871
872 depth++;
873 list_add_tail(&rq->queuelist, &ctx_list);
874 }
875
876 /*
877 * If 'this_ctx' is set, we know we have entries to complete
878 * on 'ctx_list'. Do those.
879 */
880 if (this_ctx) {
881 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
882 from_schedule);
883 }
884}
885
886static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
887{
888 init_request_from_bio(rq, bio);
889 blk_account_io_start(rq, 1);
890}
891
892static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
893{
894 struct blk_mq_hw_ctx *hctx;
895 struct blk_mq_ctx *ctx;
896 const int is_sync = rw_is_sync(bio->bi_rw);
897 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
898 int rw = bio_data_dir(bio);
899 struct request *rq;
900 unsigned int use_plug, request_count = 0;
901
902 /*
903 * If we have multiple hardware queues, just go directly to
904 * one of those for sync IO.
905 */
906 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
907
908 blk_queue_bounce(q, &bio);
909
910 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
911 return;
912
913 if (blk_mq_queue_enter(q)) {
914 bio_endio(bio, -EIO);
915 return;
916 }
917
918 ctx = blk_mq_get_ctx(q);
919 hctx = q->mq_ops->map_queue(q, ctx->cpu);
920
921 trace_block_getrq(q, bio, rw);
922 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
923 if (likely(rq))
924 blk_mq_rq_ctx_init(ctx, rq, rw);
925 else {
926 blk_mq_put_ctx(ctx);
927 trace_block_sleeprq(q, bio, rw);
928 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
929 false);
930 ctx = rq->mq_ctx;
931 hctx = q->mq_ops->map_queue(q, ctx->cpu);
932 }
933
934 hctx->queued++;
935
936 if (unlikely(is_flush_fua)) {
937 blk_mq_bio_to_request(rq, bio);
938 blk_mq_put_ctx(ctx);
939 blk_insert_flush(rq);
940 goto run_queue;
941 }
942
943 /*
944 * A task plug currently exists. Since this is completely lockless,
945 * utilize that to temporarily store requests until the task is
946 * either done or scheduled away.
947 */
948 if (use_plug) {
949 struct blk_plug *plug = current->plug;
950
951 if (plug) {
952 blk_mq_bio_to_request(rq, bio);
953 if (list_empty(&plug->mq_list))
954 trace_block_plug(q);
955 else if (request_count >= BLK_MAX_REQUEST_COUNT) {
956 blk_flush_plug_list(plug, false);
957 trace_block_plug(q);
958 }
959 list_add_tail(&rq->queuelist, &plug->mq_list);
960 blk_mq_put_ctx(ctx);
961 return;
962 }
963 }
964
965 spin_lock(&ctx->lock);
966
967 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
968 blk_mq_attempt_merge(q, ctx, bio))
969 __blk_mq_free_request(hctx, ctx, rq);
970 else {
971 blk_mq_bio_to_request(rq, bio);
972 __blk_mq_insert_request(hctx, rq);
973 }
974
975 spin_unlock(&ctx->lock);
976 blk_mq_put_ctx(ctx);
977
978 /*
979 * For a SYNC request, send it to the hardware immediately. For an
980 * ASYNC request, just ensure that we run it later on. The latter
981 * allows for merging opportunities and more efficient dispatching.
982 */
983run_queue:
984 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
985}
986
987/*
988 * Default mapping to a software queue, since we use one per CPU.
989 */
990struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
991{
992 return q->queue_hw_ctx[q->mq_map[cpu]];
993}
994EXPORT_SYMBOL(blk_mq_map_queue);
995
996struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
997 unsigned int hctx_index)
998{
999 return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
1000 GFP_KERNEL | __GFP_ZERO, reg->numa_node);
1001}
1002EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
1003
1004void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
1005 unsigned int hctx_index)
1006{
1007 kfree(hctx);
1008}
1009EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
1010
1011static void blk_mq_hctx_notify(void *data, unsigned long action,
1012 unsigned int cpu)
1013{
1014 struct blk_mq_hw_ctx *hctx = data;
1015 struct blk_mq_ctx *ctx;
1016 LIST_HEAD(tmp);
1017
1018 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1019 return;
1020
1021 /*
1022 * Move ctx entries to new CPU, if this one is going away.
1023 */
1024 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1025
1026 spin_lock(&ctx->lock);
1027 if (!list_empty(&ctx->rq_list)) {
1028 list_splice_init(&ctx->rq_list, &tmp);
1029 clear_bit(ctx->index_hw, hctx->ctx_map);
1030 }
1031 spin_unlock(&ctx->lock);
1032
1033 if (list_empty(&tmp))
1034 return;
1035
1036 ctx = blk_mq_get_ctx(hctx->queue);
1037 spin_lock(&ctx->lock);
1038
1039 while (!list_empty(&tmp)) {
1040 struct request *rq;
1041
1042 rq = list_first_entry(&tmp, struct request, queuelist);
1043 rq->mq_ctx = ctx;
1044 list_move_tail(&rq->queuelist, &ctx->rq_list);
1045 }
1046
1047 blk_mq_hctx_mark_pending(hctx, ctx);
1048
1049 spin_unlock(&ctx->lock);
1050 blk_mq_put_ctx(ctx);
1051}
1052
1053static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
1054 void (*init)(void *, struct blk_mq_hw_ctx *,
1055 struct request *, unsigned int),
1056 void *data)
1057{
1058 unsigned int i;
1059
1060 for (i = 0; i < hctx->queue_depth; i++) {
1061 struct request *rq = hctx->rqs[i];
1062
1063 init(data, hctx, rq, i);
1064 }
1065}
1066
1067void blk_mq_init_commands(struct request_queue *q,
1068 void (*init)(void *, struct blk_mq_hw_ctx *,
1069 struct request *, unsigned int),
1070 void *data)
1071{
1072 struct blk_mq_hw_ctx *hctx;
1073 unsigned int i;
1074
1075 queue_for_each_hw_ctx(q, hctx, i)
1076 blk_mq_init_hw_commands(hctx, init, data);
1077}
1078EXPORT_SYMBOL(blk_mq_init_commands);
1079
1080static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
1081{
1082 struct page *page;
1083
1084 while (!list_empty(&hctx->page_list)) {
1085 page = list_first_entry(&hctx->page_list, struct page, list);
1086 list_del_init(&page->list);
1087 __free_pages(page, page->private);
1088 }
1089
1090 kfree(hctx->rqs);
1091
1092 if (hctx->tags)
1093 blk_mq_free_tags(hctx->tags);
1094}
1095
1096static size_t order_to_size(unsigned int order)
1097{
1098 size_t ret = PAGE_SIZE;
1099
1100 while (order--)
1101 ret *= 2;
1102
1103 return ret;
1104}
1105
1106static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
1107 unsigned int reserved_tags, int node)
1108{
1109 unsigned int i, j, entries_per_page, max_order = 4;
1110 size_t rq_size, left;
1111
1112 INIT_LIST_HEAD(&hctx->page_list);
1113
1114 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
1115 GFP_KERNEL, node);
1116 if (!hctx->rqs)
1117 return -ENOMEM;
1118
1119 /*
1120 * rq_size is the size of the request plus driver payload, rounded
1121 * to the cacheline size
1122 */
1123 rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
1124 cache_line_size());
1125 left = rq_size * hctx->queue_depth;
1126
1127 for (i = 0; i < hctx->queue_depth;) {
1128 int this_order = max_order;
1129 struct page *page;
1130 int to_do;
1131 void *p;
1132
1133 while (left < order_to_size(this_order - 1) && this_order)
1134 this_order--;
1135
1136 do {
1137 page = alloc_pages_node(node, GFP_KERNEL, this_order);
1138 if (page)
1139 break;
1140 if (!this_order--)
1141 break;
1142 if (order_to_size(this_order) < rq_size)
1143 break;
1144 } while (1);
1145
1146 if (!page)
1147 break;
1148
1149 page->private = this_order;
1150 list_add_tail(&page->list, &hctx->page_list);
1151
1152 p = page_address(page);
1153 entries_per_page = order_to_size(this_order) / rq_size;
1154 to_do = min(entries_per_page, hctx->queue_depth - i);
1155 left -= to_do * rq_size;
1156 for (j = 0; j < to_do; j++) {
1157 hctx->rqs[i] = p;
1158 blk_mq_rq_init(hctx, hctx->rqs[i]);
1159 p += rq_size;
1160 i++;
1161 }
1162 }
1163
1164 if (i < (reserved_tags + BLK_MQ_TAG_MIN))
1165 goto err_rq_map;
1166 else if (i != hctx->queue_depth) {
1167 hctx->queue_depth = i;
1168 pr_warn("%s: queue depth set to %u because of low memory\n",
1169 __func__, i);
1170 }
1171
1172 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
1173 if (!hctx->tags) {
1174err_rq_map:
1175 blk_mq_free_rq_map(hctx);
1176 return -ENOMEM;
1177 }
1178
1179 return 0;
1180}
1181
1182static int blk_mq_init_hw_queues(struct request_queue *q,
1183 struct blk_mq_reg *reg, void *driver_data)
1184{
1185 struct blk_mq_hw_ctx *hctx;
1186 unsigned int i, j;
1187
1188 /*
1189 * Initialize hardware queues
1190 */
1191 queue_for_each_hw_ctx(q, hctx, i) {
1192 unsigned int num_maps;
1193 int node;
1194
1195 node = hctx->numa_node;
1196 if (node == NUMA_NO_NODE)
1197 node = hctx->numa_node = reg->numa_node;
1198
1199 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
1200 spin_lock_init(&hctx->lock);
1201 INIT_LIST_HEAD(&hctx->dispatch);
1202 hctx->queue = q;
1203 hctx->queue_num = i;
1204 hctx->flags = reg->flags;
1205 hctx->queue_depth = reg->queue_depth;
1206 hctx->cmd_size = reg->cmd_size;
1207
1208 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1209 blk_mq_hctx_notify, hctx);
1210 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1211
1212 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
1213 break;
1214
1215 /*
1216 * Allocate space for all possible cpus to avoid allocation in
1217 * runtime
1218 */
1219 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1220 GFP_KERNEL, node);
1221 if (!hctx->ctxs)
1222 break;
1223
1224 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
1225 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1226 GFP_KERNEL, node);
1227 if (!hctx->ctx_map)
1228 break;
1229
1230 hctx->nr_ctx_map = num_maps;
1231 hctx->nr_ctx = 0;
1232
1233 if (reg->ops->init_hctx &&
1234 reg->ops->init_hctx(hctx, driver_data, i))
1235 break;
1236 }
1237
1238 if (i == q->nr_hw_queues)
1239 return 0;
1240
1241 /*
1242 * Init failed
1243 */
1244 queue_for_each_hw_ctx(q, hctx, j) {
1245 if (i == j)
1246 break;
1247
1248 if (reg->ops->exit_hctx)
1249 reg->ops->exit_hctx(hctx, j);
1250
1251 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1252 blk_mq_free_rq_map(hctx);
1253 kfree(hctx->ctxs);
1254 }
1255
1256 return 1;
1257}
1258
1259static void blk_mq_init_cpu_queues(struct request_queue *q,
1260 unsigned int nr_hw_queues)
1261{
1262 unsigned int i;
1263
1264 for_each_possible_cpu(i) {
1265 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1266 struct blk_mq_hw_ctx *hctx;
1267
1268 memset(__ctx, 0, sizeof(*__ctx));
1269 __ctx->cpu = i;
1270 spin_lock_init(&__ctx->lock);
1271 INIT_LIST_HEAD(&__ctx->rq_list);
1272 __ctx->queue = q;
1273
1274 /* If the cpu isn't online, the cpu is mapped to first hctx */
1275 hctx = q->mq_ops->map_queue(q, i);
1276 hctx->nr_ctx++;
1277
1278 if (!cpu_online(i))
1279 continue;
1280
1281 /*
1282 * Set local node, IFF we have more than one hw queue. If
1283 * not, we remain on the home node of the device
1284 */
1285 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1286 hctx->numa_node = cpu_to_node(i);
1287 }
1288}
1289
1290static void blk_mq_map_swqueue(struct request_queue *q)
1291{
1292 unsigned int i;
1293 struct blk_mq_hw_ctx *hctx;
1294 struct blk_mq_ctx *ctx;
1295
1296 queue_for_each_hw_ctx(q, hctx, i) {
1297 hctx->nr_ctx = 0;
1298 }
1299
1300 /*
1301 * Map software to hardware queues
1302 */
1303 queue_for_each_ctx(q, ctx, i) {
1304 /* If the cpu isn't online, the cpu is mapped to first hctx */
1305 hctx = q->mq_ops->map_queue(q, i);
1306 ctx->index_hw = hctx->nr_ctx;
1307 hctx->ctxs[hctx->nr_ctx++] = ctx;
1308 }
1309}
1310
1311struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1312 void *driver_data)
1313{
1314 struct blk_mq_hw_ctx **hctxs;
1315 struct blk_mq_ctx *ctx;
1316 struct request_queue *q;
1317 int i;
1318
1319 if (!reg->nr_hw_queues ||
1320 !reg->ops->queue_rq || !reg->ops->map_queue ||
1321 !reg->ops->alloc_hctx || !reg->ops->free_hctx)
1322 return ERR_PTR(-EINVAL);
1323
1324 if (!reg->queue_depth)
1325 reg->queue_depth = BLK_MQ_MAX_DEPTH;
1326 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
1327 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
1328 reg->queue_depth = BLK_MQ_MAX_DEPTH;
1329 }
1330
1331 /*
1332 * Set aside a tag for flush requests. It will only be used while
1333 * another flush request is in progress but outside the driver.
1334 *
1335 * TODO: only allocate if flushes are supported
1336 */
1337 reg->queue_depth++;
1338 reg->reserved_tags++;
1339
1340 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
1341 return ERR_PTR(-EINVAL);
1342
1343 ctx = alloc_percpu(struct blk_mq_ctx);
1344 if (!ctx)
1345 return ERR_PTR(-ENOMEM);
1346
1347 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1348 reg->numa_node);
1349
1350 if (!hctxs)
1351 goto err_percpu;
1352
1353 for (i = 0; i < reg->nr_hw_queues; i++) {
1354 hctxs[i] = reg->ops->alloc_hctx(reg, i);
1355 if (!hctxs[i])
1356 goto err_hctxs;
1357
1358 hctxs[i]->numa_node = NUMA_NO_NODE;
1359 hctxs[i]->queue_num = i;
1360 }
1361
1362 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
1363 if (!q)
1364 goto err_hctxs;
1365
1366 q->mq_map = blk_mq_make_queue_map(reg);
1367 if (!q->mq_map)
1368 goto err_map;
1369
1370 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1371 blk_queue_rq_timeout(q, 30000);
1372
1373 q->nr_queues = nr_cpu_ids;
1374 q->nr_hw_queues = reg->nr_hw_queues;
1375
1376 q->queue_ctx = ctx;
1377 q->queue_hw_ctx = hctxs;
1378
1379 q->mq_ops = reg->ops;
1380
1381 blk_queue_make_request(q, blk_mq_make_request);
1382 blk_queue_rq_timed_out(q, reg->ops->timeout);
1383 if (reg->timeout)
1384 blk_queue_rq_timeout(q, reg->timeout);
1385
1386 blk_mq_init_flush(q);
1387 blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
1388
1389 if (blk_mq_init_hw_queues(q, reg, driver_data))
1390 goto err_hw;
1391
1392 blk_mq_map_swqueue(q);
1393
1394 mutex_lock(&all_q_mutex);
1395 list_add_tail(&q->all_q_node, &all_q_list);
1396 mutex_unlock(&all_q_mutex);
1397
1398 return q;
1399err_hw:
1400 kfree(q->mq_map);
1401err_map:
1402 blk_cleanup_queue(q);
1403err_hctxs:
1404 for (i = 0; i < reg->nr_hw_queues; i++) {
1405 if (!hctxs[i])
1406 break;
1407 reg->ops->free_hctx(hctxs[i], i);
1408 }
1409 kfree(hctxs);
1410err_percpu:
1411 free_percpu(ctx);
1412 return ERR_PTR(-ENOMEM);
1413}
1414EXPORT_SYMBOL(blk_mq_init_queue);
1415
1416void blk_mq_free_queue(struct request_queue *q)
1417{
1418 struct blk_mq_hw_ctx *hctx;
1419 int i;
1420
1421 queue_for_each_hw_ctx(q, hctx, i) {
1422 cancel_delayed_work_sync(&hctx->delayed_work);
1423 kfree(hctx->ctx_map);
1424 kfree(hctx->ctxs);
1425 blk_mq_free_rq_map(hctx);
1426 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1427 if (q->mq_ops->exit_hctx)
1428 q->mq_ops->exit_hctx(hctx, i);
1429 q->mq_ops->free_hctx(hctx, i);
1430 }
1431
1432 free_percpu(q->queue_ctx);
1433 kfree(q->queue_hw_ctx);
1434 kfree(q->mq_map);
1435
1436 q->queue_ctx = NULL;
1437 q->queue_hw_ctx = NULL;
1438 q->mq_map = NULL;
1439
1440 mutex_lock(&all_q_mutex);
1441 list_del_init(&q->all_q_node);
1442 mutex_unlock(&all_q_mutex);
1443}
1444EXPORT_SYMBOL(blk_mq_free_queue);
1445
1446/* Basically redo blk_mq_init_queue with queue frozen */
1447static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)
1448{
1449 blk_mq_freeze_queue(q);
1450
1451 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1452
1453 /*
1454 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1455 * we should change hctx numa_node according to new topology (this
1456 * involves free and re-allocate memory, worthy doing?)
1457 */
1458
1459 blk_mq_map_swqueue(q);
1460
1461 blk_mq_unfreeze_queue(q);
1462}
1463
1464static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb,
1465 unsigned long action, void *hcpu)
1466{
1467 struct request_queue *q;
1468
1469 /*
1470 * Before new mapping is established, hotadded cpu might already start
1471 * handling requests. This doesn't break anything as we map offline
1472 * CPUs to first hardware queue. We will re-init queue below to get
1473 * optimal settings.
1474 */
1475 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1476 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1477 return NOTIFY_OK;
1478
1479 mutex_lock(&all_q_mutex);
1480 list_for_each_entry(q, &all_q_list, all_q_node)
1481 blk_mq_queue_reinit(q);
1482 mutex_unlock(&all_q_mutex);
1483 return NOTIFY_OK;
1484}
1485
1486static int __init blk_mq_init(void)
1487{
1488 unsigned int i;
1489
1490 for_each_possible_cpu(i)
1491 init_llist_head(&per_cpu(ipi_lists, i));
1492
1493 blk_mq_cpu_init();
1494
1495 /* Must be called after percpu_counter_hotcpu_callback() */
1496 hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
1497
1498 return 0;
1499}
1500subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
new file mode 100644
index 000000000000..52bf1f96a2c2
--- /dev/null
+++ b/block/blk-mq.h
@@ -0,0 +1,52 @@
1#ifndef INT_BLK_MQ_H
2#define INT_BLK_MQ_H
3
4struct blk_mq_ctx {
5 struct {
6 spinlock_t lock;
7 struct list_head rq_list;
8 } ____cacheline_aligned_in_smp;
9
10 unsigned int cpu;
11 unsigned int index_hw;
12 unsigned int ipi_redirect;
13
14 /* incremented at dispatch time */
15 unsigned long rq_dispatched[2];
16 unsigned long rq_merged;
17
18 /* incremented at completion time */
19 unsigned long ____cacheline_aligned_in_smp rq_completed[2];
20
21 struct request_queue *queue;
22 struct kobject kobj;
23};
24
25void __blk_mq_end_io(struct request *rq, int error);
26void blk_mq_complete_request(struct request *rq, int error);
27void blk_mq_run_request(struct request *rq, bool run_queue, bool async);
28void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
29void blk_mq_init_flush(struct request_queue *q);
30
31/*
32 * CPU hotplug helpers
33 */
34struct blk_mq_cpu_notifier;
35void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
36 void (*fn)(void *, unsigned long, unsigned int),
37 void *data);
38void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
39void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
40void blk_mq_cpu_init(void);
41DECLARE_PER_CPU(struct llist_head, ipi_lists);
42
43/*
44 * CPU -> queue mappings
45 */
46struct blk_mq_reg;
47extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
48extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
49
50void blk_mq_add_timer(struct request *rq);
51
52#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 026c1517505f..05e826793e4e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -144,6 +144,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
144 lim->discard_zeroes_data = 1; 144 lim->discard_zeroes_data = 1;
145 lim->max_segments = USHRT_MAX; 145 lim->max_segments = USHRT_MAX;
146 lim->max_hw_sectors = UINT_MAX; 146 lim->max_hw_sectors = UINT_MAX;
147 lim->max_segment_size = UINT_MAX;
147 lim->max_sectors = UINT_MAX; 148 lim->max_sectors = UINT_MAX;
148 lim->max_write_same_sectors = UINT_MAX; 149 lim->max_write_same_sectors = UINT_MAX;
149} 150}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ec9e60636f43..ce4b8bfd3d27 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -23,7 +23,7 @@ static void blk_done_softirq(struct softirq_action *h)
23 struct list_head *cpu_list, local_list; 23 struct list_head *cpu_list, local_list;
24 24
25 local_irq_disable(); 25 local_irq_disable();
26 cpu_list = &__get_cpu_var(blk_cpu_done); 26 cpu_list = this_cpu_ptr(&blk_cpu_done);
27 list_replace_init(cpu_list, &local_list); 27 list_replace_init(cpu_list, &local_list);
28 local_irq_enable(); 28 local_irq_enable();
29 29
@@ -44,7 +44,7 @@ static void trigger_softirq(void *data)
44 struct list_head *list; 44 struct list_head *list;
45 45
46 local_irq_save(flags); 46 local_irq_save(flags);
47 list = &__get_cpu_var(blk_cpu_done); 47 list = this_cpu_ptr(&blk_cpu_done);
48 list_add_tail(&rq->csd.list, list); 48 list_add_tail(&rq->csd.list, list);
49 49
50 if (list->next == &rq->csd.list) 50 if (list->next == &rq->csd.list)
@@ -90,7 +90,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
90 90
91 local_irq_disable(); 91 local_irq_disable();
92 list_splice_init(&per_cpu(blk_cpu_done, cpu), 92 list_splice_init(&per_cpu(blk_cpu_done, cpu),
93 &__get_cpu_var(blk_cpu_done)); 93 this_cpu_ptr(&blk_cpu_done));
94 raise_softirq_irqoff(BLOCK_SOFTIRQ); 94 raise_softirq_irqoff(BLOCK_SOFTIRQ);
95 local_irq_enable(); 95 local_irq_enable();
96 } 96 }
@@ -135,7 +135,7 @@ void __blk_complete_request(struct request *req)
135 if (ccpu == cpu || shared) { 135 if (ccpu == cpu || shared) {
136 struct list_head *list; 136 struct list_head *list;
137do_local: 137do_local:
138 list = &__get_cpu_var(blk_cpu_done); 138 list = this_cpu_ptr(&blk_cpu_done);
139 list_add_tail(&req->csd.list, list); 139 list_add_tail(&req->csd.list, list);
140 140
141 /* 141 /*
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3aa5b195f4dd..4f8c4d90ec73 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -7,6 +7,7 @@
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/blktrace_api.h> 9#include <linux/blktrace_api.h>
10#include <linux/blk-mq.h>
10 11
11#include "blk.h" 12#include "blk.h"
12#include "blk-cgroup.h" 13#include "blk-cgroup.h"
@@ -542,6 +543,11 @@ static void blk_release_queue(struct kobject *kobj)
542 if (q->queue_tags) 543 if (q->queue_tags)
543 __blk_queue_free_tags(q); 544 __blk_queue_free_tags(q);
544 545
546 percpu_counter_destroy(&q->mq_usage_counter);
547
548 if (q->mq_ops)
549 blk_mq_free_queue(q);
550
545 blk_trace_shutdown(q); 551 blk_trace_shutdown(q);
546 552
547 bdi_destroy(&q->backing_dev_info); 553 bdi_destroy(&q->backing_dev_info);
@@ -575,6 +581,7 @@ int blk_register_queue(struct gendisk *disk)
575 * bypass from queue allocation. 581 * bypass from queue allocation.
576 */ 582 */
577 blk_queue_bypass_end(q); 583 blk_queue_bypass_end(q);
584 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
578 585
579 ret = blk_trace_init_sysfs(dev); 586 ret = blk_trace_init_sysfs(dev);
580 if (ret) 587 if (ret)
@@ -588,6 +595,9 @@ int blk_register_queue(struct gendisk *disk)
588 595
589 kobject_uevent(&q->kobj, KOBJ_ADD); 596 kobject_uevent(&q->kobj, KOBJ_ADD);
590 597
598 if (q->mq_ops)
599 blk_mq_register_disk(disk);
600
591 if (!q->request_fn) 601 if (!q->request_fn)
592 return 0; 602 return 0;
593 603
@@ -610,6 +620,9 @@ void blk_unregister_queue(struct gendisk *disk)
610 if (WARN_ON(!q)) 620 if (WARN_ON(!q))
611 return; 621 return;
612 622
623 if (q->mq_ops)
624 blk_mq_unregister_disk(disk);
625
613 if (q->request_fn) 626 if (q->request_fn)
614 elv_unregister_queue(q); 627 elv_unregister_queue(q);
615 628
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 65f103563969..bba81c9348e1 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -7,6 +7,7 @@
7#include <linux/fault-inject.h> 7#include <linux/fault-inject.h>
8 8
9#include "blk.h" 9#include "blk.h"
10#include "blk-mq.h"
10 11
11#ifdef CONFIG_FAIL_IO_TIMEOUT 12#ifdef CONFIG_FAIL_IO_TIMEOUT
12 13
@@ -31,7 +32,7 @@ static int __init fail_io_timeout_debugfs(void)
31 struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", 32 struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
32 NULL, &fail_io_timeout); 33 NULL, &fail_io_timeout);
33 34
34 return IS_ERR(dir) ? PTR_ERR(dir) : 0; 35 return PTR_ERR_OR_ZERO(dir);
35} 36}
36 37
37late_initcall(fail_io_timeout_debugfs); 38late_initcall(fail_io_timeout_debugfs);
@@ -88,11 +89,19 @@ static void blk_rq_timed_out(struct request *req)
88 ret = q->rq_timed_out_fn(req); 89 ret = q->rq_timed_out_fn(req);
89 switch (ret) { 90 switch (ret) {
90 case BLK_EH_HANDLED: 91 case BLK_EH_HANDLED:
91 __blk_complete_request(req); 92 /* Can we use req->errors here? */
93 if (q->mq_ops)
94 blk_mq_complete_request(req, req->errors);
95 else
96 __blk_complete_request(req);
92 break; 97 break;
93 case BLK_EH_RESET_TIMER: 98 case BLK_EH_RESET_TIMER:
99 if (q->mq_ops)
100 blk_mq_add_timer(req);
101 else
102 blk_add_timer(req);
103
94 blk_clear_rq_complete(req); 104 blk_clear_rq_complete(req);
95 blk_add_timer(req);
96 break; 105 break;
97 case BLK_EH_NOT_HANDLED: 106 case BLK_EH_NOT_HANDLED:
98 /* 107 /*
@@ -108,6 +117,23 @@ static void blk_rq_timed_out(struct request *req)
108 } 117 }
109} 118}
110 119
120void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
121 unsigned int *next_set)
122{
123 if (time_after_eq(jiffies, rq->deadline)) {
124 list_del_init(&rq->timeout_list);
125
126 /*
127 * Check if we raced with end io completion
128 */
129 if (!blk_mark_rq_complete(rq))
130 blk_rq_timed_out(rq);
131 } else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
132 *next_timeout = rq->deadline;
133 *next_set = 1;
134 }
135}
136
111void blk_rq_timed_out_timer(unsigned long data) 137void blk_rq_timed_out_timer(unsigned long data)
112{ 138{
113 struct request_queue *q = (struct request_queue *) data; 139 struct request_queue *q = (struct request_queue *) data;
@@ -117,21 +143,8 @@ void blk_rq_timed_out_timer(unsigned long data)
117 143
118 spin_lock_irqsave(q->queue_lock, flags); 144 spin_lock_irqsave(q->queue_lock, flags);
119 145
120 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { 146 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
121 if (time_after_eq(jiffies, rq->deadline)) { 147 blk_rq_check_expired(rq, &next, &next_set);
122 list_del_init(&rq->timeout_list);
123
124 /*
125 * Check if we raced with end io completion
126 */
127 if (blk_mark_rq_complete(rq))
128 continue;
129 blk_rq_timed_out(rq);
130 } else if (!next_set || time_after(next, rq->deadline)) {
131 next = rq->deadline;
132 next_set = 1;
133 }
134 }
135 148
136 if (next_set) 149 if (next_set)
137 mod_timer(&q->timeout, round_jiffies_up(next)); 150 mod_timer(&q->timeout, round_jiffies_up(next));
@@ -157,15 +170,7 @@ void blk_abort_request(struct request *req)
157} 170}
158EXPORT_SYMBOL_GPL(blk_abort_request); 171EXPORT_SYMBOL_GPL(blk_abort_request);
159 172
160/** 173void __blk_add_timer(struct request *req, struct list_head *timeout_list)
161 * blk_add_timer - Start timeout timer for a single request
162 * @req: request that is about to start running.
163 *
164 * Notes:
165 * Each request has its own timer, and as it is added to the queue, we
166 * set up the timer. When the request completes, we cancel the timer.
167 */
168void blk_add_timer(struct request *req)
169{ 174{
170 struct request_queue *q = req->q; 175 struct request_queue *q = req->q;
171 unsigned long expiry; 176 unsigned long expiry;
@@ -174,7 +179,6 @@ void blk_add_timer(struct request *req)
174 return; 179 return;
175 180
176 BUG_ON(!list_empty(&req->timeout_list)); 181 BUG_ON(!list_empty(&req->timeout_list));
177 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
178 182
179 /* 183 /*
180 * Some LLDs, like scsi, peek at the timeout to prevent a 184 * Some LLDs, like scsi, peek at the timeout to prevent a
@@ -184,7 +188,8 @@ void blk_add_timer(struct request *req)
184 req->timeout = q->rq_timeout; 188 req->timeout = q->rq_timeout;
185 189
186 req->deadline = jiffies + req->timeout; 190 req->deadline = jiffies + req->timeout;
187 list_add_tail(&req->timeout_list, &q->timeout_list); 191 if (timeout_list)
192 list_add_tail(&req->timeout_list, timeout_list);
188 193
189 /* 194 /*
190 * If the timer isn't already pending or this timeout is earlier 195 * If the timer isn't already pending or this timeout is earlier
@@ -196,5 +201,19 @@ void blk_add_timer(struct request *req)
196 if (!timer_pending(&q->timeout) || 201 if (!timer_pending(&q->timeout) ||
197 time_before(expiry, q->timeout.expires)) 202 time_before(expiry, q->timeout.expires))
198 mod_timer(&q->timeout, expiry); 203 mod_timer(&q->timeout, expiry);
204
205}
206
207/**
208 * blk_add_timer - Start timeout timer for a single request
209 * @req: request that is about to start running.
210 *
211 * Notes:
212 * Each request has its own timer, and as it is added to the queue, we
213 * set up the timer. When the request completes, we cancel the timer.
214 */
215void blk_add_timer(struct request *req)
216{
217 __blk_add_timer(req, &req->q->timeout_list);
199} 218}
200 219
diff --git a/block/blk.h b/block/blk.h
index e837b8f619b7..c90e1d8f7a2b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -10,6 +10,7 @@
10#define BLK_BATCH_REQ 32 10#define BLK_BATCH_REQ 32
11 11
12extern struct kmem_cache *blk_requestq_cachep; 12extern struct kmem_cache *blk_requestq_cachep;
13extern struct kmem_cache *request_cachep;
13extern struct kobj_type blk_queue_ktype; 14extern struct kobj_type blk_queue_ktype;
14extern struct ida blk_queue_ida; 15extern struct ida blk_queue_ida;
15 16
@@ -34,14 +35,30 @@ bool __blk_end_bidi_request(struct request *rq, int error,
34 unsigned int nr_bytes, unsigned int bidi_bytes); 35 unsigned int nr_bytes, unsigned int bidi_bytes);
35 36
36void blk_rq_timed_out_timer(unsigned long data); 37void blk_rq_timed_out_timer(unsigned long data);
38void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
39 unsigned int *next_set);
40void __blk_add_timer(struct request *req, struct list_head *timeout_list);
37void blk_delete_timer(struct request *); 41void blk_delete_timer(struct request *);
38void blk_add_timer(struct request *); 42void blk_add_timer(struct request *);
39 43
44
45bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
46 struct bio *bio);
47bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
48 struct bio *bio);
49bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
50 unsigned int *request_count);
51
52void blk_account_io_start(struct request *req, bool new_io);
53void blk_account_io_completion(struct request *req, unsigned int bytes);
54void blk_account_io_done(struct request *req);
55
40/* 56/*
41 * Internal atomic flags for request handling 57 * Internal atomic flags for request handling
42 */ 58 */
43enum rq_atomic_flags { 59enum rq_atomic_flags {
44 REQ_ATOM_COMPLETE = 0, 60 REQ_ATOM_COMPLETE = 0,
61 REQ_ATOM_STARTED,
45}; 62};
46 63
47/* 64/*
diff --git a/block/elevator.c b/block/elevator.c
index 2bcbd8cc14d4..b7ff2861b6bd 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name)
186 struct elevator_type *e = NULL; 186 struct elevator_type *e = NULL;
187 int err; 187 int err;
188 188
189 /*
190 * q->sysfs_lock must be held to provide mutual exclusion between
191 * elevator_switch() and here.
192 */
193 lockdep_assert_held(&q->sysfs_lock);
194
189 if (unlikely(q->elevator)) 195 if (unlikely(q->elevator))
190 return 0; 196 return 0;
191 197
@@ -959,7 +965,7 @@ fail_init:
959/* 965/*
960 * Switch this queue to the given IO scheduler. 966 * Switch this queue to the given IO scheduler.
961 */ 967 */
962int elevator_change(struct request_queue *q, const char *name) 968static int __elevator_change(struct request_queue *q, const char *name)
963{ 969{
964 char elevator_name[ELV_NAME_MAX]; 970 char elevator_name[ELV_NAME_MAX];
965 struct elevator_type *e; 971 struct elevator_type *e;
@@ -981,6 +987,18 @@ int elevator_change(struct request_queue *q, const char *name)
981 987
982 return elevator_switch(q, e); 988 return elevator_switch(q, e);
983} 989}
990
991int elevator_change(struct request_queue *q, const char *name)
992{
993 int ret;
994
995 /* Protect q->elevator from elevator_init() */
996 mutex_lock(&q->sysfs_lock);
997 ret = __elevator_change(q, name);
998 mutex_unlock(&q->sysfs_lock);
999
1000 return ret;
1001}
984EXPORT_SYMBOL(elevator_change); 1002EXPORT_SYMBOL(elevator_change);
985 1003
986ssize_t elv_iosched_store(struct request_queue *q, const char *name, 1004ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -991,7 +1009,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
991 if (!q->elevator) 1009 if (!q->elevator)
992 return count; 1010 return count;
993 1011
994 ret = elevator_change(q, name); 1012 ret = __elevator_change(q, name);
995 if (!ret) 1013 if (!ret)
996 return count; 1014 return count;
997 1015
diff --git a/block/ioctl.c b/block/ioctl.c
index a31d91d9bc5a..7d5c3b20af45 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -64,7 +64,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
64 part = add_partition(disk, partno, start, length, 64 part = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE, NULL); 65 ADDPART_FLAG_NONE, NULL);
66 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
67 return IS_ERR(part) ? PTR_ERR(part) : 0; 67 return PTR_ERR_OR_ZERO(part);
68 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
69 part = disk_get_part(disk, partno); 69 part = disk_get_part(disk, partno);
70 if (!part) 70 if (!part)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a5ffcc988f0b..625e3e471d65 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -286,7 +286,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
286 struct sg_io_hdr *hdr, fmode_t mode) 286 struct sg_io_hdr *hdr, fmode_t mode)
287{ 287{
288 unsigned long start_time; 288 unsigned long start_time;
289 int writing = 0, ret = 0; 289 ssize_t ret = 0;
290 int writing = 0;
290 struct request *rq; 291 struct request *rq;
291 char sense[SCSI_SENSE_BUFFERSIZE]; 292 char sense[SCSI_SENSE_BUFFERSIZE];
292 struct bio *bio; 293 struct bio *bio;
@@ -321,37 +322,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
321 } 322 }
322 323
323 if (hdr->iovec_count) { 324 if (hdr->iovec_count) {
324 const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
325 size_t iov_data_len; 325 size_t iov_data_len;
326 struct sg_iovec *sg_iov;
327 struct iovec *iov; 326 struct iovec *iov;
328 int i;
329 327
330 sg_iov = kmalloc(size, GFP_KERNEL); 328 ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
331 if (!sg_iov) { 329 0, NULL, &iov);
332 ret = -ENOMEM; 330 if (ret < 0)
333 goto out; 331 goto out;
334 }
335
336 if (copy_from_user(sg_iov, hdr->dxferp, size)) {
337 kfree(sg_iov);
338 ret = -EFAULT;
339 goto out;
340 }
341 332
342 /* 333 iov_data_len = ret;
343 * Sum up the vecs, making sure they don't overflow 334 ret = 0;
344 */
345 iov = (struct iovec *) sg_iov;
346 iov_data_len = 0;
347 for (i = 0; i < hdr->iovec_count; i++) {
348 if (iov_data_len + iov[i].iov_len < iov_data_len) {
349 kfree(sg_iov);
350 ret = -EINVAL;
351 goto out;
352 }
353 iov_data_len += iov[i].iov_len;
354 }
355 335
356 /* SG_IO howto says that the shorter of the two wins */ 336 /* SG_IO howto says that the shorter of the two wins */
357 if (hdr->dxfer_len < iov_data_len) { 337 if (hdr->dxfer_len < iov_data_len) {
@@ -361,9 +341,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
361 iov_data_len = hdr->dxfer_len; 341 iov_data_len = hdr->dxfer_len;
362 } 342 }
363 343
364 ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, 344 ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
345 hdr->iovec_count,
365 iov_data_len, GFP_KERNEL); 346 iov_data_len, GFP_KERNEL);
366 kfree(sg_iov); 347 kfree(iov);
367 } else if (hdr->dxfer_len) 348 } else if (hdr->dxfer_len)
368 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, 349 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
369 GFP_KERNEL); 350 GFP_KERNEL);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e67fa16e1938..5902bd006a9c 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -15,6 +15,9 @@ menuconfig BLK_DEV
15 15
16if BLK_DEV 16if BLK_DEV
17 17
18config BLK_DEV_NULL_BLK
19 tristate "Null test block driver"
20
18config BLK_DEV_FD 21config BLK_DEV_FD
19 tristate "Normal floppy disk support" 22 tristate "Normal floppy disk support"
20 depends on ARCH_MAY_HAVE_PC_FDC 23 depends on ARCH_MAY_HAVE_PC_FDC
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index ca07399a8d99..03b3b4a2bd8a 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
41obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ 41obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
42 42
43obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ 43obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
44obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
44 45
45nvme-y := nvme-core.o nvme-scsi.o 46nvme-y := nvme-core.o nvme-scsi.o
46swim_mod-y := swim.o swim_asm.o 47swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 9bf4371755f2..d91f1a56e861 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -545,7 +545,7 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
545 545
546 mutex_lock(&brd_devices_mutex); 546 mutex_lock(&brd_devices_mutex);
547 brd = brd_init_one(MINOR(dev) >> part_shift); 547 brd = brd_init_one(MINOR(dev) >> part_shift);
548 kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM); 548 kobj = brd ? get_disk(brd->brd_disk) : NULL;
549 mutex_unlock(&brd_devices_mutex); 549 mutex_unlock(&brd_devices_mutex);
550 550
551 *part = 0; 551 *part = 0;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 04ceb7e2fadd..000abe2f105c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2886,9 +2886,9 @@ static void do_fd_request(struct request_queue *q)
2886 return; 2886 return;
2887 2887
2888 if (WARN(atomic_read(&usage_count) == 0, 2888 if (WARN(atomic_read(&usage_count) == 0,
2889 "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%x\n", 2889 "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%llx\n",
2890 current_req, (long)blk_rq_pos(current_req), current_req->cmd_type, 2890 current_req, (long)blk_rq_pos(current_req), current_req->cmd_type,
2891 current_req->cmd_flags)) 2891 (unsigned long long) current_req->cmd_flags))
2892 return; 2892 return;
2893 2893
2894 if (test_and_set_bit(0, &fdc_busy)) { 2894 if (test_and_set_bit(0, &fdc_busy)) {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 40e715531aa6..dbdb88a4976c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1633,7 +1633,7 @@ static int loop_add(struct loop_device **l, int i)
1633 err = -ENOMEM; 1633 err = -ENOMEM;
1634 lo->lo_queue = blk_alloc_queue(GFP_KERNEL); 1634 lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
1635 if (!lo->lo_queue) 1635 if (!lo->lo_queue)
1636 goto out_free_dev; 1636 goto out_free_idr;
1637 1637
1638 disk = lo->lo_disk = alloc_disk(1 << part_shift); 1638 disk = lo->lo_disk = alloc_disk(1 << part_shift);
1639 if (!disk) 1639 if (!disk)
@@ -1678,6 +1678,8 @@ static int loop_add(struct loop_device **l, int i)
1678 1678
1679out_free_queue: 1679out_free_queue:
1680 blk_cleanup_queue(lo->lo_queue); 1680 blk_cleanup_queue(lo->lo_queue);
1681out_free_idr:
1682 idr_remove(&loop_index_idr, i);
1681out_free_dev: 1683out_free_dev:
1682 kfree(lo); 1684 kfree(lo);
1683out: 1685out:
@@ -1741,7 +1743,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
1741 if (err < 0) 1743 if (err < 0)
1742 err = loop_add(&lo, MINOR(dev) >> part_shift); 1744 err = loop_add(&lo, MINOR(dev) >> part_shift);
1743 if (err < 0) 1745 if (err < 0)
1744 kobj = ERR_PTR(err); 1746 kobj = NULL;
1745 else 1747 else
1746 kobj = get_disk(lo->lo_disk); 1748 kobj = get_disk(lo->lo_disk);
1747 mutex_unlock(&loop_index_mutex); 1749 mutex_unlock(&loop_index_mutex);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
new file mode 100644
index 000000000000..b5d842370cc9
--- /dev/null
+++ b/drivers/block/null_blk.c
@@ -0,0 +1,635 @@
1#include <linux/module.h>
2#include <linux/moduleparam.h>
3#include <linux/sched.h>
4#include <linux/fs.h>
5#include <linux/blkdev.h>
6#include <linux/init.h>
7#include <linux/slab.h>
8#include <linux/blk-mq.h>
9#include <linux/hrtimer.h>
10
11struct nullb_cmd {
12 struct list_head list;
13 struct llist_node ll_list;
14 struct call_single_data csd;
15 struct request *rq;
16 struct bio *bio;
17 unsigned int tag;
18 struct nullb_queue *nq;
19};
20
21struct nullb_queue {
22 unsigned long *tag_map;
23 wait_queue_head_t wait;
24 unsigned int queue_depth;
25
26 struct nullb_cmd *cmds;
27};
28
29struct nullb {
30 struct list_head list;
31 unsigned int index;
32 struct request_queue *q;
33 struct gendisk *disk;
34 struct hrtimer timer;
35 unsigned int queue_depth;
36 spinlock_t lock;
37
38 struct nullb_queue *queues;
39 unsigned int nr_queues;
40};
41
42static LIST_HEAD(nullb_list);
43static struct mutex lock;
44static int null_major;
45static int nullb_indexes;
46
47struct completion_queue {
48 struct llist_head list;
49 struct hrtimer timer;
50};
51
52/*
53 * These are per-cpu for now, they will need to be configured by the
54 * complete_queues parameter and appropriately mapped.
55 */
56static DEFINE_PER_CPU(struct completion_queue, completion_queues);
57
58enum {
59 NULL_IRQ_NONE = 0,
60 NULL_IRQ_SOFTIRQ = 1,
61 NULL_IRQ_TIMER = 2,
62
63 NULL_Q_BIO = 0,
64 NULL_Q_RQ = 1,
65 NULL_Q_MQ = 2,
66};
67
68static int submit_queues = 1;
69module_param(submit_queues, int, S_IRUGO);
70MODULE_PARM_DESC(submit_queues, "Number of submission queues");
71
72static int home_node = NUMA_NO_NODE;
73module_param(home_node, int, S_IRUGO);
74MODULE_PARM_DESC(home_node, "Home node for the device");
75
76static int queue_mode = NULL_Q_MQ;
77module_param(queue_mode, int, S_IRUGO);
78MODULE_PARM_DESC(use_mq, "Use blk-mq interface (0=bio,1=rq,2=multiqueue)");
79
80static int gb = 250;
81module_param(gb, int, S_IRUGO);
82MODULE_PARM_DESC(gb, "Size in GB");
83
84static int bs = 512;
85module_param(bs, int, S_IRUGO);
86MODULE_PARM_DESC(bs, "Block size (in bytes)");
87
88static int nr_devices = 2;
89module_param(nr_devices, int, S_IRUGO);
90MODULE_PARM_DESC(nr_devices, "Number of devices to register");
91
92static int irqmode = NULL_IRQ_SOFTIRQ;
93module_param(irqmode, int, S_IRUGO);
94MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
95
96static int completion_nsec = 10000;
97module_param(completion_nsec, int, S_IRUGO);
98MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
99
100static int hw_queue_depth = 64;
101module_param(hw_queue_depth, int, S_IRUGO);
102MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
103
104static bool use_per_node_hctx = true;
105module_param(use_per_node_hctx, bool, S_IRUGO);
106MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: true");
107
108static void put_tag(struct nullb_queue *nq, unsigned int tag)
109{
110 clear_bit_unlock(tag, nq->tag_map);
111
112 if (waitqueue_active(&nq->wait))
113 wake_up(&nq->wait);
114}
115
116static unsigned int get_tag(struct nullb_queue *nq)
117{
118 unsigned int tag;
119
120 do {
121 tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
122 if (tag >= nq->queue_depth)
123 return -1U;
124 } while (test_and_set_bit_lock(tag, nq->tag_map));
125
126 return tag;
127}
128
129static void free_cmd(struct nullb_cmd *cmd)
130{
131 put_tag(cmd->nq, cmd->tag);
132}
133
134static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
135{
136 struct nullb_cmd *cmd;
137 unsigned int tag;
138
139 tag = get_tag(nq);
140 if (tag != -1U) {
141 cmd = &nq->cmds[tag];
142 cmd->tag = tag;
143 cmd->nq = nq;
144 return cmd;
145 }
146
147 return NULL;
148}
149
150static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
151{
152 struct nullb_cmd *cmd;
153 DEFINE_WAIT(wait);
154
155 cmd = __alloc_cmd(nq);
156 if (cmd || !can_wait)
157 return cmd;
158
159 do {
160 prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
161 cmd = __alloc_cmd(nq);
162 if (cmd)
163 break;
164
165 io_schedule();
166 } while (1);
167
168 finish_wait(&nq->wait, &wait);
169 return cmd;
170}
171
172static void end_cmd(struct nullb_cmd *cmd)
173{
174 if (cmd->rq) {
175 if (queue_mode == NULL_Q_MQ)
176 blk_mq_end_io(cmd->rq, 0);
177 else {
178 INIT_LIST_HEAD(&cmd->rq->queuelist);
179 blk_end_request_all(cmd->rq, 0);
180 }
181 } else if (cmd->bio)
182 bio_endio(cmd->bio, 0);
183
184 if (queue_mode != NULL_Q_MQ)
185 free_cmd(cmd);
186}
187
188static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
189{
190 struct completion_queue *cq;
191 struct llist_node *entry;
192 struct nullb_cmd *cmd;
193
194 cq = &per_cpu(completion_queues, smp_processor_id());
195
196 while ((entry = llist_del_all(&cq->list)) != NULL) {
197 do {
198 cmd = container_of(entry, struct nullb_cmd, ll_list);
199 end_cmd(cmd);
200 entry = entry->next;
201 } while (entry);
202 }
203
204 return HRTIMER_NORESTART;
205}
206
207static void null_cmd_end_timer(struct nullb_cmd *cmd)
208{
209 struct completion_queue *cq = &per_cpu(completion_queues, get_cpu());
210
211 cmd->ll_list.next = NULL;
212 if (llist_add(&cmd->ll_list, &cq->list)) {
213 ktime_t kt = ktime_set(0, completion_nsec);
214
215 hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL);
216 }
217
218 put_cpu();
219}
220
221static void null_softirq_done_fn(struct request *rq)
222{
223 blk_end_request_all(rq, 0);
224}
225
226#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
227
228static void null_ipi_cmd_end_io(void *data)
229{
230 struct completion_queue *cq;
231 struct llist_node *entry, *next;
232 struct nullb_cmd *cmd;
233
234 cq = &per_cpu(completion_queues, smp_processor_id());
235
236 entry = llist_del_all(&cq->list);
237
238 while (entry) {
239 next = entry->next;
240 cmd = llist_entry(entry, struct nullb_cmd, ll_list);
241 end_cmd(cmd);
242 entry = next;
243 }
244}
245
246static void null_cmd_end_ipi(struct nullb_cmd *cmd)
247{
248 struct call_single_data *data = &cmd->csd;
249 int cpu = get_cpu();
250 struct completion_queue *cq = &per_cpu(completion_queues, cpu);
251
252 cmd->ll_list.next = NULL;
253
254 if (llist_add(&cmd->ll_list, &cq->list)) {
255 data->func = null_ipi_cmd_end_io;
256 data->flags = 0;
257 __smp_call_function_single(cpu, data, 0);
258 }
259
260 put_cpu();
261}
262
263#endif /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
264
265static inline void null_handle_cmd(struct nullb_cmd *cmd)
266{
267 /* Complete IO by inline, softirq or timer */
268 switch (irqmode) {
269 case NULL_IRQ_NONE:
270 end_cmd(cmd);
271 break;
272 case NULL_IRQ_SOFTIRQ:
273#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
274 null_cmd_end_ipi(cmd);
275#else
276 end_cmd(cmd);
277#endif
278 break;
279 case NULL_IRQ_TIMER:
280 null_cmd_end_timer(cmd);
281 break;
282 }
283}
284
285static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
286{
287 int index = 0;
288
289 if (nullb->nr_queues != 1)
290 index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
291
292 return &nullb->queues[index];
293}
294
295static void null_queue_bio(struct request_queue *q, struct bio *bio)
296{
297 struct nullb *nullb = q->queuedata;
298 struct nullb_queue *nq = nullb_to_queue(nullb);
299 struct nullb_cmd *cmd;
300
301 cmd = alloc_cmd(nq, 1);
302 cmd->bio = bio;
303
304 null_handle_cmd(cmd);
305}
306
307static int null_rq_prep_fn(struct request_queue *q, struct request *req)
308{
309 struct nullb *nullb = q->queuedata;
310 struct nullb_queue *nq = nullb_to_queue(nullb);
311 struct nullb_cmd *cmd;
312
313 cmd = alloc_cmd(nq, 0);
314 if (cmd) {
315 cmd->rq = req;
316 req->special = cmd;
317 return BLKPREP_OK;
318 }
319
320 return BLKPREP_DEFER;
321}
322
323static void null_request_fn(struct request_queue *q)
324{
325 struct request *rq;
326
327 while ((rq = blk_fetch_request(q)) != NULL) {
328 struct nullb_cmd *cmd = rq->special;
329
330 spin_unlock_irq(q->queue_lock);
331 null_handle_cmd(cmd);
332 spin_lock_irq(q->queue_lock);
333 }
334}
335
336static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
337{
338 struct nullb_cmd *cmd = rq->special;
339
340 cmd->rq = rq;
341 cmd->nq = hctx->driver_data;
342
343 null_handle_cmd(cmd);
344 return BLK_MQ_RQ_QUEUE_OK;
345}
346
347static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
348{
349 return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL,
350 hctx_index);
351}
352
353static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
354{
355 kfree(hctx);
356}
357
358static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
359 unsigned int index)
360{
361 struct nullb *nullb = data;
362 struct nullb_queue *nq = &nullb->queues[index];
363
364 init_waitqueue_head(&nq->wait);
365 nq->queue_depth = nullb->queue_depth;
366 nullb->nr_queues++;
367 hctx->driver_data = nq;
368
369 return 0;
370}
371
372static struct blk_mq_ops null_mq_ops = {
373 .queue_rq = null_queue_rq,
374 .map_queue = blk_mq_map_queue,
375 .init_hctx = null_init_hctx,
376};
377
378static struct blk_mq_reg null_mq_reg = {
379 .ops = &null_mq_ops,
380 .queue_depth = 64,
381 .cmd_size = sizeof(struct nullb_cmd),
382 .flags = BLK_MQ_F_SHOULD_MERGE,
383};
384
385static void null_del_dev(struct nullb *nullb)
386{
387 list_del_init(&nullb->list);
388
389 del_gendisk(nullb->disk);
390 if (queue_mode == NULL_Q_MQ)
391 blk_mq_free_queue(nullb->q);
392 else
393 blk_cleanup_queue(nullb->q);
394 put_disk(nullb->disk);
395 kfree(nullb);
396}
397
398static int null_open(struct block_device *bdev, fmode_t mode)
399{
400 return 0;
401}
402
403static void null_release(struct gendisk *disk, fmode_t mode)
404{
405}
406
407static const struct block_device_operations null_fops = {
408 .owner = THIS_MODULE,
409 .open = null_open,
410 .release = null_release,
411};
412
413static int setup_commands(struct nullb_queue *nq)
414{
415 struct nullb_cmd *cmd;
416 int i, tag_size;
417
418 nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL);
419 if (!nq->cmds)
420 return 1;
421
422 tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
423 nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL);
424 if (!nq->tag_map) {
425 kfree(nq->cmds);
426 return 1;
427 }
428
429 for (i = 0; i < nq->queue_depth; i++) {
430 cmd = &nq->cmds[i];
431 INIT_LIST_HEAD(&cmd->list);
432 cmd->ll_list.next = NULL;
433 cmd->tag = -1U;
434 }
435
436 return 0;
437}
438
439static void cleanup_queue(struct nullb_queue *nq)
440{
441 kfree(nq->tag_map);
442 kfree(nq->cmds);
443}
444
445static void cleanup_queues(struct nullb *nullb)
446{
447 int i;
448
449 for (i = 0; i < nullb->nr_queues; i++)
450 cleanup_queue(&nullb->queues[i]);
451
452 kfree(nullb->queues);
453}
454
455static int setup_queues(struct nullb *nullb)
456{
457 struct nullb_queue *nq;
458 int i;
459
460 nullb->queues = kzalloc(submit_queues * sizeof(*nq), GFP_KERNEL);
461 if (!nullb->queues)
462 return 1;
463
464 nullb->nr_queues = 0;
465 nullb->queue_depth = hw_queue_depth;
466
467 if (queue_mode == NULL_Q_MQ)
468 return 0;
469
470 for (i = 0; i < submit_queues; i++) {
471 nq = &nullb->queues[i];
472 init_waitqueue_head(&nq->wait);
473 nq->queue_depth = hw_queue_depth;
474 if (setup_commands(nq))
475 break;
476 nullb->nr_queues++;
477 }
478
479 if (i == submit_queues)
480 return 0;
481
482 cleanup_queues(nullb);
483 return 1;
484}
485
486static int null_add_dev(void)
487{
488 struct gendisk *disk;
489 struct nullb *nullb;
490 sector_t size;
491
492 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
493 if (!nullb)
494 return -ENOMEM;
495
496 spin_lock_init(&nullb->lock);
497
498 if (setup_queues(nullb))
499 goto err;
500
501 if (queue_mode == NULL_Q_MQ) {
502 null_mq_reg.numa_node = home_node;
503 null_mq_reg.queue_depth = hw_queue_depth;
504
505 if (use_per_node_hctx) {
506 null_mq_reg.ops->alloc_hctx = null_alloc_hctx;
507 null_mq_reg.ops->free_hctx = null_free_hctx;
508
509 null_mq_reg.nr_hw_queues = nr_online_nodes;
510 } else {
511 null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue;
512 null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue;
513
514 null_mq_reg.nr_hw_queues = submit_queues;
515 }
516
517 nullb->q = blk_mq_init_queue(&null_mq_reg, nullb);
518 } else if (queue_mode == NULL_Q_BIO) {
519 nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
520 blk_queue_make_request(nullb->q, null_queue_bio);
521 } else {
522 nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
523 blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
524 if (nullb->q)
525 blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
526 }
527
528 if (!nullb->q)
529 goto queue_fail;
530
531 nullb->q->queuedata = nullb;
532 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
533
534 disk = nullb->disk = alloc_disk_node(1, home_node);
535 if (!disk) {
536queue_fail:
537 if (queue_mode == NULL_Q_MQ)
538 blk_mq_free_queue(nullb->q);
539 else
540 blk_cleanup_queue(nullb->q);
541 cleanup_queues(nullb);
542err:
543 kfree(nullb);
544 return -ENOMEM;
545 }
546
547 mutex_lock(&lock);
548 list_add_tail(&nullb->list, &nullb_list);
549 nullb->index = nullb_indexes++;
550 mutex_unlock(&lock);
551
552 blk_queue_logical_block_size(nullb->q, bs);
553 blk_queue_physical_block_size(nullb->q, bs);
554
555 size = gb * 1024 * 1024 * 1024ULL;
556 sector_div(size, bs);
557 set_capacity(disk, size);
558
559 disk->flags |= GENHD_FL_EXT_DEVT;
560 disk->major = null_major;
561 disk->first_minor = nullb->index;
562 disk->fops = &null_fops;
563 disk->private_data = nullb;
564 disk->queue = nullb->q;
565 sprintf(disk->disk_name, "nullb%d", nullb->index);
566 add_disk(disk);
567 return 0;
568}
569
570static int __init null_init(void)
571{
572 unsigned int i;
573
574#if !defined(CONFIG_SMP) || !defined(CONFIG_USE_GENERIC_SMP_HELPERS)
575 if (irqmode == NULL_IRQ_SOFTIRQ) {
576 pr_warn("null_blk: softirq completions not available.\n");
577 pr_warn("null_blk: using direct completions.\n");
578 irqmode = NULL_IRQ_NONE;
579 }
580#endif
581
582 if (submit_queues > nr_cpu_ids)
583 submit_queues = nr_cpu_ids;
584 else if (!submit_queues)
585 submit_queues = 1;
586
587 mutex_init(&lock);
588
589 /* Initialize a separate list for each CPU for issuing softirqs */
590 for_each_possible_cpu(i) {
591 struct completion_queue *cq = &per_cpu(completion_queues, i);
592
593 init_llist_head(&cq->list);
594
595 if (irqmode != NULL_IRQ_TIMER)
596 continue;
597
598 hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
599 cq->timer.function = null_cmd_timer_expired;
600 }
601
602 null_major = register_blkdev(0, "nullb");
603 if (null_major < 0)
604 return null_major;
605
606 for (i = 0; i < nr_devices; i++) {
607 if (null_add_dev()) {
608 unregister_blkdev(null_major, "nullb");
609 return -EINVAL;
610 }
611 }
612
613 pr_info("null: module loaded\n");
614 return 0;
615}
616
617static void __exit null_exit(void)
618{
619 struct nullb *nullb;
620
621 unregister_blkdev(null_major, "nullb");
622
623 mutex_lock(&lock);
624 while (!list_empty(&nullb_list)) {
625 nullb = list_entry(nullb_list.next, struct nullb, list);
626 null_del_dev(nullb);
627 }
628 mutex_unlock(&lock);
629}
630
631module_init(null_init);
632module_exit(null_exit);
633
634MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
635MODULE_LICENSE("GPL");
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a4660bbee8a6..8d53ed293606 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1336,57 +1336,6 @@ static int blkfront_probe(struct xenbus_device *dev,
1336 return 0; 1336 return 0;
1337} 1337}
1338 1338
1339/*
1340 * This is a clone of md_trim_bio, used to split a bio into smaller ones
1341 */
1342static void trim_bio(struct bio *bio, int offset, int size)
1343{
1344 /* 'bio' is a cloned bio which we need to trim to match
1345 * the given offset and size.
1346 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1347 */
1348 int i;
1349 struct bio_vec *bvec;
1350 int sofar = 0;
1351
1352 size <<= 9;
1353 if (offset == 0 && size == bio->bi_size)
1354 return;
1355
1356 bio->bi_sector += offset;
1357 bio->bi_size = size;
1358 offset <<= 9;
1359 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1360
1361 while (bio->bi_idx < bio->bi_vcnt &&
1362 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
1363 /* remove this whole bio_vec */
1364 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
1365 bio->bi_idx++;
1366 }
1367 if (bio->bi_idx < bio->bi_vcnt) {
1368 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
1369 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
1370 }
1371 /* avoid any complications with bi_idx being non-zero*/
1372 if (bio->bi_idx) {
1373 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1374 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1375 bio->bi_vcnt -= bio->bi_idx;
1376 bio->bi_idx = 0;
1377 }
1378 /* Make sure vcnt and last bv are not too big */
1379 bio_for_each_segment(bvec, bio, i) {
1380 if (sofar + bvec->bv_len > size)
1381 bvec->bv_len = size - sofar;
1382 if (bvec->bv_len == 0) {
1383 bio->bi_vcnt = i;
1384 break;
1385 }
1386 sofar += bvec->bv_len;
1387 }
1388}
1389
1390static void split_bio_end(struct bio *bio, int error) 1339static void split_bio_end(struct bio *bio, int error)
1391{ 1340{
1392 struct split_bio *split_bio = bio->bi_private; 1341 struct split_bio *split_bio = bio->bi_private;
@@ -1522,7 +1471,7 @@ static int blkif_recover(struct blkfront_info *info)
1522 (unsigned int)(bio->bi_size >> 9) - offset); 1471 (unsigned int)(bio->bi_size >> 9) - offset);
1523 cloned_bio = bio_clone(bio, GFP_NOIO); 1472 cloned_bio = bio_clone(bio, GFP_NOIO);
1524 BUG_ON(cloned_bio == NULL); 1473 BUG_ON(cloned_bio == NULL);
1525 trim_bio(cloned_bio, offset, size); 1474 bio_trim(cloned_bio, offset, size);
1526 cloned_bio->bi_private = split_bio; 1475 cloned_bio->bi_private = split_bio;
1527 cloned_bio->bi_end_io = split_bio_end; 1476 cloned_bio->bi_end_io = split_bio_end;
1528 submit_bio(cloned_bio->bi_rw, cloned_bio); 1477 submit_bio(cloned_bio->bi_rw, cloned_bio);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2445fece9263..8766eabb0014 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -183,46 +183,6 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
183} 183}
184EXPORT_SYMBOL_GPL(bio_clone_mddev); 184EXPORT_SYMBOL_GPL(bio_clone_mddev);
185 185
186void md_trim_bio(struct bio *bio, int offset, int size)
187{
188 /* 'bio' is a cloned bio which we need to trim to match
189 * the given offset and size.
190 * This requires adjusting bi_sector, bi_size, and bi_io_vec
191 */
192 int i;
193 struct bio_vec *bvec;
194 int sofar = 0;
195
196 size <<= 9;
197 if (offset == 0 && size == bio->bi_size)
198 return;
199
200 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
201
202 bio_advance(bio, offset << 9);
203
204 bio->bi_size = size;
205
206 /* avoid any complications with bi_idx being non-zero*/
207 if (bio->bi_idx) {
208 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
209 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
210 bio->bi_vcnt -= bio->bi_idx;
211 bio->bi_idx = 0;
212 }
213 /* Make sure vcnt and last bv are not too big */
214 bio_for_each_segment(bvec, bio, i) {
215 if (sofar + bvec->bv_len > size)
216 bvec->bv_len = size - sofar;
217 if (bvec->bv_len == 0) {
218 bio->bi_vcnt = i;
219 break;
220 }
221 sofar += bvec->bv_len;
222 }
223}
224EXPORT_SYMBOL_GPL(md_trim_bio);
225
226/* 186/*
227 * We have a system wide 'event count' that is incremented 187 * We have a system wide 'event count' that is incremented
228 * on any 'interesting' event, and readers of /proc/mdstat 188 * on any 'interesting' event, and readers of /proc/mdstat
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b0051f2fbc0c..2f5cc8a7ef3e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -617,7 +617,6 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
617 struct mddev *mddev); 617 struct mddev *mddev);
618extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 618extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
619 struct mddev *mddev); 619 struct mddev *mddev);
620extern void md_trim_bio(struct bio *bio, int offset, int size);
621 620
622extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); 621extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
623static inline int mddev_check_plugged(struct mddev *mddev) 622static inline int mddev_check_plugged(struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index aacf6bf352d8..af6681b19776 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1097,8 +1097,8 @@ read_again:
1097 r1_bio->read_disk = rdisk; 1097 r1_bio->read_disk = rdisk;
1098 1098
1099 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1099 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1100 md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, 1100 bio_trim(read_bio, r1_bio->sector - bio->bi_sector,
1101 max_sectors); 1101 max_sectors);
1102 1102
1103 r1_bio->bios[rdisk] = read_bio; 1103 r1_bio->bios[rdisk] = read_bio;
1104 1104
@@ -1266,7 +1266,7 @@ read_again:
1266 continue; 1266 continue;
1267 1267
1268 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1268 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1269 md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); 1269 bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
1270 1270
1271 if (first_clone) { 1271 if (first_clone) {
1272 /* do behind I/O ? 1272 /* do behind I/O ?
@@ -2126,7 +2126,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
2126 wbio->bi_sector = r1_bio->sector; 2126 wbio->bi_sector = r1_bio->sector;
2127 wbio->bi_size = r1_bio->sectors << 9; 2127 wbio->bi_size = r1_bio->sectors << 9;
2128 2128
2129 md_trim_bio(wbio, sector - r1_bio->sector, sectors); 2129 bio_trim(wbio, sector - r1_bio->sector, sectors);
2130 wbio->bi_sector += rdev->data_offset; 2130 wbio->bi_sector += rdev->data_offset;
2131 wbio->bi_bdev = rdev->bdev; 2131 wbio->bi_bdev = rdev->bdev;
2132 if (submit_bio_wait(WRITE, wbio) == 0) 2132 if (submit_bio_wait(WRITE, wbio) == 0)
@@ -2241,7 +2241,7 @@ read_more:
2241 } 2241 }
2242 r1_bio->read_disk = disk; 2242 r1_bio->read_disk = disk;
2243 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); 2243 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
2244 md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); 2244 bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors);
2245 r1_bio->bios[r1_bio->read_disk] = bio; 2245 r1_bio->bios[r1_bio->read_disk] = bio;
2246 rdev = conf->mirrors[disk].rdev; 2246 rdev = conf->mirrors[disk].rdev;
2247 printk_ratelimited(KERN_ERR 2247 printk_ratelimited(KERN_ERR
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 73dc8a377522..7c3508abb5e1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1302,8 +1302,8 @@ read_again:
1302 slot = r10_bio->read_slot; 1302 slot = r10_bio->read_slot;
1303 1303
1304 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1304 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1305 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, 1305 bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
1306 max_sectors); 1306 max_sectors);
1307 1307
1308 r10_bio->devs[slot].bio = read_bio; 1308 r10_bio->devs[slot].bio = read_bio;
1309 r10_bio->devs[slot].rdev = rdev; 1309 r10_bio->devs[slot].rdev = rdev;
@@ -1510,8 +1510,8 @@ retry_write:
1510 if (r10_bio->devs[i].bio) { 1510 if (r10_bio->devs[i].bio) {
1511 struct md_rdev *rdev = conf->mirrors[d].rdev; 1511 struct md_rdev *rdev = conf->mirrors[d].rdev;
1512 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1512 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1513 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1513 bio_trim(mbio, r10_bio->sector - bio->bi_sector,
1514 max_sectors); 1514 max_sectors);
1515 r10_bio->devs[i].bio = mbio; 1515 r10_bio->devs[i].bio = mbio;
1516 1516
1517 mbio->bi_sector = (r10_bio->devs[i].addr+ 1517 mbio->bi_sector = (r10_bio->devs[i].addr+
@@ -1553,8 +1553,8 @@ retry_write:
1553 rdev = conf->mirrors[d].rdev; 1553 rdev = conf->mirrors[d].rdev;
1554 } 1554 }
1555 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1555 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1556 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1556 bio_trim(mbio, r10_bio->sector - bio->bi_sector,
1557 max_sectors); 1557 max_sectors);
1558 r10_bio->devs[i].repl_bio = mbio; 1558 r10_bio->devs[i].repl_bio = mbio;
1559 1559
1560 mbio->bi_sector = (r10_bio->devs[i].addr + 1560 mbio->bi_sector = (r10_bio->devs[i].addr +
@@ -2614,7 +2614,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
2614 sectors = sect_to_write; 2614 sectors = sect_to_write;
2615 /* Write at 'sector' for 'sectors' */ 2615 /* Write at 'sector' for 'sectors' */
2616 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 2616 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2617 md_trim_bio(wbio, sector - bio->bi_sector, sectors); 2617 bio_trim(wbio, sector - bio->bi_sector, sectors);
2618 wbio->bi_sector = (r10_bio->devs[i].addr+ 2618 wbio->bi_sector = (r10_bio->devs[i].addr+
2619 choose_data_offset(r10_bio, rdev) + 2619 choose_data_offset(r10_bio, rdev) +
2620 (sector - r10_bio->sector)); 2620 (sector - r10_bio->sector));
@@ -2687,9 +2687,7 @@ read_more:
2687 (unsigned long long)r10_bio->sector); 2687 (unsigned long long)r10_bio->sector);
2688 bio = bio_clone_mddev(r10_bio->master_bio, 2688 bio = bio_clone_mddev(r10_bio->master_bio,
2689 GFP_NOIO, mddev); 2689 GFP_NOIO, mddev);
2690 md_trim_bio(bio, 2690 bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
2691 r10_bio->sector - bio->bi_sector,
2692 max_sectors);
2693 r10_bio->devs[slot].bio = bio; 2691 r10_bio->devs[slot].bio = bio;
2694 r10_bio->devs[slot].rdev = rdev; 2692 r10_bio->devs[slot].rdev = rdev;
2695 bio->bi_sector = r10_bio->devs[slot].addr 2693 bio->bi_sector = r10_bio->devs[slot].addr
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 5693f6d7eddb..7fe4faaa149b 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1002,7 +1002,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
1002 SCpnt->cmnd[0] = READ_6; 1002 SCpnt->cmnd[0] = READ_6;
1003 SCpnt->sc_data_direction = DMA_FROM_DEVICE; 1003 SCpnt->sc_data_direction = DMA_FROM_DEVICE;
1004 } else { 1004 } else {
1005 scmd_printk(KERN_ERR, SCpnt, "Unknown command %x\n", rq->cmd_flags); 1005 scmd_printk(KERN_ERR, SCpnt, "Unknown command %llx\n", (unsigned long long) rq->cmd_flags);
1006 goto out; 1006 goto out;
1007 } 1007 }
1008 1008
diff --git a/fs/bio.c b/fs/bio.c
index ea5035da4d9a..2bdb4e25ee77 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1805,6 +1805,52 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1805EXPORT_SYMBOL(bio_split); 1805EXPORT_SYMBOL(bio_split);
1806 1806
1807/** 1807/**
1808 * bio_trim - trim a bio
1809 * @bio: bio to trim
1810 * @offset: number of sectors to trim from the front of @bio
1811 * @size: size we want to trim @bio to, in sectors
1812 */
1813void bio_trim(struct bio *bio, int offset, int size)
1814{
1815 /* 'bio' is a cloned bio which we need to trim to match
1816 * the given offset and size.
1817 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1818 */
1819 int i;
1820 struct bio_vec *bvec;
1821 int sofar = 0;
1822
1823 size <<= 9;
1824 if (offset == 0 && size == bio->bi_size)
1825 return;
1826
1827 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1828
1829 bio_advance(bio, offset << 9);
1830
1831 bio->bi_size = size;
1832
1833 /* avoid any complications with bi_idx being non-zero*/
1834 if (bio->bi_idx) {
1835 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1836 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1837 bio->bi_vcnt -= bio->bi_idx;
1838 bio->bi_idx = 0;
1839 }
1840 /* Make sure vcnt and last bv are not too big */
1841 bio_for_each_segment(bvec, bio, i) {
1842 if (sofar + bvec->bv_len > size)
1843 bvec->bv_len = size - sofar;
1844 if (bvec->bv_len == 0) {
1845 bio->bi_vcnt = i;
1846 break;
1847 }
1848 sofar += bvec->bv_len;
1849 }
1850}
1851EXPORT_SYMBOL_GPL(bio_trim);
1852
1853/**
1808 * bio_sector_offset - Find hardware sector offset in bio 1854 * bio_sector_offset - Find hardware sector offset in bio
1809 * @bio: bio to inspect 1855 * @bio: bio to inspect
1810 * @index: bio_vec index 1856 * @index: bio_vec index
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 94b5f60076da..f77f7702fabe 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -576,7 +576,8 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
576void __init chrdev_init(void) 576void __init chrdev_init(void)
577{ 577{
578 cdev_map = kobj_map_init(base_probe, &chrdevs_lock); 578 cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
579 bdi_init(&directly_mappable_cdev_bdi); 579 if (bdi_init(&directly_mappable_cdev_bdi))
580 panic("Failed to init directly mappable cdev bdi");
580} 581}
581 582
582 583
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index dcb821617774..53d35c504240 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -799,7 +799,7 @@ void fscache_enqueue_object(struct fscache_object *object)
799 */ 799 */
800bool fscache_object_sleep_till_congested(signed long *timeoutp) 800bool fscache_object_sleep_till_congested(signed long *timeoutp)
801{ 801{
802 wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait); 802 wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait);
803 DEFINE_WAIT(wait); 803 DEFINE_WAIT(wait);
804 804
805 if (fscache_object_congested()) 805 if (fscache_object_congested())
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5f66d519a726..24819001f5c8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -109,7 +109,7 @@ struct backing_dev_info {
109#endif 109#endif
110}; 110};
111 111
112int bdi_init(struct backing_dev_info *bdi); 112int __must_check bdi_init(struct backing_dev_info *bdi);
113void bdi_destroy(struct backing_dev_info *bdi); 113void bdi_destroy(struct backing_dev_info *bdi);
114 114
115__printf(3, 4) 115__printf(3, 4)
@@ -117,7 +117,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
117 const char *fmt, ...); 117 const char *fmt, ...);
118int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); 118int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
119void bdi_unregister(struct backing_dev_info *bdi); 119void bdi_unregister(struct backing_dev_info *bdi);
120int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); 120int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
121void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 121void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
122 enum wb_reason reason); 122 enum wb_reason reason);
123void bdi_start_background_writeback(struct backing_dev_info *bdi); 123void bdi_start_background_writeback(struct backing_dev_info *bdi);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ec48bac5b039..060ff695085c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -218,6 +218,7 @@ struct bio_pair {
218}; 218};
219extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); 219extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
220extern void bio_pair_release(struct bio_pair *dbio); 220extern void bio_pair_release(struct bio_pair *dbio);
221extern void bio_trim(struct bio *bio, int offset, int size);
221 222
222extern struct bio_set *bioset_create(unsigned int, unsigned int); 223extern struct bio_set *bioset_create(unsigned int, unsigned int);
223extern void bioset_free(struct bio_set *); 224extern void bioset_free(struct bio_set *);
@@ -419,6 +420,8 @@ static inline void bio_list_init(struct bio_list *bl)
419 bl->head = bl->tail = NULL; 420 bl->head = bl->tail = NULL;
420} 421}
421 422
423#define BIO_EMPTY_LIST { NULL, NULL }
424
422#define bio_list_for_each(bio, bl) \ 425#define bio_list_for_each(bio, bl) \
423 for (bio = (bl)->head; bio; bio = bio->bi_next) 426 for (bio = (bl)->head; bio; bio = bio->bi_next)
424 427
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
new file mode 100644
index 000000000000..ab0e9b2025b3
--- /dev/null
+++ b/include/linux/blk-mq.h
@@ -0,0 +1,183 @@
1#ifndef BLK_MQ_H
2#define BLK_MQ_H
3
4#include <linux/blkdev.h>
5
6struct blk_mq_tags;
7
8struct blk_mq_cpu_notifier {
9 struct list_head list;
10 void *data;
11 void (*notify)(void *data, unsigned long action, unsigned int cpu);
12};
13
14struct blk_mq_hw_ctx {
15 struct {
16 spinlock_t lock;
17 struct list_head dispatch;
18 } ____cacheline_aligned_in_smp;
19
20 unsigned long state; /* BLK_MQ_S_* flags */
21 struct delayed_work delayed_work;
22
23 unsigned long flags; /* BLK_MQ_F_* flags */
24
25 struct request_queue *queue;
26 unsigned int queue_num;
27
28 void *driver_data;
29
30 unsigned int nr_ctx;
31 struct blk_mq_ctx **ctxs;
32 unsigned int nr_ctx_map;
33 unsigned long *ctx_map;
34
35 struct request **rqs;
36 struct list_head page_list;
37 struct blk_mq_tags *tags;
38
39 unsigned long queued;
40 unsigned long run;
41#define BLK_MQ_MAX_DISPATCH_ORDER 10
42 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
43
44 unsigned int queue_depth;
45 unsigned int numa_node;
46 unsigned int cmd_size; /* per-request extra data */
47
48 struct blk_mq_cpu_notifier cpu_notifier;
49 struct kobject kobj;
50};
51
52struct blk_mq_reg {
53 struct blk_mq_ops *ops;
54 unsigned int nr_hw_queues;
55 unsigned int queue_depth;
56 unsigned int reserved_tags;
57 unsigned int cmd_size; /* per-request extra data */
58 int numa_node;
59 unsigned int timeout;
60 unsigned int flags; /* BLK_MQ_F_* */
61};
62
63typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
64typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
65typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
66typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
67typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
68typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
69
70struct blk_mq_ops {
71 /*
72 * Queue request
73 */
74 queue_rq_fn *queue_rq;
75
76 /*
77 * Map to specific hardware queue
78 */
79 map_queue_fn *map_queue;
80
81 /*
82 * Called on request timeout
83 */
84 rq_timed_out_fn *timeout;
85
86 /*
87 * Override for hctx allocations (should probably go)
88 */
89 alloc_hctx_fn *alloc_hctx;
90 free_hctx_fn *free_hctx;
91
92 /*
93 * Called when the block layer side of a hardware queue has been
94 * set up, allowing the driver to allocate/init matching structures.
95 * Ditto for exit/teardown.
96 */
97 init_hctx_fn *init_hctx;
98 exit_hctx_fn *exit_hctx;
99};
100
101enum {
102 BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */
103 BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */
104 BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */
105
106 BLK_MQ_F_SHOULD_MERGE = 1 << 0,
107 BLK_MQ_F_SHOULD_SORT = 1 << 1,
108 BLK_MQ_F_SHOULD_IPI = 1 << 2,
109
110 BLK_MQ_S_STOPPED = 1 << 0,
111
112 BLK_MQ_MAX_DEPTH = 2048,
113};
114
115struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
116void blk_mq_free_queue(struct request_queue *);
117int blk_mq_register_disk(struct gendisk *);
118void blk_mq_unregister_disk(struct gendisk *);
119void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
120
121void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
122
123void blk_mq_insert_request(struct request_queue *, struct request *, bool);
124void blk_mq_run_queues(struct request_queue *q, bool async);
125void blk_mq_free_request(struct request *rq);
126bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
127struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved);
128struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
129struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
130
131struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
132struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
133void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
134
135void blk_mq_end_io(struct request *rq, int error);
136
137void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
138void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
139void blk_mq_stop_hw_queues(struct request_queue *q);
140void blk_mq_start_stopped_hw_queues(struct request_queue *q);
141
142/*
143 * Driver command data is immediately after the request. So subtract request
144 * size to get back to the original request.
145 */
146static inline struct request *blk_mq_rq_from_pdu(void *pdu)
147{
148 return pdu - sizeof(struct request);
149}
150static inline void *blk_mq_rq_to_pdu(struct request *rq)
151{
152 return (void *) rq + sizeof(*rq);
153}
154
155static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
156 unsigned int tag)
157{
158 return hctx->rqs[tag];
159}
160
161#define queue_for_each_hw_ctx(q, hctx, i) \
162 for ((i) = 0, hctx = (q)->queue_hw_ctx[0]; \
163 (i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i])
164
165#define queue_for_each_ctx(q, ctx, i) \
166 for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0); \
167 (i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i)))
168
169#define hctx_for_each_ctx(hctx, ctx, i) \
170 for ((i) = 0, ctx = (hctx)->ctxs[0]; \
171 (i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)])
172
173#define blk_ctx_sum(q, sum) \
174({ \
175 struct blk_mq_ctx *__x; \
176 unsigned int __ret = 0, __i; \
177 \
178 queue_for_each_ctx((q), __x, __i) \
179 __ret += sum; \
180 __ret; \
181})
182
183#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fa1abeb45b76..238ef0ed62f8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -178,19 +178,20 @@ enum rq_flag_bits {
178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
179 __REQ_KERNEL, /* direct IO to kernel pages */ 179 __REQ_KERNEL, /* direct IO to kernel pages */
180 __REQ_PM, /* runtime pm request */ 180 __REQ_PM, /* runtime pm request */
181 __REQ_END, /* last of chain of requests */
181 __REQ_NR_BITS, /* stops here */ 182 __REQ_NR_BITS, /* stops here */
182}; 183};
183 184
184#define REQ_WRITE (1 << __REQ_WRITE) 185#define REQ_WRITE (1ULL << __REQ_WRITE)
185#define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV) 186#define REQ_FAILFAST_DEV (1ULL << __REQ_FAILFAST_DEV)
186#define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT) 187#define REQ_FAILFAST_TRANSPORT (1ULL << __REQ_FAILFAST_TRANSPORT)
187#define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) 188#define REQ_FAILFAST_DRIVER (1ULL << __REQ_FAILFAST_DRIVER)
188#define REQ_SYNC (1 << __REQ_SYNC) 189#define REQ_SYNC (1ULL << __REQ_SYNC)
189#define REQ_META (1 << __REQ_META) 190#define REQ_META (1ULL << __REQ_META)
190#define REQ_PRIO (1 << __REQ_PRIO) 191#define REQ_PRIO (1ULL << __REQ_PRIO)
191#define REQ_DISCARD (1 << __REQ_DISCARD) 192#define REQ_DISCARD (1ULL << __REQ_DISCARD)
192#define REQ_WRITE_SAME (1 << __REQ_WRITE_SAME) 193#define REQ_WRITE_SAME (1ULL << __REQ_WRITE_SAME)
193#define REQ_NOIDLE (1 << __REQ_NOIDLE) 194#define REQ_NOIDLE (1ULL << __REQ_NOIDLE)
194 195
195#define REQ_FAILFAST_MASK \ 196#define REQ_FAILFAST_MASK \
196 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) 197 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -206,28 +207,29 @@ enum rq_flag_bits {
206#define REQ_NOMERGE_FLAGS \ 207#define REQ_NOMERGE_FLAGS \
207 (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) 208 (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
208 209
209#define REQ_RAHEAD (1 << __REQ_RAHEAD) 210#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
210#define REQ_THROTTLED (1 << __REQ_THROTTLED) 211#define REQ_THROTTLED (1ULL << __REQ_THROTTLED)
211 212
212#define REQ_SORTED (1 << __REQ_SORTED) 213#define REQ_SORTED (1ULL << __REQ_SORTED)
213#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) 214#define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER)
214#define REQ_FUA (1 << __REQ_FUA) 215#define REQ_FUA (1ULL << __REQ_FUA)
215#define REQ_NOMERGE (1 << __REQ_NOMERGE) 216#define REQ_NOMERGE (1ULL << __REQ_NOMERGE)
216#define REQ_STARTED (1 << __REQ_STARTED) 217#define REQ_STARTED (1ULL << __REQ_STARTED)
217#define REQ_DONTPREP (1 << __REQ_DONTPREP) 218#define REQ_DONTPREP (1ULL << __REQ_DONTPREP)
218#define REQ_QUEUED (1 << __REQ_QUEUED) 219#define REQ_QUEUED (1ULL << __REQ_QUEUED)
219#define REQ_ELVPRIV (1 << __REQ_ELVPRIV) 220#define REQ_ELVPRIV (1ULL << __REQ_ELVPRIV)
220#define REQ_FAILED (1 << __REQ_FAILED) 221#define REQ_FAILED (1ULL << __REQ_FAILED)
221#define REQ_QUIET (1 << __REQ_QUIET) 222#define REQ_QUIET (1ULL << __REQ_QUIET)
222#define REQ_PREEMPT (1 << __REQ_PREEMPT) 223#define REQ_PREEMPT (1ULL << __REQ_PREEMPT)
223#define REQ_ALLOCED (1 << __REQ_ALLOCED) 224#define REQ_ALLOCED (1ULL << __REQ_ALLOCED)
224#define REQ_COPY_USER (1 << __REQ_COPY_USER) 225#define REQ_COPY_USER (1ULL << __REQ_COPY_USER)
225#define REQ_FLUSH (1 << __REQ_FLUSH) 226#define REQ_FLUSH (1ULL << __REQ_FLUSH)
226#define REQ_FLUSH_SEQ (1 << __REQ_FLUSH_SEQ) 227#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ)
227#define REQ_IO_STAT (1 << __REQ_IO_STAT) 228#define REQ_IO_STAT (1ULL << __REQ_IO_STAT)
228#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 229#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE)
229#define REQ_SECURE (1 << __REQ_SECURE) 230#define REQ_SECURE (1ULL << __REQ_SECURE)
230#define REQ_KERNEL (1 << __REQ_KERNEL) 231#define REQ_KERNEL (1ULL << __REQ_KERNEL)
231#define REQ_PM (1 << __REQ_PM) 232#define REQ_PM (1ULL << __REQ_PM)
233#define REQ_END (1ULL << __REQ_END)
232 234
233#endif /* __LINUX_BLK_TYPES_H */ 235#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0e6f765aa1f5..f26ec20f6354 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -8,6 +8,7 @@
8#include <linux/major.h> 8#include <linux/major.h>
9#include <linux/genhd.h> 9#include <linux/genhd.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/llist.h>
11#include <linux/timer.h> 12#include <linux/timer.h>
12#include <linux/workqueue.h> 13#include <linux/workqueue.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
@@ -94,12 +95,19 @@ enum rq_cmd_type_bits {
94 * as well! 95 * as well!
95 */ 96 */
96struct request { 97struct request {
97 struct list_head queuelist; 98 union {
98 struct call_single_data csd; 99 struct list_head queuelist;
100 struct llist_node ll_list;
101 };
102 union {
103 struct call_single_data csd;
104 struct work_struct mq_flush_data;
105 };
99 106
100 struct request_queue *q; 107 struct request_queue *q;
108 struct blk_mq_ctx *mq_ctx;
101 109
102 unsigned int cmd_flags; 110 u64 cmd_flags;
103 enum rq_cmd_type_bits cmd_type; 111 enum rq_cmd_type_bits cmd_type;
104 unsigned long atomic_flags; 112 unsigned long atomic_flags;
105 113
@@ -160,8 +168,6 @@ struct request {
160 168
161 unsigned short ioprio; 169 unsigned short ioprio;
162 170
163 int ref_count;
164
165 void *special; /* opaque pointer available for LLD use */ 171 void *special; /* opaque pointer available for LLD use */
166 char *buffer; /* kaddr of the current segment if available */ 172 char *buffer; /* kaddr of the current segment if available */
167 173
@@ -215,6 +221,8 @@ struct request_pm_state
215 221
216#include <linux/elevator.h> 222#include <linux/elevator.h>
217 223
224struct blk_queue_ctx;
225
218typedef void (request_fn_proc) (struct request_queue *q); 226typedef void (request_fn_proc) (struct request_queue *q);
219typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); 227typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
220typedef int (prep_rq_fn) (struct request_queue *, struct request *); 228typedef int (prep_rq_fn) (struct request_queue *, struct request *);
@@ -313,6 +321,18 @@ struct request_queue {
313 dma_drain_needed_fn *dma_drain_needed; 321 dma_drain_needed_fn *dma_drain_needed;
314 lld_busy_fn *lld_busy_fn; 322 lld_busy_fn *lld_busy_fn;
315 323
324 struct blk_mq_ops *mq_ops;
325
326 unsigned int *mq_map;
327
328 /* sw queues */
329 struct blk_mq_ctx *queue_ctx;
330 unsigned int nr_queues;
331
332 /* hw dispatch queues */
333 struct blk_mq_hw_ctx **queue_hw_ctx;
334 unsigned int nr_hw_queues;
335
316 /* 336 /*
317 * Dispatch queue sorting 337 * Dispatch queue sorting
318 */ 338 */
@@ -361,6 +381,11 @@ struct request_queue {
361 */ 381 */
362 struct kobject kobj; 382 struct kobject kobj;
363 383
384 /*
385 * mq queue kobject
386 */
387 struct kobject mq_kobj;
388
364#ifdef CONFIG_PM_RUNTIME 389#ifdef CONFIG_PM_RUNTIME
365 struct device *dev; 390 struct device *dev;
366 int rpm_status; 391 int rpm_status;
@@ -425,7 +450,13 @@ struct request_queue {
425 unsigned long flush_pending_since; 450 unsigned long flush_pending_since;
426 struct list_head flush_queue[2]; 451 struct list_head flush_queue[2];
427 struct list_head flush_data_in_flight; 452 struct list_head flush_data_in_flight;
428 struct request flush_rq; 453 union {
454 struct request flush_rq;
455 struct {
456 spinlock_t mq_flush_lock;
457 struct work_struct mq_flush_work;
458 };
459 };
429 460
430 struct mutex sysfs_lock; 461 struct mutex sysfs_lock;
431 462
@@ -437,14 +468,14 @@ struct request_queue {
437 struct bsg_class_device bsg_dev; 468 struct bsg_class_device bsg_dev;
438#endif 469#endif
439 470
440#ifdef CONFIG_BLK_CGROUP
441 struct list_head all_q_node;
442#endif
443#ifdef CONFIG_BLK_DEV_THROTTLING 471#ifdef CONFIG_BLK_DEV_THROTTLING
444 /* Throttle data */ 472 /* Throttle data */
445 struct throtl_data *td; 473 struct throtl_data *td;
446#endif 474#endif
447 struct rcu_head rcu_head; 475 struct rcu_head rcu_head;
476 wait_queue_head_t mq_freeze_wq;
477 struct percpu_counter mq_usage_counter;
478 struct list_head all_q_node;
448}; 479};
449 480
450#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 481#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
@@ -467,6 +498,7 @@ struct request_queue {
467#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 498#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */
468#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 499#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
469#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ 500#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */
501#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
470 502
471#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 503#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
472 (1 << QUEUE_FLAG_STACKABLE) | \ 504 (1 << QUEUE_FLAG_STACKABLE) | \
@@ -539,6 +571,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
539#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) 571#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
540#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 572#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
541#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) 573#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
574#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
542#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 575#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
543#define blk_queue_noxmerges(q) \ 576#define blk_queue_noxmerges(q) \
544 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) 577 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
@@ -570,7 +603,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
570 603
571#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) 604#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist)
572 605
573#define rq_data_dir(rq) ((rq)->cmd_flags & 1) 606#define rq_data_dir(rq) (((rq)->cmd_flags & 1) != 0)
574 607
575static inline unsigned int blk_queue_cluster(struct request_queue *q) 608static inline unsigned int blk_queue_cluster(struct request_queue *q)
576{ 609{
@@ -1013,6 +1046,7 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
1013struct blk_plug { 1046struct blk_plug {
1014 unsigned long magic; /* detect uninitialized use-cases */ 1047 unsigned long magic; /* detect uninitialized use-cases */
1015 struct list_head list; /* requests */ 1048 struct list_head list; /* requests */
1049 struct list_head mq_list; /* blk-mq requests */
1016 struct list_head cb_list; /* md requires an unplug callback */ 1050 struct list_head cb_list; /* md requires an unplug callback */
1017}; 1051};
1018#define BLK_MAX_REQUEST_COUNT 16 1052#define BLK_MAX_REQUEST_COUNT 16
@@ -1050,7 +1084,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1050{ 1084{
1051 struct blk_plug *plug = tsk->plug; 1085 struct blk_plug *plug = tsk->plug;
1052 1086
1053 return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list)); 1087 return plug &&
1088 (!list_empty(&plug->list) ||
1089 !list_empty(&plug->mq_list) ||
1090 !list_empty(&plug->cb_list));
1054} 1091}
1055 1092
1056/* 1093/*
@@ -1325,6 +1362,7 @@ static inline void put_dev_sector(Sector p)
1325 1362
1326struct work_struct; 1363struct work_struct;
1327int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1364int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
1365int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
1328 1366
1329#ifdef CONFIG_BLK_CGROUP 1367#ifdef CONFIG_BLK_CGROUP
1330/* 1368/*
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 7c2e030e72f1..afc1343df3c7 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -5,6 +5,7 @@
5#include <linux/relay.h> 5#include <linux/relay.h>
6#include <linux/compat.h> 6#include <linux/compat.h>
7#include <uapi/linux/blktrace_api.h> 7#include <uapi/linux/blktrace_api.h>
8#include <linux/list.h>
8 9
9#if defined(CONFIG_BLK_DEV_IO_TRACE) 10#if defined(CONFIG_BLK_DEV_IO_TRACE)
10 11
@@ -23,6 +24,7 @@ struct blk_trace {
23 struct dentry *dir; 24 struct dentry *dir;
24 struct dentry *dropped_file; 25 struct dentry *dropped_file;
25 struct dentry *msg_file; 26 struct dentry *msg_file;
27 struct list_head running_list;
26 atomic_t dropped; 28 atomic_t dropped;
27}; 29};
28 30
@@ -87,7 +89,7 @@ static inline int blk_trace_init_sysfs(struct device *dev)
87#ifdef CONFIG_COMPAT 89#ifdef CONFIG_COMPAT
88 90
89struct compat_blk_user_trace_setup { 91struct compat_blk_user_trace_setup {
90 char name[32]; 92 char name[BLKTRACE_BDEV_SIZE];
91 u16 act_mask; 93 u16 act_mask;
92 u32 buf_size; 94 u32 buf_size;
93 u32 buf_nr; 95 u32 buf_nr;
diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h
index 0b23edbee309..1900bd0fa639 100644
--- a/include/linux/percpu_ida.h
+++ b/include/linux/percpu_ida.h
@@ -16,6 +16,8 @@ struct percpu_ida {
16 * percpu_ida_init() 16 * percpu_ida_init()
17 */ 17 */
18 unsigned nr_tags; 18 unsigned nr_tags;
19 unsigned percpu_max_size;
20 unsigned percpu_batch_size;
19 21
20 struct percpu_ida_cpu __percpu *tag_cpu; 22 struct percpu_ida_cpu __percpu *tag_cpu;
21 23
@@ -51,10 +53,29 @@ struct percpu_ida {
51 } ____cacheline_aligned_in_smp; 53 } ____cacheline_aligned_in_smp;
52}; 54};
53 55
56/*
57 * Number of tags we move between the percpu freelist and the global freelist at
58 * a time
59 */
60#define IDA_DEFAULT_PCPU_BATCH_MOVE 32U
61/* Max size of percpu freelist, */
62#define IDA_DEFAULT_PCPU_SIZE ((IDA_DEFAULT_PCPU_BATCH_MOVE * 3) / 2)
63
54int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp); 64int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
55void percpu_ida_free(struct percpu_ida *pool, unsigned tag); 65void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
56 66
57void percpu_ida_destroy(struct percpu_ida *pool); 67void percpu_ida_destroy(struct percpu_ida *pool);
58int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags); 68int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
69 unsigned long max_size, unsigned long batch_size);
70static inline int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
71{
72 return __percpu_ida_init(pool, nr_tags, IDA_DEFAULT_PCPU_SIZE,
73 IDA_DEFAULT_PCPU_BATCH_MOVE);
74}
75
76typedef int (*percpu_ida_cb)(unsigned, void *);
77int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
78 void *data);
59 79
80unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu);
60#endif /* __PERCPU_IDA_H__ */ 81#endif /* __PERCPU_IDA_H__ */
diff --git a/kernel/smp.c b/kernel/smp.c
index f5768b0c816a..46116100f0ee 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -18,6 +18,7 @@
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19enum { 19enum {
20 CSD_FLAG_LOCK = 0x01, 20 CSD_FLAG_LOCK = 0x01,
21 CSD_FLAG_WAIT = 0x02,
21}; 22};
22 23
23struct call_function_data { 24struct call_function_data {
@@ -124,7 +125,7 @@ static void csd_lock(struct call_single_data *csd)
124 125
125static void csd_unlock(struct call_single_data *csd) 126static void csd_unlock(struct call_single_data *csd)
126{ 127{
127 WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); 128 WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
128 129
129 /* 130 /*
130 * ensure we're all done before releasing data: 131 * ensure we're all done before releasing data:
@@ -146,6 +147,9 @@ void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
146 unsigned long flags; 147 unsigned long flags;
147 int ipi; 148 int ipi;
148 149
150 if (wait)
151 csd->flags |= CSD_FLAG_WAIT;
152
149 raw_spin_lock_irqsave(&dst->lock, flags); 153 raw_spin_lock_irqsave(&dst->lock, flags);
150 ipi = list_empty(&dst->list); 154 ipi = list_empty(&dst->list);
151 list_add_tail(&csd->list, &dst->list); 155 list_add_tail(&csd->list, &dst->list);
@@ -340,6 +344,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,
340 } 344 }
341 put_cpu(); 345 put_cpu();
342} 346}
347EXPORT_SYMBOL_GPL(__smp_call_function_single);
343 348
344/** 349/**
345 * smp_call_function_many(): Run a function on a set of other CPUs. 350 * smp_call_function_many(): Run a function on a set of other CPUs.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b8b8560bfb95..f785aef65799 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -26,6 +26,7 @@
26#include <linux/export.h> 26#include <linux/export.h>
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/list.h>
29 30
30#include <trace/events/block.h> 31#include <trace/events/block.h>
31 32
@@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;
38static struct trace_array *blk_tr; 39static struct trace_array *blk_tr;
39static bool blk_tracer_enabled __read_mostly; 40static bool blk_tracer_enabled __read_mostly;
40 41
42static LIST_HEAD(running_trace_list);
43static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
44
41/* Select an alternative, minimalistic output than the original one */ 45/* Select an alternative, minimalistic output than the original one */
42#define TRACE_BLK_OPT_CLASSIC 0x1 46#define TRACE_BLK_OPT_CLASSIC 0x1
43 47
@@ -107,10 +111,18 @@ record_it:
107 * Send out a notify for this process, if we haven't done so since a trace 111 * Send out a notify for this process, if we haven't done so since a trace
108 * started 112 * started
109 */ 113 */
110static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) 114static void trace_note_tsk(struct task_struct *tsk)
111{ 115{
116 unsigned long flags;
117 struct blk_trace *bt;
118
112 tsk->btrace_seq = blktrace_seq; 119 tsk->btrace_seq = blktrace_seq;
113 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); 120 spin_lock_irqsave(&running_trace_lock, flags);
121 list_for_each_entry(bt, &running_trace_list, running_list) {
122 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
123 sizeof(tsk->comm));
124 }
125 spin_unlock_irqrestore(&running_trace_lock, flags);
114} 126}
115 127
116static void trace_note_time(struct blk_trace *bt) 128static void trace_note_time(struct blk_trace *bt)
@@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
229 goto record_it; 241 goto record_it;
230 } 242 }
231 243
244 if (unlikely(tsk->btrace_seq != blktrace_seq))
245 trace_note_tsk(tsk);
246
232 /* 247 /*
233 * A word about the locking here - we disable interrupts to reserve 248 * A word about the locking here - we disable interrupts to reserve
234 * some space in the relay per-cpu buffer, to prevent an irq 249 * some space in the relay per-cpu buffer, to prevent an irq
235 * from coming in and stepping on our toes. 250 * from coming in and stepping on our toes.
236 */ 251 */
237 local_irq_save(flags); 252 local_irq_save(flags);
238
239 if (unlikely(tsk->btrace_seq != blktrace_seq))
240 trace_note_tsk(bt, tsk);
241
242 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); 253 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
243 if (t) { 254 if (t) {
244 sequence = per_cpu_ptr(bt->sequence, cpu); 255 sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
477 bt->dir = dir; 488 bt->dir = dir;
478 bt->dev = dev; 489 bt->dev = dev;
479 atomic_set(&bt->dropped, 0); 490 atomic_set(&bt->dropped, 0);
491 INIT_LIST_HEAD(&bt->running_list);
480 492
481 ret = -EIO; 493 ret = -EIO;
482 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, 494 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
@@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
567 .end_lba = cbuts.end_lba, 579 .end_lba = cbuts.end_lba,
568 .pid = cbuts.pid, 580 .pid = cbuts.pid,
569 }; 581 };
570 memcpy(&buts.name, &cbuts.name, 32);
571 582
572 ret = do_blk_trace_setup(q, name, dev, bdev, &buts); 583 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
573 if (ret) 584 if (ret)
574 return ret; 585 return ret;
575 586
576 if (copy_to_user(arg, &buts.name, 32)) { 587 if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
577 blk_trace_remove(q); 588 blk_trace_remove(q);
578 return -EFAULT; 589 return -EFAULT;
579 } 590 }
@@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
601 blktrace_seq++; 612 blktrace_seq++;
602 smp_mb(); 613 smp_mb();
603 bt->trace_state = Blktrace_running; 614 bt->trace_state = Blktrace_running;
615 spin_lock_irq(&running_trace_lock);
616 list_add(&bt->running_list, &running_trace_list);
617 spin_unlock_irq(&running_trace_lock);
604 618
605 trace_note_time(bt); 619 trace_note_time(bt);
606 ret = 0; 620 ret = 0;
@@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
608 } else { 622 } else {
609 if (bt->trace_state == Blktrace_running) { 623 if (bt->trace_state == Blktrace_running) {
610 bt->trace_state = Blktrace_stopped; 624 bt->trace_state = Blktrace_stopped;
625 spin_lock_irq(&running_trace_lock);
626 list_del_init(&bt->running_list);
627 spin_unlock_irq(&running_trace_lock);
611 relay_flush(bt->rchan); 628 relay_flush(bt->rchan);
612 ret = 0; 629 ret = 0;
613 } 630 }
@@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
1472 if (atomic_dec_and_test(&blk_probes_ref)) 1489 if (atomic_dec_and_test(&blk_probes_ref))
1473 blk_unregister_tracepoints(); 1490 blk_unregister_tracepoints();
1474 1491
1492 spin_lock_irq(&running_trace_lock);
1493 list_del(&bt->running_list);
1494 spin_unlock_irq(&running_trace_lock);
1475 blk_trace_free(bt); 1495 blk_trace_free(bt);
1476 return 0; 1496 return 0;
1477} 1497}
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 93c5d5ecff4e..7473ee3b4ee7 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -60,14 +60,15 @@ static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
60void percpu_counter_set(struct percpu_counter *fbc, s64 amount) 60void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
61{ 61{
62 int cpu; 62 int cpu;
63 unsigned long flags;
63 64
64 raw_spin_lock(&fbc->lock); 65 raw_spin_lock_irqsave(&fbc->lock, flags);
65 for_each_possible_cpu(cpu) { 66 for_each_possible_cpu(cpu) {
66 s32 *pcount = per_cpu_ptr(fbc->counters, cpu); 67 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
67 *pcount = 0; 68 *pcount = 0;
68 } 69 }
69 fbc->count = amount; 70 fbc->count = amount;
70 raw_spin_unlock(&fbc->lock); 71 raw_spin_unlock_irqrestore(&fbc->lock, flags);
71} 72}
72EXPORT_SYMBOL(percpu_counter_set); 73EXPORT_SYMBOL(percpu_counter_set);
73 74
@@ -78,9 +79,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
78 preempt_disable(); 79 preempt_disable();
79 count = __this_cpu_read(*fbc->counters) + amount; 80 count = __this_cpu_read(*fbc->counters) + amount;
80 if (count >= batch || count <= -batch) { 81 if (count >= batch || count <= -batch) {
81 raw_spin_lock(&fbc->lock); 82 unsigned long flags;
83 raw_spin_lock_irqsave(&fbc->lock, flags);
82 fbc->count += count; 84 fbc->count += count;
83 raw_spin_unlock(&fbc->lock); 85 raw_spin_unlock_irqrestore(&fbc->lock, flags);
84 __this_cpu_write(*fbc->counters, 0); 86 __this_cpu_write(*fbc->counters, 0);
85 } else { 87 } else {
86 __this_cpu_write(*fbc->counters, count); 88 __this_cpu_write(*fbc->counters, count);
@@ -97,14 +99,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
97{ 99{
98 s64 ret; 100 s64 ret;
99 int cpu; 101 int cpu;
102 unsigned long flags;
100 103
101 raw_spin_lock(&fbc->lock); 104 raw_spin_lock_irqsave(&fbc->lock, flags);
102 ret = fbc->count; 105 ret = fbc->count;
103 for_each_online_cpu(cpu) { 106 for_each_online_cpu(cpu) {
104 s32 *pcount = per_cpu_ptr(fbc->counters, cpu); 107 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
105 ret += *pcount; 108 ret += *pcount;
106 } 109 }
107 raw_spin_unlock(&fbc->lock); 110 raw_spin_unlock_irqrestore(&fbc->lock, flags);
108 return ret; 111 return ret;
109} 112}
110EXPORT_SYMBOL(__percpu_counter_sum); 113EXPORT_SYMBOL(__percpu_counter_sum);
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index bab1ba2a4c71..b0698ea972c6 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -30,15 +30,6 @@
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/percpu_ida.h> 31#include <linux/percpu_ida.h>
32 32
33/*
34 * Number of tags we move between the percpu freelist and the global freelist at
35 * a time
36 */
37#define IDA_PCPU_BATCH_MOVE 32U
38
39/* Max size of percpu freelist, */
40#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2)
41
42struct percpu_ida_cpu { 33struct percpu_ida_cpu {
43 /* 34 /*
44 * Even though this is percpu, we need a lock for tag stealing by remote 35 * Even though this is percpu, we need a lock for tag stealing by remote
@@ -78,7 +69,7 @@ static inline void steal_tags(struct percpu_ida *pool,
78 struct percpu_ida_cpu *remote; 69 struct percpu_ida_cpu *remote;
79 70
80 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); 71 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
81 cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2; 72 cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;
82 cpus_have_tags--) { 73 cpus_have_tags--) {
83 cpu = cpumask_next(cpu, &pool->cpus_have_tags); 74 cpu = cpumask_next(cpu, &pool->cpus_have_tags);
84 75
@@ -123,7 +114,7 @@ static inline void alloc_global_tags(struct percpu_ida *pool,
123{ 114{
124 move_tags(tags->freelist, &tags->nr_free, 115 move_tags(tags->freelist, &tags->nr_free,
125 pool->freelist, &pool->nr_free, 116 pool->freelist, &pool->nr_free,
126 min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); 117 min(pool->nr_free, pool->percpu_batch_size));
127} 118}
128 119
129static inline unsigned alloc_local_tag(struct percpu_ida *pool, 120static inline unsigned alloc_local_tag(struct percpu_ida *pool,
@@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
245 wake_up(&pool->wait); 236 wake_up(&pool->wait);
246 } 237 }
247 238
248 if (nr_free == IDA_PCPU_SIZE) { 239 if (nr_free == pool->percpu_max_size) {
249 spin_lock(&pool->lock); 240 spin_lock(&pool->lock);
250 241
251 /* 242 /*
252 * Global lock held and irqs disabled, don't need percpu 243 * Global lock held and irqs disabled, don't need percpu
253 * lock 244 * lock
254 */ 245 */
255 if (tags->nr_free == IDA_PCPU_SIZE) { 246 if (tags->nr_free == pool->percpu_max_size) {
256 move_tags(pool->freelist, &pool->nr_free, 247 move_tags(pool->freelist, &pool->nr_free,
257 tags->freelist, &tags->nr_free, 248 tags->freelist, &tags->nr_free,
258 IDA_PCPU_BATCH_MOVE); 249 pool->percpu_batch_size);
259 250
260 wake_up(&pool->wait); 251 wake_up(&pool->wait);
261 } 252 }
@@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy);
292 * Allocation is percpu, but sharding is limited by nr_tags - for best 283 * Allocation is percpu, but sharding is limited by nr_tags - for best
293 * performance, the workload should not span more cpus than nr_tags / 128. 284 * performance, the workload should not span more cpus than nr_tags / 128.
294 */ 285 */
295int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) 286int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
287 unsigned long max_size, unsigned long batch_size)
296{ 288{
297 unsigned i, cpu, order; 289 unsigned i, cpu, order;
298 290
@@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
301 init_waitqueue_head(&pool->wait); 293 init_waitqueue_head(&pool->wait);
302 spin_lock_init(&pool->lock); 294 spin_lock_init(&pool->lock);
303 pool->nr_tags = nr_tags; 295 pool->nr_tags = nr_tags;
296 pool->percpu_max_size = max_size;
297 pool->percpu_batch_size = batch_size;
304 298
305 /* Guard against overflow */ 299 /* Guard against overflow */
306 if (nr_tags > (unsigned) INT_MAX + 1) { 300 if (nr_tags > (unsigned) INT_MAX + 1) {
@@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
319 pool->nr_free = nr_tags; 313 pool->nr_free = nr_tags;
320 314
321 pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) + 315 pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
322 IDA_PCPU_SIZE * sizeof(unsigned), 316 pool->percpu_max_size * sizeof(unsigned),
323 sizeof(unsigned)); 317 sizeof(unsigned));
324 if (!pool->tag_cpu) 318 if (!pool->tag_cpu)
325 goto err; 319 goto err;
@@ -332,4 +326,65 @@ err:
332 percpu_ida_destroy(pool); 326 percpu_ida_destroy(pool);
333 return -ENOMEM; 327 return -ENOMEM;
334} 328}
335EXPORT_SYMBOL_GPL(percpu_ida_init); 329EXPORT_SYMBOL_GPL(__percpu_ida_init);
330
331/**
332 * percpu_ida_for_each_free - iterate free ids of a pool
333 * @pool: pool to iterate
334 * @fn: interate callback function
335 * @data: parameter for @fn
336 *
337 * Note, this doesn't guarantee to iterate all free ids restrictly. Some free
338 * ids might be missed, some might be iterated duplicated, and some might
339 * be iterated and not free soon.
340 */
341int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
342 void *data)
343{
344 unsigned long flags;
345 struct percpu_ida_cpu *remote;
346 unsigned cpu, i, err = 0;
347
348 local_irq_save(flags);
349 for_each_possible_cpu(cpu) {
350 remote = per_cpu_ptr(pool->tag_cpu, cpu);
351 spin_lock(&remote->lock);
352 for (i = 0; i < remote->nr_free; i++) {
353 err = fn(remote->freelist[i], data);
354 if (err)
355 break;
356 }
357 spin_unlock(&remote->lock);
358 if (err)
359 goto out;
360 }
361
362 spin_lock(&pool->lock);
363 for (i = 0; i < pool->nr_free; i++) {
364 err = fn(pool->freelist[i], data);
365 if (err)
366 break;
367 }
368 spin_unlock(&pool->lock);
369out:
370 local_irq_restore(flags);
371 return err;
372}
373EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
374
375/**
376 * percpu_ida_free_tags - return free tags number of a specific cpu or global pool
377 * @pool: pool related
378 * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids
379 *
380 * Note: this just returns a snapshot of free tags number.
381 */
382unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)
383{
384 struct percpu_ida_cpu *remote;
385 if (cpu == nr_cpu_ids)
386 return pool->nr_free;
387 remote = per_cpu_ptr(pool->tag_cpu, cpu);
388 return remote->nr_free;
389}
390EXPORT_SYMBOL_GPL(percpu_ida_free_tags);
diff --git a/mm/swap.c b/mm/swap.c
index 759c3caf44bd..7a9f80d451f5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -934,7 +934,8 @@ void __init swap_setup(void)
934#ifdef CONFIG_SWAP 934#ifdef CONFIG_SWAP
935 int i; 935 int i;
936 936
937 bdi_init(swapper_spaces[0].backing_dev_info); 937 if (bdi_init(swapper_spaces[0].backing_dev_info))
938 panic("Failed to init swap bdi");
938 for (i = 0; i < MAX_SWAPFILES; i++) { 939 for (i = 0; i < MAX_SWAPFILES; i++) {
939 spin_lock_init(&swapper_spaces[i].tree_lock); 940 spin_lock_init(&swapper_spaces[i].tree_lock);
940 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); 941 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);