diff options
author | Tejun Heo <tj@kernel.org> | 2014-09-24 13:31:50 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2014-09-24 13:37:21 -0400 |
commit | 17497acbdce9506fd6a75115dee4ab80c3cc5ee5 (patch) | |
tree | fc56c3784250dd79761f34c30e1a2218974eaf76 | |
parent | 1cae13e75b7a7848c03138636d4eb8d8a5054dd5 (diff) |
blk-mq, percpu_ref: start q->mq_usage_counter in atomic mode
blk-mq uses percpu_ref for its usage counter which tracks the number
of in-flight commands and used to synchronously drain the queue on
freeze. percpu_ref shutdown takes measureable wallclock time as it
involves a sched RCU grace period. This means that draining a blk-mq
takes measureable wallclock time. One would think that this shouldn't
matter as queue shutdown should be a rare event which takes place
asynchronously w.r.t. userland.
Unfortunately, SCSI probing involves synchronously setting up and then
tearing down a lot of request_queues back-to-back for non-existent
LUNs. This means that SCSI probing may take above ten seconds when
scsi-mq is used.
[ 0.949892] scsi host0: Virtio SCSI HBA
[ 1.007864] scsi 0:0:0:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5
[ 1.021299] scsi 0:0:1:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5
[ 1.520356] tsc: Refined TSC clocksource calibration: 2491.910 MHz
<stall>
[ 16.186549] sd 0:0:0:0: Attached scsi generic sg0 type 0
[ 16.190478] sd 0:0:1:0: Attached scsi generic sg1 type 0
[ 16.194099] osd: LOADED open-osd 0.2.1
[ 16.203202] sd 0:0:0:0: [sda] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB)
[ 16.208478] sd 0:0:0:0: [sda] Write Protect is off
[ 16.211439] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
[ 16.218771] sd 0:0:1:0: [sdb] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB)
[ 16.223264] sd 0:0:1:0: [sdb] Write Protect is off
[ 16.225682] sd 0:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
This is also the reason why request_queues start in bypass mode which
is ended on blk_register_queue() as shutting down a fully functional
queue also involves a RCU grace period and the queues for non-existent
SCSI devices never reach registration.
blk-mq basically needs to do the same thing - start the mq in a
degraded mode which is faster to shut down and then make it fully
functional only after the queue reaches registration. percpu_ref
recently grew facilities to force atomic operation until explicitly
switched to percpu mode, which can be used for this purpose. This
patch makes blk-mq initialize q->mq_usage_counter in atomic mode and
switch it to percpu mode only once blk_register_queue() is reached.
Note that this issue was previously worked around by 0a30288da1ae
("blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during
probe") for v3.17. The temp fix was reverted in preparation of adding
persistent atomic mode to percpu_ref by 9eca80461a45 ("Revert "blk-mq,
percpu_ref: implement a kludge for SCSI blk-mq stall during probe"").
This patch and the prerequisite percpu_ref changes will be merged
during v3.18 devel cycle.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Christoph Hellwig <hch@infradead.org>
Link: http://lkml.kernel.org/g/20140919113815.GA10791@lst.de
Fixes: add703fda981 ("blk-mq: use percpu_ref for mq usage count")
Reviewed-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
-rw-r--r-- | block/blk-mq-sysfs.c | 6 | ||||
-rw-r--r-- | block/blk-mq.c | 6 | ||||
-rw-r--r-- | block/blk-sysfs.c | 11 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 1 |
4 files changed, 21 insertions, 3 deletions
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ed5217867555..371d8800b48a 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c | |||
@@ -402,6 +402,12 @@ static void blk_mq_sysfs_init(struct request_queue *q) | |||
402 | } | 402 | } |
403 | } | 403 | } |
404 | 404 | ||
405 | /* see blk_register_queue() */ | ||
406 | void blk_mq_finish_init(struct request_queue *q) | ||
407 | { | ||
408 | percpu_ref_switch_to_percpu(&q->mq_usage_counter); | ||
409 | } | ||
410 | |||
405 | int blk_mq_register_disk(struct gendisk *disk) | 411 | int blk_mq_register_disk(struct gendisk *disk) |
406 | { | 412 | { |
407 | struct device *dev = disk_to_dev(disk); | 413 | struct device *dev = disk_to_dev(disk); |
diff --git a/block/blk-mq.c b/block/blk-mq.c index d85fe01c44ef..38f4a165640d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -1795,8 +1795,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1795 | if (!q) | 1795 | if (!q) |
1796 | goto err_hctxs; | 1796 | goto err_hctxs; |
1797 | 1797 | ||
1798 | /* | ||
1799 | * Init percpu_ref in atomic mode so that it's faster to shutdown. | ||
1800 | * See blk_register_queue() for details. | ||
1801 | */ | ||
1798 | if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, | 1802 | if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, |
1799 | 0, GFP_KERNEL)) | 1803 | PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) |
1800 | goto err_map; | 1804 | goto err_map; |
1801 | 1805 | ||
1802 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); | 1806 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 17f5c84ce7bf..521ae9089c50 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -551,12 +551,19 @@ int blk_register_queue(struct gendisk *disk) | |||
551 | return -ENXIO; | 551 | return -ENXIO; |
552 | 552 | ||
553 | /* | 553 | /* |
554 | * Initialization must be complete by now. Finish the initial | 554 | * SCSI probing may synchronously create and destroy a lot of |
555 | * bypass from queue allocation. | 555 | * request_queues for non-existent devices. Shutting down a fully |
556 | * functional queue takes measureable wallclock time as RCU grace | ||
557 | * periods are involved. To avoid excessive latency in these | ||
558 | * cases, a request_queue starts out in a degraded mode which is | ||
559 | * faster to shut down and is made fully functional here as | ||
560 | * request_queues for non-existent devices never get registered. | ||
556 | */ | 561 | */ |
557 | if (!blk_queue_init_done(q)) { | 562 | if (!blk_queue_init_done(q)) { |
558 | queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); | 563 | queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); |
559 | blk_queue_bypass_end(q); | 564 | blk_queue_bypass_end(q); |
565 | if (q->mq_ops) | ||
566 | blk_mq_finish_init(q); | ||
560 | } | 567 | } |
561 | 568 | ||
562 | ret = blk_trace_init_sysfs(dev); | 569 | ret = blk_trace_init_sysfs(dev); |
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a1e31f274fcd..c13a0c09faea 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
@@ -140,6 +140,7 @@ enum { | |||
140 | }; | 140 | }; |
141 | 141 | ||
142 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); | 142 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); |
143 | void blk_mq_finish_init(struct request_queue *q); | ||
143 | int blk_mq_register_disk(struct gendisk *); | 144 | int blk_mq_register_disk(struct gendisk *); |
144 | void blk_mq_unregister_disk(struct gendisk *); | 145 | void blk_mq_unregister_disk(struct gendisk *); |
145 | 146 | ||