From dd840087086f3b93ac20f7472b4fca59aff7b79f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Aug 2014 23:16:32 +0800 Subject: blk-mq: fix WARNING "percpu_ref_kill() called more than once!" Before doing queue release, the queue has been freezed already by blk_cleanup_queue(), so needn't to freeze queue for deleting tag set. This patch fixes the WARNING of "percpu_ref_kill() called more than once!" which is triggered during unloading block driver. Cc: Tejun Heo Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..ac8a0413664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1713,14 +1713,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; - blk_mq_freeze_queue(q); - mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); blk_mq_update_tag_set_depth(set); mutex_unlock(&set->tag_list_lock); - - blk_mq_unfreeze_queue(q); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, -- cgit v1.2.2 From 274a5843ff2f08a89464589d90c64eb65f2c0847 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 12:44:08 -0600 Subject: blk-mq: don't allow merges if turned off for the queue blk-mq uses BLK_MQ_F_SHOULD_MERGE, as set by the driver at init time, to determine whether it should merge IO or not. However, this could also be disabled by the admin, if merging is switched off through sysfs. So check the general queue state as well before attempting to merge IO. Reported-by: Rob Elliott Tested-by: Rob Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ac8a0413664e..c9e89a8792e3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1068,13 +1068,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } +static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) +{ + return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !blk_queue_nomerges(hctx->queue); +} + static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - struct request_queue *q = hctx->queue; - - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + if (!hctx_allow_merges(hctx)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1082,6 +1086,8 @@ insert_rq: spin_unlock(&ctx->lock); return false; } else { + struct request_queue *q = hctx->queue; + spin_lock(&ctx->lock); if (!blk_mq_attempt_merge(q, ctx, bio)) { blk_mq_bio_to_request(rq, bio); -- cgit v1.2.2 From 16f408dc6b1c87f5e3a767626df09c1399c6bf70 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Aug 2014 14:49:31 +0300 Subject: block: Fix BUG_ON when pi errors occur When getting a pi error we get to bio_integrity_end_io with bi_remaining already decremented to 0 where we will eventually need to call bio_endio with restored original bio completion handler. Calling bio_endio invokes a BUG_ON(). We should call bio_endio_nodec instead, like what is done in bio_integrity_verify_fn. Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bc423f7b02da..f14b4abbebd8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -520,7 +520,7 @@ void bio_integrity_endio(struct bio *bio, int error) */ if (error) { bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); + bio_endio_nodec(bio, error); return; } -- cgit v1.2.2 From a68aafa5b297d99c2d0c38689089a752126e9e79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 13:19:15 -0600 Subject: blk-mq: correct a few wrong/bad comments Just grammar or spelling errors, nothing major. Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c9e89a8792e3..a0565bb20fd5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1580,7 +1580,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->tags = set->tags[i]; /* - * Allocate space for all possible cpus to avoid allocation in + * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), @@ -1668,8 +1668,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { /* - * If not software queues are mapped to this hardware queue, - * disable it and free the request entries + * If no software queues are mapped to this hardware queue, + * disable it and free the request entries. */ if (!hctx->nr_ctx) { struct blk_mq_tag_set *set = q->tag_set; -- cgit v1.2.2 From cddd5d17642cc6881352732693c2ae6930e9ce65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 16 Aug 2014 08:02:24 -0400 Subject: blk-mq: blk_mq_freeze_queue() should allow nesting While converting to percpu_ref for freezing, add703fda981 ("blk-mq: use percpu_ref for mq usage count") incorrectly made blk_mq_freeze_queue() misbehave when freezing is nested due to percpu_ref_kill() being invoked on an already killed ref. Fix it by making blk_mq_freeze_queue() kill and kick the queue only for the outermost freeze attempt. All the nested ones can simply wait for the ref to reach zero. While at it, remove unnecessary @wake initialization from blk_mq_unfreeze_queue(). Signed-off-by: Tejun Heo Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a0565bb20fd5..7950f8d7c1bb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,18 +112,22 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) */ void blk_mq_freeze_queue(struct request_queue *q) { + bool freeze; + spin_lock_irq(q->queue_lock); - q->mq_freeze_depth++; + freeze = !q->mq_freeze_depth++; spin_unlock_irq(q->queue_lock); - percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q, false); + if (freeze) { + percpu_ref_kill(&q->mq_usage_counter); + blk_mq_run_queues(q, false); + } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } static void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake = false; + bool wake; spin_lock_irq(q->queue_lock); wake = !--q->mq_freeze_depth; -- cgit v1.2.2 From 2cada584b20007470931080c49310d85908449b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Aug 2014 20:38:27 -0500 Subject: block: cleanup error handling in sg_io Make sure we always clean up through the out label and just have a single place to put the request. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 51bf5155ee75..ba8706f6cded 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -279,7 +279,6 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, r = blk_rq_unmap_user(bio); if (!ret) ret = r; - blk_put_request(rq); return ret; } @@ -322,10 +321,9 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, return -ENOMEM; blk_rq_set_block_pc(rq); - if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { - blk_put_request(rq); - return -EFAULT; - } + ret = -EFAULT; + if (blk_fill_sghdr_rq(q, rq, hdr, mode)) + goto out; if (hdr->iovec_count) { size_t iov_data_len; @@ -376,7 +374,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, hdr->duration = jiffies_to_msecs(jiffies - start_time); - return blk_complete_sghdr_rq(rq, hdr, bio); + ret = blk_complete_sghdr_rq(rq, hdr, bio); out: blk_put_request(rq); return ret; -- cgit v1.2.2 From a57821cac6bb6e46abea118e34d0e86444ec1410 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Aug 2014 20:39:53 -0500 Subject: block: support > 16 byte CDBs for SG_IO Signed-off-by: Christoph Hellwig Reviewed-by: Boaz Harrosh Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index ba8706f6cded..1d78e6cf9d61 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -296,8 +296,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->interface_id != 'S') return -EINVAL; - if (hdr->cmd_len > BLK_MAX_CDB) - return -EINVAL; if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) return -EIO; @@ -316,14 +314,21 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->flags & SG_FLAG_Q_AT_HEAD) at_head = 1; + ret = -ENOMEM; rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); if (!rq) - return -ENOMEM; + goto out; blk_rq_set_block_pc(rq); + if (hdr->cmd_len > BLK_MAX_CDB) { + rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); + if (!rq->cmd) + goto out_put_request; + } + ret = -EFAULT; if (blk_fill_sghdr_rq(q, rq, hdr, mode)) - goto out; + goto out_free_cdb; if (hdr->iovec_count) { size_t iov_data_len; @@ -333,7 +338,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, 0, NULL, &iov); if (ret < 0) { kfree(iov); - goto out; + goto out_free_cdb; } iov_data_len = ret; @@ -356,7 +361,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, GFP_KERNEL); if (ret) - goto out; + goto out_free_cdb; bio = rq->bio; memset(sense, 0, sizeof(sense)); @@ -375,8 +380,13 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, hdr->duration = jiffies_to_msecs(jiffies - start_time); ret = blk_complete_sghdr_rq(rq, hdr, bio); -out: + +out_free_cdb: + if (rq->cmd != rq->__cmd) + kfree(rq->cmd); +out_put_request: blk_put_request(rq); +out: return ret; } -- cgit v1.2.2 From 6f4a16266fb3e58cd3e200eab51d2220ef92d604 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:39 -0400 Subject: scsi-mq: fix requests that use a separate CDB buffer This patch fixes code such as the following with scsi-mq enabled: rq = blk_get_request(...); blk_rq_set_block_pc(rq); rq->cmd = my_cmd_buffer; /* separate CDB buffer */ blk_execute_rq_nowait(...); Code like this appears in e.g. sg_start_req() in drivers/scsi/sg.c (for large CDBs only). Without this patch, scsi_mq_prep_fn() will set rq->cmd back to rq->__cmd, causing the wrong CDB to be sent to the device. Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-mq.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index c359d72e9d76..bf930f481d43 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1252,7 +1252,6 @@ void blk_rq_set_block_pc(struct request *rq) rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; memset(rq->__cmd, 0, sizeof(rq->__cmd)); - rq->cmd = rq->__cmd; } EXPORT_SYMBOL(blk_rq_set_block_pc); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7950f8d7c1bb..4aac82615a46 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -176,6 +176,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, /* tag was already set */ rq->errors = 0; + rq->cmd = rq->__cmd; + rq->extra_len = 0; rq->sense_len = 0; rq->resid_len = 0; -- cgit v1.2.2 From 2ba136daa3ae1e881c9f586f283fcaa164767dce Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:35 -0400 Subject: fix regression in SCSI_IOCTL_SEND_COMMAND blk_rq_set_block_pc() memsets rq->cmd to 0, so it should come immediately after blk_get_request() to avoid overwriting the user-supplied CDB. Also check for failure to allocate rq. Fixes: f27b087b81b7 ("block: add blk_rq_set_block_pc()") Cc: # 3.16.x Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 1d78e6cf9d61..5dd477bfb4bc 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -456,6 +456,11 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + if (!rq) { + err = -ENOMEM; + goto error; + } + blk_rq_set_block_pc(rq); cmdlen = COMMAND_SIZE(opcode); @@ -509,7 +514,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, memset(sense, 0, sizeof(sense)); rq->sense = sense; rq->sense_len = 0; - blk_rq_set_block_pc(rq); blk_execute_rq(q, disk, rq, 0); @@ -529,7 +533,8 @@ out: error: kfree(buffer); - blk_put_request(rq); + if (rq) + blk_put_request(rq); return err; } EXPORT_SYMBOL_GPL(sg_scsi_ioctl); -- cgit v1.2.2 From d19d744685f47f1bb3d39b3ec51eb50afe5ff47d Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Aug 2014 16:14:02 +0200 Subject: block: fix error handling in sg_io Before commit 2cada584b200 ("block: cleanup error handling in sg_io"), we had ret = 0 before entering the last big if block of sg_io. Since 2cada584b200, ret = -EFAULT, which breaks hdparm: /dev/sda: setting Advanced Power Management level to 0xc8 (200) HDIO_DRIVE_CMD failed: Bad address APM_level = 128 Signed-off-by: Sabrina Dubroca Fixes: 2cada584b200 ("block: cleanup error handling in sg_io") Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 5dd477bfb4bc..9b8eaeca6a79 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -330,6 +330,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (blk_fill_sghdr_rq(q, rq, hdr, mode)) goto out_free_cdb; + ret = 0; if (hdr->iovec_count) { size_t iov_data_len; struct iovec *iov = NULL; -- cgit v1.2.2 From e15693ef18e13e3e6bffe891fe140f18b8ff6d07 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Tue, 26 Aug 2014 20:56:36 +0900 Subject: cfq-iosched: Fix wrong children_weight calculation cfq_group_service_tree_add() is applying new_weight at the beginning of the function via cfq_update_group_weight(). This actually allows weight to change between adding it to and subtracting it from children_weight, and triggers WARN_ON_ONCE() in cfq_group_service_tree_del(), or even causes oops by divide error during vfr calculation in cfq_group_service_tree_add(). The detailed scenario is as follows: 1. Create blkio cgroups X and Y as a child of X. Set X's weight to 500 and perform some I/O to apply new_weight. This X's I/O completes before starting Y's I/O. 2. Y starts I/O and cfq_group_service_tree_add() is called with Y. 3. cfq_group_service_tree_add() walks up the tree during children_weight calculation and adds parent X's weight (500) to children_weight of root. children_weight becomes 500. 4. Set X's weight to 1000. 5. X starts I/O and cfq_group_service_tree_add() is called with X. 6. cfq_group_service_tree_add() applies its new_weight (1000). 7. I/O of Y completes and cfq_group_service_tree_del() is called with Y. 8. I/O of X completes and cfq_group_service_tree_del() is called with X. 9. cfq_group_service_tree_del() subtracts X's weight (1000) from children_weight of root. children_weight becomes -500. This triggers WARN_ON_ONCE(). 10. Set X's weight to 500. 11. X starts I/O and cfq_group_service_tree_add() is called with X. 12. cfq_group_service_tree_add() applies its new_weight (500) and adds it to children_weight of root. children_weight becomes 0. Calcularion of vfr triggers oops by divide error. weight should be updated right before adding it to children_weight. Reported-by: Ruki Sekiya Signed-off-by: Toshiaki Makita Acked-by: Tejun Heo Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cadc37841744..d7494637c5db 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1275,12 +1275,16 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) static void cfq_update_group_weight(struct cfq_group *cfqg) { - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; cfqg->new_weight = 0; } +} + +static void +cfq_update_group_leaf_weight(struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); if (cfqg->new_leaf_weight) { cfqg->leaf_weight = cfqg->new_leaf_weight; @@ -1299,7 +1303,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - cfq_update_group_weight(cfqg); + cfq_update_group_leaf_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); /* @@ -1323,6 +1327,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) */ while ((parent = cfqg_parent(pos))) { if (propagate) { + cfq_update_group_weight(pos); propagate = !parent->nr_active++; parent->children_weight += pos->weight; } -- cgit v1.2.2 From 7b5af5cffce569298b1d03af1ddf1dc43570ab42 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 28 Aug 2014 17:14:58 +0900 Subject: cfq-iosched: Add comments on update timing of weight Explain that weight has to be updated on activation. This complements previous fix e15693ef18e1 ("cfq-iosched: Fix wrong children_weight calculation"). Signed-off-by: Toshiaki Makita Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d7494637c5db..3f31cf9508e6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1272,6 +1272,9 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) rb_insert_color(&cfqg->rb_node, &st->rb); } +/* + * This has to be called only on activation of cfqg + */ static void cfq_update_group_weight(struct cfq_group *cfqg) { @@ -1303,6 +1306,11 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + /* + * Update leaf_weight. We cannot update weight at this point + * because cfqg might already have been activated and is + * contributing its current weight to the parent's child_weight. + */ cfq_update_group_leaf_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); -- cgit v1.2.2