summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBart Van Assche <bart.vanassche@wdc.com>2017-11-09 13:49:58 -0500
committerJens Axboe <axboe@kernel.dk>2017-11-10 21:53:25 -0500
commit3a0a529971ec4e2d933e9c7798db101dfb6b1aec (patch)
treee79cac20198e657afc109f6f80111b0fb03f9dbc
parentc9254f2ddb19387ea9714a57ea48463c20333b92 (diff)
block, scsi: Make SCSI quiesce and resume work reliably
The contexts from which a SCSI device can be quiesced or resumed are: * Writing into /sys/class/scsi_device/*/device/state. * SCSI parallel (SPI) domain validation. * The SCSI device power management methods. See also scsi_bus_pm_ops. It is essential during suspend and resume that neither the filesystem state nor the filesystem metadata in RAM changes. This is why while the hibernation image is being written or restored that SCSI devices are quiesced. The SCSI core quiesces devices through scsi_device_quiesce() and scsi_device_resume(). In the SDEV_QUIESCE state execution of non-preempt requests is deferred. This is realized by returning BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI devices. Avoid that a full queue prevents power management requests to be submitted by deferring allocation of non-preempt requests for devices in the quiesced state. This patch has been tested by running the following commands and by verifying that after each resume the fio job was still running: for ((i=0; i<10; i++)); do ( cd /sys/block/md0/md && while true; do [ "$(<sync_action)" = "idle" ] && echo check > sync_action sleep 1 done ) & pids=($!) for d in /sys/class/block/sd*[a-z]; do bdev=${d#/sys/class/block/} hcil=$(readlink "$d/device") hcil=${hcil#../../../} echo 4 > "$d/queue/nr_requests" echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth" fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \ --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16 \ --iodepth_batch=1 --thread --loops=$((2**31)) & pids+=($!) done sleep 1 echo "$(date) Hibernating ..." >>hibernate-test-log.txt systemctl hibernate sleep 10 kill "${pids[@]}" echo idle > /sys/block/md0/md/sync_action wait echo "$(date) Done." >>hibernate-test-log.txt done Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name> References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348). Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com> Reviewed-by: Hannes Reinecke <hare@suse.com> Tested-by: Martin Steigerwald <martin@lichtvoll.de> Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> Cc: Martin K. Petersen <martin.petersen@oracle.com> Cc: Ming Lei <ming.lei@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/blk-core.c42
-rw-r--r--block/blk-mq.c4
-rw-r--r--drivers/scsi/scsi_lib.c42
-rw-r--r--fs/block_dev.c4
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/scsi/scsi_device.h1
6 files changed, 70 insertions, 25 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index edc276899116..29b08428ae45 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q)
374 374
375 spin_lock_irqsave(q->queue_lock, flags); 375 spin_lock_irqsave(q->queue_lock, flags);
376 queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); 376 queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
377 wake_up_all(&q->mq_freeze_wq);
377 spin_unlock_irqrestore(q->queue_lock, flags); 378 spin_unlock_irqrestore(q->queue_lock, flags);
378} 379}
379EXPORT_SYMBOL_GPL(blk_clear_preempt_only); 380EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
@@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
795} 796}
796EXPORT_SYMBOL(blk_alloc_queue); 797EXPORT_SYMBOL(blk_alloc_queue);
797 798
798int blk_queue_enter(struct request_queue *q, bool nowait) 799/**
800 * blk_queue_enter() - try to increase q->q_usage_counter
801 * @q: request queue pointer
802 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
803 */
804int blk_queue_enter(struct request_queue *q, unsigned int flags)
799{ 805{
806 const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
807
800 while (true) { 808 while (true) {
809 bool success = false;
801 int ret; 810 int ret;
802 811
803 if (percpu_ref_tryget_live(&q->q_usage_counter)) 812 rcu_read_lock_sched();
813 if (percpu_ref_tryget_live(&q->q_usage_counter)) {
814 /*
815 * The code that sets the PREEMPT_ONLY flag is
816 * responsible for ensuring that that flag is globally
817 * visible before the queue is unfrozen.
818 */
819 if (preempt || !blk_queue_preempt_only(q)) {
820 success = true;
821 } else {
822 percpu_ref_put(&q->q_usage_counter);
823 }
824 }
825 rcu_read_unlock_sched();
826
827 if (success)
804 return 0; 828 return 0;
805 829
806 if (nowait) 830 if (flags & BLK_MQ_REQ_NOWAIT)
807 return -EBUSY; 831 return -EBUSY;
808 832
809 /* 833 /*
@@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
816 smp_rmb(); 840 smp_rmb();
817 841
818 ret = wait_event_interruptible(q->mq_freeze_wq, 842 ret = wait_event_interruptible(q->mq_freeze_wq,
819 !atomic_read(&q->mq_freeze_depth) || 843 (atomic_read(&q->mq_freeze_depth) == 0 &&
844 (preempt || !blk_queue_preempt_only(q))) ||
820 blk_queue_dying(q)); 845 blk_queue_dying(q));
821 if (blk_queue_dying(q)) 846 if (blk_queue_dying(q))
822 return -ENODEV; 847 return -ENODEV;
@@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
1445 /* create ioc upfront */ 1470 /* create ioc upfront */
1446 create_io_context(gfp_mask, q->node); 1471 create_io_context(gfp_mask, q->node);
1447 1472
1448 ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) || 1473 ret = blk_queue_enter(q, flags);
1449 (op & REQ_NOWAIT));
1450 if (ret) 1474 if (ret)
1451 return ERR_PTR(ret); 1475 return ERR_PTR(ret);
1452 spin_lock_irq(q->queue_lock); 1476 spin_lock_irq(q->queue_lock);
@@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
2267 current->bio_list = bio_list_on_stack; 2291 current->bio_list = bio_list_on_stack;
2268 do { 2292 do {
2269 struct request_queue *q = bio->bi_disk->queue; 2293 struct request_queue *q = bio->bi_disk->queue;
2294 unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
2295 BLK_MQ_REQ_NOWAIT : 0;
2270 2296
2271 if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { 2297 if (likely(blk_queue_enter(q, flags) == 0)) {
2272 struct bio_list lower, same; 2298 struct bio_list lower, same;
2273 2299
2274 /* Create a fresh bio_list for all subordinate requests */ 2300 /* Create a fresh bio_list for all subordinate requests */
@@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio)
2327 if (!generic_make_request_checks(bio)) 2353 if (!generic_make_request_checks(bio))
2328 return BLK_QC_T_NONE; 2354 return BLK_QC_T_NONE;
2329 2355
2330 if (unlikely(blk_queue_enter(q, nowait))) { 2356 if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
2331 if (nowait && !blk_queue_dying(q)) 2357 if (nowait && !blk_queue_dying(q))
2332 bio->bi_status = BLK_STS_AGAIN; 2358 bio->bi_status = BLK_STS_AGAIN;
2333 else 2359 else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e21876778cec..211bc8a3e2cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
389 struct request *rq; 389 struct request *rq;
390 int ret; 390 int ret;
391 391
392 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); 392 ret = blk_queue_enter(q, flags);
393 if (ret) 393 if (ret)
394 return ERR_PTR(ret); 394 return ERR_PTR(ret);
395 395
@@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
428 if (hctx_idx >= q->nr_hw_queues) 428 if (hctx_idx >= q->nr_hw_queues)
429 return ERR_PTR(-EIO); 429 return ERR_PTR(-EIO);
430 430
431 ret = blk_queue_enter(q, true); 431 ret = blk_queue_enter(q, flags);
432 if (ret) 432 if (ret)
433 return ERR_PTR(ret); 433 return ERR_PTR(ret);
434 434
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eb129dfc2ebe..f907e2f8c1dd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
2947int 2947int
2948scsi_device_quiesce(struct scsi_device *sdev) 2948scsi_device_quiesce(struct scsi_device *sdev)
2949{ 2949{
2950 struct request_queue *q = sdev->request_queue;
2950 int err; 2951 int err;
2951 2952
2953 /*
2954 * It is allowed to call scsi_device_quiesce() multiple times from
2955 * the same context but concurrent scsi_device_quiesce() calls are
2956 * not allowed.
2957 */
2958 WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
2959
2960 blk_set_preempt_only(q);
2961
2962 blk_mq_freeze_queue(q);
2963 /*
2964 * Ensure that the effect of blk_set_preempt_only() will be visible
2965 * for percpu_ref_tryget() callers that occur after the queue
2966 * unfreeze even if the queue was already frozen before this function
2967 * was called. See also https://lwn.net/Articles/573497/.
2968 */
2969 synchronize_rcu();
2970 blk_mq_unfreeze_queue(q);
2971
2952 mutex_lock(&sdev->state_mutex); 2972 mutex_lock(&sdev->state_mutex);
2953 err = scsi_device_set_state(sdev, SDEV_QUIESCE); 2973 err = scsi_device_set_state(sdev, SDEV_QUIESCE);
2974 if (err == 0)
2975 sdev->quiesced_by = current;
2976 else
2977 blk_clear_preempt_only(q);
2954 mutex_unlock(&sdev->state_mutex); 2978 mutex_unlock(&sdev->state_mutex);
2955 2979
2956 if (err) 2980 return err;
2957 return err;
2958
2959 scsi_run_queue(sdev->request_queue);
2960 while (atomic_read(&sdev->device_busy)) {
2961 msleep_interruptible(200);
2962 scsi_run_queue(sdev->request_queue);
2963 }
2964 return 0;
2965} 2981}
2966EXPORT_SYMBOL(scsi_device_quiesce); 2982EXPORT_SYMBOL(scsi_device_quiesce);
2967 2983
@@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev)
2981 * device deleted during suspend) 2997 * device deleted during suspend)
2982 */ 2998 */
2983 mutex_lock(&sdev->state_mutex); 2999 mutex_lock(&sdev->state_mutex);
2984 if (sdev->sdev_state == SDEV_QUIESCE && 3000 WARN_ON_ONCE(!sdev->quiesced_by);
2985 scsi_device_set_state(sdev, SDEV_RUNNING) == 0) 3001 sdev->quiesced_by = NULL;
2986 scsi_run_queue(sdev->request_queue); 3002 blk_clear_preempt_only(sdev->request_queue);
3003 if (sdev->sdev_state == SDEV_QUIESCE)
3004 scsi_device_set_state(sdev, SDEV_RUNNING);
2987 mutex_unlock(&sdev->state_mutex); 3005 mutex_unlock(&sdev->state_mutex);
2988} 3006}
2989EXPORT_SYMBOL(scsi_device_resume); 3007EXPORT_SYMBOL(scsi_device_resume);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4afa4d5ff969..04973f484422 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
662 if (!ops->rw_page || bdev_get_integrity(bdev)) 662 if (!ops->rw_page || bdev_get_integrity(bdev))
663 return result; 663 return result;
664 664
665 result = blk_queue_enter(bdev->bd_queue, false); 665 result = blk_queue_enter(bdev->bd_queue, 0);
666 if (result) 666 if (result)
667 return result; 667 return result;
668 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); 668 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
698 698
699 if (!ops->rw_page || bdev_get_integrity(bdev)) 699 if (!ops->rw_page || bdev_get_integrity(bdev))
700 return -EOPNOTSUPP; 700 return -EOPNOTSUPP;
701 result = blk_queue_enter(bdev->bd_queue, false); 701 result = blk_queue_enter(bdev->bd_queue, 0);
702 if (result) 702 if (result)
703 return result; 703 return result;
704 704
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2147e2381a22..402c9d536ae1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
959extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, 959extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
960 struct scsi_ioctl_command __user *); 960 struct scsi_ioctl_command __user *);
961 961
962extern int blk_queue_enter(struct request_queue *q, bool nowait); 962extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
963extern void blk_queue_exit(struct request_queue *q); 963extern void blk_queue_exit(struct request_queue *q);
964extern void blk_start_queue(struct request_queue *q); 964extern void blk_start_queue(struct request_queue *q);
965extern void blk_start_queue_async(struct request_queue *q); 965extern void blk_start_queue_async(struct request_queue *q);
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 82e93ee94708..6f0f1e242e23 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -219,6 +219,7 @@ struct scsi_device {
219 unsigned char access_state; 219 unsigned char access_state;
220 struct mutex state_mutex; 220 struct mutex state_mutex;
221 enum scsi_device_state sdev_state; 221 enum scsi_device_state sdev_state;
222 struct task_struct *quiesced_by;
222 unsigned long sdev_data[0]; 223 unsigned long sdev_data[0];
223} __attribute__((aligned(sizeof(unsigned long)))); 224} __attribute__((aligned(sizeof(unsigned long))));
224 225