aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.com>2017-03-07 15:38:05 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-04-08 03:30:36 -0400
commitd5986e0078f25ee9862f3f13157f1421f18d6c64 (patch)
tree870873ccf448a1daf6cecbbdd4aaac2f3a9b2565
parente3a55294fc2048136de8a3f9c5154f5e4d3438d8 (diff)
blk: improve order of bio handling in generic_make_request()
commit 79bd99596b7305ab08109a8bf44a6a4511dbf1cd upstream. To avoid recursion on the kernel stack when stacked block devices are in use, generic_make_request() will, when called recursively, queue new requests for later handling. They will be handled when the make_request_fn for the current bio completes. If any bios are submitted by a make_request_fn, these will ultimately be handled seqeuntially. If the handling of one of those generates further requests, they will be added to the end of the queue. This strict first-in-first-out behaviour can lead to deadlocks in various ways, normally because a request might need to wait for a previous request to the same device to complete. This can happen when they share a mempool, and can happen due to interdependencies particular to the device. Both md and dm have examples where this happens. These deadlocks can be erradicated by more selective ordering of bios. Specifically by handling them in depth-first order. That is: when the handling of one bio generates one or more further bios, they are handled immediately after the parent, before any siblings of the parent. That way, when generic_make_request() calls make_request_fn for some particular device, we can be certain that all previously submited requests for that device have been completely handled and are not waiting for anything in the queue of requests maintained in generic_make_request(). An easy way to achieve this would be to use a last-in-first-out stack instead of a queue. However this will change the order of consecutive bios submitted by a make_request_fn, which could have unexpected consequences. Instead we take a slightly more complex approach. A fresh queue is created for each call to a make_request_fn. After it completes, any bios for a different device are placed on the front of the main queue, followed by any bios for the same device, followed by all bios that were already on the queue before the make_request_fn was called. This provides the depth-first approach without reordering bios on the same level. This, by itself, it not enough to remove all deadlocks. It just makes it possible for drivers to take the extra step required themselves. To avoid deadlocks, drivers must never risk waiting for a request after submitting one to generic_make_request. This includes never allocing from a mempool twice in the one call to a make_request_fn. A common pattern in drivers is to call bio_split() in a loop, handling the first part and then looping around to possibly split the next part. Instead, a driver that finds it needs to split a bio should queue (with generic_make_request) the second part, handle the first part, and then return. The new code in generic_make_request will ensure the requests to underlying bios are processed first, then the second bio that was split off. If it splits again, the same process happens. In each case one bio will be completely handled before the next one is attempted. With this is place, it should be possible to disable the punt_bios_to_recover() recovery thread for many block devices, and eventually it may be possible to remove it completely. Ref: http://www.spinics.net/lists/raid/msg54680.html Tested-by: Jinpu Wang <jinpu.wang@profitbricks.com> Inspired-by: Lars Ellenberg <lars.ellenberg@linbit.com> Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Jens Axboe <axboe@fb.com> Cc: Jack Wang <jinpu.wang@profitbricks.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--block/blk-core.c25
1 files changed, 21 insertions, 4 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 14d7c0740dc0..27cd46ad2b9e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2036,17 +2036,34 @@ blk_qc_t generic_make_request(struct bio *bio)
2036 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 2036 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
2037 2037
2038 if (likely(blk_queue_enter(q, false) == 0)) { 2038 if (likely(blk_queue_enter(q, false) == 0)) {
2039 struct bio_list hold;
2040 struct bio_list lower, same;
2041
2042 /* Create a fresh bio_list for all subordinate requests */
2043 hold = bio_list_on_stack;
2044 bio_list_init(&bio_list_on_stack);
2039 ret = q->make_request_fn(q, bio); 2045 ret = q->make_request_fn(q, bio);
2040 2046
2041 blk_queue_exit(q); 2047 blk_queue_exit(q);
2042 2048
2043 bio = bio_list_pop(current->bio_list); 2049 /* sort new bios into those for a lower level
2050 * and those for the same level
2051 */
2052 bio_list_init(&lower);
2053 bio_list_init(&same);
2054 while ((bio = bio_list_pop(&bio_list_on_stack)) != NULL)
2055 if (q == bdev_get_queue(bio->bi_bdev))
2056 bio_list_add(&same, bio);
2057 else
2058 bio_list_add(&lower, bio);
2059 /* now assemble so we handle the lowest level first */
2060 bio_list_merge(&bio_list_on_stack, &lower);
2061 bio_list_merge(&bio_list_on_stack, &same);
2062 bio_list_merge(&bio_list_on_stack, &hold);
2044 } else { 2063 } else {
2045 struct bio *bio_next = bio_list_pop(current->bio_list);
2046
2047 bio_io_error(bio); 2064 bio_io_error(bio);
2048 bio = bio_next;
2049 } 2065 }
2066 bio = bio_list_pop(current->bio_list);
2050 } while (bio); 2067 } while (bio);
2051 current->bio_list = NULL; /* deactivate */ 2068 current->bio_list = NULL; /* deactivate */
2052 2069