aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorKent Overstreet <koverstreet@google.com>2012-09-10 17:33:46 -0400
committerKent Overstreet <koverstreet@google.com>2013-03-23 17:15:26 -0400
commitdf2cb6daa4cbc34406bc4b1ac9b9335df1083a72 (patch)
treeeffcb15a6e524696a82383c74bf01a812da22b97 /fs
parent57fb233f078beb5d0437a4ae575fbd4d9eb9c738 (diff)
block: Avoid deadlocks with bio allocation by stacking drivers
Previously, if we ever try to allocate more than once from the same bio set while running under generic_make_request() (i.e. a stacking block driver), we risk deadlock. This is because of the code in generic_make_request() that converts recursion to iteration; any bios we submit won't actually be submitted (so they can complete and eventually be freed) until after we return - this means if we allocate a second bio, we're blocking the first one from ever being freed. Thus if enough threads call into a stacking block driver at the same time with bios that need multiple splits, and the bio_set's reserve gets used up, we deadlock. This can be worked around in the driver code - we could check if we're running under generic_make_request(), then mask out __GFP_WAIT when we go to allocate a bio, and if the allocation fails punt to workqueue and retry the allocation. But this is tricky and not a generic solution. This patch solves it for all users by inverting the previously described technique. We allocate a rescuer workqueue for each bio_set, and then in the allocation code if there are bios on current->bio_list we would be blocking, we punt them to the rescuer workqueue to be submitted. This guarantees forward progress for bio allocations under generic_make_request() provided each bio is submitted before allocating the next, and provided the bios are freed after they complete. Note that this doesn't do anything for allocation from other mempools. Instead of allocating per bio data structures from a mempool, code should use bio_set's front_pad. Tested it by forcing the rescue codepath to be taken (by disabling the first GFP_NOWAIT) attempt, and then ran it with bcache (which does a lot of arbitrary bio splitting) and verified that the rescuer was being invoked. Signed-off-by: Kent Overstreet <koverstreet@google.com> CC: Jens Axboe <axboe@kernel.dk> Acked-by: Tejun Heo <tj@kernel.org> Reviewed-by: Muthukumar Ratty <muthur@gmail.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/bio.c116
1 files changed, 114 insertions, 2 deletions
diff --git a/fs/bio.c b/fs/bio.c
index bb5768f59b32..73b544709945 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -297,6 +297,54 @@ void bio_reset(struct bio *bio)
297} 297}
298EXPORT_SYMBOL(bio_reset); 298EXPORT_SYMBOL(bio_reset);
299 299
300static void bio_alloc_rescue(struct work_struct *work)
301{
302 struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
303 struct bio *bio;
304
305 while (1) {
306 spin_lock(&bs->rescue_lock);
307 bio = bio_list_pop(&bs->rescue_list);
308 spin_unlock(&bs->rescue_lock);
309
310 if (!bio)
311 break;
312
313 generic_make_request(bio);
314 }
315}
316
317static void punt_bios_to_rescuer(struct bio_set *bs)
318{
319 struct bio_list punt, nopunt;
320 struct bio *bio;
321
322 /*
323 * In order to guarantee forward progress we must punt only bios that
324 * were allocated from this bio_set; otherwise, if there was a bio on
325 * there for a stacking driver higher up in the stack, processing it
326 * could require allocating bios from this bio_set, and doing that from
327 * our own rescuer would be bad.
328 *
329 * Since bio lists are singly linked, pop them all instead of trying to
330 * remove from the middle of the list:
331 */
332
333 bio_list_init(&punt);
334 bio_list_init(&nopunt);
335
336 while ((bio = bio_list_pop(current->bio_list)))
337 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
338
339 *current->bio_list = nopunt;
340
341 spin_lock(&bs->rescue_lock);
342 bio_list_merge(&bs->rescue_list, &punt);
343 spin_unlock(&bs->rescue_lock);
344
345 queue_work(bs->rescue_workqueue, &bs->rescue_work);
346}
347
300/** 348/**
301 * bio_alloc_bioset - allocate a bio for I/O 349 * bio_alloc_bioset - allocate a bio for I/O
302 * @gfp_mask: the GFP_ mask given to the slab allocator 350 * @gfp_mask: the GFP_ mask given to the slab allocator
@@ -314,11 +362,27 @@ EXPORT_SYMBOL(bio_reset);
314 * previously allocated bio for IO before attempting to allocate a new one. 362 * previously allocated bio for IO before attempting to allocate a new one.
315 * Failure to do so can cause deadlocks under memory pressure. 363 * Failure to do so can cause deadlocks under memory pressure.
316 * 364 *
365 * Note that when running under generic_make_request() (i.e. any block
366 * driver), bios are not submitted until after you return - see the code in
367 * generic_make_request() that converts recursion into iteration, to prevent
368 * stack overflows.
369 *
370 * This would normally mean allocating multiple bios under
371 * generic_make_request() would be susceptible to deadlocks, but we have
372 * deadlock avoidance code that resubmits any blocked bios from a rescuer
373 * thread.
374 *
375 * However, we do not guarantee forward progress for allocations from other
376 * mempools. Doing multiple allocations from the same mempool under
377 * generic_make_request() should be avoided - instead, use bio_set's front_pad
378 * for per bio allocations.
379 *
317 * RETURNS: 380 * RETURNS:
318 * Pointer to new bio on success, NULL on failure. 381 * Pointer to new bio on success, NULL on failure.
319 */ 382 */
320struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 383struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
321{ 384{
385 gfp_t saved_gfp = gfp_mask;
322 unsigned front_pad; 386 unsigned front_pad;
323 unsigned inline_vecs; 387 unsigned inline_vecs;
324 unsigned long idx = BIO_POOL_NONE; 388 unsigned long idx = BIO_POOL_NONE;
@@ -336,7 +400,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
336 front_pad = 0; 400 front_pad = 0;
337 inline_vecs = nr_iovecs; 401 inline_vecs = nr_iovecs;
338 } else { 402 } else {
403 /*
404 * generic_make_request() converts recursion to iteration; this
405 * means if we're running beneath it, any bios we allocate and
406 * submit will not be submitted (and thus freed) until after we
407 * return.
408 *
409 * This exposes us to a potential deadlock if we allocate
410 * multiple bios from the same bio_set() while running
411 * underneath generic_make_request(). If we were to allocate
412 * multiple bios (say a stacking block driver that was splitting
413 * bios), we would deadlock if we exhausted the mempool's
414 * reserve.
415 *
416 * We solve this, and guarantee forward progress, with a rescuer
417 * workqueue per bio_set. If we go to allocate and there are
418 * bios on current->bio_list, we first try the allocation
419 * without __GFP_WAIT; if that fails, we punt those bios we
420 * would be blocking to the rescuer workqueue before we retry
421 * with the original gfp_flags.
422 */
423
424 if (current->bio_list && !bio_list_empty(current->bio_list))
425 gfp_mask &= ~__GFP_WAIT;
426
339 p = mempool_alloc(bs->bio_pool, gfp_mask); 427 p = mempool_alloc(bs->bio_pool, gfp_mask);
428 if (!p && gfp_mask != saved_gfp) {
429 punt_bios_to_rescuer(bs);
430 gfp_mask = saved_gfp;
431 p = mempool_alloc(bs->bio_pool, gfp_mask);
432 }
433
340 front_pad = bs->front_pad; 434 front_pad = bs->front_pad;
341 inline_vecs = BIO_INLINE_VECS; 435 inline_vecs = BIO_INLINE_VECS;
342 } 436 }
@@ -349,6 +443,12 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
349 443
350 if (nr_iovecs > inline_vecs) { 444 if (nr_iovecs > inline_vecs) {
351 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 445 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
446 if (!bvl && gfp_mask != saved_gfp) {
447 punt_bios_to_rescuer(bs);
448 gfp_mask = saved_gfp;
449 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
450 }
451
352 if (unlikely(!bvl)) 452 if (unlikely(!bvl))
353 goto err_free; 453 goto err_free;
354 } else if (nr_iovecs) { 454 } else if (nr_iovecs) {
@@ -1579,6 +1679,9 @@ static void biovec_free_pools(struct bio_set *bs)
1579 1679
1580void bioset_free(struct bio_set *bs) 1680void bioset_free(struct bio_set *bs)
1581{ 1681{
1682 if (bs->rescue_workqueue)
1683 destroy_workqueue(bs->rescue_workqueue);
1684
1582 if (bs->bio_pool) 1685 if (bs->bio_pool)
1583 mempool_destroy(bs->bio_pool); 1686 mempool_destroy(bs->bio_pool);
1584 1687
@@ -1614,6 +1717,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1614 1717
1615 bs->front_pad = front_pad; 1718 bs->front_pad = front_pad;
1616 1719
1720 spin_lock_init(&bs->rescue_lock);
1721 bio_list_init(&bs->rescue_list);
1722 INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
1723
1617 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); 1724 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
1618 if (!bs->bio_slab) { 1725 if (!bs->bio_slab) {
1619 kfree(bs); 1726 kfree(bs);
@@ -1624,9 +1731,14 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1624 if (!bs->bio_pool) 1731 if (!bs->bio_pool)
1625 goto bad; 1732 goto bad;
1626 1733
1627 if (!biovec_create_pools(bs, pool_size)) 1734 if (biovec_create_pools(bs, pool_size))
1628 return bs; 1735 goto bad;
1736
1737 bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
1738 if (!bs->rescue_workqueue)
1739 goto bad;
1629 1740
1741 return bs;
1630bad: 1742bad:
1631 bioset_free(bs); 1743 bioset_free(bs);
1632 return NULL; 1744 return NULL;