aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2016-02-15 01:23:12 -0500
committerDave Chinner <david@fromorbit.com>2016-02-15 01:23:12 -0500
commite10de3723c53378e7cf441529f563c316fdc0dd3 (patch)
treea21990a6de7476c768ec5924ed5cb4d40d3efe5f
parentbfce7d2e2d5ee05e9d465888905c66a70a9c243c (diff)
xfs: don't chain ioends during writepage submission
Currently we can build a long ioend chain during ->writepages that gets attached to the writepage context. IO submission only then occurs when we finish all the writepage processing. This means we can have many ioends allocated and pending, and this violates the mempool guarantees that we need to give about forwards progress. i.e. we really should only have one ioend being built at a time, otherwise we may drain the mempool trying to allocate a new ioend and that blocks submission, completion and freeing of ioends that are already in progress. To prevent this situation from happening, we need to submit ioends for IO as soon as they are ready for dispatch rather than queuing them for later submission. This means the ioends have bios built immediately and they get queued on any plug that is current active. Hence if we schedule away from writeback, the ioends that have been built will make forwards progress due to the plug flushing on context switch. This will also prevent context switches from creating unnecessary IO submission latency. We can't completely avoid having nested IO allocation - when we have a block size smaller than a page size, we still need to hold the ioend submission until after we have marked the current page dirty. Hence we may need multiple ioends to be held while the current page is completely mapped and made ready for IO dispatch. We cannot avoid this problem - the current code already has this ioend chaining within a page so we can mostly ignore that it occurs. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r--fs/xfs/xfs_aops.c241
-rw-r--r--fs/xfs/xfs_aops.h2
2 files changed, 119 insertions, 124 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6f5c95f94add..46dc9211bae7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -43,7 +43,6 @@ struct xfs_writepage_ctx {
43 struct xfs_bmbt_irec imap; 43 struct xfs_bmbt_irec imap;
44 bool imap_valid; 44 bool imap_valid;
45 unsigned int io_type; 45 unsigned int io_type;
46 struct xfs_ioend *iohead;
47 struct xfs_ioend *ioend; 46 struct xfs_ioend *ioend;
48 sector_t last_block; 47 sector_t last_block;
49}; 48};
@@ -277,7 +276,7 @@ xfs_alloc_ioend(
277 */ 276 */
278 atomic_set(&ioend->io_remaining, 1); 277 atomic_set(&ioend->io_remaining, 1);
279 ioend->io_error = 0; 278 ioend->io_error = 0;
280 ioend->io_list = NULL; 279 INIT_LIST_HEAD(&ioend->io_list);
281 ioend->io_type = type; 280 ioend->io_type = type;
282 ioend->io_inode = inode; 281 ioend->io_inode = inode;
283 ioend->io_buffer_head = NULL; 282 ioend->io_buffer_head = NULL;
@@ -420,8 +419,7 @@ xfs_start_buffer_writeback(
420STATIC void 419STATIC void
421xfs_start_page_writeback( 420xfs_start_page_writeback(
422 struct page *page, 421 struct page *page,
423 int clear_dirty, 422 int clear_dirty)
424 int buffers)
425{ 423{
426 ASSERT(PageLocked(page)); 424 ASSERT(PageLocked(page));
427 ASSERT(!PageWriteback(page)); 425 ASSERT(!PageWriteback(page));
@@ -440,10 +438,6 @@ xfs_start_page_writeback(
440 set_page_writeback_keepwrite(page); 438 set_page_writeback_keepwrite(page);
441 439
442 unlock_page(page); 440 unlock_page(page);
443
444 /* If no buffers on the page are to be written, finish it here */
445 if (!buffers)
446 end_page_writeback(page);
447} 441}
448 442
449static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) 443static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -452,110 +446,90 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
452} 446}
453 447
454/* 448/*
455 * Submit all of the bios for all of the ioends we have saved up, covering the 449 * Submit all of the bios for an ioend. We are only passed a single ioend at a
456 * initial writepage page and also any probed pages. 450 * time; the caller is responsible for chaining prior to submission.
457 *
458 * Because we may have multiple ioends spanning a page, we need to start
459 * writeback on all the buffers before we submit them for I/O. If we mark the
460 * buffers as we got, then we can end up with a page that only has buffers
461 * marked async write and I/O complete on can occur before we mark the other
462 * buffers async write.
463 *
464 * The end result of this is that we trip a bug in end_page_writeback() because
465 * we call it twice for the one page as the code in end_buffer_async_write()
466 * assumes that all buffers on the page are started at the same time.
467 *
468 * The fix is two passes across the ioend list - one to start writeback on the
469 * buffer_heads, and then submit them for I/O on the second pass.
470 * 451 *
471 * If @fail is non-zero, it means that we have a situation where some part of 452 * If @fail is non-zero, it means that we have a situation where some part of
472 * the submission process has failed after we have marked paged for writeback 453 * the submission process has failed after we have marked paged for writeback
473 * and unlocked them. In this situation, we need to fail the ioend chain rather 454 * and unlocked them. In this situation, we need to fail the ioend chain rather
474 * than submit it to IO. This typically only happens on a filesystem shutdown. 455 * than submit it to IO. This typically only happens on a filesystem shutdown.
475 */ 456 */
476STATIC void 457STATIC int
477xfs_submit_ioend( 458xfs_submit_ioend(
478 struct writeback_control *wbc, 459 struct writeback_control *wbc,
479 xfs_ioend_t *ioend, 460 xfs_ioend_t *ioend,
480 int fail) 461 int status)
481{ 462{
482 xfs_ioend_t *head = ioend;
483 xfs_ioend_t *next;
484 struct buffer_head *bh; 463 struct buffer_head *bh;
485 struct bio *bio; 464 struct bio *bio;
486 sector_t lastblock = 0; 465 sector_t lastblock = 0;
487 466
488 /* Pass 1 - start writeback */ 467 /* Reserve log space if we might write beyond the on-disk inode size. */
489 do { 468 if (!status &&
490 next = ioend->io_list; 469 ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
491 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) 470 status = xfs_setfilesize_trans_alloc(ioend);
492 xfs_start_buffer_writeback(bh); 471 /*
493 } while ((ioend = next) != NULL); 472 * If we are failing the IO now, just mark the ioend with an
473 * error and finish it. This will run IO completion immediately
474 * as there is only one reference to the ioend at this point in
475 * time.
476 */
477 if (status) {
478 ioend->io_error = status;
479 xfs_finish_ioend(ioend);
480 return status;
481 }
494 482
495 /* Pass 2 - submit I/O */ 483 bio = NULL;
496 ioend = head; 484 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
497 do {
498 next = ioend->io_list;
499 bio = NULL;
500 485
501 /* 486 if (!bio) {
502 * If we are failing the IO now, just mark the ioend with an 487retry:
503 * error and finish it. This will run IO completion immediately 488 bio = xfs_alloc_ioend_bio(bh);
504 * as there is only one reference to the ioend at this point in 489 } else if (bh->b_blocknr != lastblock + 1) {
505 * time. 490 xfs_submit_ioend_bio(wbc, ioend, bio);
506 */ 491 goto retry;
507 if (fail) {
508 ioend->io_error = fail;
509 xfs_finish_ioend(ioend);
510 continue;
511 } 492 }
512 493
513 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 494 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
514
515 if (!bio) {
516 retry:
517 bio = xfs_alloc_ioend_bio(bh);
518 } else if (bh->b_blocknr != lastblock + 1) {
519 xfs_submit_ioend_bio(wbc, ioend, bio);
520 goto retry;
521 }
522
523 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
524 xfs_submit_ioend_bio(wbc, ioend, bio);
525 goto retry;
526 }
527
528 lastblock = bh->b_blocknr;
529 }
530 if (bio)
531 xfs_submit_ioend_bio(wbc, ioend, bio); 495 xfs_submit_ioend_bio(wbc, ioend, bio);
532 xfs_finish_ioend(ioend); 496 goto retry;
533 } while ((ioend = next) != NULL); 497 }
498
499 lastblock = bh->b_blocknr;
500 }
501 if (bio)
502 xfs_submit_ioend_bio(wbc, ioend, bio);
503 xfs_finish_ioend(ioend);
504 return 0;
534} 505}
535 506
536/* 507/*
537 * Test to see if we've been building up a completion structure for 508 * Test to see if we've been building up a completion structure for
538 * earlier buffers -- if so, we try to append to this ioend if we 509 * earlier buffers -- if so, we try to append to this ioend if we
539 * can, otherwise we finish off any current ioend and start another. 510 * can, otherwise we finish off any current ioend and start another.
540 * Return true if we've finished the given ioend. 511 * Return the ioend we finished off so that the caller can submit it
512 * once it has finished processing the dirty page.
541 */ 513 */
542STATIC void 514STATIC void
543xfs_add_to_ioend( 515xfs_add_to_ioend(
544 struct inode *inode, 516 struct inode *inode,
545 struct buffer_head *bh, 517 struct buffer_head *bh,
546 xfs_off_t offset, 518 xfs_off_t offset,
547 struct xfs_writepage_ctx *wpc) 519 struct xfs_writepage_ctx *wpc,
520 struct list_head *iolist)
548{ 521{
549 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || 522 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
550 bh->b_blocknr != wpc->last_block + 1) { 523 bh->b_blocknr != wpc->last_block + 1) {
551 struct xfs_ioend *new; 524 struct xfs_ioend *new;
552 525
526 if (wpc->ioend)
527 list_add(&wpc->ioend->io_list, iolist);
528
553 new = xfs_alloc_ioend(inode, wpc->io_type); 529 new = xfs_alloc_ioend(inode, wpc->io_type);
554 new->io_offset = offset; 530 new->io_offset = offset;
555 new->io_buffer_head = bh; 531 new->io_buffer_head = bh;
556 new->io_buffer_tail = bh; 532 new->io_buffer_tail = bh;
557 if (wpc->ioend)
558 wpc->ioend->io_list = new;
559 wpc->ioend = new; 533 wpc->ioend = new;
560 } else { 534 } else {
561 wpc->ioend->io_buffer_tail->b_private = bh; 535 wpc->ioend->io_buffer_tail->b_private = bh;
@@ -565,6 +539,7 @@ xfs_add_to_ioend(
565 bh->b_private = NULL; 539 bh->b_private = NULL;
566 wpc->ioend->io_size += bh->b_size; 540 wpc->ioend->io_size += bh->b_size;
567 wpc->last_block = bh->b_blocknr; 541 wpc->last_block = bh->b_blocknr;
542 xfs_start_buffer_writeback(bh);
568} 543}
569 544
570STATIC void 545STATIC void
@@ -726,44 +701,41 @@ out_invalidate:
726 return; 701 return;
727} 702}
728 703
729static int 704/*
730xfs_writepage_submit( 705 * We implement an immediate ioend submission policy here to avoid needing to
731 struct xfs_writepage_ctx *wpc, 706 * chain multiple ioends and hence nest mempool allocations which can violate
732 struct writeback_control *wbc, 707 * forward progress guarantees we need to provide. The current ioend we are
733 int status) 708 * adding buffers to is cached on the writepage context, and if the new buffer
734{ 709 * does not append to the cached ioend it will create a new ioend and cache that
735 struct blk_plug plug; 710 * instead.
736 711 *
737 /* Reserve log space if we might write beyond the on-disk inode size. */ 712 * If a new ioend is created and cached, the old ioend is returned and queued
738 if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN && 713 * locally for submission once the entire page is processed or an error has been
739 xfs_ioend_is_append(wpc->ioend)) 714 * detected. While ioends are submitted immediately after they are completed,
740 status = xfs_setfilesize_trans_alloc(wpc->ioend); 715 * batching optimisations are provided by higher level block plugging.
741 716 *
742 if (wpc->iohead) { 717 * At the end of a writeback pass, there will be a cached ioend remaining on the
743 blk_start_plug(&plug); 718 * writepage context that the caller will need to submit.
744 xfs_submit_ioend(wbc, wpc->iohead, status); 719 */
745 blk_finish_plug(&plug);
746 }
747 return status;
748}
749
750static int 720static int
751xfs_writepage_map( 721xfs_writepage_map(
752 struct xfs_writepage_ctx *wpc, 722 struct xfs_writepage_ctx *wpc,
723 struct writeback_control *wbc,
753 struct inode *inode, 724 struct inode *inode,
754 struct page *page, 725 struct page *page,
755 loff_t offset, 726 loff_t offset,
756 __uint64_t end_offset) 727 __uint64_t end_offset)
757{ 728{
729 LIST_HEAD(submit_list);
730 struct xfs_ioend *ioend, *next;
758 struct buffer_head *bh, *head; 731 struct buffer_head *bh, *head;
759 ssize_t len = 1 << inode->i_blkbits; 732 ssize_t len = 1 << inode->i_blkbits;
760 int error = 0; 733 int error = 0;
761 int uptodate = 1;
762 int count = 0; 734 int count = 0;
735 int uptodate = 1;
763 736
764 bh = head = page_buffers(page); 737 bh = head = page_buffers(page);
765 offset = page_offset(page); 738 offset = page_offset(page);
766
767 do { 739 do {
768 if (offset >= end_offset) 740 if (offset >= end_offset)
769 break; 741 break;
@@ -816,7 +788,7 @@ xfs_writepage_map(
816 error = xfs_map_blocks(inode, offset, &wpc->imap, 788 error = xfs_map_blocks(inode, offset, &wpc->imap,
817 wpc->io_type); 789 wpc->io_type);
818 if (error) 790 if (error)
819 goto out_error; 791 goto out;
820 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, 792 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
821 offset); 793 offset);
822 } 794 }
@@ -824,46 +796,65 @@ xfs_writepage_map(
824 lock_buffer(bh); 796 lock_buffer(bh);
825 if (wpc->io_type != XFS_IO_OVERWRITE) 797 if (wpc->io_type != XFS_IO_OVERWRITE)
826 xfs_map_at_offset(inode, bh, &wpc->imap, offset); 798 xfs_map_at_offset(inode, bh, &wpc->imap, offset);
827 xfs_add_to_ioend(inode, bh, offset, wpc); 799 xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
828 count++; 800 count++;
829 } 801 }
830 802
831 if (!wpc->iohead)
832 wpc->iohead = wpc->ioend;
833
834 } while (offset += len, ((bh = bh->b_this_page) != head)); 803 } while (offset += len, ((bh = bh->b_this_page) != head));
835 804
836 if (uptodate && bh == head) 805 if (uptodate && bh == head)
837 SetPageUptodate(page); 806 SetPageUptodate(page);
838 807
839 xfs_start_page_writeback(page, 1, count); 808 ASSERT(wpc->ioend || list_empty(&submit_list));
840 ASSERT(wpc->iohead || !count);
841 return 0;
842 809
843out_error: 810out:
844 /* 811 /*
845 * On error, we have to fail the iohead here because we locked buffers 812 * On error, we have to fail the ioend here because we have locked
846 * in the ioend chain. If we don't do this, we'll deadlock invalidating 813 * buffers in the ioend. If we don't do this, we'll deadlock
847 * the page as that tries to lock the buffers on the page. Also, because 814 * invalidating the page as that tries to lock the buffers on the page.
848 * we may have set pages under writeback, we have to make sure we run 815 * Also, because we may have set pages under writeback, we have to make
849 * IO completion to mark the error state of the IO appropriately, so we 816 * sure we run IO completion to mark the error state of the IO
850 * can't cancel the ioend directly here. That means we have to mark this 817 * appropriately, so we can't cancel the ioend directly here. That means
851 * page as under writeback if we included any buffers from it in the 818 * we have to mark this page as under writeback if we included any
852 * ioend chain so that completion treats it correctly. 819 * buffers from it in the ioend chain so that completion treats it
820 * correctly.
853 * 821 *
854 * If we didn't include the page in the ioend, then we can simply 822 * If we didn't include the page in the ioend, the on error we can
855 * discard and unlock it as there are no other users of the page or it's 823 * simply discard and unlock it as there are no other users of the page
856 * buffers right now. The caller will still need to trigger submission 824 * or it's buffers right now. The caller will still need to trigger
857 * of outstanding ioends on the writepage context so they are treated 825 * submission of outstanding ioends on the writepage context so they are
858 * correctly on error. 826 * treated correctly on error.
859 */ 827 */
860 if (count) 828 if (count) {
861 xfs_start_page_writeback(page, 0, count); 829 xfs_start_page_writeback(page, !error);
862 else { 830
831 /*
832 * Preserve the original error if there was one, otherwise catch
833 * submission errors here and propagate into subsequent ioend
834 * submissions.
835 */
836 list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
837 int error2;
838
839 list_del_init(&ioend->io_list);
840 error2 = xfs_submit_ioend(wbc, ioend, error);
841 if (error2 && !error)
842 error = error2;
843 }
844 } else if (error) {
863 xfs_aops_discard_page(page); 845 xfs_aops_discard_page(page);
864 ClearPageUptodate(page); 846 ClearPageUptodate(page);
865 unlock_page(page); 847 unlock_page(page);
848 } else {
849 /*
850 * We can end up here with no error and nothing to write if we
851 * race with a partial page truncate on a sub-page block sized
852 * filesystem. In that case we need to mark the page clean.
853 */
854 xfs_start_page_writeback(page, 1);
855 end_page_writeback(page);
866 } 856 }
857
867 mapping_set_error(page->mapping, error); 858 mapping_set_error(page->mapping, error);
868 return error; 859 return error;
869} 860}
@@ -979,7 +970,7 @@ xfs_do_writepage(
979 end_offset = offset; 970 end_offset = offset;
980 } 971 }
981 972
982 return xfs_writepage_map(wpc, inode, page, offset, end_offset); 973 return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
983 974
984redirty: 975redirty:
985 redirty_page_for_writepage(wbc, page); 976 redirty_page_for_writepage(wbc, page);
@@ -998,7 +989,9 @@ xfs_vm_writepage(
998 int ret; 989 int ret;
999 990
1000 ret = xfs_do_writepage(page, wbc, &wpc); 991 ret = xfs_do_writepage(page, wbc, &wpc);
1001 return xfs_writepage_submit(&wpc, wbc, ret); 992 if (wpc.ioend)
993 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
994 return ret;
1002} 995}
1003 996
1004STATIC int 997STATIC int
@@ -1013,7 +1006,9 @@ xfs_vm_writepages(
1013 1006
1014 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1007 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1015 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); 1008 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1016 return xfs_writepage_submit(&wpc, wbc, ret); 1009 if (wpc.ioend)
1010 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1011 return ret;
1017} 1012}
1018 1013
1019/* 1014/*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 3c3f1a37a1c7..4e01bd5b6426 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -41,7 +41,7 @@ enum {
41 * It can manage several multi-page bio's at once. 41 * It can manage several multi-page bio's at once.
42 */ 42 */
43typedef struct xfs_ioend { 43typedef struct xfs_ioend {
44 struct xfs_ioend *io_list; /* next ioend in chain */ 44 struct list_head io_list; /* next ioend in chain */
45 unsigned int io_type; /* delalloc / unwritten */ 45 unsigned int io_type; /* delalloc / unwritten */
46 int io_error; /* I/O error code */ 46 int io_error; /* I/O error code */
47 atomic_t io_remaining; /* hold count */ 47 atomic_t io_remaining; /* hold count */