diff options
author | Wu Fengguang <fengguang.wu@intel.com> | 2011-05-04 21:54:37 -0400 |
---|---|---|
committer | Wu Fengguang <fengguang.wu@intel.com> | 2011-07-10 01:09:01 -0400 |
commit | d46db3d58233be4be980eb1e42eebe7808bcabab (patch) | |
tree | 6d813b33938d915f0c0633e8615d1ffdcc554c96 /fs/fs-writeback.c | |
parent | 36715cef0770b7e2547892b7c3197fc024274630 (diff) |
writeback: make writeback_control.nr_to_write straight
Pass struct wb_writeback_work all the way down to writeback_sb_inodes(),
and initialize the struct writeback_control there.
struct writeback_control is basically designed to control writeback of a
single file, but we keep abuse it for writing multiple files in
writeback_sb_inodes() and its callers.
It immediately clean things up, e.g. suddenly wbc.nr_to_write vs
work->nr_pages starts to make sense, and instead of saving and restoring
pages_skipped in writeback_sb_inodes it can always start with a clean
zero value.
It also makes a neat IO pattern change: large dirty files are now
written in the full 4MB writeback chunk size, rather than whatever
remained quota in wbc->nr_to_write.
Acked-by: Jan Kara <jack@suse.cz>
Proposed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 196 |
1 files changed, 111 insertions, 85 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6caa98247a5b..2c947da39f6e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -30,11 +30,21 @@ | |||
30 | #include "internal.h" | 30 | #include "internal.h" |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * The maximum number of pages to writeout in a single bdi flush/kupdate | ||
34 | * operation. We do this so we don't hold I_SYNC against an inode for | ||
35 | * enormous amounts of time, which would block a userspace task which has | ||
36 | * been forced to throttle against that inode. Also, the code reevaluates | ||
37 | * the dirty each time it has written this many pages. | ||
38 | */ | ||
39 | #define MAX_WRITEBACK_PAGES 1024L | ||
40 | |||
41 | /* | ||
33 | * Passed into wb_writeback(), essentially a subset of writeback_control | 42 | * Passed into wb_writeback(), essentially a subset of writeback_control |
34 | */ | 43 | */ |
35 | struct wb_writeback_work { | 44 | struct wb_writeback_work { |
36 | long nr_pages; | 45 | long nr_pages; |
37 | struct super_block *sb; | 46 | struct super_block *sb; |
47 | unsigned long *older_than_this; | ||
38 | enum writeback_sync_modes sync_mode; | 48 | enum writeback_sync_modes sync_mode; |
39 | unsigned int tagged_writepages:1; | 49 | unsigned int tagged_writepages:1; |
40 | unsigned int for_kupdate:1; | 50 | unsigned int for_kupdate:1; |
@@ -472,7 +482,6 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, | |||
472 | * No need to add it back to the LRU. | 482 | * No need to add it back to the LRU. |
473 | */ | 483 | */ |
474 | list_del_init(&inode->i_wb_list); | 484 | list_del_init(&inode->i_wb_list); |
475 | wbc->inodes_written++; | ||
476 | } | 485 | } |
477 | } | 486 | } |
478 | inode_sync_complete(inode); | 487 | inode_sync_complete(inode); |
@@ -506,6 +515,31 @@ static bool pin_sb_for_writeback(struct super_block *sb) | |||
506 | return false; | 515 | return false; |
507 | } | 516 | } |
508 | 517 | ||
518 | static long writeback_chunk_size(struct wb_writeback_work *work) | ||
519 | { | ||
520 | long pages; | ||
521 | |||
522 | /* | ||
523 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
524 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
525 | * here avoids calling into writeback_inodes_wb() more than once. | ||
526 | * | ||
527 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
528 | * | ||
529 | * wb_writeback() | ||
530 | * writeback_sb_inodes() <== called only once | ||
531 | * write_cache_pages() <== called once for each inode | ||
532 | * (quickly) tag currently dirty pages | ||
533 | * (maybe slowly) sync all tagged pages | ||
534 | */ | ||
535 | if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) | ||
536 | pages = LONG_MAX; | ||
537 | else | ||
538 | pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); | ||
539 | |||
540 | return pages; | ||
541 | } | ||
542 | |||
509 | /* | 543 | /* |
510 | * Write a portion of b_io inodes which belong to @sb. | 544 | * Write a portion of b_io inodes which belong to @sb. |
511 | * | 545 | * |
@@ -513,18 +547,30 @@ static bool pin_sb_for_writeback(struct super_block *sb) | |||
513 | * inodes. Otherwise write only ones which go sequentially | 547 | * inodes. Otherwise write only ones which go sequentially |
514 | * in reverse order. | 548 | * in reverse order. |
515 | * | 549 | * |
516 | * Return 1, if the caller writeback routine should be | 550 | * Return the number of pages and/or inodes written. |
517 | * interrupted. Otherwise return 0. | ||
518 | */ | 551 | */ |
519 | static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | 552 | static long writeback_sb_inodes(struct super_block *sb, |
520 | struct writeback_control *wbc, bool only_this_sb) | 553 | struct bdi_writeback *wb, |
554 | struct wb_writeback_work *work) | ||
521 | { | 555 | { |
556 | struct writeback_control wbc = { | ||
557 | .sync_mode = work->sync_mode, | ||
558 | .tagged_writepages = work->tagged_writepages, | ||
559 | .for_kupdate = work->for_kupdate, | ||
560 | .for_background = work->for_background, | ||
561 | .range_cyclic = work->range_cyclic, | ||
562 | .range_start = 0, | ||
563 | .range_end = LLONG_MAX, | ||
564 | }; | ||
565 | unsigned long start_time = jiffies; | ||
566 | long write_chunk; | ||
567 | long wrote = 0; /* count both pages and inodes */ | ||
568 | |||
522 | while (!list_empty(&wb->b_io)) { | 569 | while (!list_empty(&wb->b_io)) { |
523 | long pages_skipped; | ||
524 | struct inode *inode = wb_inode(wb->b_io.prev); | 570 | struct inode *inode = wb_inode(wb->b_io.prev); |
525 | 571 | ||
526 | if (inode->i_sb != sb) { | 572 | if (inode->i_sb != sb) { |
527 | if (only_this_sb) { | 573 | if (work->sb) { |
528 | /* | 574 | /* |
529 | * We only want to write back data for this | 575 | * We only want to write back data for this |
530 | * superblock, move all inodes not belonging | 576 | * superblock, move all inodes not belonging |
@@ -539,7 +585,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
539 | * Bounce back to the caller to unpin this and | 585 | * Bounce back to the caller to unpin this and |
540 | * pin the next superblock. | 586 | * pin the next superblock. |
541 | */ | 587 | */ |
542 | return 0; | 588 | break; |
543 | } | 589 | } |
544 | 590 | ||
545 | /* | 591 | /* |
@@ -553,12 +599,18 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
553 | requeue_io(inode, wb); | 599 | requeue_io(inode, wb); |
554 | continue; | 600 | continue; |
555 | } | 601 | } |
556 | |||
557 | __iget(inode); | 602 | __iget(inode); |
603 | write_chunk = writeback_chunk_size(work); | ||
604 | wbc.nr_to_write = write_chunk; | ||
605 | wbc.pages_skipped = 0; | ||
606 | |||
607 | writeback_single_inode(inode, wb, &wbc); | ||
558 | 608 | ||
559 | pages_skipped = wbc->pages_skipped; | 609 | work->nr_pages -= write_chunk - wbc.nr_to_write; |
560 | writeback_single_inode(inode, wb, wbc); | 610 | wrote += write_chunk - wbc.nr_to_write; |
561 | if (wbc->pages_skipped != pages_skipped) { | 611 | if (!(inode->i_state & I_DIRTY)) |
612 | wrote++; | ||
613 | if (wbc.pages_skipped) { | ||
562 | /* | 614 | /* |
563 | * writeback is not making progress due to locked | 615 | * writeback is not making progress due to locked |
564 | * buffers. Skip this inode for now. | 616 | * buffers. Skip this inode for now. |
@@ -570,17 +622,25 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, | |||
570 | iput(inode); | 622 | iput(inode); |
571 | cond_resched(); | 623 | cond_resched(); |
572 | spin_lock(&wb->list_lock); | 624 | spin_lock(&wb->list_lock); |
573 | if (wbc->nr_to_write <= 0) | 625 | /* |
574 | return 1; | 626 | * bail out to wb_writeback() often enough to check |
627 | * background threshold and other termination conditions. | ||
628 | */ | ||
629 | if (wrote) { | ||
630 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
631 | break; | ||
632 | if (work->nr_pages <= 0) | ||
633 | break; | ||
634 | } | ||
575 | } | 635 | } |
576 | /* b_io is empty */ | 636 | return wrote; |
577 | return 1; | ||
578 | } | 637 | } |
579 | 638 | ||
580 | static void __writeback_inodes_wb(struct bdi_writeback *wb, | 639 | static long __writeback_inodes_wb(struct bdi_writeback *wb, |
581 | struct writeback_control *wbc) | 640 | struct wb_writeback_work *work) |
582 | { | 641 | { |
583 | int ret = 0; | 642 | unsigned long start_time = jiffies; |
643 | long wrote = 0; | ||
584 | 644 | ||
585 | while (!list_empty(&wb->b_io)) { | 645 | while (!list_empty(&wb->b_io)) { |
586 | struct inode *inode = wb_inode(wb->b_io.prev); | 646 | struct inode *inode = wb_inode(wb->b_io.prev); |
@@ -590,33 +650,37 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb, | |||
590 | requeue_io(inode, wb); | 650 | requeue_io(inode, wb); |
591 | continue; | 651 | continue; |
592 | } | 652 | } |
593 | ret = writeback_sb_inodes(sb, wb, wbc, false); | 653 | wrote += writeback_sb_inodes(sb, wb, work); |
594 | drop_super(sb); | 654 | drop_super(sb); |
595 | 655 | ||
596 | if (ret) | 656 | /* refer to the same tests at the end of writeback_sb_inodes */ |
597 | break; | 657 | if (wrote) { |
658 | if (time_is_before_jiffies(start_time + HZ / 10UL)) | ||
659 | break; | ||
660 | if (work->nr_pages <= 0) | ||
661 | break; | ||
662 | } | ||
598 | } | 663 | } |
599 | /* Leave any unwritten inodes on b_io */ | 664 | /* Leave any unwritten inodes on b_io */ |
665 | return wrote; | ||
600 | } | 666 | } |
601 | 667 | ||
602 | void writeback_inodes_wb(struct bdi_writeback *wb, | 668 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) |
603 | struct writeback_control *wbc) | ||
604 | { | 669 | { |
670 | struct wb_writeback_work work = { | ||
671 | .nr_pages = nr_pages, | ||
672 | .sync_mode = WB_SYNC_NONE, | ||
673 | .range_cyclic = 1, | ||
674 | }; | ||
675 | |||
605 | spin_lock(&wb->list_lock); | 676 | spin_lock(&wb->list_lock); |
606 | if (list_empty(&wb->b_io)) | 677 | if (list_empty(&wb->b_io)) |
607 | queue_io(wb, wbc->older_than_this); | 678 | queue_io(wb, NULL); |
608 | __writeback_inodes_wb(wb, wbc); | 679 | __writeback_inodes_wb(wb, &work); |
609 | spin_unlock(&wb->list_lock); | 680 | spin_unlock(&wb->list_lock); |
610 | } | ||
611 | 681 | ||
612 | /* | 682 | return nr_pages - work.nr_pages; |
613 | * The maximum number of pages to writeout in a single bdi flush/kupdate | 683 | } |
614 | * operation. We do this so we don't hold I_SYNC against an inode for | ||
615 | * enormous amounts of time, which would block a userspace task which has | ||
616 | * been forced to throttle against that inode. Also, the code reevaluates | ||
617 | * the dirty each time it has written this many pages. | ||
618 | */ | ||
619 | #define MAX_WRITEBACK_PAGES 1024 | ||
620 | 684 | ||
621 | static inline bool over_bground_thresh(void) | 685 | static inline bool over_bground_thresh(void) |
622 | { | 686 | { |
@@ -646,42 +710,13 @@ static inline bool over_bground_thresh(void) | |||
646 | static long wb_writeback(struct bdi_writeback *wb, | 710 | static long wb_writeback(struct bdi_writeback *wb, |
647 | struct wb_writeback_work *work) | 711 | struct wb_writeback_work *work) |
648 | { | 712 | { |
649 | struct writeback_control wbc = { | 713 | long nr_pages = work->nr_pages; |
650 | .sync_mode = work->sync_mode, | ||
651 | .tagged_writepages = work->tagged_writepages, | ||
652 | .older_than_this = NULL, | ||
653 | .for_kupdate = work->for_kupdate, | ||
654 | .for_background = work->for_background, | ||
655 | .range_cyclic = work->range_cyclic, | ||
656 | }; | ||
657 | unsigned long oldest_jif; | 714 | unsigned long oldest_jif; |
658 | long wrote = 0; | ||
659 | long write_chunk = MAX_WRITEBACK_PAGES; | ||
660 | struct inode *inode; | 715 | struct inode *inode; |
661 | 716 | long progress; | |
662 | if (!wbc.range_cyclic) { | ||
663 | wbc.range_start = 0; | ||
664 | wbc.range_end = LLONG_MAX; | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * WB_SYNC_ALL mode does livelock avoidance by syncing dirty | ||
669 | * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX | ||
670 | * here avoids calling into writeback_inodes_wb() more than once. | ||
671 | * | ||
672 | * The intended call sequence for WB_SYNC_ALL writeback is: | ||
673 | * | ||
674 | * wb_writeback() | ||
675 | * writeback_sb_inodes() <== called only once | ||
676 | * write_cache_pages() <== called once for each inode | ||
677 | * (quickly) tag currently dirty pages | ||
678 | * (maybe slowly) sync all tagged pages | ||
679 | */ | ||
680 | if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) | ||
681 | write_chunk = LONG_MAX; | ||
682 | 717 | ||
683 | oldest_jif = jiffies; | 718 | oldest_jif = jiffies; |
684 | wbc.older_than_this = &oldest_jif; | 719 | work->older_than_this = &oldest_jif; |
685 | 720 | ||
686 | spin_lock(&wb->list_lock); | 721 | spin_lock(&wb->list_lock); |
687 | for (;;) { | 722 | for (;;) { |
@@ -711,24 +746,17 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
711 | if (work->for_kupdate) { | 746 | if (work->for_kupdate) { |
712 | oldest_jif = jiffies - | 747 | oldest_jif = jiffies - |
713 | msecs_to_jiffies(dirty_expire_interval * 10); | 748 | msecs_to_jiffies(dirty_expire_interval * 10); |
714 | wbc.older_than_this = &oldest_jif; | 749 | work->older_than_this = &oldest_jif; |
715 | } | 750 | } |
716 | 751 | ||
717 | wbc.nr_to_write = write_chunk; | 752 | trace_writeback_start(wb->bdi, work); |
718 | wbc.pages_skipped = 0; | ||
719 | wbc.inodes_written = 0; | ||
720 | |||
721 | trace_wbc_writeback_start(&wbc, wb->bdi); | ||
722 | if (list_empty(&wb->b_io)) | 753 | if (list_empty(&wb->b_io)) |
723 | queue_io(wb, wbc.older_than_this); | 754 | queue_io(wb, work->older_than_this); |
724 | if (work->sb) | 755 | if (work->sb) |
725 | writeback_sb_inodes(work->sb, wb, &wbc, true); | 756 | progress = writeback_sb_inodes(work->sb, wb, work); |
726 | else | 757 | else |
727 | __writeback_inodes_wb(wb, &wbc); | 758 | progress = __writeback_inodes_wb(wb, work); |
728 | trace_wbc_writeback_written(&wbc, wb->bdi); | 759 | trace_writeback_written(wb->bdi, work); |
729 | |||
730 | work->nr_pages -= write_chunk - wbc.nr_to_write; | ||
731 | wrote += write_chunk - wbc.nr_to_write; | ||
732 | 760 | ||
733 | /* | 761 | /* |
734 | * Did we write something? Try for more | 762 | * Did we write something? Try for more |
@@ -738,9 +766,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
738 | * mean the overall work is done. So we keep looping as long | 766 | * mean the overall work is done. So we keep looping as long |
739 | * as made some progress on cleaning pages or inodes. | 767 | * as made some progress on cleaning pages or inodes. |
740 | */ | 768 | */ |
741 | if (wbc.nr_to_write < write_chunk) | 769 | if (progress) |
742 | continue; | ||
743 | if (wbc.inodes_written) | ||
744 | continue; | 770 | continue; |
745 | /* | 771 | /* |
746 | * No more inodes for IO, bail | 772 | * No more inodes for IO, bail |
@@ -753,8 +779,8 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
753 | * we'll just busyloop. | 779 | * we'll just busyloop. |
754 | */ | 780 | */ |
755 | if (!list_empty(&wb->b_more_io)) { | 781 | if (!list_empty(&wb->b_more_io)) { |
782 | trace_writeback_wait(wb->bdi, work); | ||
756 | inode = wb_inode(wb->b_more_io.prev); | 783 | inode = wb_inode(wb->b_more_io.prev); |
757 | trace_wbc_writeback_wait(&wbc, wb->bdi); | ||
758 | spin_lock(&inode->i_lock); | 784 | spin_lock(&inode->i_lock); |
759 | inode_wait_for_writeback(inode, wb); | 785 | inode_wait_for_writeback(inode, wb); |
760 | spin_unlock(&inode->i_lock); | 786 | spin_unlock(&inode->i_lock); |
@@ -762,7 +788,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
762 | } | 788 | } |
763 | spin_unlock(&wb->list_lock); | 789 | spin_unlock(&wb->list_lock); |
764 | 790 | ||
765 | return wrote; | 791 | return nr_pages - work->nr_pages; |
766 | } | 792 | } |
767 | 793 | ||
768 | /* | 794 | /* |