diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 161 |
1 files changed, 104 insertions, 57 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b7c7586caea1..2f76c4a081a2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -26,15 +26,9 @@ | |||
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
29 | #include <linux/tracepoint.h> | ||
29 | #include "internal.h" | 30 | #include "internal.h" |
30 | 31 | ||
31 | #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) | ||
32 | |||
33 | /* | ||
34 | * We don't actually have pdflush, but this one is exported though /proc... | ||
35 | */ | ||
36 | int nr_pdflush_threads; | ||
37 | |||
38 | /* | 32 | /* |
39 | * Passed into wb_writeback(), essentially a subset of writeback_control | 33 | * Passed into wb_writeback(), essentially a subset of writeback_control |
40 | */ | 34 | */ |
@@ -50,6 +44,21 @@ struct wb_writeback_work { | |||
50 | struct completion *done; /* set if the caller waits */ | 44 | struct completion *done; /* set if the caller waits */ |
51 | }; | 45 | }; |
52 | 46 | ||
47 | /* | ||
48 | * Include the creation of the trace points after defining the | ||
49 | * wb_writeback_work structure so that the definition remains local to this | ||
50 | * file. | ||
51 | */ | ||
52 | #define CREATE_TRACE_POINTS | ||
53 | #include <trace/events/writeback.h> | ||
54 | |||
55 | #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) | ||
56 | |||
57 | /* | ||
58 | * We don't actually have pdflush, but this one is exported though /proc... | ||
59 | */ | ||
60 | int nr_pdflush_threads; | ||
61 | |||
53 | /** | 62 | /** |
54 | * writeback_in_progress - determine whether there is writeback in progress | 63 | * writeback_in_progress - determine whether there is writeback in progress |
55 | * @bdi: the device's backing_dev_info structure. | 64 | * @bdi: the device's backing_dev_info structure. |
@@ -65,22 +74,21 @@ int writeback_in_progress(struct backing_dev_info *bdi) | |||
65 | static void bdi_queue_work(struct backing_dev_info *bdi, | 74 | static void bdi_queue_work(struct backing_dev_info *bdi, |
66 | struct wb_writeback_work *work) | 75 | struct wb_writeback_work *work) |
67 | { | 76 | { |
68 | spin_lock(&bdi->wb_lock); | 77 | trace_writeback_queue(bdi, work); |
69 | list_add_tail(&work->list, &bdi->work_list); | ||
70 | spin_unlock(&bdi->wb_lock); | ||
71 | 78 | ||
72 | /* | 79 | spin_lock_bh(&bdi->wb_lock); |
73 | * If the default thread isn't there, make sure we add it. When | 80 | list_add_tail(&work->list, &bdi->work_list); |
74 | * it gets created and wakes up, we'll run this work. | 81 | if (bdi->wb.task) { |
75 | */ | 82 | wake_up_process(bdi->wb.task); |
76 | if (unlikely(list_empty_careful(&bdi->wb_list))) | 83 | } else { |
84 | /* | ||
85 | * The bdi thread isn't there, wake up the forker thread which | ||
86 | * will create and run it. | ||
87 | */ | ||
88 | trace_writeback_nothread(bdi, work); | ||
77 | wake_up_process(default_backing_dev_info.wb.task); | 89 | wake_up_process(default_backing_dev_info.wb.task); |
78 | else { | ||
79 | struct bdi_writeback *wb = &bdi->wb; | ||
80 | |||
81 | if (wb->task) | ||
82 | wake_up_process(wb->task); | ||
83 | } | 90 | } |
91 | spin_unlock_bh(&bdi->wb_lock); | ||
84 | } | 92 | } |
85 | 93 | ||
86 | static void | 94 | static void |
@@ -95,8 +103,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | |||
95 | */ | 103 | */ |
96 | work = kzalloc(sizeof(*work), GFP_ATOMIC); | 104 | work = kzalloc(sizeof(*work), GFP_ATOMIC); |
97 | if (!work) { | 105 | if (!work) { |
98 | if (bdi->wb.task) | 106 | if (bdi->wb.task) { |
107 | trace_writeback_nowork(bdi); | ||
99 | wake_up_process(bdi->wb.task); | 108 | wake_up_process(bdi->wb.task); |
109 | } | ||
100 | return; | 110 | return; |
101 | } | 111 | } |
102 | 112 | ||
@@ -643,10 +653,14 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
643 | wbc.more_io = 0; | 653 | wbc.more_io = 0; |
644 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 654 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
645 | wbc.pages_skipped = 0; | 655 | wbc.pages_skipped = 0; |
656 | |||
657 | trace_wbc_writeback_start(&wbc, wb->bdi); | ||
646 | if (work->sb) | 658 | if (work->sb) |
647 | __writeback_inodes_sb(work->sb, wb, &wbc); | 659 | __writeback_inodes_sb(work->sb, wb, &wbc); |
648 | else | 660 | else |
649 | writeback_inodes_wb(wb, &wbc); | 661 | writeback_inodes_wb(wb, &wbc); |
662 | trace_wbc_writeback_written(&wbc, wb->bdi); | ||
663 | |||
650 | work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 664 | work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
651 | wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 665 | wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
652 | 666 | ||
@@ -674,6 +688,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
674 | if (!list_empty(&wb->b_more_io)) { | 688 | if (!list_empty(&wb->b_more_io)) { |
675 | inode = list_entry(wb->b_more_io.prev, | 689 | inode = list_entry(wb->b_more_io.prev, |
676 | struct inode, i_list); | 690 | struct inode, i_list); |
691 | trace_wbc_writeback_wait(&wbc, wb->bdi); | ||
677 | inode_wait_for_writeback(inode); | 692 | inode_wait_for_writeback(inode); |
678 | } | 693 | } |
679 | spin_unlock(&inode_lock); | 694 | spin_unlock(&inode_lock); |
@@ -686,17 +701,17 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
686 | * Return the next wb_writeback_work struct that hasn't been processed yet. | 701 | * Return the next wb_writeback_work struct that hasn't been processed yet. |
687 | */ | 702 | */ |
688 | static struct wb_writeback_work * | 703 | static struct wb_writeback_work * |
689 | get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) | 704 | get_next_work_item(struct backing_dev_info *bdi) |
690 | { | 705 | { |
691 | struct wb_writeback_work *work = NULL; | 706 | struct wb_writeback_work *work = NULL; |
692 | 707 | ||
693 | spin_lock(&bdi->wb_lock); | 708 | spin_lock_bh(&bdi->wb_lock); |
694 | if (!list_empty(&bdi->work_list)) { | 709 | if (!list_empty(&bdi->work_list)) { |
695 | work = list_entry(bdi->work_list.next, | 710 | work = list_entry(bdi->work_list.next, |
696 | struct wb_writeback_work, list); | 711 | struct wb_writeback_work, list); |
697 | list_del_init(&work->list); | 712 | list_del_init(&work->list); |
698 | } | 713 | } |
699 | spin_unlock(&bdi->wb_lock); | 714 | spin_unlock_bh(&bdi->wb_lock); |
700 | return work; | 715 | return work; |
701 | } | 716 | } |
702 | 717 | ||
@@ -744,7 +759,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
744 | struct wb_writeback_work *work; | 759 | struct wb_writeback_work *work; |
745 | long wrote = 0; | 760 | long wrote = 0; |
746 | 761 | ||
747 | while ((work = get_next_work_item(bdi, wb)) != NULL) { | 762 | while ((work = get_next_work_item(bdi)) != NULL) { |
748 | /* | 763 | /* |
749 | * Override sync mode, in case we must wait for completion | 764 | * Override sync mode, in case we must wait for completion |
750 | * because this thread is exiting now. | 765 | * because this thread is exiting now. |
@@ -752,6 +767,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
752 | if (force_wait) | 767 | if (force_wait) |
753 | work->sync_mode = WB_SYNC_ALL; | 768 | work->sync_mode = WB_SYNC_ALL; |
754 | 769 | ||
770 | trace_writeback_exec(bdi, work); | ||
771 | |||
755 | wrote += wb_writeback(wb, work); | 772 | wrote += wb_writeback(wb, work); |
756 | 773 | ||
757 | /* | 774 | /* |
@@ -776,47 +793,66 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
776 | * Handle writeback of dirty data for the device backed by this bdi. Also | 793 | * Handle writeback of dirty data for the device backed by this bdi. Also |
777 | * wakes up periodically and does kupdated style flushing. | 794 | * wakes up periodically and does kupdated style flushing. |
778 | */ | 795 | */ |
779 | int bdi_writeback_task(struct bdi_writeback *wb) | 796 | int bdi_writeback_thread(void *data) |
780 | { | 797 | { |
781 | unsigned long last_active = jiffies; | 798 | struct bdi_writeback *wb = data; |
782 | unsigned long wait_jiffies = -1UL; | 799 | struct backing_dev_info *bdi = wb->bdi; |
783 | long pages_written; | 800 | long pages_written; |
784 | 801 | ||
802 | current->flags |= PF_FLUSHER | PF_SWAPWRITE; | ||
803 | set_freezable(); | ||
804 | wb->last_active = jiffies; | ||
805 | |||
806 | /* | ||
807 | * Our parent may run at a different priority, just set us to normal | ||
808 | */ | ||
809 | set_user_nice(current, 0); | ||
810 | |||
811 | trace_writeback_thread_start(bdi); | ||
812 | |||
785 | while (!kthread_should_stop()) { | 813 | while (!kthread_should_stop()) { |
814 | /* | ||
815 | * Remove own delayed wake-up timer, since we are already awake | ||
816 | * and we'll take care of the preriodic write-back. | ||
817 | */ | ||
818 | del_timer(&wb->wakeup_timer); | ||
819 | |||
786 | pages_written = wb_do_writeback(wb, 0); | 820 | pages_written = wb_do_writeback(wb, 0); |
787 | 821 | ||
822 | trace_writeback_pages_written(pages_written); | ||
823 | |||
788 | if (pages_written) | 824 | if (pages_written) |
789 | last_active = jiffies; | 825 | wb->last_active = jiffies; |
790 | else if (wait_jiffies != -1UL) { | ||
791 | unsigned long max_idle; | ||
792 | 826 | ||
793 | /* | 827 | set_current_state(TASK_INTERRUPTIBLE); |
794 | * Longest period of inactivity that we tolerate. If we | 828 | if (!list_empty(&bdi->work_list)) { |
795 | * see dirty data again later, the task will get | 829 | __set_current_state(TASK_RUNNING); |
796 | * recreated automatically. | 830 | continue; |
797 | */ | ||
798 | max_idle = max(5UL * 60 * HZ, wait_jiffies); | ||
799 | if (time_after(jiffies, max_idle + last_active)) | ||
800 | break; | ||
801 | } | 831 | } |
802 | 832 | ||
803 | if (dirty_writeback_interval) { | 833 | if (wb_has_dirty_io(wb) && dirty_writeback_interval) |
804 | wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); | 834 | schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); |
805 | schedule_timeout_interruptible(wait_jiffies); | 835 | else { |
806 | } else { | 836 | /* |
807 | set_current_state(TASK_INTERRUPTIBLE); | 837 | * We have nothing to do, so can go sleep without any |
808 | if (list_empty_careful(&wb->bdi->work_list) && | 838 | * timeout and save power. When a work is queued or |
809 | !kthread_should_stop()) | 839 | * something is made dirty - we will be woken up. |
810 | schedule(); | 840 | */ |
811 | __set_current_state(TASK_RUNNING); | 841 | schedule(); |
812 | } | 842 | } |
813 | 843 | ||
814 | try_to_freeze(); | 844 | try_to_freeze(); |
815 | } | 845 | } |
816 | 846 | ||
847 | /* Flush any work that raced with us exiting */ | ||
848 | if (!list_empty(&bdi->work_list)) | ||
849 | wb_do_writeback(wb, 1); | ||
850 | |||
851 | trace_writeback_thread_stop(bdi); | ||
817 | return 0; | 852 | return 0; |
818 | } | 853 | } |
819 | 854 | ||
855 | |||
820 | /* | 856 | /* |
821 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | 857 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back |
822 | * the whole world. | 858 | * the whole world. |
@@ -891,6 +927,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) | |||
891 | void __mark_inode_dirty(struct inode *inode, int flags) | 927 | void __mark_inode_dirty(struct inode *inode, int flags) |
892 | { | 928 | { |
893 | struct super_block *sb = inode->i_sb; | 929 | struct super_block *sb = inode->i_sb; |
930 | struct backing_dev_info *bdi = NULL; | ||
931 | bool wakeup_bdi = false; | ||
894 | 932 | ||
895 | /* | 933 | /* |
896 | * Don't do this for I_DIRTY_PAGES - that doesn't actually | 934 | * Don't do this for I_DIRTY_PAGES - that doesn't actually |
@@ -944,22 +982,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) | |||
944 | * reposition it (that would break b_dirty time-ordering). | 982 | * reposition it (that would break b_dirty time-ordering). |
945 | */ | 983 | */ |
946 | if (!was_dirty) { | 984 | if (!was_dirty) { |
947 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; | 985 | bdi = inode_to_bdi(inode); |
948 | struct backing_dev_info *bdi = wb->bdi; | 986 | |
949 | 987 | if (bdi_cap_writeback_dirty(bdi)) { | |
950 | if (bdi_cap_writeback_dirty(bdi) && | 988 | WARN(!test_bit(BDI_registered, &bdi->state), |
951 | !test_bit(BDI_registered, &bdi->state)) { | 989 | "bdi-%s not registered\n", bdi->name); |
952 | WARN_ON(1); | 990 | |
953 | printk(KERN_ERR "bdi-%s not registered\n", | 991 | /* |
954 | bdi->name); | 992 | * If this is the first dirty inode for this |
993 | * bdi, we have to wake-up the corresponding | ||
994 | * bdi thread to make sure background | ||
995 | * write-back happens later. | ||
996 | */ | ||
997 | if (!wb_has_dirty_io(&bdi->wb)) | ||
998 | wakeup_bdi = true; | ||
955 | } | 999 | } |
956 | 1000 | ||
957 | inode->dirtied_when = jiffies; | 1001 | inode->dirtied_when = jiffies; |
958 | list_move(&inode->i_list, &wb->b_dirty); | 1002 | list_move(&inode->i_list, &bdi->wb.b_dirty); |
959 | } | 1003 | } |
960 | } | 1004 | } |
961 | out: | 1005 | out: |
962 | spin_unlock(&inode_lock); | 1006 | spin_unlock(&inode_lock); |
1007 | |||
1008 | if (wakeup_bdi) | ||
1009 | bdi_wakeup_thread_delayed(bdi); | ||
963 | } | 1010 | } |
964 | EXPORT_SYMBOL(__mark_inode_dirty); | 1011 | EXPORT_SYMBOL(__mark_inode_dirty); |
965 | 1012 | ||