aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2010-07-06 02:59:53 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-07-06 02:59:53 -0400
commit83ba7b071f30f7c01f72518ad72d5cd203c27502 (patch)
tree4737320dcce72cfff4d87d835e4f78428eca7ef5
parentedadfb10ba35da7253541e4155aa92eff758ebe6 (diff)
writeback: simplify the write back thread queue
First remove items from work_list as soon as we start working on them. This means we don't have to track any pending or visited state and can get rid of all the RCU magic freeing the work items - we can simply free them once the operation has finished. Second use a real completion for tracking synchronous requests - if the caller sets the completion pointer we complete it, otherwise use it as a boolean indicator that we can free the work item directly. Third unify struct wb_writeback_args and struct bdi_work into a single data structure, wb_writeback_work. Previous we set all parameters into a struct wb_writeback_args, copied it into struct bdi_work, copied it again on the stack to use it there. Instead of just allocate one structure dynamically or on the stack and use it all the way through the stack. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--fs/fs-writeback.c253
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--mm/backing-dev.c14
3 files changed, 72 insertions, 197 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8cc06d5432b5..d5be1693ac93 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -38,43 +38,18 @@ int nr_pdflush_threads;
38/* 38/*
39 * Passed into wb_writeback(), essentially a subset of writeback_control 39 * Passed into wb_writeback(), essentially a subset of writeback_control
40 */ 40 */
41struct wb_writeback_args { 41struct wb_writeback_work {
42 long nr_pages; 42 long nr_pages;
43 struct super_block *sb; 43 struct super_block *sb;
44 enum writeback_sync_modes sync_mode; 44 enum writeback_sync_modes sync_mode;
45 unsigned int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 unsigned int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 unsigned int for_background:1; 47 unsigned int for_background:1;
48};
49 48
50/*
51 * Work items for the bdi_writeback threads
52 */
53struct bdi_work {
54 struct list_head list; /* pending work list */ 49 struct list_head list; /* pending work list */
55 struct rcu_head rcu_head; /* for RCU free/clear of work */ 50 struct completion *done; /* set if the caller waits */
56
57 unsigned long seen; /* threads that have seen this work */
58 atomic_t pending; /* number of threads still to do work */
59
60 struct wb_writeback_args args; /* writeback arguments */
61
62 unsigned long state; /* flag bits, see WS_* */
63};
64
65enum {
66 WS_INPROGRESS = 0,
67 WS_ONSTACK,
68}; 51};
69 52
70static inline void bdi_work_init(struct bdi_work *work,
71 struct wb_writeback_args *args)
72{
73 INIT_RCU_HEAD(&work->rcu_head);
74 work->args = *args;
75 __set_bit(WS_INPROGRESS, &work->state);
76}
77
78/** 53/**
79 * writeback_in_progress - determine whether there is writeback in progress 54 * writeback_in_progress - determine whether there is writeback in progress
80 * @bdi: the device's backing_dev_info structure. 55 * @bdi: the device's backing_dev_info structure.
@@ -87,49 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
87 return !list_empty(&bdi->work_list); 62 return !list_empty(&bdi->work_list);
88} 63}
89 64
90static void bdi_work_free(struct rcu_head *head) 65static void bdi_queue_work(struct backing_dev_info *bdi,
91{ 66 struct wb_writeback_work *work)
92 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
93
94 clear_bit(WS_INPROGRESS, &work->state);
95 smp_mb__after_clear_bit();
96 wake_up_bit(&work->state, WS_INPROGRESS);
97
98 if (!test_bit(WS_ONSTACK, &work->state))
99 kfree(work);
100}
101
102static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
103{
104 /*
105 * The caller has retrieved the work arguments from this work,
106 * drop our reference. If this is the last ref, delete and free it
107 */
108 if (atomic_dec_and_test(&work->pending)) {
109 struct backing_dev_info *bdi = wb->bdi;
110
111 spin_lock(&bdi->wb_lock);
112 list_del_rcu(&work->list);
113 spin_unlock(&bdi->wb_lock);
114
115 call_rcu(&work->rcu_head, bdi_work_free);
116 }
117}
118
119static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
120{ 67{
121 work->seen = bdi->wb_mask;
122 BUG_ON(!work->seen);
123 atomic_set(&work->pending, bdi->wb_cnt);
124 BUG_ON(!bdi->wb_cnt);
125
126 /*
127 * list_add_tail_rcu() contains the necessary barriers to
128 * make sure the above stores are seen before the item is
129 * noticed on the list
130 */
131 spin_lock(&bdi->wb_lock); 68 spin_lock(&bdi->wb_lock);
132 list_add_tail_rcu(&work->list, &bdi->work_list); 69 list_add_tail(&work->list, &bdi->work_list);
133 spin_unlock(&bdi->wb_lock); 70 spin_unlock(&bdi->wb_lock);
134 71
135 /* 72 /*
@@ -146,55 +83,29 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
146 } 83 }
147} 84}
148 85
149/* 86static void
150 * Used for on-stack allocated work items. The caller needs to wait until 87__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
151 * the wb threads have acked the work before it's safe to continue. 88 bool range_cyclic, bool for_background)
152 */
153static void bdi_wait_on_work_done(struct bdi_work *work)
154{
155 wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait,
156 TASK_UNINTERRUPTIBLE);
157}
158
159static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
160 struct wb_writeback_args *args)
161{ 89{
162 struct bdi_work *work; 90 struct wb_writeback_work *work;
163 91
164 /* 92 /*
165 * This is WB_SYNC_NONE writeback, so if allocation fails just 93 * This is WB_SYNC_NONE writeback, so if allocation fails just
166 * wakeup the thread for old dirty data writeback 94 * wakeup the thread for old dirty data writeback
167 */ 95 */
168 work = kmalloc(sizeof(*work), GFP_ATOMIC); 96 work = kzalloc(sizeof(*work), GFP_ATOMIC);
169 if (work) { 97 if (!work) {
170 bdi_work_init(work, args); 98 if (bdi->wb.task)
171 bdi_queue_work(bdi, work); 99 wake_up_process(bdi->wb.task);
172 } else { 100 return;
173 struct bdi_writeback *wb = &bdi->wb;
174
175 if (wb->task)
176 wake_up_process(wb->task);
177 } 101 }
178}
179 102
180/** 103 work->sync_mode = WB_SYNC_NONE;
181 * bdi_queue_work_onstack - start and wait for writeback 104 work->nr_pages = nr_pages;
182 * @args: parameters to control the work queue writeback 105 work->range_cyclic = range_cyclic;
183 * 106 work->for_background = for_background;
184 * Description:
185 * This function initiates writeback and waits for the operation to
186 * complete. Callers must hold the sb s_umount semaphore for
187 * reading, to avoid having the super disappear before we are done.
188 */
189static void bdi_queue_work_onstack(struct wb_writeback_args *args)
190{
191 struct bdi_work work;
192 107
193 bdi_work_init(&work, args); 108 bdi_queue_work(bdi, work);
194 __set_bit(WS_ONSTACK, &work.state);
195
196 bdi_queue_work(args->sb->s_bdi, &work);
197 bdi_wait_on_work_done(&work);
198} 109}
199 110
200/** 111/**
@@ -210,13 +121,7 @@ static void bdi_queue_work_onstack(struct wb_writeback_args *args)
210 */ 121 */
211void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 122void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
212{ 123{
213 struct wb_writeback_args args = { 124 __bdi_start_writeback(bdi, nr_pages, true, false);
214 .sync_mode = WB_SYNC_NONE,
215 .nr_pages = nr_pages,
216 .range_cyclic = 1,
217 };
218
219 bdi_alloc_queue_work(bdi, &args);
220} 125}
221 126
222/** 127/**
@@ -230,13 +135,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
230 */ 135 */
231void bdi_start_background_writeback(struct backing_dev_info *bdi) 136void bdi_start_background_writeback(struct backing_dev_info *bdi)
232{ 137{
233 struct wb_writeback_args args = { 138 __bdi_start_writeback(bdi, LONG_MAX, true, true);
234 .sync_mode = WB_SYNC_NONE,
235 .nr_pages = LONG_MAX,
236 .for_background = 1,
237 .range_cyclic = 1,
238 };
239 bdi_alloc_queue_work(bdi, &args);
240} 139}
241 140
242/* 141/*
@@ -703,14 +602,14 @@ static inline bool over_bground_thresh(void)
703 * all dirty pages if they are all attached to "old" mappings. 602 * all dirty pages if they are all attached to "old" mappings.
704 */ 603 */
705static long wb_writeback(struct bdi_writeback *wb, 604static long wb_writeback(struct bdi_writeback *wb,
706 struct wb_writeback_args *args) 605 struct wb_writeback_work *work)
707{ 606{
708 struct writeback_control wbc = { 607 struct writeback_control wbc = {
709 .sync_mode = args->sync_mode, 608 .sync_mode = work->sync_mode,
710 .older_than_this = NULL, 609 .older_than_this = NULL,
711 .for_kupdate = args->for_kupdate, 610 .for_kupdate = work->for_kupdate,
712 .for_background = args->for_background, 611 .for_background = work->for_background,
713 .range_cyclic = args->range_cyclic, 612 .range_cyclic = work->range_cyclic,
714 }; 613 };
715 unsigned long oldest_jif; 614 unsigned long oldest_jif;
716 long wrote = 0; 615 long wrote = 0;
@@ -730,24 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb,
730 /* 629 /*
731 * Stop writeback when nr_pages has been consumed 630 * Stop writeback when nr_pages has been consumed
732 */ 631 */
733 if (args->nr_pages <= 0) 632 if (work->nr_pages <= 0)
734 break; 633 break;
735 634
736 /* 635 /*
737 * For background writeout, stop when we are below the 636 * For background writeout, stop when we are below the
738 * background dirty threshold 637 * background dirty threshold
739 */ 638 */
740 if (args->for_background && !over_bground_thresh()) 639 if (work->for_background && !over_bground_thresh())
741 break; 640 break;
742 641
743 wbc.more_io = 0; 642 wbc.more_io = 0;
744 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 643 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
745 wbc.pages_skipped = 0; 644 wbc.pages_skipped = 0;
746 if (args->sb) 645 if (work->sb)
747 __writeback_inodes_sb(args->sb, wb, &wbc); 646 __writeback_inodes_sb(work->sb, wb, &wbc);
748 else 647 else
749 writeback_inodes_wb(wb, &wbc); 648 writeback_inodes_wb(wb, &wbc);
750 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 649 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
751 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 650 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
752 651
753 /* 652 /*
@@ -783,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb,
783} 682}
784 683
785/* 684/*
786 * Return the next bdi_work struct that hasn't been processed by this 685 * Return the next wb_writeback_work struct that hasn't been processed yet.
787 * wb thread yet. ->seen is initially set for each thread that exists
788 * for this device, when a thread first notices a piece of work it
789 * clears its bit. Depending on writeback type, the thread will notify
790 * completion on either receiving the work (WB_SYNC_NONE) or after
791 * it is done (WB_SYNC_ALL).
792 */ 686 */
793static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, 687static struct wb_writeback_work *
794 struct bdi_writeback *wb) 688get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
795{ 689{
796 struct bdi_work *work, *ret = NULL; 690 struct wb_writeback_work *work = NULL;
797
798 rcu_read_lock();
799
800 list_for_each_entry_rcu(work, &bdi->work_list, list) {
801 if (!test_bit(wb->nr, &work->seen))
802 continue;
803 clear_bit(wb->nr, &work->seen);
804 691
805 ret = work; 692 spin_lock(&bdi->wb_lock);
806 break; 693 if (!list_empty(&bdi->work_list)) {
694 work = list_entry(bdi->work_list.next,
695 struct wb_writeback_work, list);
696 list_del_init(&work->list);
807 } 697 }
808 698 spin_unlock(&bdi->wb_lock);
809 rcu_read_unlock(); 699 return work;
810 return ret;
811} 700}
812 701
813static long wb_check_old_data_flush(struct bdi_writeback *wb) 702static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -832,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
832 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 721 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
833 722
834 if (nr_pages) { 723 if (nr_pages) {
835 struct wb_writeback_args args = { 724 struct wb_writeback_work work = {
836 .nr_pages = nr_pages, 725 .nr_pages = nr_pages,
837 .sync_mode = WB_SYNC_NONE, 726 .sync_mode = WB_SYNC_NONE,
838 .for_kupdate = 1, 727 .for_kupdate = 1,
839 .range_cyclic = 1, 728 .range_cyclic = 1,
840 }; 729 };
841 730
842 return wb_writeback(wb, &args); 731 return wb_writeback(wb, &work);
843 } 732 }
844 733
845 return 0; 734 return 0;
@@ -851,33 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
851long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 740long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
852{ 741{
853 struct backing_dev_info *bdi = wb->bdi; 742 struct backing_dev_info *bdi = wb->bdi;
854 struct bdi_work *work; 743 struct wb_writeback_work *work;
855 long wrote = 0; 744 long wrote = 0;
856 745
857 while ((work = get_next_work_item(bdi, wb)) != NULL) { 746 while ((work = get_next_work_item(bdi, wb)) != NULL) {
858 struct wb_writeback_args args = work->args;
859
860 /* 747 /*
861 * Override sync mode, in case we must wait for completion 748 * Override sync mode, in case we must wait for completion
749 * because this thread is exiting now.
862 */ 750 */
863 if (force_wait) 751 if (force_wait)
864 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 752 work->sync_mode = WB_SYNC_ALL;
865 753
866 /* 754 wrote += wb_writeback(wb, work);
867 * If this isn't a data integrity operation, just notify
868 * that we have seen this work and we are now starting it.
869 */
870 if (!test_bit(WS_ONSTACK, &work->state))
871 wb_clear_pending(wb, work);
872
873 wrote += wb_writeback(wb, &args);
874 755
875 /* 756 /*
876 * This is a data integrity writeback, so only do the 757 * Notify the caller of completion if this is a synchronous
877 * notification when we have completed the work. 758 * work item, otherwise just free it.
878 */ 759 */
879 if (test_bit(WS_ONSTACK, &work->state)) 760 if (work->done)
880 wb_clear_pending(wb, work); 761 complete(work->done);
762 else
763 kfree(work);
881 } 764 }
882 765
883 /* 766 /*
@@ -940,14 +823,9 @@ int bdi_writeback_task(struct bdi_writeback *wb)
940void wakeup_flusher_threads(long nr_pages) 823void wakeup_flusher_threads(long nr_pages)
941{ 824{
942 struct backing_dev_info *bdi; 825 struct backing_dev_info *bdi;
943 struct wb_writeback_args args = {
944 .sync_mode = WB_SYNC_NONE,
945 };
946 826
947 if (nr_pages) { 827 if (!nr_pages) {
948 args.nr_pages = nr_pages; 828 nr_pages = global_page_state(NR_FILE_DIRTY) +
949 } else {
950 args.nr_pages = global_page_state(NR_FILE_DIRTY) +
951 global_page_state(NR_UNSTABLE_NFS); 829 global_page_state(NR_UNSTABLE_NFS);
952 } 830 }
953 831
@@ -955,7 +833,7 @@ void wakeup_flusher_threads(long nr_pages)
955 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 833 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
956 if (!bdi_has_dirty_io(bdi)) 834 if (!bdi_has_dirty_io(bdi))
957 continue; 835 continue;
958 bdi_alloc_queue_work(bdi, &args); 836 __bdi_start_writeback(bdi, nr_pages, false, false);
959 } 837 }
960 rcu_read_unlock(); 838 rcu_read_unlock();
961} 839}
@@ -1164,17 +1042,20 @@ void writeback_inodes_sb(struct super_block *sb)
1164{ 1042{
1165 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1043 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1166 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1044 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1167 struct wb_writeback_args args = { 1045 DECLARE_COMPLETION_ONSTACK(done);
1046 struct wb_writeback_work work = {
1168 .sb = sb, 1047 .sb = sb,
1169 .sync_mode = WB_SYNC_NONE, 1048 .sync_mode = WB_SYNC_NONE,
1049 .done = &done,
1170 }; 1050 };
1171 1051
1172 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1052 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1173 1053
1174 args.nr_pages = nr_dirty + nr_unstable + 1054 work.nr_pages = nr_dirty + nr_unstable +
1175 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1055 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1176 1056
1177 bdi_queue_work_onstack(&args); 1057 bdi_queue_work(sb->s_bdi, &work);
1058 wait_for_completion(&done);
1178} 1059}
1179EXPORT_SYMBOL(writeback_inodes_sb); 1060EXPORT_SYMBOL(writeback_inodes_sb);
1180 1061
@@ -1206,16 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1206 */ 1087 */
1207void sync_inodes_sb(struct super_block *sb) 1088void sync_inodes_sb(struct super_block *sb)
1208{ 1089{
1209 struct wb_writeback_args args = { 1090 DECLARE_COMPLETION_ONSTACK(done);
1091 struct wb_writeback_work work = {
1210 .sb = sb, 1092 .sb = sb,
1211 .sync_mode = WB_SYNC_ALL, 1093 .sync_mode = WB_SYNC_ALL,
1212 .nr_pages = LONG_MAX, 1094 .nr_pages = LONG_MAX,
1213 .range_cyclic = 0, 1095 .range_cyclic = 0,
1096 .done = &done,
1214 }; 1097 };
1215 1098
1216 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1099 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1217 1100
1218 bdi_queue_work_onstack(&args); 1101 bdi_queue_work(sb->s_bdi, &work);
1102 wait_for_completion(&done);
1103
1219 wait_sb_inodes(sb); 1104 wait_sb_inodes(sb);
1220} 1105}
1221EXPORT_SYMBOL(sync_inodes_sb); 1106EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 9ae2889096b6..e9aec0d099df 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -82,8 +82,6 @@ struct backing_dev_info {
82 struct bdi_writeback wb; /* default writeback info for this bdi */ 82 struct bdi_writeback wb; /* default writeback info for this bdi */
83 spinlock_t wb_lock; /* protects update side of wb_list */ 83 spinlock_t wb_lock; /* protects update side of wb_list */
84 struct list_head wb_list; /* the flusher threads hanging off this bdi */ 84 struct list_head wb_list; /* the flusher threads hanging off this bdi */
85 unsigned long wb_mask; /* bitmask of registered tasks */
86 unsigned int wb_cnt; /* number of registered tasks */
87 85
88 struct list_head work_list; 86 struct list_head work_list;
89 87
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6e0b09a1ec2c..123bcef13e51 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -104,15 +104,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
104 "b_more_io: %8lu\n" 104 "b_more_io: %8lu\n"
105 "bdi_list: %8u\n" 105 "bdi_list: %8u\n"
106 "state: %8lx\n" 106 "state: %8lx\n"
107 "wb_mask: %8lx\n" 107 "wb_list: %8u\n",
108 "wb_list: %8u\n"
109 "wb_cnt: %8u\n",
110 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 108 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
111 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 109 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
112 K(bdi_thresh), K(dirty_thresh), 110 K(bdi_thresh), K(dirty_thresh),
113 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, 111 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
114 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, 112 !list_empty(&bdi->bdi_list), bdi->state,
115 !list_empty(&bdi->wb_list), bdi->wb_cnt); 113 !list_empty(&bdi->wb_list));
116#undef K 114#undef K
117 115
118 return 0; 116 return 0;
@@ -674,12 +672,6 @@ int bdi_init(struct backing_dev_info *bdi)
674 672
675 bdi_wb_init(&bdi->wb, bdi); 673 bdi_wb_init(&bdi->wb, bdi);
676 674
677 /*
678 * Just one thread support for now, hard code mask and count
679 */
680 bdi->wb_mask = 1;
681 bdi->wb_cnt = 1;
682
683 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 675 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
684 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 676 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
685 if (err) 677 if (err)