aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c254
1 files changed, 107 insertions, 147 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1d1088f48bc2..0609607d3955 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -63,24 +63,16 @@ struct bdi_work {
63}; 63};
64 64
65enum { 65enum {
66 WS_USED_B = 0, 66 WS_INPROGRESS = 0,
67 WS_ONSTACK_B, 67 WS_ONSTACK,
68}; 68};
69 69
70#define WS_USED (1 << WS_USED_B)
71#define WS_ONSTACK (1 << WS_ONSTACK_B)
72
73static inline bool bdi_work_on_stack(struct bdi_work *work)
74{
75 return test_bit(WS_ONSTACK_B, &work->state);
76}
77
78static inline void bdi_work_init(struct bdi_work *work, 70static inline void bdi_work_init(struct bdi_work *work,
79 struct wb_writeback_args *args) 71 struct wb_writeback_args *args)
80{ 72{
81 INIT_RCU_HEAD(&work->rcu_head); 73 INIT_RCU_HEAD(&work->rcu_head);
82 work->args = *args; 74 work->args = *args;
83 work->state = WS_USED; 75 __set_bit(WS_INPROGRESS, &work->state);
84} 76}
85 77
86/** 78/**
@@ -95,43 +87,16 @@ int writeback_in_progress(struct backing_dev_info *bdi)
95 return !list_empty(&bdi->work_list); 87 return !list_empty(&bdi->work_list);
96} 88}
97 89
98static void bdi_work_clear(struct bdi_work *work)
99{
100 clear_bit(WS_USED_B, &work->state);
101 smp_mb__after_clear_bit();
102 /*
103 * work can have disappeared at this point. bit waitq functions
104 * should be able to tolerate this, provided bdi_sched_wait does
105 * not dereference it's pointer argument.
106 */
107 wake_up_bit(&work->state, WS_USED_B);
108}
109
110static void bdi_work_free(struct rcu_head *head) 90static void bdi_work_free(struct rcu_head *head)
111{ 91{
112 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); 92 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
113 93
114 if (!bdi_work_on_stack(work)) 94 clear_bit(WS_INPROGRESS, &work->state);
115 kfree(work); 95 smp_mb__after_clear_bit();
116 else 96 wake_up_bit(&work->state, WS_INPROGRESS);
117 bdi_work_clear(work);
118}
119
120static void wb_work_complete(struct bdi_work *work)
121{
122 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
123 int onstack = bdi_work_on_stack(work);
124 97
125 /* 98 if (!test_bit(WS_ONSTACK, &work->state))
126 * For allocated work, we can clear the done/seen bit right here. 99 kfree(work);
127 * For on-stack work, we need to postpone both the clear and free
128 * to after the RCU grace period, since the stack could be invalidated
129 * as soon as bdi_work_clear() has done the wakeup.
130 */
131 if (!onstack)
132 bdi_work_clear(work);
133 if (sync_mode == WB_SYNC_NONE || onstack)
134 call_rcu(&work->rcu_head, bdi_work_free);
135} 100}
136 101
137static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) 102static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
@@ -147,7 +112,7 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
147 list_del_rcu(&work->list); 112 list_del_rcu(&work->list);
148 spin_unlock(&bdi->wb_lock); 113 spin_unlock(&bdi->wb_lock);
149 114
150 wb_work_complete(work); 115 call_rcu(&work->rcu_head, bdi_work_free);
151 } 116 }
152} 117}
153 118
@@ -185,9 +150,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
185 * Used for on-stack allocated work items. The caller needs to wait until 150 * Used for on-stack allocated work items. The caller needs to wait until
186 * the wb threads have acked the work before it's safe to continue. 151 * the wb threads have acked the work before it's safe to continue.
187 */ 152 */
188static void bdi_wait_on_work_clear(struct bdi_work *work) 153static void bdi_wait_on_work_done(struct bdi_work *work)
189{ 154{
190 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, 155 wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait,
191 TASK_UNINTERRUPTIBLE); 156 TASK_UNINTERRUPTIBLE);
192} 157}
193 158
@@ -213,37 +178,28 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
213} 178}
214 179
215/** 180/**
216 * bdi_sync_writeback - start and wait for writeback 181 * bdi_queue_work_onstack - start and wait for writeback
217 * @bdi: the backing device to write from
218 * @sb: write inodes from this super_block 182 * @sb: write inodes from this super_block
219 * 183 *
220 * Description: 184 * Description:
221 * This does WB_SYNC_ALL data integrity writeback and waits for the 185 * This function initiates writeback and waits for the operation to
222 * IO to complete. Callers must hold the sb s_umount semaphore for 186 * complete. Callers must hold the sb s_umount semaphore for
223 * reading, to avoid having the super disappear before we are done. 187 * reading, to avoid having the super disappear before we are done.
224 */ 188 */
225static void bdi_sync_writeback(struct backing_dev_info *bdi, 189static void bdi_queue_work_onstack(struct wb_writeback_args *args)
226 struct super_block *sb)
227{ 190{
228 struct wb_writeback_args args = {
229 .sb = sb,
230 .sync_mode = WB_SYNC_ALL,
231 .nr_pages = LONG_MAX,
232 .range_cyclic = 0,
233 };
234 struct bdi_work work; 191 struct bdi_work work;
235 192
236 bdi_work_init(&work, &args); 193 bdi_work_init(&work, args);
237 work.state |= WS_ONSTACK; 194 __set_bit(WS_ONSTACK, &work.state);
238 195
239 bdi_queue_work(bdi, &work); 196 bdi_queue_work(args->sb->s_bdi, &work);
240 bdi_wait_on_work_clear(&work); 197 bdi_wait_on_work_done(&work);
241} 198}
242 199
243/** 200/**
244 * bdi_start_writeback - start writeback 201 * bdi_start_writeback - start writeback
245 * @bdi: the backing device to write from 202 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block
247 * @nr_pages: the number of pages to write 203 * @nr_pages: the number of pages to write
248 * 204 *
249 * Description: 205 * Description:
@@ -252,25 +208,34 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
252 * completion. Caller need not hold sb s_umount semaphore. 208 * completion. Caller need not hold sb s_umount semaphore.
253 * 209 *
254 */ 210 */
255void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 211void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
256 long nr_pages)
257{ 212{
258 struct wb_writeback_args args = { 213 struct wb_writeback_args args = {
259 .sb = sb,
260 .sync_mode = WB_SYNC_NONE, 214 .sync_mode = WB_SYNC_NONE,
261 .nr_pages = nr_pages, 215 .nr_pages = nr_pages,
262 .range_cyclic = 1, 216 .range_cyclic = 1,
263 }; 217 };
264 218
265 /* 219 bdi_alloc_queue_work(bdi, &args);
266 * We treat @nr_pages=0 as the special case to do background writeback, 220}
267 * ie. to sync pages until the background dirty threshold is reached.
268 */
269 if (!nr_pages) {
270 args.nr_pages = LONG_MAX;
271 args.for_background = 1;
272 }
273 221
222/**
223 * bdi_start_background_writeback - start background writeback
224 * @bdi: the backing device to write from
225 *
226 * Description:
227 * This does WB_SYNC_NONE background writeback. The IO is only
228 * started when this function returns, we make no guarentees on
229 * completion. Caller need not hold sb s_umount semaphore.
230 */
231void bdi_start_background_writeback(struct backing_dev_info *bdi)
232{
233 struct wb_writeback_args args = {
234 .sync_mode = WB_SYNC_NONE,
235 .nr_pages = LONG_MAX,
236 .for_background = 1,
237 .range_cyclic = 1,
238 };
274 bdi_alloc_queue_work(bdi, &args); 239 bdi_alloc_queue_work(bdi, &args);
275} 240}
276 241
@@ -561,48 +526,30 @@ select_queue:
561 return ret; 526 return ret;
562} 527}
563 528
564static void unpin_sb_for_writeback(struct super_block *sb)
565{
566 up_read(&sb->s_umount);
567 put_super(sb);
568}
569
570enum sb_pin_state {
571 SB_PINNED,
572 SB_NOT_PINNED,
573 SB_PIN_FAILED
574};
575
576/* 529/*
577 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 530 * For background writeback the caller does not have the sb pinned
578 * before calling writeback. So make sure that we do pin it, so it doesn't 531 * before calling writeback. So make sure that we do pin it, so it doesn't
579 * go away while we are writing inodes from it. 532 * go away while we are writing inodes from it.
580 */ 533 */
581static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, 534static bool pin_sb_for_writeback(struct super_block *sb)
582 struct super_block *sb)
583{ 535{
584 /*
585 * Caller must already hold the ref for this
586 */
587 if (wbc->sync_mode == WB_SYNC_ALL) {
588 WARN_ON(!rwsem_is_locked(&sb->s_umount));
589 return SB_NOT_PINNED;
590 }
591 spin_lock(&sb_lock); 536 spin_lock(&sb_lock);
537 if (list_empty(&sb->s_instances)) {
538 spin_unlock(&sb_lock);
539 return false;
540 }
541
592 sb->s_count++; 542 sb->s_count++;
543 spin_unlock(&sb_lock);
544
593 if (down_read_trylock(&sb->s_umount)) { 545 if (down_read_trylock(&sb->s_umount)) {
594 if (sb->s_root) { 546 if (sb->s_root)
595 spin_unlock(&sb_lock); 547 return true;
596 return SB_PINNED;
597 }
598 /*
599 * umounted, drop rwsem again and fall through to failure
600 */
601 up_read(&sb->s_umount); 548 up_read(&sb->s_umount);
602 } 549 }
603 sb->s_count--; 550
604 spin_unlock(&sb_lock); 551 put_super(sb);
605 return SB_PIN_FAILED; 552 return false;
606} 553}
607 554
608/* 555/*
@@ -681,24 +628,31 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
681 struct inode *inode = list_entry(wb->b_io.prev, 628 struct inode *inode = list_entry(wb->b_io.prev,
682 struct inode, i_list); 629 struct inode, i_list);
683 struct super_block *sb = inode->i_sb; 630 struct super_block *sb = inode->i_sb;
684 enum sb_pin_state state;
685 631
686 if (wbc->sb && sb != wbc->sb) { 632 if (wbc->sb) {
687 /* super block given and doesn't 633 /*
688 match, skip this inode */ 634 * We are requested to write out inodes for a specific
689 redirty_tail(inode); 635 * superblock. This means we already have s_umount
690 continue; 636 * taken by the caller which also waits for us to
691 } 637 * complete the writeout.
692 state = pin_sb_for_writeback(wbc, sb); 638 */
639 if (sb != wbc->sb) {
640 redirty_tail(inode);
641 continue;
642 }
693 643
694 if (state == SB_PIN_FAILED) { 644 WARN_ON(!rwsem_is_locked(&sb->s_umount));
695 requeue_io(inode); 645
696 continue; 646 ret = writeback_sb_inodes(sb, wb, wbc);
647 } else {
648 if (!pin_sb_for_writeback(sb)) {
649 requeue_io(inode);
650 continue;
651 }
652 ret = writeback_sb_inodes(sb, wb, wbc);
653 drop_super(sb);
697 } 654 }
698 ret = writeback_sb_inodes(sb, wb, wbc);
699 655
700 if (state == SB_PINNED)
701 unpin_sb_for_writeback(sb);
702 if (ret) 656 if (ret)
703 break; 657 break;
704 } 658 }
@@ -911,7 +865,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
911 * If this isn't a data integrity operation, just notify 865 * If this isn't a data integrity operation, just notify
912 * that we have seen this work and we are now starting it. 866 * that we have seen this work and we are now starting it.
913 */ 867 */
914 if (args.sync_mode == WB_SYNC_NONE) 868 if (!test_bit(WS_ONSTACK, &work->state))
915 wb_clear_pending(wb, work); 869 wb_clear_pending(wb, work);
916 870
917 wrote += wb_writeback(wb, &args); 871 wrote += wb_writeback(wb, &args);
@@ -920,7 +874,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
920 * This is a data integrity writeback, so only do the 874 * This is a data integrity writeback, so only do the
921 * notification when we have completed the work. 875 * notification when we have completed the work.
922 */ 876 */
923 if (args.sync_mode == WB_SYNC_ALL) 877 if (test_bit(WS_ONSTACK, &work->state))
924 wb_clear_pending(wb, work); 878 wb_clear_pending(wb, work);
925 } 879 }
926 880
@@ -978,42 +932,32 @@ int bdi_writeback_task(struct bdi_writeback *wb)
978} 932}
979 933
980/* 934/*
981 * Schedule writeback for all backing devices. This does WB_SYNC_NONE 935 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
982 * writeback, for integrity writeback see bdi_sync_writeback(). 936 * the whole world.
983 */ 937 */
984static void bdi_writeback_all(struct super_block *sb, long nr_pages) 938void wakeup_flusher_threads(long nr_pages)
985{ 939{
940 struct backing_dev_info *bdi;
986 struct wb_writeback_args args = { 941 struct wb_writeback_args args = {
987 .sb = sb,
988 .nr_pages = nr_pages,
989 .sync_mode = WB_SYNC_NONE, 942 .sync_mode = WB_SYNC_NONE,
990 }; 943 };
991 struct backing_dev_info *bdi;
992 944
993 rcu_read_lock(); 945 if (nr_pages) {
946 args.nr_pages = nr_pages;
947 } else {
948 args.nr_pages = global_page_state(NR_FILE_DIRTY) +
949 global_page_state(NR_UNSTABLE_NFS);
950 }
994 951
952 rcu_read_lock();
995 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 953 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
996 if (!bdi_has_dirty_io(bdi)) 954 if (!bdi_has_dirty_io(bdi))
997 continue; 955 continue;
998
999 bdi_alloc_queue_work(bdi, &args); 956 bdi_alloc_queue_work(bdi, &args);
1000 } 957 }
1001
1002 rcu_read_unlock(); 958 rcu_read_unlock();
1003} 959}
1004 960
1005/*
1006 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
1007 * the whole world.
1008 */
1009void wakeup_flusher_threads(long nr_pages)
1010{
1011 if (nr_pages == 0)
1012 nr_pages = global_page_state(NR_FILE_DIRTY) +
1013 global_page_state(NR_UNSTABLE_NFS);
1014 bdi_writeback_all(NULL, nr_pages);
1015}
1016
1017static noinline void block_dump___mark_inode_dirty(struct inode *inode) 961static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1018{ 962{
1019 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 963 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1218,12 +1162,17 @@ void writeback_inodes_sb(struct super_block *sb)
1218{ 1162{
1219 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1163 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1220 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1164 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1221 long nr_to_write; 1165 struct wb_writeback_args args = {
1166 .sb = sb,
1167 .sync_mode = WB_SYNC_NONE,
1168 };
1169
1170 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1222 1171
1223 nr_to_write = nr_dirty + nr_unstable + 1172 args.nr_pages = nr_dirty + nr_unstable +
1224 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1173 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1225 1174
1226 bdi_start_writeback(sb->s_bdi, sb, nr_to_write); 1175 bdi_queue_work_onstack(&args);
1227} 1176}
1228EXPORT_SYMBOL(writeback_inodes_sb); 1177EXPORT_SYMBOL(writeback_inodes_sb);
1229 1178
@@ -1237,7 +1186,9 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1237int writeback_inodes_sb_if_idle(struct super_block *sb) 1186int writeback_inodes_sb_if_idle(struct super_block *sb)
1238{ 1187{
1239 if (!writeback_in_progress(sb->s_bdi)) { 1188 if (!writeback_in_progress(sb->s_bdi)) {
1189 down_read(&sb->s_umount);
1240 writeback_inodes_sb(sb); 1190 writeback_inodes_sb(sb);
1191 up_read(&sb->s_umount);
1241 return 1; 1192 return 1;
1242 } else 1193 } else
1243 return 0; 1194 return 0;
@@ -1253,7 +1204,16 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1253 */ 1204 */
1254void sync_inodes_sb(struct super_block *sb) 1205void sync_inodes_sb(struct super_block *sb)
1255{ 1206{
1256 bdi_sync_writeback(sb->s_bdi, sb); 1207 struct wb_writeback_args args = {
1208 .sb = sb,
1209 .sync_mode = WB_SYNC_ALL,
1210 .nr_pages = LONG_MAX,
1211 .range_cyclic = 0,
1212 };
1213
1214 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1215
1216 bdi_queue_work_onstack(&args);
1257 wait_sb_inodes(sb); 1217 wait_sb_inodes(sb);
1258} 1218}
1259EXPORT_SYMBOL(sync_inodes_sb); 1219EXPORT_SYMBOL(sync_inodes_sb);