aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c304
1 files changed, 116 insertions, 188 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ea8592b90696..0609607d3955 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,7 +45,6 @@ struct wb_writeback_args {
45 unsigned int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 unsigned int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 unsigned int for_background:1; 47 unsigned int for_background:1;
48 unsigned int sb_pinned:1;
49}; 48};
50 49
51/* 50/*
@@ -64,24 +63,16 @@ struct bdi_work {
64}; 63};
65 64
66enum { 65enum {
67 WS_USED_B = 0, 66 WS_INPROGRESS = 0,
68 WS_ONSTACK_B, 67 WS_ONSTACK,
69}; 68};
70 69
71#define WS_USED (1 << WS_USED_B)
72#define WS_ONSTACK (1 << WS_ONSTACK_B)
73
74static inline bool bdi_work_on_stack(struct bdi_work *work)
75{
76 return test_bit(WS_ONSTACK_B, &work->state);
77}
78
79static inline void bdi_work_init(struct bdi_work *work, 70static inline void bdi_work_init(struct bdi_work *work,
80 struct wb_writeback_args *args) 71 struct wb_writeback_args *args)
81{ 72{
82 INIT_RCU_HEAD(&work->rcu_head); 73 INIT_RCU_HEAD(&work->rcu_head);
83 work->args = *args; 74 work->args = *args;
84 work->state = WS_USED; 75 __set_bit(WS_INPROGRESS, &work->state);
85} 76}
86 77
87/** 78/**
@@ -96,43 +87,16 @@ int writeback_in_progress(struct backing_dev_info *bdi)
96 return !list_empty(&bdi->work_list); 87 return !list_empty(&bdi->work_list);
97} 88}
98 89
99static void bdi_work_clear(struct bdi_work *work)
100{
101 clear_bit(WS_USED_B, &work->state);
102 smp_mb__after_clear_bit();
103 /*
104 * work can have disappeared at this point. bit waitq functions
105 * should be able to tolerate this, provided bdi_sched_wait does
106 * not dereference it's pointer argument.
107 */
108 wake_up_bit(&work->state, WS_USED_B);
109}
110
111static void bdi_work_free(struct rcu_head *head) 90static void bdi_work_free(struct rcu_head *head)
112{ 91{
113 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); 92 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
114 93
115 if (!bdi_work_on_stack(work)) 94 clear_bit(WS_INPROGRESS, &work->state);
116 kfree(work); 95 smp_mb__after_clear_bit();
117 else 96 wake_up_bit(&work->state, WS_INPROGRESS);
118 bdi_work_clear(work);
119}
120
121static void wb_work_complete(struct bdi_work *work)
122{
123 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
124 int onstack = bdi_work_on_stack(work);
125 97
126 /* 98 if (!test_bit(WS_ONSTACK, &work->state))
127 * For allocated work, we can clear the done/seen bit right here. 99 kfree(work);
128 * For on-stack work, we need to postpone both the clear and free
129 * to after the RCU grace period, since the stack could be invalidated
130 * as soon as bdi_work_clear() has done the wakeup.
131 */
132 if (!onstack)
133 bdi_work_clear(work);
134 if (sync_mode == WB_SYNC_NONE || onstack)
135 call_rcu(&work->rcu_head, bdi_work_free);
136} 100}
137 101
138static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) 102static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
@@ -148,7 +112,7 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
148 list_del_rcu(&work->list); 112 list_del_rcu(&work->list);
149 spin_unlock(&bdi->wb_lock); 113 spin_unlock(&bdi->wb_lock);
150 114
151 wb_work_complete(work); 115 call_rcu(&work->rcu_head, bdi_work_free);
152 } 116 }
153} 117}
154 118
@@ -186,15 +150,14 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
186 * Used for on-stack allocated work items. The caller needs to wait until 150 * Used for on-stack allocated work items. The caller needs to wait until
187 * the wb threads have acked the work before it's safe to continue. 151 * the wb threads have acked the work before it's safe to continue.
188 */ 152 */
189static void bdi_wait_on_work_clear(struct bdi_work *work) 153static void bdi_wait_on_work_done(struct bdi_work *work)
190{ 154{
191 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, 155 wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait,
192 TASK_UNINTERRUPTIBLE); 156 TASK_UNINTERRUPTIBLE);
193} 157}
194 158
195static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 159static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
196 struct wb_writeback_args *args, 160 struct wb_writeback_args *args)
197 int wait)
198{ 161{
199 struct bdi_work *work; 162 struct bdi_work *work;
200 163
@@ -206,8 +169,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
206 if (work) { 169 if (work) {
207 bdi_work_init(work, args); 170 bdi_work_init(work, args);
208 bdi_queue_work(bdi, work); 171 bdi_queue_work(bdi, work);
209 if (wait)
210 bdi_wait_on_work_clear(work);
211 } else { 172 } else {
212 struct bdi_writeback *wb = &bdi->wb; 173 struct bdi_writeback *wb = &bdi->wb;
213 174
@@ -217,72 +178,65 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
217} 178}
218 179
219/** 180/**
220 * bdi_sync_writeback - start and wait for writeback 181 * bdi_queue_work_onstack - start and wait for writeback
221 * @bdi: the backing device to write from
222 * @sb: write inodes from this super_block 182 * @sb: write inodes from this super_block
223 * 183 *
224 * Description: 184 * Description:
225 * This does WB_SYNC_ALL data integrity writeback and waits for the 185 * This function initiates writeback and waits for the operation to
226 * IO to complete. Callers must hold the sb s_umount semaphore for 186 * complete. Callers must hold the sb s_umount semaphore for
227 * reading, to avoid having the super disappear before we are done. 187 * reading, to avoid having the super disappear before we are done.
228 */ 188 */
229static void bdi_sync_writeback(struct backing_dev_info *bdi, 189static void bdi_queue_work_onstack(struct wb_writeback_args *args)
230 struct super_block *sb)
231{ 190{
232 struct wb_writeback_args args = {
233 .sb = sb,
234 .sync_mode = WB_SYNC_ALL,
235 .nr_pages = LONG_MAX,
236 .range_cyclic = 0,
237 /*
238 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
239 * lets make it explicitly clear.
240 */
241 .sb_pinned = 1,
242 };
243 struct bdi_work work; 191 struct bdi_work work;
244 192
245 bdi_work_init(&work, &args); 193 bdi_work_init(&work, args);
246 work.state |= WS_ONSTACK; 194 __set_bit(WS_ONSTACK, &work.state);
247 195
248 bdi_queue_work(bdi, &work); 196 bdi_queue_work(args->sb->s_bdi, &work);
249 bdi_wait_on_work_clear(&work); 197 bdi_wait_on_work_done(&work);
250} 198}
251 199
252/** 200/**
253 * bdi_start_writeback - start writeback 201 * bdi_start_writeback - start writeback
254 * @bdi: the backing device to write from 202 * @bdi: the backing device to write from
255 * @sb: write inodes from this super_block
256 * @nr_pages: the number of pages to write 203 * @nr_pages: the number of pages to write
257 * @sb_locked: caller already holds sb umount sem.
258 * 204 *
259 * Description: 205 * Description:
260 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 206 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
261 * started when this function returns, we make no guarentees on 207 * started when this function returns, we make no guarentees on
262 * completion. Caller specifies whether sb umount sem is held already or not. 208 * completion. Caller need not hold sb s_umount semaphore.
263 * 209 *
264 */ 210 */
265void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 211void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
266 long nr_pages, int sb_locked)
267{ 212{
268 struct wb_writeback_args args = { 213 struct wb_writeback_args args = {
269 .sb = sb,
270 .sync_mode = WB_SYNC_NONE, 214 .sync_mode = WB_SYNC_NONE,
271 .nr_pages = nr_pages, 215 .nr_pages = nr_pages,
272 .range_cyclic = 1, 216 .range_cyclic = 1,
273 .sb_pinned = sb_locked,
274 }; 217 };
275 218
276 /* 219 bdi_alloc_queue_work(bdi, &args);
277 * We treat @nr_pages=0 as the special case to do background writeback, 220}
278 * ie. to sync pages until the background dirty threshold is reached.
279 */
280 if (!nr_pages) {
281 args.nr_pages = LONG_MAX;
282 args.for_background = 1;
283 }
284 221
285 bdi_alloc_queue_work(bdi, &args, sb_locked); 222/**
223 * bdi_start_background_writeback - start background writeback
224 * @bdi: the backing device to write from
225 *
226 * Description:
227 * This does WB_SYNC_NONE background writeback. The IO is only
228 * started when this function returns, we make no guarentees on
229 * completion. Caller need not hold sb s_umount semaphore.
230 */
231void bdi_start_background_writeback(struct backing_dev_info *bdi)
232{
233 struct wb_writeback_args args = {
234 .sync_mode = WB_SYNC_NONE,
235 .nr_pages = LONG_MAX,
236 .for_background = 1,
237 .range_cyclic = 1,
238 };
239 bdi_alloc_queue_work(bdi, &args);
286} 240}
287 241
288/* 242/*
@@ -572,48 +526,30 @@ select_queue:
572 return ret; 526 return ret;
573} 527}
574 528
575static void unpin_sb_for_writeback(struct super_block *sb)
576{
577 up_read(&sb->s_umount);
578 put_super(sb);
579}
580
581enum sb_pin_state {
582 SB_PINNED,
583 SB_NOT_PINNED,
584 SB_PIN_FAILED
585};
586
587/* 529/*
588 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 530 * For background writeback the caller does not have the sb pinned
589 * before calling writeback. So make sure that we do pin it, so it doesn't 531 * before calling writeback. So make sure that we do pin it, so it doesn't
590 * go away while we are writing inodes from it. 532 * go away while we are writing inodes from it.
591 */ 533 */
592static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, 534static bool pin_sb_for_writeback(struct super_block *sb)
593 struct super_block *sb)
594{ 535{
595 /*
596 * Caller must already hold the ref for this
597 */
598 if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
599 WARN_ON(!rwsem_is_locked(&sb->s_umount));
600 return SB_NOT_PINNED;
601 }
602 spin_lock(&sb_lock); 536 spin_lock(&sb_lock);
537 if (list_empty(&sb->s_instances)) {
538 spin_unlock(&sb_lock);
539 return false;
540 }
541
603 sb->s_count++; 542 sb->s_count++;
543 spin_unlock(&sb_lock);
544
604 if (down_read_trylock(&sb->s_umount)) { 545 if (down_read_trylock(&sb->s_umount)) {
605 if (sb->s_root) { 546 if (sb->s_root)
606 spin_unlock(&sb_lock); 547 return true;
607 return SB_PINNED;
608 }
609 /*
610 * umounted, drop rwsem again and fall through to failure
611 */
612 up_read(&sb->s_umount); 548 up_read(&sb->s_umount);
613 } 549 }
614 sb->s_count--; 550
615 spin_unlock(&sb_lock); 551 put_super(sb);
616 return SB_PIN_FAILED; 552 return false;
617} 553}
618 554
619/* 555/*
@@ -692,24 +628,31 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
692 struct inode *inode = list_entry(wb->b_io.prev, 628 struct inode *inode = list_entry(wb->b_io.prev,
693 struct inode, i_list); 629 struct inode, i_list);
694 struct super_block *sb = inode->i_sb; 630 struct super_block *sb = inode->i_sb;
695 enum sb_pin_state state;
696 631
697 if (wbc->sb && sb != wbc->sb) { 632 if (wbc->sb) {
698 /* super block given and doesn't 633 /*
699 match, skip this inode */ 634 * We are requested to write out inodes for a specific
700 redirty_tail(inode); 635 * superblock. This means we already have s_umount
701 continue; 636 * taken by the caller which also waits for us to
702 } 637 * complete the writeout.
703 state = pin_sb_for_writeback(wbc, sb); 638 */
639 if (sb != wbc->sb) {
640 redirty_tail(inode);
641 continue;
642 }
704 643
705 if (state == SB_PIN_FAILED) { 644 WARN_ON(!rwsem_is_locked(&sb->s_umount));
706 requeue_io(inode); 645
707 continue; 646 ret = writeback_sb_inodes(sb, wb, wbc);
647 } else {
648 if (!pin_sb_for_writeback(sb)) {
649 requeue_io(inode);
650 continue;
651 }
652 ret = writeback_sb_inodes(sb, wb, wbc);
653 drop_super(sb);
708 } 654 }
709 ret = writeback_sb_inodes(sb, wb, wbc);
710 655
711 if (state == SB_PINNED)
712 unpin_sb_for_writeback(sb);
713 if (ret) 656 if (ret)
714 break; 657 break;
715 } 658 }
@@ -769,7 +712,6 @@ static long wb_writeback(struct bdi_writeback *wb,
769 .for_kupdate = args->for_kupdate, 712 .for_kupdate = args->for_kupdate,
770 .for_background = args->for_background, 713 .for_background = args->for_background,
771 .range_cyclic = args->range_cyclic, 714 .range_cyclic = args->range_cyclic,
772 .sb_pinned = args->sb_pinned,
773 }; 715 };
774 unsigned long oldest_jif; 716 unsigned long oldest_jif;
775 long wrote = 0; 717 long wrote = 0;
@@ -912,7 +854,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
912 854
913 while ((work = get_next_work_item(bdi, wb)) != NULL) { 855 while ((work = get_next_work_item(bdi, wb)) != NULL) {
914 struct wb_writeback_args args = work->args; 856 struct wb_writeback_args args = work->args;
915 int post_clear;
916 857
917 /* 858 /*
918 * Override sync mode, in case we must wait for completion 859 * Override sync mode, in case we must wait for completion
@@ -920,13 +861,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
920 if (force_wait) 861 if (force_wait)
921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 862 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
922 863
923 post_clear = WB_SYNC_ALL || args.sb_pinned;
924
925 /* 864 /*
926 * If this isn't a data integrity operation, just notify 865 * If this isn't a data integrity operation, just notify
927 * that we have seen this work and we are now starting it. 866 * that we have seen this work and we are now starting it.
928 */ 867 */
929 if (!post_clear) 868 if (!test_bit(WS_ONSTACK, &work->state))
930 wb_clear_pending(wb, work); 869 wb_clear_pending(wb, work);
931 870
932 wrote += wb_writeback(wb, &args); 871 wrote += wb_writeback(wb, &args);
@@ -935,7 +874,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
935 * This is a data integrity writeback, so only do the 874 * This is a data integrity writeback, so only do the
936 * notification when we have completed the work. 875 * notification when we have completed the work.
937 */ 876 */
938 if (post_clear) 877 if (test_bit(WS_ONSTACK, &work->state))
939 wb_clear_pending(wb, work); 878 wb_clear_pending(wb, work);
940 } 879 }
941 880
@@ -993,42 +932,32 @@ int bdi_writeback_task(struct bdi_writeback *wb)
993} 932}
994 933
995/* 934/*
996 * Schedule writeback for all backing devices. This does WB_SYNC_NONE 935 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
997 * writeback, for integrity writeback see bdi_sync_writeback(). 936 * the whole world.
998 */ 937 */
999static void bdi_writeback_all(struct super_block *sb, long nr_pages) 938void wakeup_flusher_threads(long nr_pages)
1000{ 939{
940 struct backing_dev_info *bdi;
1001 struct wb_writeback_args args = { 941 struct wb_writeback_args args = {
1002 .sb = sb,
1003 .nr_pages = nr_pages,
1004 .sync_mode = WB_SYNC_NONE, 942 .sync_mode = WB_SYNC_NONE,
1005 }; 943 };
1006 struct backing_dev_info *bdi;
1007 944
1008 rcu_read_lock(); 945 if (nr_pages) {
946 args.nr_pages = nr_pages;
947 } else {
948 args.nr_pages = global_page_state(NR_FILE_DIRTY) +
949 global_page_state(NR_UNSTABLE_NFS);
950 }
1009 951
952 rcu_read_lock();
1010 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 953 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1011 if (!bdi_has_dirty_io(bdi)) 954 if (!bdi_has_dirty_io(bdi))
1012 continue; 955 continue;
1013 956 bdi_alloc_queue_work(bdi, &args);
1014 bdi_alloc_queue_work(bdi, &args, 0);
1015 } 957 }
1016
1017 rcu_read_unlock(); 958 rcu_read_unlock();
1018} 959}
1019 960
1020/*
1021 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
1022 * the whole world.
1023 */
1024void wakeup_flusher_threads(long nr_pages)
1025{
1026 if (nr_pages == 0)
1027 nr_pages = global_page_state(NR_FILE_DIRTY) +
1028 global_page_state(NR_UNSTABLE_NFS);
1029 bdi_writeback_all(NULL, nr_pages);
1030}
1031
1032static noinline void block_dump___mark_inode_dirty(struct inode *inode) 961static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1033{ 962{
1034 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 963 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1220,18 +1149,6 @@ static void wait_sb_inodes(struct super_block *sb)
1220 iput(old_inode); 1149 iput(old_inode);
1221} 1150}
1222 1151
1223static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1224{
1225 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1226 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1227 long nr_to_write;
1228
1229 nr_to_write = nr_dirty + nr_unstable +
1230 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1231
1232 bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
1233}
1234
1235/** 1152/**
1236 * writeback_inodes_sb - writeback dirty inodes from given super_block 1153 * writeback_inodes_sb - writeback dirty inodes from given super_block
1237 * @sb: the superblock 1154 * @sb: the superblock
@@ -1243,21 +1160,21 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
1243 */ 1160 */
1244void writeback_inodes_sb(struct super_block *sb) 1161void writeback_inodes_sb(struct super_block *sb)
1245{ 1162{
1246 __writeback_inodes_sb(sb, 0); 1163 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1247} 1164 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1248EXPORT_SYMBOL(writeback_inodes_sb); 1165 struct wb_writeback_args args = {
1166 .sb = sb,
1167 .sync_mode = WB_SYNC_NONE,
1168 };
1249 1169
1250/** 1170 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1251 * writeback_inodes_sb_locked - writeback dirty inodes from given super_block 1171
1252 * @sb: the superblock 1172 args.nr_pages = nr_dirty + nr_unstable +
1253 * 1173 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1254 * Like writeback_inodes_sb(), except the caller already holds the 1174
1255 * sb umount sem. 1175 bdi_queue_work_onstack(&args);
1256 */
1257void writeback_inodes_sb_locked(struct super_block *sb)
1258{
1259 __writeback_inodes_sb(sb, 1);
1260} 1176}
1177EXPORT_SYMBOL(writeback_inodes_sb);
1261 1178
1262/** 1179/**
1263 * writeback_inodes_sb_if_idle - start writeback if none underway 1180 * writeback_inodes_sb_if_idle - start writeback if none underway
@@ -1269,7 +1186,9 @@ void writeback_inodes_sb_locked(struct super_block *sb)
1269int writeback_inodes_sb_if_idle(struct super_block *sb) 1186int writeback_inodes_sb_if_idle(struct super_block *sb)
1270{ 1187{
1271 if (!writeback_in_progress(sb->s_bdi)) { 1188 if (!writeback_in_progress(sb->s_bdi)) {
1189 down_read(&sb->s_umount);
1272 writeback_inodes_sb(sb); 1190 writeback_inodes_sb(sb);
1191 up_read(&sb->s_umount);
1273 return 1; 1192 return 1;
1274 } else 1193 } else
1275 return 0; 1194 return 0;
@@ -1285,7 +1204,16 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1285 */ 1204 */
1286void sync_inodes_sb(struct super_block *sb) 1205void sync_inodes_sb(struct super_block *sb)
1287{ 1206{
1288 bdi_sync_writeback(sb->s_bdi, sb); 1207 struct wb_writeback_args args = {
1208 .sb = sb,
1209 .sync_mode = WB_SYNC_ALL,
1210 .nr_pages = LONG_MAX,
1211 .range_cyclic = 0,
1212 };
1213
1214 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1215
1216 bdi_queue_work_onstack(&args);
1289 wait_sb_inodes(sb); 1217 wait_sb_inodes(sb);
1290} 1218}
1291EXPORT_SYMBOL(sync_inodes_sb); 1219EXPORT_SYMBOL(sync_inodes_sb);