diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 254 |
1 files changed, 107 insertions, 147 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1d1088f48bc2..0609607d3955 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -63,24 +63,16 @@ struct bdi_work { | |||
63 | }; | 63 | }; |
64 | 64 | ||
65 | enum { | 65 | enum { |
66 | WS_USED_B = 0, | 66 | WS_INPROGRESS = 0, |
67 | WS_ONSTACK_B, | 67 | WS_ONSTACK, |
68 | }; | 68 | }; |
69 | 69 | ||
70 | #define WS_USED (1 << WS_USED_B) | ||
71 | #define WS_ONSTACK (1 << WS_ONSTACK_B) | ||
72 | |||
73 | static inline bool bdi_work_on_stack(struct bdi_work *work) | ||
74 | { | ||
75 | return test_bit(WS_ONSTACK_B, &work->state); | ||
76 | } | ||
77 | |||
78 | static inline void bdi_work_init(struct bdi_work *work, | 70 | static inline void bdi_work_init(struct bdi_work *work, |
79 | struct wb_writeback_args *args) | 71 | struct wb_writeback_args *args) |
80 | { | 72 | { |
81 | INIT_RCU_HEAD(&work->rcu_head); | 73 | INIT_RCU_HEAD(&work->rcu_head); |
82 | work->args = *args; | 74 | work->args = *args; |
83 | work->state = WS_USED; | 75 | __set_bit(WS_INPROGRESS, &work->state); |
84 | } | 76 | } |
85 | 77 | ||
86 | /** | 78 | /** |
@@ -95,43 +87,16 @@ int writeback_in_progress(struct backing_dev_info *bdi) | |||
95 | return !list_empty(&bdi->work_list); | 87 | return !list_empty(&bdi->work_list); |
96 | } | 88 | } |
97 | 89 | ||
98 | static void bdi_work_clear(struct bdi_work *work) | ||
99 | { | ||
100 | clear_bit(WS_USED_B, &work->state); | ||
101 | smp_mb__after_clear_bit(); | ||
102 | /* | ||
103 | * work can have disappeared at this point. bit waitq functions | ||
104 | * should be able to tolerate this, provided bdi_sched_wait does | ||
105 | * not dereference it's pointer argument. | ||
106 | */ | ||
107 | wake_up_bit(&work->state, WS_USED_B); | ||
108 | } | ||
109 | |||
110 | static void bdi_work_free(struct rcu_head *head) | 90 | static void bdi_work_free(struct rcu_head *head) |
111 | { | 91 | { |
112 | struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); | 92 | struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); |
113 | 93 | ||
114 | if (!bdi_work_on_stack(work)) | 94 | clear_bit(WS_INPROGRESS, &work->state); |
115 | kfree(work); | 95 | smp_mb__after_clear_bit(); |
116 | else | 96 | wake_up_bit(&work->state, WS_INPROGRESS); |
117 | bdi_work_clear(work); | ||
118 | } | ||
119 | |||
120 | static void wb_work_complete(struct bdi_work *work) | ||
121 | { | ||
122 | const enum writeback_sync_modes sync_mode = work->args.sync_mode; | ||
123 | int onstack = bdi_work_on_stack(work); | ||
124 | 97 | ||
125 | /* | 98 | if (!test_bit(WS_ONSTACK, &work->state)) |
126 | * For allocated work, we can clear the done/seen bit right here. | 99 | kfree(work); |
127 | * For on-stack work, we need to postpone both the clear and free | ||
128 | * to after the RCU grace period, since the stack could be invalidated | ||
129 | * as soon as bdi_work_clear() has done the wakeup. | ||
130 | */ | ||
131 | if (!onstack) | ||
132 | bdi_work_clear(work); | ||
133 | if (sync_mode == WB_SYNC_NONE || onstack) | ||
134 | call_rcu(&work->rcu_head, bdi_work_free); | ||
135 | } | 100 | } |
136 | 101 | ||
137 | static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) | 102 | static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) |
@@ -147,7 +112,7 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) | |||
147 | list_del_rcu(&work->list); | 112 | list_del_rcu(&work->list); |
148 | spin_unlock(&bdi->wb_lock); | 113 | spin_unlock(&bdi->wb_lock); |
149 | 114 | ||
150 | wb_work_complete(work); | 115 | call_rcu(&work->rcu_head, bdi_work_free); |
151 | } | 116 | } |
152 | } | 117 | } |
153 | 118 | ||
@@ -185,9 +150,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) | |||
185 | * Used for on-stack allocated work items. The caller needs to wait until | 150 | * Used for on-stack allocated work items. The caller needs to wait until |
186 | * the wb threads have acked the work before it's safe to continue. | 151 | * the wb threads have acked the work before it's safe to continue. |
187 | */ | 152 | */ |
188 | static void bdi_wait_on_work_clear(struct bdi_work *work) | 153 | static void bdi_wait_on_work_done(struct bdi_work *work) |
189 | { | 154 | { |
190 | wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, | 155 | wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait, |
191 | TASK_UNINTERRUPTIBLE); | 156 | TASK_UNINTERRUPTIBLE); |
192 | } | 157 | } |
193 | 158 | ||
@@ -213,37 +178,28 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, | |||
213 | } | 178 | } |
214 | 179 | ||
215 | /** | 180 | /** |
216 | * bdi_sync_writeback - start and wait for writeback | 181 | * bdi_queue_work_onstack - start and wait for writeback |
217 | * @bdi: the backing device to write from | ||
218 | * @sb: write inodes from this super_block | 182 | * @sb: write inodes from this super_block |
219 | * | 183 | * |
220 | * Description: | 184 | * Description: |
221 | * This does WB_SYNC_ALL data integrity writeback and waits for the | 185 | * This function initiates writeback and waits for the operation to |
222 | * IO to complete. Callers must hold the sb s_umount semaphore for | 186 | * complete. Callers must hold the sb s_umount semaphore for |
223 | * reading, to avoid having the super disappear before we are done. | 187 | * reading, to avoid having the super disappear before we are done. |
224 | */ | 188 | */ |
225 | static void bdi_sync_writeback(struct backing_dev_info *bdi, | 189 | static void bdi_queue_work_onstack(struct wb_writeback_args *args) |
226 | struct super_block *sb) | ||
227 | { | 190 | { |
228 | struct wb_writeback_args args = { | ||
229 | .sb = sb, | ||
230 | .sync_mode = WB_SYNC_ALL, | ||
231 | .nr_pages = LONG_MAX, | ||
232 | .range_cyclic = 0, | ||
233 | }; | ||
234 | struct bdi_work work; | 191 | struct bdi_work work; |
235 | 192 | ||
236 | bdi_work_init(&work, &args); | 193 | bdi_work_init(&work, args); |
237 | work.state |= WS_ONSTACK; | 194 | __set_bit(WS_ONSTACK, &work.state); |
238 | 195 | ||
239 | bdi_queue_work(bdi, &work); | 196 | bdi_queue_work(args->sb->s_bdi, &work); |
240 | bdi_wait_on_work_clear(&work); | 197 | bdi_wait_on_work_done(&work); |
241 | } | 198 | } |
242 | 199 | ||
243 | /** | 200 | /** |
244 | * bdi_start_writeback - start writeback | 201 | * bdi_start_writeback - start writeback |
245 | * @bdi: the backing device to write from | 202 | * @bdi: the backing device to write from |
246 | * @sb: write inodes from this super_block | ||
247 | * @nr_pages: the number of pages to write | 203 | * @nr_pages: the number of pages to write |
248 | * | 204 | * |
249 | * Description: | 205 | * Description: |
@@ -252,25 +208,34 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, | |||
252 | * completion. Caller need not hold sb s_umount semaphore. | 208 | * completion. Caller need not hold sb s_umount semaphore. |
253 | * | 209 | * |
254 | */ | 210 | */ |
255 | void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, | 211 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) |
256 | long nr_pages) | ||
257 | { | 212 | { |
258 | struct wb_writeback_args args = { | 213 | struct wb_writeback_args args = { |
259 | .sb = sb, | ||
260 | .sync_mode = WB_SYNC_NONE, | 214 | .sync_mode = WB_SYNC_NONE, |
261 | .nr_pages = nr_pages, | 215 | .nr_pages = nr_pages, |
262 | .range_cyclic = 1, | 216 | .range_cyclic = 1, |
263 | }; | 217 | }; |
264 | 218 | ||
265 | /* | 219 | bdi_alloc_queue_work(bdi, &args); |
266 | * We treat @nr_pages=0 as the special case to do background writeback, | 220 | } |
267 | * ie. to sync pages until the background dirty threshold is reached. | ||
268 | */ | ||
269 | if (!nr_pages) { | ||
270 | args.nr_pages = LONG_MAX; | ||
271 | args.for_background = 1; | ||
272 | } | ||
273 | 221 | ||
222 | /** | ||
223 | * bdi_start_background_writeback - start background writeback | ||
224 | * @bdi: the backing device to write from | ||
225 | * | ||
226 | * Description: | ||
227 | * This does WB_SYNC_NONE background writeback. The IO is only | ||
228 | * started when this function returns, we make no guarentees on | ||
229 | * completion. Caller need not hold sb s_umount semaphore. | ||
230 | */ | ||
231 | void bdi_start_background_writeback(struct backing_dev_info *bdi) | ||
232 | { | ||
233 | struct wb_writeback_args args = { | ||
234 | .sync_mode = WB_SYNC_NONE, | ||
235 | .nr_pages = LONG_MAX, | ||
236 | .for_background = 1, | ||
237 | .range_cyclic = 1, | ||
238 | }; | ||
274 | bdi_alloc_queue_work(bdi, &args); | 239 | bdi_alloc_queue_work(bdi, &args); |
275 | } | 240 | } |
276 | 241 | ||
@@ -561,48 +526,30 @@ select_queue: | |||
561 | return ret; | 526 | return ret; |
562 | } | 527 | } |
563 | 528 | ||
564 | static void unpin_sb_for_writeback(struct super_block *sb) | ||
565 | { | ||
566 | up_read(&sb->s_umount); | ||
567 | put_super(sb); | ||
568 | } | ||
569 | |||
570 | enum sb_pin_state { | ||
571 | SB_PINNED, | ||
572 | SB_NOT_PINNED, | ||
573 | SB_PIN_FAILED | ||
574 | }; | ||
575 | |||
576 | /* | 529 | /* |
577 | * For WB_SYNC_NONE writeback, the caller does not have the sb pinned | 530 | * For background writeback the caller does not have the sb pinned |
578 | * before calling writeback. So make sure that we do pin it, so it doesn't | 531 | * before calling writeback. So make sure that we do pin it, so it doesn't |
579 | * go away while we are writing inodes from it. | 532 | * go away while we are writing inodes from it. |
580 | */ | 533 | */ |
581 | static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, | 534 | static bool pin_sb_for_writeback(struct super_block *sb) |
582 | struct super_block *sb) | ||
583 | { | 535 | { |
584 | /* | ||
585 | * Caller must already hold the ref for this | ||
586 | */ | ||
587 | if (wbc->sync_mode == WB_SYNC_ALL) { | ||
588 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | ||
589 | return SB_NOT_PINNED; | ||
590 | } | ||
591 | spin_lock(&sb_lock); | 536 | spin_lock(&sb_lock); |
537 | if (list_empty(&sb->s_instances)) { | ||
538 | spin_unlock(&sb_lock); | ||
539 | return false; | ||
540 | } | ||
541 | |||
592 | sb->s_count++; | 542 | sb->s_count++; |
543 | spin_unlock(&sb_lock); | ||
544 | |||
593 | if (down_read_trylock(&sb->s_umount)) { | 545 | if (down_read_trylock(&sb->s_umount)) { |
594 | if (sb->s_root) { | 546 | if (sb->s_root) |
595 | spin_unlock(&sb_lock); | 547 | return true; |
596 | return SB_PINNED; | ||
597 | } | ||
598 | /* | ||
599 | * umounted, drop rwsem again and fall through to failure | ||
600 | */ | ||
601 | up_read(&sb->s_umount); | 548 | up_read(&sb->s_umount); |
602 | } | 549 | } |
603 | sb->s_count--; | 550 | |
604 | spin_unlock(&sb_lock); | 551 | put_super(sb); |
605 | return SB_PIN_FAILED; | 552 | return false; |
606 | } | 553 | } |
607 | 554 | ||
608 | /* | 555 | /* |
@@ -681,24 +628,31 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, | |||
681 | struct inode *inode = list_entry(wb->b_io.prev, | 628 | struct inode *inode = list_entry(wb->b_io.prev, |
682 | struct inode, i_list); | 629 | struct inode, i_list); |
683 | struct super_block *sb = inode->i_sb; | 630 | struct super_block *sb = inode->i_sb; |
684 | enum sb_pin_state state; | ||
685 | 631 | ||
686 | if (wbc->sb && sb != wbc->sb) { | 632 | if (wbc->sb) { |
687 | /* super block given and doesn't | 633 | /* |
688 | match, skip this inode */ | 634 | * We are requested to write out inodes for a specific |
689 | redirty_tail(inode); | 635 | * superblock. This means we already have s_umount |
690 | continue; | 636 | * taken by the caller which also waits for us to |
691 | } | 637 | * complete the writeout. |
692 | state = pin_sb_for_writeback(wbc, sb); | 638 | */ |
639 | if (sb != wbc->sb) { | ||
640 | redirty_tail(inode); | ||
641 | continue; | ||
642 | } | ||
693 | 643 | ||
694 | if (state == SB_PIN_FAILED) { | 644 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
695 | requeue_io(inode); | 645 | |
696 | continue; | 646 | ret = writeback_sb_inodes(sb, wb, wbc); |
647 | } else { | ||
648 | if (!pin_sb_for_writeback(sb)) { | ||
649 | requeue_io(inode); | ||
650 | continue; | ||
651 | } | ||
652 | ret = writeback_sb_inodes(sb, wb, wbc); | ||
653 | drop_super(sb); | ||
697 | } | 654 | } |
698 | ret = writeback_sb_inodes(sb, wb, wbc); | ||
699 | 655 | ||
700 | if (state == SB_PINNED) | ||
701 | unpin_sb_for_writeback(sb); | ||
702 | if (ret) | 656 | if (ret) |
703 | break; | 657 | break; |
704 | } | 658 | } |
@@ -911,7 +865,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
911 | * If this isn't a data integrity operation, just notify | 865 | * If this isn't a data integrity operation, just notify |
912 | * that we have seen this work and we are now starting it. | 866 | * that we have seen this work and we are now starting it. |
913 | */ | 867 | */ |
914 | if (args.sync_mode == WB_SYNC_NONE) | 868 | if (!test_bit(WS_ONSTACK, &work->state)) |
915 | wb_clear_pending(wb, work); | 869 | wb_clear_pending(wb, work); |
916 | 870 | ||
917 | wrote += wb_writeback(wb, &args); | 871 | wrote += wb_writeback(wb, &args); |
@@ -920,7 +874,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) | |||
920 | * This is a data integrity writeback, so only do the | 874 | * This is a data integrity writeback, so only do the |
921 | * notification when we have completed the work. | 875 | * notification when we have completed the work. |
922 | */ | 876 | */ |
923 | if (args.sync_mode == WB_SYNC_ALL) | 877 | if (test_bit(WS_ONSTACK, &work->state)) |
924 | wb_clear_pending(wb, work); | 878 | wb_clear_pending(wb, work); |
925 | } | 879 | } |
926 | 880 | ||
@@ -978,42 +932,32 @@ int bdi_writeback_task(struct bdi_writeback *wb) | |||
978 | } | 932 | } |
979 | 933 | ||
980 | /* | 934 | /* |
981 | * Schedule writeback for all backing devices. This does WB_SYNC_NONE | 935 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back |
982 | * writeback, for integrity writeback see bdi_sync_writeback(). | 936 | * the whole world. |
983 | */ | 937 | */ |
984 | static void bdi_writeback_all(struct super_block *sb, long nr_pages) | 938 | void wakeup_flusher_threads(long nr_pages) |
985 | { | 939 | { |
940 | struct backing_dev_info *bdi; | ||
986 | struct wb_writeback_args args = { | 941 | struct wb_writeback_args args = { |
987 | .sb = sb, | ||
988 | .nr_pages = nr_pages, | ||
989 | .sync_mode = WB_SYNC_NONE, | 942 | .sync_mode = WB_SYNC_NONE, |
990 | }; | 943 | }; |
991 | struct backing_dev_info *bdi; | ||
992 | 944 | ||
993 | rcu_read_lock(); | 945 | if (nr_pages) { |
946 | args.nr_pages = nr_pages; | ||
947 | } else { | ||
948 | args.nr_pages = global_page_state(NR_FILE_DIRTY) + | ||
949 | global_page_state(NR_UNSTABLE_NFS); | ||
950 | } | ||
994 | 951 | ||
952 | rcu_read_lock(); | ||
995 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { | 953 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { |
996 | if (!bdi_has_dirty_io(bdi)) | 954 | if (!bdi_has_dirty_io(bdi)) |
997 | continue; | 955 | continue; |
998 | |||
999 | bdi_alloc_queue_work(bdi, &args); | 956 | bdi_alloc_queue_work(bdi, &args); |
1000 | } | 957 | } |
1001 | |||
1002 | rcu_read_unlock(); | 958 | rcu_read_unlock(); |
1003 | } | 959 | } |
1004 | 960 | ||
1005 | /* | ||
1006 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | ||
1007 | * the whole world. | ||
1008 | */ | ||
1009 | void wakeup_flusher_threads(long nr_pages) | ||
1010 | { | ||
1011 | if (nr_pages == 0) | ||
1012 | nr_pages = global_page_state(NR_FILE_DIRTY) + | ||
1013 | global_page_state(NR_UNSTABLE_NFS); | ||
1014 | bdi_writeback_all(NULL, nr_pages); | ||
1015 | } | ||
1016 | |||
1017 | static noinline void block_dump___mark_inode_dirty(struct inode *inode) | 961 | static noinline void block_dump___mark_inode_dirty(struct inode *inode) |
1018 | { | 962 | { |
1019 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { | 963 | if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { |
@@ -1218,12 +1162,17 @@ void writeback_inodes_sb(struct super_block *sb) | |||
1218 | { | 1162 | { |
1219 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); | 1163 | unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); |
1220 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); | 1164 | unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); |
1221 | long nr_to_write; | 1165 | struct wb_writeback_args args = { |
1166 | .sb = sb, | ||
1167 | .sync_mode = WB_SYNC_NONE, | ||
1168 | }; | ||
1169 | |||
1170 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | ||
1222 | 1171 | ||
1223 | nr_to_write = nr_dirty + nr_unstable + | 1172 | args.nr_pages = nr_dirty + nr_unstable + |
1224 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 1173 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
1225 | 1174 | ||
1226 | bdi_start_writeback(sb->s_bdi, sb, nr_to_write); | 1175 | bdi_queue_work_onstack(&args); |
1227 | } | 1176 | } |
1228 | EXPORT_SYMBOL(writeback_inodes_sb); | 1177 | EXPORT_SYMBOL(writeback_inodes_sb); |
1229 | 1178 | ||
@@ -1237,7 +1186,9 @@ EXPORT_SYMBOL(writeback_inodes_sb); | |||
1237 | int writeback_inodes_sb_if_idle(struct super_block *sb) | 1186 | int writeback_inodes_sb_if_idle(struct super_block *sb) |
1238 | { | 1187 | { |
1239 | if (!writeback_in_progress(sb->s_bdi)) { | 1188 | if (!writeback_in_progress(sb->s_bdi)) { |
1189 | down_read(&sb->s_umount); | ||
1240 | writeback_inodes_sb(sb); | 1190 | writeback_inodes_sb(sb); |
1191 | up_read(&sb->s_umount); | ||
1241 | return 1; | 1192 | return 1; |
1242 | } else | 1193 | } else |
1243 | return 0; | 1194 | return 0; |
@@ -1253,7 +1204,16 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); | |||
1253 | */ | 1204 | */ |
1254 | void sync_inodes_sb(struct super_block *sb) | 1205 | void sync_inodes_sb(struct super_block *sb) |
1255 | { | 1206 | { |
1256 | bdi_sync_writeback(sb->s_bdi, sb); | 1207 | struct wb_writeback_args args = { |
1208 | .sb = sb, | ||
1209 | .sync_mode = WB_SYNC_ALL, | ||
1210 | .nr_pages = LONG_MAX, | ||
1211 | .range_cyclic = 0, | ||
1212 | }; | ||
1213 | |||
1214 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | ||
1215 | |||
1216 | bdi_queue_work_onstack(&args); | ||
1257 | wait_sb_inodes(sb); | 1217 | wait_sb_inodes(sb); |
1258 | } | 1218 | } |
1259 | EXPORT_SYMBOL(sync_inodes_sb); | 1219 | EXPORT_SYMBOL(sync_inodes_sb); |