aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c1197
1 files changed, 843 insertions, 354 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..9d5360c4c2af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,257 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
25#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
26#include "internal.h" 28#include "internal.h"
27 29
30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 31
29/** 32/*
30 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * We don't actually have pdflush, but this one is exported though /proc...
31 * @bdi: the device's backing_dev_info structure
32 *
33 * It is a waste of resources to have more than one pdflush thread blocked on
34 * a single request queue. Exclusion at the request_queue level is obtained
35 * via a flag in the request_queue's backing_dev_info.state.
36 *
37 * Non-request_queue-backed address_spaces will share default_backing_dev_info,
38 * unless they implement their own. Which is somewhat inefficient, as this
39 * may prevent concurrent writeback against multiple devices.
40 */ 34 */
41static int writeback_acquire(struct backing_dev_info *bdi) 35int nr_pdflush_threads;
36
37/*
38 * Passed into wb_writeback(), essentially a subset of writeback_control
39 */
40struct wb_writeback_args {
41 long nr_pages;
42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode;
44 int for_kupdate:1;
45 int range_cyclic:1;
46 int for_background:1;
47};
48
49/*
50 * Work items for the bdi_writeback threads
51 */
52struct bdi_work {
53 struct list_head list; /* pending work list */
54 struct rcu_head rcu_head; /* for RCU free/clear of work */
55
56 unsigned long seen; /* threads that have seen this work */
57 atomic_t pending; /* number of threads still to do work */
58
59 struct wb_writeback_args args; /* writeback arguments */
60
61 unsigned long state; /* flag bits, see WS_* */
62};
63
64enum {
65 WS_USED_B = 0,
66 WS_ONSTACK_B,
67};
68
69#define WS_USED (1 << WS_USED_B)
70#define WS_ONSTACK (1 << WS_ONSTACK_B)
71
72static inline bool bdi_work_on_stack(struct bdi_work *work)
42{ 73{
43 return !test_and_set_bit(BDI_pdflush, &bdi->state); 74 return test_bit(WS_ONSTACK_B, &work->state);
75}
76
77static inline void bdi_work_init(struct bdi_work *work,
78 struct wb_writeback_args *args)
79{
80 INIT_RCU_HEAD(&work->rcu_head);
81 work->args = *args;
82 work->state = WS_USED;
44} 83}
45 84
46/** 85/**
47 * writeback_in_progress - determine whether there is writeback in progress 86 * writeback_in_progress - determine whether there is writeback in progress
48 * @bdi: the device's backing_dev_info structure. 87 * @bdi: the device's backing_dev_info structure.
49 * 88 *
50 * Determine whether there is writeback in progress against a backing device. 89 * Determine whether there is writeback waiting to be handled against a
90 * backing device.
51 */ 91 */
52int writeback_in_progress(struct backing_dev_info *bdi) 92int writeback_in_progress(struct backing_dev_info *bdi)
53{ 93{
54 return test_bit(BDI_pdflush, &bdi->state); 94 return !list_empty(&bdi->work_list);
55} 95}
56 96
57/** 97static void bdi_work_clear(struct bdi_work *work)
58 * writeback_release - relinquish exclusive writeback access against a device.
59 * @bdi: the device's backing_dev_info structure
60 */
61static void writeback_release(struct backing_dev_info *bdi)
62{ 98{
63 BUG_ON(!writeback_in_progress(bdi)); 99 clear_bit(WS_USED_B, &work->state);
64 clear_bit(BDI_pdflush, &bdi->state); 100 smp_mb__after_clear_bit();
101 /*
102 * work can have disappeared at this point. bit waitq functions
103 * should be able to tolerate this, provided bdi_sched_wait does
104 * not dereference it's pointer argument.
105 */
106 wake_up_bit(&work->state, WS_USED_B);
65} 107}
66 108
67static noinline void block_dump___mark_inode_dirty(struct inode *inode) 109static void bdi_work_free(struct rcu_head *head)
68{ 110{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 111 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
70 struct dentry *dentry;
71 const char *name = "?";
72 112
73 dentry = d_find_alias(inode); 113 if (!bdi_work_on_stack(work))
74 if (dentry) { 114 kfree(work);
75 spin_lock(&dentry->d_lock); 115 else
76 name = (const char *) dentry->d_name.name; 116 bdi_work_clear(work);
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87} 117}
88 118
89/** 119static void wb_work_complete(struct bdi_work *work)
90 * __mark_inode_dirty - internal function
91 * @inode: inode to mark
92 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
93 * Mark an inode as dirty. Callers should use mark_inode_dirty or
94 * mark_inode_dirty_sync.
95 *
96 * Put the inode on the super block's dirty list.
97 *
98 * CAREFUL! We mark it dirty unconditionally, but move it onto the
99 * dirty list only if it is hashed or if it refers to a blockdev.
100 * If it was not hashed, it will never be added to the dirty list
101 * even if it is later hashed, as it will have been marked dirty already.
102 *
103 * In short, make sure you hash any inodes _before_ you start marking
104 * them dirty.
105 *
106 * This function *must* be atomic for the I_DIRTY_PAGES case -
107 * set_page_dirty() is called under spinlock in several places.
108 *
109 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
110 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
111 * the kernel-internal blockdev inode represents the dirtying time of the
112 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
113 * page->mapping->host, so the page-dirtying time is recorded in the internal
114 * blockdev inode.
115 */
116void __mark_inode_dirty(struct inode *inode, int flags)
117{ 120{
118 struct super_block *sb = inode->i_sb; 121 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
122 int onstack = bdi_work_on_stack(work);
119 123
120 /* 124 /*
121 * Don't do this for I_DIRTY_PAGES - that doesn't actually 125 * For allocated work, we can clear the done/seen bit right here.
122 * dirty the inode itself 126 * For on-stack work, we need to postpone both the clear and free
127 * to after the RCU grace period, since the stack could be invalidated
128 * as soon as bdi_work_clear() has done the wakeup.
123 */ 129 */
124 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 130 if (!onstack)
125 if (sb->s_op->dirty_inode) 131 bdi_work_clear(work);
126 sb->s_op->dirty_inode(inode); 132 if (sync_mode == WB_SYNC_NONE || onstack)
127 } 133 call_rcu(&work->rcu_head, bdi_work_free);
134}
128 135
136static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
137{
129 /* 138 /*
130 * make sure that changes are seen by all cpus before we test i_state 139 * The caller has retrieved the work arguments from this work,
131 * -- mikulas 140 * drop our reference. If this is the last ref, delete and free it
132 */ 141 */
133 smp_mb(); 142 if (atomic_dec_and_test(&work->pending)) {
143 struct backing_dev_info *bdi = wb->bdi;
134 144
135 /* avoid the locking if we can */ 145 spin_lock(&bdi->wb_lock);
136 if ((inode->i_state & flags) == flags) 146 list_del_rcu(&work->list);
137 return; 147 spin_unlock(&bdi->wb_lock);
138 148
139 if (unlikely(block_dump)) 149 wb_work_complete(work);
140 block_dump___mark_inode_dirty(inode); 150 }
151}
141 152
142 spin_lock(&inode_lock); 153static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
143 if ((inode->i_state & flags) != flags) { 154{
144 const int was_dirty = inode->i_state & I_DIRTY; 155 work->seen = bdi->wb_mask;
156 BUG_ON(!work->seen);
157 atomic_set(&work->pending, bdi->wb_cnt);
158 BUG_ON(!bdi->wb_cnt);
145 159
146 inode->i_state |= flags; 160 /*
161 * list_add_tail_rcu() contains the necessary barriers to
162 * make sure the above stores are seen before the item is
163 * noticed on the list
164 */
165 spin_lock(&bdi->wb_lock);
166 list_add_tail_rcu(&work->list, &bdi->work_list);
167 spin_unlock(&bdi->wb_lock);
147 168
148 /* 169 /*
149 * If the inode is being synced, just update its dirty state. 170 * If the default thread isn't there, make sure we add it. When
150 * The unlocker will place the inode on the appropriate 171 * it gets created and wakes up, we'll run this work.
151 * superblock list, based upon its state. 172 */
152 */ 173 if (unlikely(list_empty_careful(&bdi->wb_list)))
153 if (inode->i_state & I_SYNC) 174 wake_up_process(default_backing_dev_info.wb.task);
154 goto out; 175 else {
176 struct bdi_writeback *wb = &bdi->wb;
155 177
156 /* 178 if (wb->task)
157 * Only add valid (hashed) inodes to the superblock's 179 wake_up_process(wb->task);
158 * dirty list. Add blockdev inodes as well. 180 }
159 */ 181}
160 if (!S_ISBLK(inode->i_mode)) {
161 if (hlist_unhashed(&inode->i_hash))
162 goto out;
163 }
164 if (inode->i_state & (I_FREEING|I_CLEAR))
165 goto out;
166 182
167 /* 183/*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 184 * Used for on-stack allocated work items. The caller needs to wait until
169 * reposition it (that would break s_dirty time-ordering). 185 * the wb threads have acked the work before it's safe to continue.
170 */ 186 */
171 if (!was_dirty) { 187static void bdi_wait_on_work_clear(struct bdi_work *work)
172 inode->dirtied_when = jiffies; 188{
173 list_move(&inode->i_list, &sb->s_dirty); 189 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
174 } 190 TASK_UNINTERRUPTIBLE);
191}
192
193static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
194 struct wb_writeback_args *args)
195{
196 struct bdi_work *work;
197
198 /*
199 * This is WB_SYNC_NONE writeback, so if allocation fails just
200 * wakeup the thread for old dirty data writeback
201 */
202 work = kmalloc(sizeof(*work), GFP_ATOMIC);
203 if (work) {
204 bdi_work_init(work, args);
205 bdi_queue_work(bdi, work);
206 } else {
207 struct bdi_writeback *wb = &bdi->wb;
208
209 if (wb->task)
210 wake_up_process(wb->task);
175 } 211 }
176out:
177 spin_unlock(&inode_lock);
178} 212}
179 213
180EXPORT_SYMBOL(__mark_inode_dirty); 214/**
215 * bdi_sync_writeback - start and wait for writeback
216 * @bdi: the backing device to write from
217 * @sb: write inodes from this super_block
218 *
219 * Description:
220 * This does WB_SYNC_ALL data integrity writeback and waits for the
221 * IO to complete. Callers must hold the sb s_umount semaphore for
222 * reading, to avoid having the super disappear before we are done.
223 */
224static void bdi_sync_writeback(struct backing_dev_info *bdi,
225 struct super_block *sb)
226{
227 struct wb_writeback_args args = {
228 .sb = sb,
229 .sync_mode = WB_SYNC_ALL,
230 .nr_pages = LONG_MAX,
231 .range_cyclic = 0,
232 };
233 struct bdi_work work;
181 234
182static int write_inode(struct inode *inode, int sync) 235 bdi_work_init(&work, &args);
236 work.state |= WS_ONSTACK;
237
238 bdi_queue_work(bdi, &work);
239 bdi_wait_on_work_clear(&work);
240}
241
242/**
243 * bdi_start_writeback - start writeback
244 * @bdi: the backing device to write from
245 * @nr_pages: the number of pages to write
246 *
247 * Description:
248 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
249 * started when this function returns, we make no guarentees on
250 * completion. Caller need not hold sb s_umount semaphore.
251 *
252 */
253void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
254 long nr_pages)
183{ 255{
184 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 256 struct wb_writeback_args args = {
185 return inode->i_sb->s_op->write_inode(inode, sync); 257 .sb = sb,
186 return 0; 258 .sync_mode = WB_SYNC_NONE,
259 .nr_pages = nr_pages,
260 .range_cyclic = 1,
261 };
262
263 /*
264 * We treat @nr_pages=0 as the special case to do background writeback,
265 * ie. to sync pages until the background dirty threshold is reached.
266 */
267 if (!nr_pages) {
268 args.nr_pages = LONG_MAX;
269 args.for_background = 1;
270 }
271
272 bdi_alloc_queue_work(bdi, &args);
187} 273}
188 274
189/* 275/*
@@ -191,31 +277,32 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 277 * furthest end of its superblock's dirty-inode list.
192 * 278 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 279 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 280 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 281 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 282 * out and we don't reset its dirtied_when.
197 */ 283 */
198static void redirty_tail(struct inode *inode) 284static void redirty_tail(struct inode *inode)
199{ 285{
200 struct super_block *sb = inode->i_sb; 286 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201 287
202 if (!list_empty(&sb->s_dirty)) { 288 if (!list_empty(&wb->b_dirty)) {
203 struct inode *tail_inode; 289 struct inode *tail;
204 290
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 291 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 292 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 293 inode->dirtied_when = jiffies;
209 } 294 }
210 list_move(&inode->i_list, &sb->s_dirty); 295 list_move(&inode->i_list, &wb->b_dirty);
211} 296}
212 297
213/* 298/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 299 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 300 */
216static void requeue_io(struct inode *inode) 301static void requeue_io(struct inode *inode)
217{ 302{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 303 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
304
305 list_move(&inode->i_list, &wb->b_more_io);
219} 306}
220 307
221static void inode_sync_complete(struct inode *inode) 308static void inode_sync_complete(struct inode *inode)
@@ -235,7 +322,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
235 * For inodes being constantly redirtied, dirtied_when can get stuck. 322 * For inodes being constantly redirtied, dirtied_when can get stuck.
236 * It _appears_ to be in the future, but is actually in distant past. 323 * It _appears_ to be in the future, but is actually in distant past.
237 * This test is necessary to prevent such wrapped-around relative times 324 * This test is necessary to prevent such wrapped-around relative times
238 * from permanently stopping the whole pdflush writeback. 325 * from permanently stopping the whole bdi writeback.
239 */ 326 */
240 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 327 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
241#endif 328#endif
@@ -249,33 +336,56 @@ static void move_expired_inodes(struct list_head *delaying_queue,
249 struct list_head *dispatch_queue, 336 struct list_head *dispatch_queue,
250 unsigned long *older_than_this) 337 unsigned long *older_than_this)
251{ 338{
339 LIST_HEAD(tmp);
340 struct list_head *pos, *node;
341 struct super_block *sb = NULL;
342 struct inode *inode;
343 int do_sb_sort = 0;
344
252 while (!list_empty(delaying_queue)) { 345 while (!list_empty(delaying_queue)) {
253 struct inode *inode = list_entry(delaying_queue->prev, 346 inode = list_entry(delaying_queue->prev, struct inode, i_list);
254 struct inode, i_list);
255 if (older_than_this && 347 if (older_than_this &&
256 inode_dirtied_after(inode, *older_than_this)) 348 inode_dirtied_after(inode, *older_than_this))
257 break; 349 break;
258 list_move(&inode->i_list, dispatch_queue); 350 if (sb && sb != inode->i_sb)
351 do_sb_sort = 1;
352 sb = inode->i_sb;
353 list_move(&inode->i_list, &tmp);
354 }
355
356 /* just one sb in list, splice to dispatch_queue and we're done */
357 if (!do_sb_sort) {
358 list_splice(&tmp, dispatch_queue);
359 return;
360 }
361
362 /* Move inodes from one superblock together */
363 while (!list_empty(&tmp)) {
364 inode = list_entry(tmp.prev, struct inode, i_list);
365 sb = inode->i_sb;
366 list_for_each_prev_safe(pos, node, &tmp) {
367 inode = list_entry(pos, struct inode, i_list);
368 if (inode->i_sb == sb)
369 list_move(&inode->i_list, dispatch_queue);
370 }
259 } 371 }
260} 372}
261 373
262/* 374/*
263 * Queue all expired dirty inodes for io, eldest first. 375 * Queue all expired dirty inodes for io, eldest first.
264 */ 376 */
265static void queue_io(struct super_block *sb, 377static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
266 unsigned long *older_than_this)
267{ 378{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 379 list_splice_init(&wb->b_more_io, wb->b_io.prev);
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 380 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
270} 381}
271 382
272int sb_has_dirty_inodes(struct super_block *sb) 383static int write_inode(struct inode *inode, int sync)
273{ 384{
274 return !list_empty(&sb->s_dirty) || 385 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
275 !list_empty(&sb->s_io) || 386 return inode->i_sb->s_op->write_inode(inode, sync);
276 !list_empty(&sb->s_more_io); 387 return 0;
277} 388}
278EXPORT_SYMBOL(sb_has_dirty_inodes);
279 389
280/* 390/*
281 * Wait for writeback on an inode to complete. 391 * Wait for writeback on an inode to complete.
@@ -322,11 +432,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 432 if (inode->i_state & I_SYNC) {
323 /* 433 /*
324 * If this inode is locked for writeback and we are not doing 434 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 435 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 436 * writeback can proceed with the other inodes on s_io.
327 * 437 *
328 * We'll have another go at writing back this inode when we 438 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 439 * completed a full scan of b_io.
330 */ 440 */
331 if (!wait) { 441 if (!wait) {
332 requeue_io(inode); 442 requeue_io(inode);
@@ -366,16 +476,26 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
366 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
367 inode->i_state &= ~I_SYNC; 477 inode->i_state &= ~I_SYNC;
368 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 478 if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
369 if (!(inode->i_state & I_DIRTY) && 479 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
370 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 480 /*
481 * More pages get dirtied by a fast dirtier.
482 */
483 goto select_queue;
484 } else if (inode->i_state & I_DIRTY) {
485 /*
486 * At least XFS will redirty the inode during the
487 * writeback (delalloc) and on io completion (isize).
488 */
489 redirty_tail(inode);
490 } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
371 /* 491 /*
372 * We didn't write back all the pages. nfs_writepages() 492 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 493 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 494 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 495 */
376 /* 496 /*
377 * akpm: if the caller was the kupdate function we put 497 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 498 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 499 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 500 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 501 * how much sense this makes. Presumably I had a good
@@ -385,10 +505,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 505 if (wbc->for_kupdate) {
386 /* 506 /*
387 * For the kupdate function we move the inode 507 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 508 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 509 * soon as the queue becomes uncongested.
390 */ 510 */
391 inode->i_state |= I_DIRTY_PAGES; 511 inode->i_state |= I_DIRTY_PAGES;
512select_queue:
392 if (wbc->nr_to_write <= 0) { 513 if (wbc->nr_to_write <= 0) {
393 /* 514 /*
394 * slice used up: queue for next turn 515 * slice used up: queue for next turn
@@ -411,12 +532,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
411 inode->i_state |= I_DIRTY_PAGES; 532 inode->i_state |= I_DIRTY_PAGES;
412 redirty_tail(inode); 533 redirty_tail(inode);
413 } 534 }
414 } else if (inode->i_state & I_DIRTY) {
415 /*
416 * Someone redirtied the inode while were writing back
417 * the pages.
418 */
419 redirty_tail(inode);
420 } else if (atomic_read(&inode->i_count)) { 535 } else if (atomic_read(&inode->i_count)) {
421 /* 536 /*
422 * The inode is clean, inuse 537 * The inode is clean, inuse
@@ -433,51 +548,96 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
433 return ret; 548 return ret;
434} 549}
435 550
551static void unpin_sb_for_writeback(struct super_block **psb)
552{
553 struct super_block *sb = *psb;
554
555 if (sb) {
556 up_read(&sb->s_umount);
557 put_super(sb);
558 *psb = NULL;
559 }
560}
561
436/* 562/*
437 * Write out a superblock's list of dirty inodes. A wait will be performed 563 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 564 * before calling writeback. So make sure that we do pin it, so it doesn't
439 * 565 * go away while we are writing inodes from it.
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
447 * This function assumes that the blockdev superblock's inodes are backed by
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 * 566 *
451 * FIXME: this linear search could get expensive with many fileystems. But 567 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
452 * how to fix? We need to go from an address_space to all inodes which share 568 * 1 if we failed.
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */ 569 */
461void generic_sync_sb_inodes(struct super_block *sb, 570static int pin_sb_for_writeback(struct writeback_control *wbc,
571 struct inode *inode, struct super_block **psb)
572{
573 struct super_block *sb = inode->i_sb;
574
575 /*
576 * If this sb is already pinned, nothing more to do. If not and
577 * *psb is non-NULL, unpin the old one first
578 */
579 if (sb == *psb)
580 return 0;
581 else if (*psb)
582 unpin_sb_for_writeback(psb);
583
584 /*
585 * Caller must already hold the ref for this
586 */
587 if (wbc->sync_mode == WB_SYNC_ALL) {
588 WARN_ON(!rwsem_is_locked(&sb->s_umount));
589 return 0;
590 }
591
592 spin_lock(&sb_lock);
593 sb->s_count++;
594 if (down_read_trylock(&sb->s_umount)) {
595 if (sb->s_root) {
596 spin_unlock(&sb_lock);
597 goto pinned;
598 }
599 /*
600 * umounted, drop rwsem again and fall through to failure
601 */
602 up_read(&sb->s_umount);
603 }
604
605 sb->s_count--;
606 spin_unlock(&sb_lock);
607 return 1;
608pinned:
609 *psb = sb;
610 return 0;
611}
612
613static void writeback_inodes_wb(struct bdi_writeback *wb,
462 struct writeback_control *wbc) 614 struct writeback_control *wbc)
463{ 615{
616 struct super_block *sb = wbc->sb, *pin_sb = NULL;
617 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 618 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 619
467 spin_lock(&inode_lock); 620 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 621
471 while (!list_empty(&sb->s_io)) { 622 if (!wbc->for_kupdate || list_empty(&wb->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 623 queue_io(wb, wbc->older_than_this);
624
625 while (!list_empty(&wb->b_io)) {
626 struct inode *inode = list_entry(wb->b_io.prev,
473 struct inode, i_list); 627 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 628 long pages_skipped;
477 629
478 if (!bdi_cap_writeback_dirty(bdi)) { 630 /*
631 * super block given and doesn't match, skip this inode
632 */
633 if (sb && sb != inode->i_sb) {
479 redirty_tail(inode); 634 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 635 continue;
636 }
637
638 if (!bdi_cap_writeback_dirty(wb->bdi)) {
639 redirty_tail(inode);
640 if (is_blkdev_sb) {
481 /* 641 /*
482 * Dirty memory-backed blockdev: the ramdisk 642 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 643 * driver does this. Skip just this inode
@@ -497,21 +657,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
497 continue; 657 continue;
498 } 658 }
499 659
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 660 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
501 wbc->encountered_congestion = 1; 661 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 662 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 663 break; /* Skip a congested fs */
504 requeue_io(inode); 664 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 665 continue; /* Skip a congested blockdev */
506 } 666 }
507 667
508 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb))
510 break; /* fs has the wrong queue */
511 requeue_io(inode);
512 continue; /* blockdev has wrong queue */
513 }
514
515 /* 668 /*
516 * Was this inode dirtied after sync_sb_inodes was called? 669 * Was this inode dirtied after sync_sb_inodes was called?
517 * This keeps sync from extra jobs and livelock. 670 * This keeps sync from extra jobs and livelock.
@@ -519,16 +672,15 @@ void generic_sync_sb_inodes(struct super_block *sb,
519 if (inode_dirtied_after(inode, start)) 672 if (inode_dirtied_after(inode, start))
520 break; 673 break;
521 674
522 /* Is another pdflush already flushing this queue? */ 675 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
523 if (current_is_pdflush() && !writeback_acquire(bdi)) 676 requeue_io(inode);
524 break; 677 continue;
678 }
525 679
526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 680 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
527 __iget(inode); 681 __iget(inode);
528 pages_skipped = wbc->pages_skipped; 682 pages_skipped = wbc->pages_skipped;
529 writeback_single_inode(inode, wbc); 683 writeback_single_inode(inode, wbc);
530 if (current_is_pdflush())
531 writeback_release(bdi);
532 if (wbc->pages_skipped != pages_skipped) { 684 if (wbc->pages_skipped != pages_skipped) {
533 /* 685 /*
534 * writeback is not making progress due to locked 686 * writeback is not making progress due to locked
@@ -544,144 +696,535 @@ void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 696 wbc->more_io = 1;
545 break; 697 break;
546 } 698 }
547 if (!list_empty(&sb->s_more_io)) 699 if (!list_empty(&wb->b_more_io))
548 wbc->more_io = 1; 700 wbc->more_io = 1;
549 } 701 }
550 702
551 if (sync) { 703 unpin_sb_for_writeback(&pin_sb);
552 struct inode *inode, *old_inode = NULL;
553 704
705 spin_unlock(&inode_lock);
706 /* Leave any unwritten inodes on b_io */
707}
708
709void writeback_inodes_wbc(struct writeback_control *wbc)
710{
711 struct backing_dev_info *bdi = wbc->bdi;
712
713 writeback_inodes_wb(&bdi->wb, wbc);
714}
715
716/*
717 * The maximum number of pages to writeout in a single bdi flush/kupdate
718 * operation. We do this so we don't hold I_SYNC against an inode for
719 * enormous amounts of time, which would block a userspace task which has
720 * been forced to throttle against that inode. Also, the code reevaluates
721 * the dirty each time it has written this many pages.
722 */
723#define MAX_WRITEBACK_PAGES 1024
724
725static inline bool over_bground_thresh(void)
726{
727 unsigned long background_thresh, dirty_thresh;
728
729 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
730
731 return (global_page_state(NR_FILE_DIRTY) +
732 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
733}
734
735/*
736 * Explicit flushing or periodic writeback of "old" data.
737 *
738 * Define "old": the first time one of an inode's pages is dirtied, we mark the
739 * dirtying-time in the inode's address_space. So this periodic writeback code
740 * just walks the superblock inode list, writing back any inodes which are
741 * older than a specific point in time.
742 *
743 * Try to run once per dirty_writeback_interval. But if a writeback event
744 * takes longer than a dirty_writeback_interval interval, then leave a
745 * one-second gap.
746 *
747 * older_than_this takes precedence over nr_to_write. So we'll only write back
748 * all dirty pages if they are all attached to "old" mappings.
749 */
750static long wb_writeback(struct bdi_writeback *wb,
751 struct wb_writeback_args *args)
752{
753 struct writeback_control wbc = {
754 .bdi = wb->bdi,
755 .sb = args->sb,
756 .sync_mode = args->sync_mode,
757 .older_than_this = NULL,
758 .for_kupdate = args->for_kupdate,
759 .range_cyclic = args->range_cyclic,
760 };
761 unsigned long oldest_jif;
762 long wrote = 0;
763 struct inode *inode;
764
765 if (wbc.for_kupdate) {
766 wbc.older_than_this = &oldest_jif;
767 oldest_jif = jiffies -
768 msecs_to_jiffies(dirty_expire_interval * 10);
769 }
770 if (!wbc.range_cyclic) {
771 wbc.range_start = 0;
772 wbc.range_end = LLONG_MAX;
773 }
774
775 for (;;) {
554 /* 776 /*
555 * Data integrity sync. Must wait for all pages under writeback, 777 * Stop writeback when nr_pages has been consumed
556 * because there may have been pages dirtied before our sync
557 * call, but which had writeout started before we write it out.
558 * In which case, the inode may not be on the dirty list, but
559 * we still have to wait for that writeout.
560 */ 778 */
561 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 779 if (args->nr_pages <= 0)
562 struct address_space *mapping; 780 break;
563
564 if (inode->i_state &
565 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
566 continue;
567 mapping = inode->i_mapping;
568 if (mapping->nrpages == 0)
569 continue;
570 __iget(inode);
571 spin_unlock(&inode_lock);
572 /*
573 * We hold a reference to 'inode' so it couldn't have
574 * been removed from s_inodes list while we dropped the
575 * inode_lock. We cannot iput the inode now as we can
576 * be holding the last reference and we cannot iput it
577 * under inode_lock. So we keep the reference and iput
578 * it later.
579 */
580 iput(old_inode);
581 old_inode = inode;
582 781
583 filemap_fdatawait(mapping); 782 /*
783 * For background writeout, stop when we are below the
784 * background dirty threshold
785 */
786 if (args->for_background && !over_bground_thresh())
787 break;
584 788
585 cond_resched(); 789 wbc.more_io = 0;
790 wbc.encountered_congestion = 0;
791 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
792 wbc.pages_skipped = 0;
793 writeback_inodes_wb(wb, &wbc);
794 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
586 796
587 spin_lock(&inode_lock); 797 /*
798 * If we consumed everything, see if we have more
799 */
800 if (wbc.nr_to_write <= 0)
801 continue;
802 /*
803 * Didn't write everything and we don't have more IO, bail
804 */
805 if (!wbc.more_io)
806 break;
807 /*
808 * Did we write something? Try for more
809 */
810 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
811 continue;
812 /*
813 * Nothing written. Wait for some inode to
814 * become available for writeback. Otherwise
815 * we'll just busyloop.
816 */
817 spin_lock(&inode_lock);
818 if (!list_empty(&wb->b_more_io)) {
819 inode = list_entry(wb->b_more_io.prev,
820 struct inode, i_list);
821 inode_wait_for_writeback(inode);
588 } 822 }
589 spin_unlock(&inode_lock); 823 spin_unlock(&inode_lock);
590 iput(old_inode); 824 }
591 } else
592 spin_unlock(&inode_lock);
593 825
594 return; /* Leave any unwritten inodes on s_io */ 826 return wrote;
595} 827}
596EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
597 828
598static void sync_sb_inodes(struct super_block *sb, 829/*
599 struct writeback_control *wbc) 830 * Return the next bdi_work struct that hasn't been processed by this
831 * wb thread yet. ->seen is initially set for each thread that exists
832 * for this device, when a thread first notices a piece of work it
833 * clears its bit. Depending on writeback type, the thread will notify
834 * completion on either receiving the work (WB_SYNC_NONE) or after
835 * it is done (WB_SYNC_ALL).
836 */
837static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
838 struct bdi_writeback *wb)
839{
840 struct bdi_work *work, *ret = NULL;
841
842 rcu_read_lock();
843
844 list_for_each_entry_rcu(work, &bdi->work_list, list) {
845 if (!test_bit(wb->nr, &work->seen))
846 continue;
847 clear_bit(wb->nr, &work->seen);
848
849 ret = work;
850 break;
851 }
852
853 rcu_read_unlock();
854 return ret;
855}
856
857static long wb_check_old_data_flush(struct bdi_writeback *wb)
858{
859 unsigned long expired;
860 long nr_pages;
861
862 expired = wb->last_old_flush +
863 msecs_to_jiffies(dirty_writeback_interval * 10);
864 if (time_before(jiffies, expired))
865 return 0;
866
867 wb->last_old_flush = jiffies;
868 nr_pages = global_page_state(NR_FILE_DIRTY) +
869 global_page_state(NR_UNSTABLE_NFS) +
870 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
871
872 if (nr_pages) {
873 struct wb_writeback_args args = {
874 .nr_pages = nr_pages,
875 .sync_mode = WB_SYNC_NONE,
876 .for_kupdate = 1,
877 .range_cyclic = 1,
878 };
879
880 return wb_writeback(wb, &args);
881 }
882
883 return 0;
884}
885
886/*
887 * Retrieve work items and do the writeback they describe
888 */
889long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
890{
891 struct backing_dev_info *bdi = wb->bdi;
892 struct bdi_work *work;
893 long wrote = 0;
894
895 while ((work = get_next_work_item(bdi, wb)) != NULL) {
896 struct wb_writeback_args args = work->args;
897
898 /*
899 * Override sync mode, in case we must wait for completion
900 */
901 if (force_wait)
902 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
903
904 /*
905 * If this isn't a data integrity operation, just notify
906 * that we have seen this work and we are now starting it.
907 */
908 if (args.sync_mode == WB_SYNC_NONE)
909 wb_clear_pending(wb, work);
910
911 wrote += wb_writeback(wb, &args);
912
913 /*
914 * This is a data integrity writeback, so only do the
915 * notification when we have completed the work.
916 */
917 if (args.sync_mode == WB_SYNC_ALL)
918 wb_clear_pending(wb, work);
919 }
920
921 /*
922 * Check for periodic writeback, kupdated() style
923 */
924 wrote += wb_check_old_data_flush(wb);
925
926 return wrote;
927}
928
929/*
930 * Handle writeback of dirty data for the device backed by this bdi. Also
931 * wakes up periodically and does kupdated style flushing.
932 */
933int bdi_writeback_task(struct bdi_writeback *wb)
934{
935 unsigned long last_active = jiffies;
936 unsigned long wait_jiffies = -1UL;
937 long pages_written;
938
939 while (!kthread_should_stop()) {
940 pages_written = wb_do_writeback(wb, 0);
941
942 if (pages_written)
943 last_active = jiffies;
944 else if (wait_jiffies != -1UL) {
945 unsigned long max_idle;
946
947 /*
948 * Longest period of inactivity that we tolerate. If we
949 * see dirty data again later, the task will get
950 * recreated automatically.
951 */
952 max_idle = max(5UL * 60 * HZ, wait_jiffies);
953 if (time_after(jiffies, max_idle + last_active))
954 break;
955 }
956
957 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
958 schedule_timeout_interruptible(wait_jiffies);
959 try_to_freeze();
960 }
961
962 return 0;
963}
964
965/*
966 * Schedule writeback for all backing devices. This does WB_SYNC_NONE
967 * writeback, for integrity writeback see bdi_sync_writeback().
968 */
969static void bdi_writeback_all(struct super_block *sb, long nr_pages)
600{ 970{
601 generic_sync_sb_inodes(sb, wbc); 971 struct wb_writeback_args args = {
972 .sb = sb,
973 .nr_pages = nr_pages,
974 .sync_mode = WB_SYNC_NONE,
975 };
976 struct backing_dev_info *bdi;
977
978 rcu_read_lock();
979
980 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
981 if (!bdi_has_dirty_io(bdi))
982 continue;
983
984 bdi_alloc_queue_work(bdi, &args);
985 }
986
987 rcu_read_unlock();
602} 988}
603 989
604/* 990/*
605 * Start writeback of dirty pagecache data against all unlocked inodes. 991 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
992 * the whole world.
993 */
994void wakeup_flusher_threads(long nr_pages)
995{
996 if (nr_pages == 0)
997 nr_pages = global_page_state(NR_FILE_DIRTY) +
998 global_page_state(NR_UNSTABLE_NFS);
999 bdi_writeback_all(NULL, nr_pages);
1000}
1001
1002static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1003{
1004 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
1005 struct dentry *dentry;
1006 const char *name = "?";
1007
1008 dentry = d_find_alias(inode);
1009 if (dentry) {
1010 spin_lock(&dentry->d_lock);
1011 name = (const char *) dentry->d_name.name;
1012 }
1013 printk(KERN_DEBUG
1014 "%s(%d): dirtied inode %lu (%s) on %s\n",
1015 current->comm, task_pid_nr(current), inode->i_ino,
1016 name, inode->i_sb->s_id);
1017 if (dentry) {
1018 spin_unlock(&dentry->d_lock);
1019 dput(dentry);
1020 }
1021 }
1022}
1023
1024/**
1025 * __mark_inode_dirty - internal function
1026 * @inode: inode to mark
1027 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
1028 * Mark an inode as dirty. Callers should use mark_inode_dirty or
1029 * mark_inode_dirty_sync.
1030 *
1031 * Put the inode on the super block's dirty list.
1032 *
1033 * CAREFUL! We mark it dirty unconditionally, but move it onto the
1034 * dirty list only if it is hashed or if it refers to a blockdev.
1035 * If it was not hashed, it will never be added to the dirty list
1036 * even if it is later hashed, as it will have been marked dirty already.
606 * 1037 *
607 * Note: 1038 * In short, make sure you hash any inodes _before_ you start marking
608 * We don't need to grab a reference to superblock here. If it has non-empty 1039 * them dirty.
609 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
610 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
611 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
612 * inode from superblock lists we are OK.
613 * 1040 *
614 * If `older_than_this' is non-zero then only flush inodes which have a 1041 * This function *must* be atomic for the I_DIRTY_PAGES case -
615 * flushtime older than *older_than_this. 1042 * set_page_dirty() is called under spinlock in several places.
616 * 1043 *
617 * If `bdi' is non-zero then we will scan the first inode against each 1044 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
618 * superblock until we find the matching ones. One group will be the dirty 1045 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
619 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 1046 * the kernel-internal blockdev inode represents the dirtying time of the
620 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 1047 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
621 * super-efficient but we're about to do a ton of I/O... 1048 * page->mapping->host, so the page-dirtying time is recorded in the internal
1049 * blockdev inode.
622 */ 1050 */
623void 1051void __mark_inode_dirty(struct inode *inode, int flags)
624writeback_inodes(struct writeback_control *wbc)
625{ 1052{
626 struct super_block *sb; 1053 struct super_block *sb = inode->i_sb;
627 1054
628 might_sleep(); 1055 /*
629 spin_lock(&sb_lock); 1056 * Don't do this for I_DIRTY_PAGES - that doesn't actually
630restart: 1057 * dirty the inode itself
631 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 1058 */
632 if (sb_has_dirty_inodes(sb)) { 1059 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
633 /* we're making our own get_super here */ 1060 if (sb->s_op->dirty_inode)
634 sb->s_count++; 1061 sb->s_op->dirty_inode(inode);
635 spin_unlock(&sb_lock); 1062 }
636 /* 1063
637 * If we can't get the readlock, there's no sense in 1064 /*
638 * waiting around, most of the time the FS is going to 1065 * make sure that changes are seen by all cpus before we test i_state
639 * be unmounted by the time it is released. 1066 * -- mikulas
640 */ 1067 */
641 if (down_read_trylock(&sb->s_umount)) { 1068 smp_mb();
642 if (sb->s_root) 1069
643 sync_sb_inodes(sb, wbc); 1070 /* avoid the locking if we can */
644 up_read(&sb->s_umount); 1071 if ((inode->i_state & flags) == flags)
1072 return;
1073
1074 if (unlikely(block_dump))
1075 block_dump___mark_inode_dirty(inode);
1076
1077 spin_lock(&inode_lock);
1078 if ((inode->i_state & flags) != flags) {
1079 const int was_dirty = inode->i_state & I_DIRTY;
1080
1081 inode->i_state |= flags;
1082
1083 /*
1084 * If the inode is being synced, just update its dirty state.
1085 * The unlocker will place the inode on the appropriate
1086 * superblock list, based upon its state.
1087 */
1088 if (inode->i_state & I_SYNC)
1089 goto out;
1090
1091 /*
1092 * Only add valid (hashed) inodes to the superblock's
1093 * dirty list. Add blockdev inodes as well.
1094 */
1095 if (!S_ISBLK(inode->i_mode)) {
1096 if (hlist_unhashed(&inode->i_hash))
1097 goto out;
1098 }
1099 if (inode->i_state & (I_FREEING|I_CLEAR))
1100 goto out;
1101
1102 /*
1103 * If the inode was already on b_dirty/b_io/b_more_io, don't
1104 * reposition it (that would break b_dirty time-ordering).
1105 */
1106 if (!was_dirty) {
1107 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1108 struct backing_dev_info *bdi = wb->bdi;
1109
1110 if (bdi_cap_writeback_dirty(bdi) &&
1111 !test_bit(BDI_registered, &bdi->state)) {
1112 WARN_ON(1);
1113 printk(KERN_ERR "bdi-%s not registered\n",
1114 bdi->name);
645 } 1115 }
646 spin_lock(&sb_lock); 1116
647 if (__put_super_and_need_restart(sb)) 1117 inode->dirtied_when = jiffies;
648 goto restart; 1118 list_move(&inode->i_list, &wb->b_dirty);
649 } 1119 }
650 if (wbc->nr_to_write <= 0)
651 break;
652 } 1120 }
653 spin_unlock(&sb_lock); 1121out:
1122 spin_unlock(&inode_lock);
654} 1123}
1124EXPORT_SYMBOL(__mark_inode_dirty);
655 1125
656/* 1126/*
657 * writeback and wait upon the filesystem's dirty inodes. The caller will 1127 * Write out a superblock's list of dirty inodes. A wait will be performed
658 * do this in two passes - one to write, and one to wait. 1128 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1129 *
1130 * If older_than_this is non-NULL, then only write out inodes which
1131 * had their first dirtying at a time earlier than *older_than_this.
659 * 1132 *
660 * A finite limit is set on the number of pages which will be written. 1133 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
661 * To prevent infinite livelock of sys_sync(). 1134 * This function assumes that the blockdev superblock's inodes are backed by
1135 * a variety of queues, so all inodes are searched. For other superblocks,
1136 * assume that all inodes are backed by the same queue.
662 * 1137 *
663 * We add in the number of potentially dirty inodes, because each inode write 1138 * The inodes to be written are parked on bdi->b_io. They are moved back onto
664 * can dirty pagecache in the underlying blockdev. 1139 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1140 * on the writer throttling path, and we get decent balancing between many
1141 * throttled threads: we don't want them all piling up on inode_sync_wait.
665 */ 1142 */
666void sync_inodes_sb(struct super_block *sb, int wait) 1143static void wait_sb_inodes(struct super_block *sb)
667{ 1144{
668 struct writeback_control wbc = { 1145 struct inode *inode, *old_inode = NULL;
669 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1146
670 .range_start = 0, 1147 /*
671 .range_end = LLONG_MAX, 1148 * We need to be protected against the filesystem going from
672 }; 1149 * r/o to r/w or vice versa.
1150 */
1151 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1152
1153 spin_lock(&inode_lock);
673 1154
674 if (!wait) { 1155 /*
675 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1156 * Data integrity sync. Must wait for all pages under writeback,
676 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1157 * because there may have been pages dirtied before our sync
1158 * call, but which had writeout started before we write it out.
1159 * In which case, the inode may not be on the dirty list, but
1160 * we still have to wait for that writeout.
1161 */
1162 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1163 struct address_space *mapping;
677 1164
678 wbc.nr_to_write = nr_dirty + nr_unstable + 1165 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1166 continue;
1167 mapping = inode->i_mapping;
1168 if (mapping->nrpages == 0)
1169 continue;
1170 __iget(inode);
1171 spin_unlock(&inode_lock);
1172 /*
1173 * We hold a reference to 'inode' so it couldn't have
1174 * been removed from s_inodes list while we dropped the
1175 * inode_lock. We cannot iput the inode now as we can
1176 * be holding the last reference and we cannot iput it
1177 * under inode_lock. So we keep the reference and iput
1178 * it later.
1179 */
1180 iput(old_inode);
1181 old_inode = inode;
1182
1183 filemap_fdatawait(mapping);
1184
1185 cond_resched();
1186
1187 spin_lock(&inode_lock);
1188 }
1189 spin_unlock(&inode_lock);
1190 iput(old_inode);
1191}
1192
1193/**
1194 * writeback_inodes_sb - writeback dirty inodes from given super_block
1195 * @sb: the superblock
1196 *
1197 * Start writeback on some inodes on this super_block. No guarantees are made
1198 * on how many (if any) will be written, and this function does not wait
1199 * for IO completion of submitted IO. The number of pages submitted is
1200 * returned.
1201 */
1202void writeback_inodes_sb(struct super_block *sb)
1203{
1204 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1205 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1206 long nr_to_write;
1207
1208 nr_to_write = nr_dirty + nr_unstable +
679 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1209 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
680 } else
681 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
682 1210
683 sync_sb_inodes(sb, &wbc); 1211 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
684} 1212}
1213EXPORT_SYMBOL(writeback_inodes_sb);
1214
1215/**
1216 * sync_inodes_sb - sync sb inode pages
1217 * @sb: the superblock
1218 *
1219 * This function writes and waits on any dirty inode belonging to this
1220 * super_block. The number of pages synced is returned.
1221 */
1222void sync_inodes_sb(struct super_block *sb)
1223{
1224 bdi_sync_writeback(sb->s_bdi, sb);
1225 wait_sb_inodes(sb);
1226}
1227EXPORT_SYMBOL(sync_inodes_sb);
685 1228
686/** 1229/**
687 * write_inode_now - write an inode to disk 1230 * write_inode_now - write an inode to disk
@@ -737,57 +1280,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
737 return ret; 1280 return ret;
738} 1281}
739EXPORT_SYMBOL(sync_inode); 1282EXPORT_SYMBOL(sync_inode);
740
741/**
742 * generic_osync_inode - flush all dirty data for a given inode to disk
743 * @inode: inode to write
744 * @mapping: the address_space that should be flushed
745 * @what: what to write and wait upon
746 *
747 * This can be called by file_write functions for files which have the
748 * O_SYNC flag set, to flush dirty writes to disk.
749 *
750 * @what is a bitmask, specifying which part of the inode's data should be
751 * written and waited upon.
752 *
753 * OSYNC_DATA: i_mapping's dirty data
754 * OSYNC_METADATA: the buffers at i_mapping->private_list
755 * OSYNC_INODE: the inode itself
756 */
757
758int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
759{
760 int err = 0;
761 int need_write_inode_now = 0;
762 int err2;
763
764 if (what & OSYNC_DATA)
765 err = filemap_fdatawrite(mapping);
766 if (what & (OSYNC_METADATA|OSYNC_DATA)) {
767 err2 = sync_mapping_buffers(mapping);
768 if (!err)
769 err = err2;
770 }
771 if (what & OSYNC_DATA) {
772 err2 = filemap_fdatawait(mapping);
773 if (!err)
774 err = err2;
775 }
776
777 spin_lock(&inode_lock);
778 if ((inode->i_state & I_DIRTY) &&
779 ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
780 need_write_inode_now = 1;
781 spin_unlock(&inode_lock);
782
783 if (need_write_inode_now) {
784 err2 = write_inode_now(inode, 1);
785 if (!err)
786 err = err2;
787 }
788 else
789 inode_sync_wait(inode);
790
791 return err;
792}
793EXPORT_SYMBOL(generic_osync_inode);