aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2009-09-21 05:09:22 -0400
committerArtem Bityutskiy <Artem.Bityutskiy@nokia.com>2009-09-21 05:09:22 -0400
commit7cce2f4cb7f5f641f78c8e3eea4e7b1b96cb71c0 (patch)
treeb064d077928cf224660ab1e1841cdab2c9fd8b08 /fs/fs-writeback.c
parente055f7e873d900925c222cf2d1ec955af4a9ca90 (diff)
parentebc79c4f8da0f92efa968e0328f32334a2ce80cf (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into linux-next
Conflicts: fs/ubifs/super.c Merge the upstream tree in order to resolve a conflict with the per-bdi writeback changes from the linux-2.6-block tree.
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c1104
1 files changed, 763 insertions, 341 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,245 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
25#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
26#include "internal.h" 28#include "internal.h"
27 29
30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 31
29/** 32/*
30 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * We don't actually have pdflush, but this one is exported though /proc...
31 * @bdi: the device's backing_dev_info structure 34 */
32 * 35int nr_pdflush_threads;
33 * It is a waste of resources to have more than one pdflush thread blocked on 36
34 * a single request queue. Exclusion at the request_queue level is obtained 37/*
35 * via a flag in the request_queue's backing_dev_info.state. 38 * Passed into wb_writeback(), essentially a subset of writeback_control
36 * 39 */
37 * Non-request_queue-backed address_spaces will share default_backing_dev_info, 40struct wb_writeback_args {
38 * unless they implement their own. Which is somewhat inefficient, as this 41 long nr_pages;
39 * may prevent concurrent writeback against multiple devices. 42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode;
44 int for_kupdate;
45 int range_cyclic;
46};
47
48/*
49 * Work items for the bdi_writeback threads
40 */ 50 */
41static int writeback_acquire(struct backing_dev_info *bdi) 51struct bdi_work {
52 struct list_head list; /* pending work list */
53 struct rcu_head rcu_head; /* for RCU free/clear of work */
54
55 unsigned long seen; /* threads that have seen this work */
56 atomic_t pending; /* number of threads still to do work */
57
58 struct wb_writeback_args args; /* writeback arguments */
59
60 unsigned long state; /* flag bits, see WS_* */
61};
62
63enum {
64 WS_USED_B = 0,
65 WS_ONSTACK_B,
66};
67
68#define WS_USED (1 << WS_USED_B)
69#define WS_ONSTACK (1 << WS_ONSTACK_B)
70
71static inline bool bdi_work_on_stack(struct bdi_work *work)
72{
73 return test_bit(WS_ONSTACK_B, &work->state);
74}
75
76static inline void bdi_work_init(struct bdi_work *work,
77 struct wb_writeback_args *args)
42{ 78{
43 return !test_and_set_bit(BDI_pdflush, &bdi->state); 79 INIT_RCU_HEAD(&work->rcu_head);
80 work->args = *args;
81 work->state = WS_USED;
44} 82}
45 83
46/** 84/**
47 * writeback_in_progress - determine whether there is writeback in progress 85 * writeback_in_progress - determine whether there is writeback in progress
48 * @bdi: the device's backing_dev_info structure. 86 * @bdi: the device's backing_dev_info structure.
49 * 87 *
50 * Determine whether there is writeback in progress against a backing device. 88 * Determine whether there is writeback waiting to be handled against a
89 * backing device.
51 */ 90 */
52int writeback_in_progress(struct backing_dev_info *bdi) 91int writeback_in_progress(struct backing_dev_info *bdi)
53{ 92{
54 return test_bit(BDI_pdflush, &bdi->state); 93 return !list_empty(&bdi->work_list);
55} 94}
56 95
57/** 96static void bdi_work_clear(struct bdi_work *work)
58 * writeback_release - relinquish exclusive writeback access against a device.
59 * @bdi: the device's backing_dev_info structure
60 */
61static void writeback_release(struct backing_dev_info *bdi)
62{ 97{
63 BUG_ON(!writeback_in_progress(bdi)); 98 clear_bit(WS_USED_B, &work->state);
64 clear_bit(BDI_pdflush, &bdi->state); 99 smp_mb__after_clear_bit();
100 /*
101 * work can have disappeared at this point. bit waitq functions
102 * should be able to tolerate this, provided bdi_sched_wait does
103 * not dereference it's pointer argument.
104 */
105 wake_up_bit(&work->state, WS_USED_B);
65} 106}
66 107
67static noinline void block_dump___mark_inode_dirty(struct inode *inode) 108static void bdi_work_free(struct rcu_head *head)
68{ 109{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 110 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
70 struct dentry *dentry;
71 const char *name = "?";
72 111
73 dentry = d_find_alias(inode); 112 if (!bdi_work_on_stack(work))
74 if (dentry) { 113 kfree(work);
75 spin_lock(&dentry->d_lock); 114 else
76 name = (const char *) dentry->d_name.name; 115 bdi_work_clear(work);
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87} 116}
88 117
89/** 118static void wb_work_complete(struct bdi_work *work)
90 * __mark_inode_dirty - internal function
91 * @inode: inode to mark
92 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
93 * Mark an inode as dirty. Callers should use mark_inode_dirty or
94 * mark_inode_dirty_sync.
95 *
96 * Put the inode on the super block's dirty list.
97 *
98 * CAREFUL! We mark it dirty unconditionally, but move it onto the
99 * dirty list only if it is hashed or if it refers to a blockdev.
100 * If it was not hashed, it will never be added to the dirty list
101 * even if it is later hashed, as it will have been marked dirty already.
102 *
103 * In short, make sure you hash any inodes _before_ you start marking
104 * them dirty.
105 *
106 * This function *must* be atomic for the I_DIRTY_PAGES case -
107 * set_page_dirty() is called under spinlock in several places.
108 *
109 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
110 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
111 * the kernel-internal blockdev inode represents the dirtying time of the
112 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
113 * page->mapping->host, so the page-dirtying time is recorded in the internal
114 * blockdev inode.
115 */
116void __mark_inode_dirty(struct inode *inode, int flags)
117{ 119{
118 struct super_block *sb = inode->i_sb; 120 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
121 int onstack = bdi_work_on_stack(work);
119 122
120 /* 123 /*
121 * Don't do this for I_DIRTY_PAGES - that doesn't actually 124 * For allocated work, we can clear the done/seen bit right here.
122 * dirty the inode itself 125 * For on-stack work, we need to postpone both the clear and free
126 * to after the RCU grace period, since the stack could be invalidated
127 * as soon as bdi_work_clear() has done the wakeup.
123 */ 128 */
124 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 129 if (!onstack)
125 if (sb->s_op->dirty_inode) 130 bdi_work_clear(work);
126 sb->s_op->dirty_inode(inode); 131 if (sync_mode == WB_SYNC_NONE || onstack)
127 } 132 call_rcu(&work->rcu_head, bdi_work_free);
133}
128 134
135static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
136{
129 /* 137 /*
130 * make sure that changes are seen by all cpus before we test i_state 138 * The caller has retrieved the work arguments from this work,
131 * -- mikulas 139 * drop our reference. If this is the last ref, delete and free it
132 */ 140 */
133 smp_mb(); 141 if (atomic_dec_and_test(&work->pending)) {
142 struct backing_dev_info *bdi = wb->bdi;
134 143
135 /* avoid the locking if we can */ 144 spin_lock(&bdi->wb_lock);
136 if ((inode->i_state & flags) == flags) 145 list_del_rcu(&work->list);
137 return; 146 spin_unlock(&bdi->wb_lock);
138 147
139 if (unlikely(block_dump)) 148 wb_work_complete(work);
140 block_dump___mark_inode_dirty(inode); 149 }
150}
141 151
142 spin_lock(&inode_lock); 152static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
143 if ((inode->i_state & flags) != flags) { 153{
144 const int was_dirty = inode->i_state & I_DIRTY; 154 work->seen = bdi->wb_mask;
155 BUG_ON(!work->seen);
156 atomic_set(&work->pending, bdi->wb_cnt);
157 BUG_ON(!bdi->wb_cnt);
145 158
146 inode->i_state |= flags; 159 /*
160 * list_add_tail_rcu() contains the necessary barriers to
161 * make sure the above stores are seen before the item is
162 * noticed on the list
163 */
164 spin_lock(&bdi->wb_lock);
165 list_add_tail_rcu(&work->list, &bdi->work_list);
166 spin_unlock(&bdi->wb_lock);
147 167
148 /* 168 /*
149 * If the inode is being synced, just update its dirty state. 169 * If the default thread isn't there, make sure we add it. When
150 * The unlocker will place the inode on the appropriate 170 * it gets created and wakes up, we'll run this work.
151 * superblock list, based upon its state. 171 */
152 */ 172 if (unlikely(list_empty_careful(&bdi->wb_list)))
153 if (inode->i_state & I_SYNC) 173 wake_up_process(default_backing_dev_info.wb.task);
154 goto out; 174 else {
175 struct bdi_writeback *wb = &bdi->wb;
155 176
156 /* 177 if (wb->task)
157 * Only add valid (hashed) inodes to the superblock's 178 wake_up_process(wb->task);
158 * dirty list. Add blockdev inodes as well. 179 }
159 */ 180}
160 if (!S_ISBLK(inode->i_mode)) {
161 if (hlist_unhashed(&inode->i_hash))
162 goto out;
163 }
164 if (inode->i_state & (I_FREEING|I_CLEAR))
165 goto out;
166 181
167 /* 182/*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 183 * Used for on-stack allocated work items. The caller needs to wait until
169 * reposition it (that would break s_dirty time-ordering). 184 * the wb threads have acked the work before it's safe to continue.
170 */ 185 */
171 if (!was_dirty) { 186static void bdi_wait_on_work_clear(struct bdi_work *work)
172 inode->dirtied_when = jiffies; 187{
173 list_move(&inode->i_list, &sb->s_dirty); 188 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
174 } 189 TASK_UNINTERRUPTIBLE);
190}
191
192static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
193 struct wb_writeback_args *args)
194{
195 struct bdi_work *work;
196
197 /*
198 * This is WB_SYNC_NONE writeback, so if allocation fails just
199 * wakeup the thread for old dirty data writeback
200 */
201 work = kmalloc(sizeof(*work), GFP_ATOMIC);
202 if (work) {
203 bdi_work_init(work, args);
204 bdi_queue_work(bdi, work);
205 } else {
206 struct bdi_writeback *wb = &bdi->wb;
207
208 if (wb->task)
209 wake_up_process(wb->task);
175 } 210 }
176out:
177 spin_unlock(&inode_lock);
178} 211}
179 212
180EXPORT_SYMBOL(__mark_inode_dirty); 213/**
214 * bdi_sync_writeback - start and wait for writeback
215 * @bdi: the backing device to write from
216 * @sb: write inodes from this super_block
217 *
218 * Description:
219 * This does WB_SYNC_ALL data integrity writeback and waits for the
220 * IO to complete. Callers must hold the sb s_umount semaphore for
221 * reading, to avoid having the super disappear before we are done.
222 */
223static void bdi_sync_writeback(struct backing_dev_info *bdi,
224 struct super_block *sb)
225{
226 struct wb_writeback_args args = {
227 .sb = sb,
228 .sync_mode = WB_SYNC_ALL,
229 .nr_pages = LONG_MAX,
230 .range_cyclic = 0,
231 };
232 struct bdi_work work;
181 233
182static int write_inode(struct inode *inode, int sync) 234 bdi_work_init(&work, &args);
235 work.state |= WS_ONSTACK;
236
237 bdi_queue_work(bdi, &work);
238 bdi_wait_on_work_clear(&work);
239}
240
241/**
242 * bdi_start_writeback - start writeback
243 * @bdi: the backing device to write from
244 * @nr_pages: the number of pages to write
245 *
246 * Description:
247 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
248 * started when this function returns, we make no guarentees on
249 * completion. Caller need not hold sb s_umount semaphore.
250 *
251 */
252void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
183{ 253{
184 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 254 struct wb_writeback_args args = {
185 return inode->i_sb->s_op->write_inode(inode, sync); 255 .sync_mode = WB_SYNC_NONE,
186 return 0; 256 .nr_pages = nr_pages,
257 .range_cyclic = 1,
258 };
259
260 bdi_alloc_queue_work(bdi, &args);
187} 261}
188 262
189/* 263/*
@@ -191,31 +265,32 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 265 * furthest end of its superblock's dirty-inode list.
192 * 266 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 267 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 268 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 269 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 270 * out and we don't reset its dirtied_when.
197 */ 271 */
198static void redirty_tail(struct inode *inode) 272static void redirty_tail(struct inode *inode)
199{ 273{
200 struct super_block *sb = inode->i_sb; 274 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201 275
202 if (!list_empty(&sb->s_dirty)) { 276 if (!list_empty(&wb->b_dirty)) {
203 struct inode *tail_inode; 277 struct inode *tail;
204 278
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 279 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 280 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 281 inode->dirtied_when = jiffies;
209 } 282 }
210 list_move(&inode->i_list, &sb->s_dirty); 283 list_move(&inode->i_list, &wb->b_dirty);
211} 284}
212 285
213/* 286/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 287 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 288 */
216static void requeue_io(struct inode *inode) 289static void requeue_io(struct inode *inode)
217{ 290{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 291 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
292
293 list_move(&inode->i_list, &wb->b_more_io);
219} 294}
220 295
221static void inode_sync_complete(struct inode *inode) 296static void inode_sync_complete(struct inode *inode)
@@ -262,20 +337,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
262/* 337/*
263 * Queue all expired dirty inodes for io, eldest first. 338 * Queue all expired dirty inodes for io, eldest first.
264 */ 339 */
265static void queue_io(struct super_block *sb, 340static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
266 unsigned long *older_than_this)
267{ 341{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 342 list_splice_init(&wb->b_more_io, wb->b_io.prev);
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 343 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
270} 344}
271 345
272int sb_has_dirty_inodes(struct super_block *sb) 346static int write_inode(struct inode *inode, int sync)
273{ 347{
274 return !list_empty(&sb->s_dirty) || 348 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
275 !list_empty(&sb->s_io) || 349 return inode->i_sb->s_op->write_inode(inode, sync);
276 !list_empty(&sb->s_more_io); 350 return 0;
277} 351}
278EXPORT_SYMBOL(sb_has_dirty_inodes);
279 352
280/* 353/*
281 * Wait for writeback on an inode to complete. 354 * Wait for writeback on an inode to complete.
@@ -322,11 +395,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 395 if (inode->i_state & I_SYNC) {
323 /* 396 /*
324 * If this inode is locked for writeback and we are not doing 397 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 398 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 399 * writeback can proceed with the other inodes on s_io.
327 * 400 *
328 * We'll have another go at writing back this inode when we 401 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 402 * completed a full scan of b_io.
330 */ 403 */
331 if (!wait) { 404 if (!wait) {
332 requeue_io(inode); 405 requeue_io(inode);
@@ -371,11 +444,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
371 /* 444 /*
372 * We didn't write back all the pages. nfs_writepages() 445 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 446 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 447 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 448 */
376 /* 449 /*
377 * akpm: if the caller was the kupdate function we put 450 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 451 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 452 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 453 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 454 * how much sense this makes. Presumably I had a good
@@ -385,7 +458,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 458 if (wbc->for_kupdate) {
386 /* 459 /*
387 * For the kupdate function we move the inode 460 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 461 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 462 * soon as the queue becomes uncongested.
390 */ 463 */
391 inode->i_state |= I_DIRTY_PAGES; 464 inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +507,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
434} 507}
435 508
436/* 509/*
437 * Write out a superblock's list of dirty inodes. A wait will be performed 510 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 511 * before calling writeback. So make sure that we do pin it, so it doesn't
439 * 512 * go away while we are writing inodes from it.
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 * 513 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 514 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
447 * This function assumes that the blockdev superblock's inodes are backed by 515 * 1 if we failed.
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 *
451 * FIXME: this linear search could get expensive with many fileystems. But
452 * how to fix? We need to go from an address_space to all inodes which share
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */ 516 */
461void generic_sync_sb_inodes(struct super_block *sb, 517static int pin_sb_for_writeback(struct writeback_control *wbc,
518 struct inode *inode)
519{
520 struct super_block *sb = inode->i_sb;
521
522 /*
523 * Caller must already hold the ref for this
524 */
525 if (wbc->sync_mode == WB_SYNC_ALL) {
526 WARN_ON(!rwsem_is_locked(&sb->s_umount));
527 return 0;
528 }
529
530 spin_lock(&sb_lock);
531 sb->s_count++;
532 if (down_read_trylock(&sb->s_umount)) {
533 if (sb->s_root) {
534 spin_unlock(&sb_lock);
535 return 0;
536 }
537 /*
538 * umounted, drop rwsem again and fall through to failure
539 */
540 up_read(&sb->s_umount);
541 }
542
543 sb->s_count--;
544 spin_unlock(&sb_lock);
545 return 1;
546}
547
548static void unpin_sb_for_writeback(struct writeback_control *wbc,
549 struct inode *inode)
550{
551 struct super_block *sb = inode->i_sb;
552
553 if (wbc->sync_mode == WB_SYNC_ALL)
554 return;
555
556 up_read(&sb->s_umount);
557 put_super(sb);
558}
559
560static void writeback_inodes_wb(struct bdi_writeback *wb,
462 struct writeback_control *wbc) 561 struct writeback_control *wbc)
463{ 562{
563 struct super_block *sb = wbc->sb;
564 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 565 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 566
467 spin_lock(&inode_lock); 567 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 568
471 while (!list_empty(&sb->s_io)) { 569 if (!wbc->for_kupdate || list_empty(&wb->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 570 queue_io(wb, wbc->older_than_this);
571
572 while (!list_empty(&wb->b_io)) {
573 struct inode *inode = list_entry(wb->b_io.prev,
473 struct inode, i_list); 574 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 575 long pages_skipped;
477 576
478 if (!bdi_cap_writeback_dirty(bdi)) { 577 /*
578 * super block given and doesn't match, skip this inode
579 */
580 if (sb && sb != inode->i_sb) {
581 redirty_tail(inode);
582 continue;
583 }
584
585 if (!bdi_cap_writeback_dirty(wb->bdi)) {
479 redirty_tail(inode); 586 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 587 if (is_blkdev_sb) {
481 /* 588 /*
482 * Dirty memory-backed blockdev: the ramdisk 589 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 590 * driver does this. Skip just this inode
@@ -497,21 +604,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
497 continue; 604 continue;
498 } 605 }
499 606
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 607 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
501 wbc->encountered_congestion = 1; 608 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 609 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 610 break; /* Skip a congested fs */
504 requeue_io(inode); 611 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 612 continue; /* Skip a congested blockdev */
506 } 613 }
507 614
508 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb))
510 break; /* fs has the wrong queue */
511 requeue_io(inode);
512 continue; /* blockdev has wrong queue */
513 }
514
515 /* 615 /*
516 * Was this inode dirtied after sync_sb_inodes was called? 616 * Was this inode dirtied after sync_sb_inodes was called?
517 * This keeps sync from extra jobs and livelock. 617 * This keeps sync from extra jobs and livelock.
@@ -519,16 +619,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
519 if (inode_dirtied_after(inode, start)) 619 if (inode_dirtied_after(inode, start))
520 break; 620 break;
521 621
522 /* Is another pdflush already flushing this queue? */ 622 if (pin_sb_for_writeback(wbc, inode)) {
523 if (current_is_pdflush() && !writeback_acquire(bdi)) 623 requeue_io(inode);
524 break; 624 continue;
625 }
525 626
526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 627 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
527 __iget(inode); 628 __iget(inode);
528 pages_skipped = wbc->pages_skipped; 629 pages_skipped = wbc->pages_skipped;
529 writeback_single_inode(inode, wbc); 630 writeback_single_inode(inode, wbc);
530 if (current_is_pdflush()) 631 unpin_sb_for_writeback(wbc, inode);
531 writeback_release(bdi);
532 if (wbc->pages_skipped != pages_skipped) { 632 if (wbc->pages_skipped != pages_skipped) {
533 /* 633 /*
534 * writeback is not making progress due to locked 634 * writeback is not making progress due to locked
@@ -544,144 +644,520 @@ void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 644 wbc->more_io = 1;
545 break; 645 break;
546 } 646 }
547 if (!list_empty(&sb->s_more_io)) 647 if (!list_empty(&wb->b_more_io))
548 wbc->more_io = 1; 648 wbc->more_io = 1;
549 } 649 }
550 650
551 if (sync) { 651 spin_unlock(&inode_lock);
552 struct inode *inode, *old_inode = NULL; 652 /* Leave any unwritten inodes on b_io */
653}
654
655void writeback_inodes_wbc(struct writeback_control *wbc)
656{
657 struct backing_dev_info *bdi = wbc->bdi;
553 658
659 writeback_inodes_wb(&bdi->wb, wbc);
660}
661
662/*
663 * The maximum number of pages to writeout in a single bdi flush/kupdate
664 * operation. We do this so we don't hold I_SYNC against an inode for
665 * enormous amounts of time, which would block a userspace task which has
666 * been forced to throttle against that inode. Also, the code reevaluates
667 * the dirty each time it has written this many pages.
668 */
669#define MAX_WRITEBACK_PAGES 1024
670
671static inline bool over_bground_thresh(void)
672{
673 unsigned long background_thresh, dirty_thresh;
674
675 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
676
677 return (global_page_state(NR_FILE_DIRTY) +
678 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
679}
680
681/*
682 * Explicit flushing or periodic writeback of "old" data.
683 *
684 * Define "old": the first time one of an inode's pages is dirtied, we mark the
685 * dirtying-time in the inode's address_space. So this periodic writeback code
686 * just walks the superblock inode list, writing back any inodes which are
687 * older than a specific point in time.
688 *
689 * Try to run once per dirty_writeback_interval. But if a writeback event
690 * takes longer than a dirty_writeback_interval interval, then leave a
691 * one-second gap.
692 *
693 * older_than_this takes precedence over nr_to_write. So we'll only write back
694 * all dirty pages if they are all attached to "old" mappings.
695 */
696static long wb_writeback(struct bdi_writeback *wb,
697 struct wb_writeback_args *args)
698{
699 struct writeback_control wbc = {
700 .bdi = wb->bdi,
701 .sb = args->sb,
702 .sync_mode = args->sync_mode,
703 .older_than_this = NULL,
704 .for_kupdate = args->for_kupdate,
705 .range_cyclic = args->range_cyclic,
706 };
707 unsigned long oldest_jif;
708 long wrote = 0;
709
710 if (wbc.for_kupdate) {
711 wbc.older_than_this = &oldest_jif;
712 oldest_jif = jiffies -
713 msecs_to_jiffies(dirty_expire_interval * 10);
714 }
715 if (!wbc.range_cyclic) {
716 wbc.range_start = 0;
717 wbc.range_end = LLONG_MAX;
718 }
719
720 for (;;) {
554 /* 721 /*
555 * Data integrity sync. Must wait for all pages under writeback, 722 * Don't flush anything for non-integrity writeback where
556 * because there may have been pages dirtied before our sync 723 * no nr_pages was given
557 * call, but which had writeout started before we write it out.
558 * In which case, the inode may not be on the dirty list, but
559 * we still have to wait for that writeout.
560 */ 724 */
561 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 725 if (!args->for_kupdate && args->nr_pages <= 0 &&
562 struct address_space *mapping; 726 args->sync_mode == WB_SYNC_NONE)
727 break;
563 728
564 if (inode->i_state & 729 /*
565 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 730 * If no specific pages were given and this is just a
566 continue; 731 * periodic background writeout and we are below the
567 mapping = inode->i_mapping; 732 * background dirty threshold, don't do anything
568 if (mapping->nrpages == 0) 733 */
734 if (args->for_kupdate && args->nr_pages <= 0 &&
735 !over_bground_thresh())
736 break;
737
738 wbc.more_io = 0;
739 wbc.encountered_congestion = 0;
740 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
741 wbc.pages_skipped = 0;
742 writeback_inodes_wb(wb, &wbc);
743 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
745
746 /*
747 * If we ran out of stuff to write, bail unless more_io got set
748 */
749 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
750 if (wbc.more_io && !wbc.for_kupdate)
569 continue; 751 continue;
570 __iget(inode); 752 break;
571 spin_unlock(&inode_lock); 753 }
572 /* 754 }
573 * We hold a reference to 'inode' so it couldn't have 755
574 * been removed from s_inodes list while we dropped the 756 return wrote;
575 * inode_lock. We cannot iput the inode now as we can 757}
576 * be holding the last reference and we cannot iput it 758
577 * under inode_lock. So we keep the reference and iput 759/*
578 * it later. 760 * Return the next bdi_work struct that hasn't been processed by this
579 */ 761 * wb thread yet. ->seen is initially set for each thread that exists
580 iput(old_inode); 762 * for this device, when a thread first notices a piece of work it
581 old_inode = inode; 763 * clears its bit. Depending on writeback type, the thread will notify
764 * completion on either receiving the work (WB_SYNC_NONE) or after
765 * it is done (WB_SYNC_ALL).
766 */
767static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
768 struct bdi_writeback *wb)
769{
770 struct bdi_work *work, *ret = NULL;
771
772 rcu_read_lock();
773
774 list_for_each_entry_rcu(work, &bdi->work_list, list) {
775 if (!test_bit(wb->nr, &work->seen))
776 continue;
777 clear_bit(wb->nr, &work->seen);
778
779 ret = work;
780 break;
781 }
782
783 rcu_read_unlock();
784 return ret;
785}
786
787static long wb_check_old_data_flush(struct bdi_writeback *wb)
788{
789 unsigned long expired;
790 long nr_pages;
791
792 expired = wb->last_old_flush +
793 msecs_to_jiffies(dirty_writeback_interval * 10);
794 if (time_before(jiffies, expired))
795 return 0;
796
797 wb->last_old_flush = jiffies;
798 nr_pages = global_page_state(NR_FILE_DIRTY) +
799 global_page_state(NR_UNSTABLE_NFS) +
800 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
801
802 if (nr_pages) {
803 struct wb_writeback_args args = {
804 .nr_pages = nr_pages,
805 .sync_mode = WB_SYNC_NONE,
806 .for_kupdate = 1,
807 .range_cyclic = 1,
808 };
809
810 return wb_writeback(wb, &args);
811 }
812
813 return 0;
814}
815
816/*
817 * Retrieve work items and do the writeback they describe
818 */
819long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
820{
821 struct backing_dev_info *bdi = wb->bdi;
822 struct bdi_work *work;
823 long wrote = 0;
582 824
583 filemap_fdatawait(mapping); 825 while ((work = get_next_work_item(bdi, wb)) != NULL) {
826 struct wb_writeback_args args = work->args;
584 827
585 cond_resched(); 828 /*
829 * Override sync mode, in case we must wait for completion
830 */
831 if (force_wait)
832 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
586 833
587 spin_lock(&inode_lock); 834 /*
835 * If this isn't a data integrity operation, just notify
836 * that we have seen this work and we are now starting it.
837 */
838 if (args.sync_mode == WB_SYNC_NONE)
839 wb_clear_pending(wb, work);
840
841 wrote += wb_writeback(wb, &args);
842
843 /*
844 * This is a data integrity writeback, so only do the
845 * notification when we have completed the work.
846 */
847 if (args.sync_mode == WB_SYNC_ALL)
848 wb_clear_pending(wb, work);
849 }
850
851 /*
852 * Check for periodic writeback, kupdated() style
853 */
854 wrote += wb_check_old_data_flush(wb);
855
856 return wrote;
857}
858
859/*
860 * Handle writeback of dirty data for the device backed by this bdi. Also
861 * wakes up periodically and does kupdated style flushing.
862 */
863int bdi_writeback_task(struct bdi_writeback *wb)
864{
865 unsigned long last_active = jiffies;
866 unsigned long wait_jiffies = -1UL;
867 long pages_written;
868
869 while (!kthread_should_stop()) {
870 pages_written = wb_do_writeback(wb, 0);
871
872 if (pages_written)
873 last_active = jiffies;
874 else if (wait_jiffies != -1UL) {
875 unsigned long max_idle;
876
877 /*
878 * Longest period of inactivity that we tolerate. If we
879 * see dirty data again later, the task will get
880 * recreated automatically.
881 */
882 max_idle = max(5UL * 60 * HZ, wait_jiffies);
883 if (time_after(jiffies, max_idle + last_active))
884 break;
588 } 885 }
589 spin_unlock(&inode_lock);
590 iput(old_inode);
591 } else
592 spin_unlock(&inode_lock);
593 886
594 return; /* Leave any unwritten inodes on s_io */ 887 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
888 schedule_timeout_interruptible(wait_jiffies);
889 try_to_freeze();
890 }
891
892 return 0;
595} 893}
596EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
597 894
598static void sync_sb_inodes(struct super_block *sb, 895/*
599 struct writeback_control *wbc) 896 * Schedule writeback for all backing devices. This does WB_SYNC_NONE
897 * writeback, for integrity writeback see bdi_sync_writeback().
898 */
899static void bdi_writeback_all(struct super_block *sb, long nr_pages)
600{ 900{
601 generic_sync_sb_inodes(sb, wbc); 901 struct wb_writeback_args args = {
902 .sb = sb,
903 .nr_pages = nr_pages,
904 .sync_mode = WB_SYNC_NONE,
905 };
906 struct backing_dev_info *bdi;
907
908 rcu_read_lock();
909
910 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
911 if (!bdi_has_dirty_io(bdi))
912 continue;
913
914 bdi_alloc_queue_work(bdi, &args);
915 }
916
917 rcu_read_unlock();
602} 918}
603 919
604/* 920/*
605 * Start writeback of dirty pagecache data against all unlocked inodes. 921 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
922 * the whole world.
923 */
924void wakeup_flusher_threads(long nr_pages)
925{
926 if (nr_pages == 0)
927 nr_pages = global_page_state(NR_FILE_DIRTY) +
928 global_page_state(NR_UNSTABLE_NFS);
929 bdi_writeback_all(NULL, nr_pages);
930}
931
932static noinline void block_dump___mark_inode_dirty(struct inode *inode)
933{
934 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
935 struct dentry *dentry;
936 const char *name = "?";
937
938 dentry = d_find_alias(inode);
939 if (dentry) {
940 spin_lock(&dentry->d_lock);
941 name = (const char *) dentry->d_name.name;
942 }
943 printk(KERN_DEBUG
944 "%s(%d): dirtied inode %lu (%s) on %s\n",
945 current->comm, task_pid_nr(current), inode->i_ino,
946 name, inode->i_sb->s_id);
947 if (dentry) {
948 spin_unlock(&dentry->d_lock);
949 dput(dentry);
950 }
951 }
952}
953
954/**
955 * __mark_inode_dirty - internal function
956 * @inode: inode to mark
957 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
958 * Mark an inode as dirty. Callers should use mark_inode_dirty or
959 * mark_inode_dirty_sync.
960 *
961 * Put the inode on the super block's dirty list.
606 * 962 *
607 * Note: 963 * CAREFUL! We mark it dirty unconditionally, but move it onto the
608 * We don't need to grab a reference to superblock here. If it has non-empty 964 * dirty list only if it is hashed or if it refers to a blockdev.
609 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed 965 * If it was not hashed, it will never be added to the dirty list
610 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all 966 * even if it is later hashed, as it will have been marked dirty already.
611 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
612 * inode from superblock lists we are OK.
613 * 967 *
614 * If `older_than_this' is non-zero then only flush inodes which have a 968 * In short, make sure you hash any inodes _before_ you start marking
615 * flushtime older than *older_than_this. 969 * them dirty.
616 * 970 *
617 * If `bdi' is non-zero then we will scan the first inode against each 971 * This function *must* be atomic for the I_DIRTY_PAGES case -
618 * superblock until we find the matching ones. One group will be the dirty 972 * set_page_dirty() is called under spinlock in several places.
619 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 973 *
620 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 974 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
621 * super-efficient but we're about to do a ton of I/O... 975 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
976 * the kernel-internal blockdev inode represents the dirtying time of the
977 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
978 * page->mapping->host, so the page-dirtying time is recorded in the internal
979 * blockdev inode.
622 */ 980 */
623void 981void __mark_inode_dirty(struct inode *inode, int flags)
624writeback_inodes(struct writeback_control *wbc)
625{ 982{
626 struct super_block *sb; 983 struct super_block *sb = inode->i_sb;
627 984
628 might_sleep(); 985 /*
629 spin_lock(&sb_lock); 986 * Don't do this for I_DIRTY_PAGES - that doesn't actually
630restart: 987 * dirty the inode itself
631 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 988 */
632 if (sb_has_dirty_inodes(sb)) { 989 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
633 /* we're making our own get_super here */ 990 if (sb->s_op->dirty_inode)
634 sb->s_count++; 991 sb->s_op->dirty_inode(inode);
635 spin_unlock(&sb_lock); 992 }
636 /* 993
637 * If we can't get the readlock, there's no sense in 994 /*
638 * waiting around, most of the time the FS is going to 995 * make sure that changes are seen by all cpus before we test i_state
639 * be unmounted by the time it is released. 996 * -- mikulas
640 */ 997 */
641 if (down_read_trylock(&sb->s_umount)) { 998 smp_mb();
642 if (sb->s_root) 999
643 sync_sb_inodes(sb, wbc); 1000 /* avoid the locking if we can */
644 up_read(&sb->s_umount); 1001 if ((inode->i_state & flags) == flags)
1002 return;
1003
1004 if (unlikely(block_dump))
1005 block_dump___mark_inode_dirty(inode);
1006
1007 spin_lock(&inode_lock);
1008 if ((inode->i_state & flags) != flags) {
1009 const int was_dirty = inode->i_state & I_DIRTY;
1010
1011 inode->i_state |= flags;
1012
1013 /*
1014 * If the inode is being synced, just update its dirty state.
1015 * The unlocker will place the inode on the appropriate
1016 * superblock list, based upon its state.
1017 */
1018 if (inode->i_state & I_SYNC)
1019 goto out;
1020
1021 /*
1022 * Only add valid (hashed) inodes to the superblock's
1023 * dirty list. Add blockdev inodes as well.
1024 */
1025 if (!S_ISBLK(inode->i_mode)) {
1026 if (hlist_unhashed(&inode->i_hash))
1027 goto out;
1028 }
1029 if (inode->i_state & (I_FREEING|I_CLEAR))
1030 goto out;
1031
1032 /*
1033 * If the inode was already on b_dirty/b_io/b_more_io, don't
1034 * reposition it (that would break b_dirty time-ordering).
1035 */
1036 if (!was_dirty) {
1037 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1038 struct backing_dev_info *bdi = wb->bdi;
1039
1040 if (bdi_cap_writeback_dirty(bdi) &&
1041 !test_bit(BDI_registered, &bdi->state)) {
1042 WARN_ON(1);
1043 printk(KERN_ERR "bdi-%s not registered\n",
1044 bdi->name);
645 } 1045 }
646 spin_lock(&sb_lock); 1046
647 if (__put_super_and_need_restart(sb)) 1047 inode->dirtied_when = jiffies;
648 goto restart; 1048 list_move(&inode->i_list, &wb->b_dirty);
649 } 1049 }
650 if (wbc->nr_to_write <= 0)
651 break;
652 } 1050 }
653 spin_unlock(&sb_lock); 1051out:
1052 spin_unlock(&inode_lock);
654} 1053}
1054EXPORT_SYMBOL(__mark_inode_dirty);
655 1055
656/* 1056/*
657 * writeback and wait upon the filesystem's dirty inodes. The caller will 1057 * Write out a superblock's list of dirty inodes. A wait will be performed
658 * do this in two passes - one to write, and one to wait. 1058 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1059 *
1060 * If older_than_this is non-NULL, then only write out inodes which
1061 * had their first dirtying at a time earlier than *older_than_this.
1062 *
1063 * If we're a pdlfush thread, then implement pdflush collision avoidance
1064 * against the entire list.
659 * 1065 *
660 * A finite limit is set on the number of pages which will be written. 1066 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
661 * To prevent infinite livelock of sys_sync(). 1067 * This function assumes that the blockdev superblock's inodes are backed by
1068 * a variety of queues, so all inodes are searched. For other superblocks,
1069 * assume that all inodes are backed by the same queue.
662 * 1070 *
663 * We add in the number of potentially dirty inodes, because each inode write 1071 * The inodes to be written are parked on bdi->b_io. They are moved back onto
664 * can dirty pagecache in the underlying blockdev. 1072 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1073 * on the writer throttling path, and we get decent balancing between many
1074 * throttled threads: we don't want them all piling up on inode_sync_wait.
665 */ 1075 */
666void sync_inodes_sb(struct super_block *sb, int wait) 1076static void wait_sb_inodes(struct super_block *sb)
667{ 1077{
668 struct writeback_control wbc = { 1078 struct inode *inode, *old_inode = NULL;
669 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1079
670 .range_start = 0, 1080 /*
671 .range_end = LLONG_MAX, 1081 * We need to be protected against the filesystem going from
672 }; 1082 * r/o to r/w or vice versa.
1083 */
1084 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1085
1086 spin_lock(&inode_lock);
1087
1088 /*
1089 * Data integrity sync. Must wait for all pages under writeback,
1090 * because there may have been pages dirtied before our sync
1091 * call, but which had writeout started before we write it out.
1092 * In which case, the inode may not be on the dirty list, but
1093 * we still have to wait for that writeout.
1094 */
1095 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1096 struct address_space *mapping;
1097
1098 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1099 continue;
1100 mapping = inode->i_mapping;
1101 if (mapping->nrpages == 0)
1102 continue;
1103 __iget(inode);
1104 spin_unlock(&inode_lock);
1105 /*
1106 * We hold a reference to 'inode' so it couldn't have
1107 * been removed from s_inodes list while we dropped the
1108 * inode_lock. We cannot iput the inode now as we can
1109 * be holding the last reference and we cannot iput it
1110 * under inode_lock. So we keep the reference and iput
1111 * it later.
1112 */
1113 iput(old_inode);
1114 old_inode = inode;
1115
1116 filemap_fdatawait(mapping);
1117
1118 cond_resched();
673 1119
674 if (!wait) { 1120 spin_lock(&inode_lock);
675 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1121 }
676 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1122 spin_unlock(&inode_lock);
1123 iput(old_inode);
1124}
677 1125
678 wbc.nr_to_write = nr_dirty + nr_unstable + 1126/**
1127 * writeback_inodes_sb - writeback dirty inodes from given super_block
1128 * @sb: the superblock
1129 *
1130 * Start writeback on some inodes on this super_block. No guarantees are made
1131 * on how many (if any) will be written, and this function does not wait
1132 * for IO completion of submitted IO. The number of pages submitted is
1133 * returned.
1134 */
1135void writeback_inodes_sb(struct super_block *sb)
1136{
1137 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1138 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1139 long nr_to_write;
1140
1141 nr_to_write = nr_dirty + nr_unstable +
679 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
680 } else
681 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
682 1143
683 sync_sb_inodes(sb, &wbc); 1144 bdi_writeback_all(sb, nr_to_write);
1145}
1146EXPORT_SYMBOL(writeback_inodes_sb);
1147
1148/**
1149 * sync_inodes_sb - sync sb inode pages
1150 * @sb: the superblock
1151 *
1152 * This function writes and waits on any dirty inode belonging to this
1153 * super_block. The number of pages synced is returned.
1154 */
1155void sync_inodes_sb(struct super_block *sb)
1156{
1157 bdi_sync_writeback(sb->s_bdi, sb);
1158 wait_sb_inodes(sb);
684} 1159}
1160EXPORT_SYMBOL(sync_inodes_sb);
685 1161
686/** 1162/**
687 * write_inode_now - write an inode to disk 1163 * write_inode_now - write an inode to disk
@@ -737,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
737 return ret; 1213 return ret;
738} 1214}
739EXPORT_SYMBOL(sync_inode); 1215EXPORT_SYMBOL(sync_inode);
740
741/**
742 * generic_osync_inode - flush all dirty data for a given inode to disk
743 * @inode: inode to write
744 * @mapping: the address_space that should be flushed
745 * @what: what to write and wait upon
746 *
747 * This can be called by file_write functions for files which have the
748 * O_SYNC flag set, to flush dirty writes to disk.
749 *
750 * @what is a bitmask, specifying which part of the inode's data should be
751 * written and waited upon.
752 *
753 * OSYNC_DATA: i_mapping's dirty data
754 * OSYNC_METADATA: the buffers at i_mapping->private_list
755 * OSYNC_INODE: the inode itself
756 */
757
758int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
759{
760 int err = 0;
761 int need_write_inode_now = 0;
762 int err2;
763
764 if (what & OSYNC_DATA)
765 err = filemap_fdatawrite(mapping);
766 if (what & (OSYNC_METADATA|OSYNC_DATA)) {
767 err2 = sync_mapping_buffers(mapping);
768 if (!err)
769 err = err2;
770 }
771 if (what & OSYNC_DATA) {
772 err2 = filemap_fdatawait(mapping);
773 if (!err)
774 err = err2;
775 }
776
777 spin_lock(&inode_lock);
778 if ((inode->i_state & I_DIRTY) &&
779 ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
780 need_write_inode_now = 1;
781 spin_unlock(&inode_lock);
782
783 if (need_write_inode_now) {
784 err2 = write_inode_now(inode, 1);
785 if (!err)
786 err = err2;
787 }
788 else
789 inode_sync_wait(inode);
790
791 return err;
792}
793EXPORT_SYMBOL(generic_osync_inode);