aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2009-09-09 03:08:54 -0400
committerJens Axboe <jens.axboe@oracle.com>2009-09-11 03:20:25 -0400
commit03ba3782e8dcc5b0e1efe440d33084f066e38cae (patch)
treee5a6513b411de16a46199530ec98ef9b7f1efc50 /fs
parent66f3b8e2e103a0b93b945764d98e9ba46cb926dd (diff)
writeback: switch to per-bdi threads for flushing data
This gets rid of pdflush for bdi writeout and kupdated style cleaning. pdflush writeout suffers from lack of locality and also requires more threads to handle the same workload, since it has to work in a non-blocking fashion against each queue. This also introduces lumpy behaviour and potential request starvation, since pdflush can be starved for queue access if others are accessing it. A sample ffsb workload that does random writes to files is about 8% faster here on a simple SATA drive during the benchmark phase. File layout also seems a LOT more smooth in vmstat: r b swpd free buff cache si so bi bo in cs us sy id wa 0 1 0 608848 2652 375372 0 0 0 71024 604 24 1 10 48 42 0 1 0 549644 2712 433736 0 0 0 60692 505 27 1 8 48 44 1 0 0 476928 2784 505192 0 0 4 29540 553 24 0 9 53 37 0 1 0 457972 2808 524008 0 0 0 54876 331 16 0 4 38 58 0 1 0 366128 2928 614284 0 0 4 92168 710 58 0 13 53 34 0 1 0 295092 3000 684140 0 0 0 62924 572 23 0 9 53 37 0 1 0 236592 3064 741704 0 0 4 58256 523 17 0 8 48 44 0 1 0 165608 3132 811464 0 0 0 57460 560 21 0 8 54 38 0 1 0 102952 3200 873164 0 0 4 74748 540 29 1 10 48 41 0 1 0 48604 3252 926472 0 0 0 53248 469 29 0 7 47 45 where vanilla tends to fluctuate a lot in the creation phase: r b swpd free buff cache si so bi bo in cs us sy id wa 1 1 0 678716 5792 303380 0 0 0 74064 565 50 1 11 52 36 1 0 0 662488 5864 319396 0 0 4 352 302 329 0 2 47 51 0 1 0 599312 5924 381468 0 0 0 78164 516 55 0 9 51 40 0 1 0 519952 6008 459516 0 0 4 78156 622 56 1 11 52 37 1 1 0 436640 6092 541632 0 0 0 82244 622 54 0 11 48 41 0 1 0 436640 6092 541660 0 0 0 8 152 39 0 0 51 49 0 1 0 332224 6200 644252 0 0 4 102800 728 46 1 13 49 36 1 0 0 274492 6260 701056 0 0 4 12328 459 49 0 7 50 43 0 1 0 211220 6324 763356 0 0 0 106940 515 37 1 10 51 39 1 0 0 160412 6376 813468 0 0 0 8224 415 43 0 6 49 45 1 1 0 85980 6452 886556 0 0 4 113516 575 39 1 11 54 34 0 2 0 85968 6452 886620 0 0 0 1640 158 211 0 0 46 54 A 10 disk test with btrfs performs 26% faster with per-bdi flushing. A SSD based writeback test on XFS performs over 20% better as well, with the throughput being very stable around 1GB/sec, where pdflush only manages 750MB/sec and fluctuates wildly while doing so. Random buffered writes to many files behave a lot better as well, as does random mmap'ed writes. A separate thread is added to sync the super blocks. In the long term, adding sync_supers_bdi() functionality could get rid of this thread again. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/fs-writeback.c999
-rw-r--r--fs/super.c2
-rw-r--r--fs/sync.c2
4 files changed, 713 insertions, 292 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
281 struct zone *zone; 281 struct zone *zone;
282 int nid; 282 int nid;
283 283
284 wakeup_pdflush(1024); 284 wakeup_flusher_threads(1024);
285 yield(); 285 yield();
286 286
287 for_each_online_node(nid) { 287 for_each_online_node(nid) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 45ad4bb700e6..7f6dae8aa47f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,6 +19,8 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
@@ -27,165 +29,208 @@
27 29
28#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) 30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
29 31
30/** 32/*
31 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * Work items for the bdi_writeback threads
32 * @bdi: the device's backing_dev_info structure
33 *
34 * It is a waste of resources to have more than one pdflush thread blocked on
35 * a single request queue. Exclusion at the request_queue level is obtained
36 * via a flag in the request_queue's backing_dev_info.state.
37 *
38 * Non-request_queue-backed address_spaces will share default_backing_dev_info,
39 * unless they implement their own. Which is somewhat inefficient, as this
40 * may prevent concurrent writeback against multiple devices.
41 */ 34 */
42static int writeback_acquire(struct backing_dev_info *bdi) 35struct bdi_work {
36 struct list_head list;
37 struct list_head wait_list;
38 struct rcu_head rcu_head;
39
40 unsigned long seen;
41 atomic_t pending;
42
43 struct super_block *sb;
44 unsigned long nr_pages;
45 enum writeback_sync_modes sync_mode;
46
47 unsigned long state;
48};
49
50enum {
51 WS_USED_B = 0,
52 WS_ONSTACK_B,
53};
54
55#define WS_USED (1 << WS_USED_B)
56#define WS_ONSTACK (1 << WS_ONSTACK_B)
57
58static inline bool bdi_work_on_stack(struct bdi_work *work)
59{
60 return test_bit(WS_ONSTACK_B, &work->state);
61}
62
63static inline void bdi_work_init(struct bdi_work *work,
64 struct writeback_control *wbc)
65{
66 INIT_RCU_HEAD(&work->rcu_head);
67 work->sb = wbc->sb;
68 work->nr_pages = wbc->nr_to_write;
69 work->sync_mode = wbc->sync_mode;
70 work->state = WS_USED;
71}
72
73static inline void bdi_work_init_on_stack(struct bdi_work *work,
74 struct writeback_control *wbc)
43{ 75{
44 return !test_and_set_bit(BDI_pdflush, &bdi->state); 76 bdi_work_init(work, wbc);
77 work->state |= WS_ONSTACK;
45} 78}
46 79
47/** 80/**
48 * writeback_in_progress - determine whether there is writeback in progress 81 * writeback_in_progress - determine whether there is writeback in progress
49 * @bdi: the device's backing_dev_info structure. 82 * @bdi: the device's backing_dev_info structure.
50 * 83 *
51 * Determine whether there is writeback in progress against a backing device. 84 * Determine whether there is writeback waiting to be handled against a
85 * backing device.
52 */ 86 */
53int writeback_in_progress(struct backing_dev_info *bdi) 87int writeback_in_progress(struct backing_dev_info *bdi)
54{ 88{
55 return test_bit(BDI_pdflush, &bdi->state); 89 return !list_empty(&bdi->work_list);
56} 90}
57 91
58/** 92static void bdi_work_clear(struct bdi_work *work)
59 * writeback_release - relinquish exclusive writeback access against a device.
60 * @bdi: the device's backing_dev_info structure
61 */
62static void writeback_release(struct backing_dev_info *bdi)
63{ 93{
64 BUG_ON(!writeback_in_progress(bdi)); 94 clear_bit(WS_USED_B, &work->state);
65 clear_bit(BDI_pdflush, &bdi->state); 95 smp_mb__after_clear_bit();
96 wake_up_bit(&work->state, WS_USED_B);
66} 97}
67 98
68static noinline void block_dump___mark_inode_dirty(struct inode *inode) 99static void bdi_work_free(struct rcu_head *head)
69{ 100{
70 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 101 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
71 struct dentry *dentry;
72 const char *name = "?";
73 102
74 dentry = d_find_alias(inode); 103 if (!bdi_work_on_stack(work))
75 if (dentry) { 104 kfree(work);
76 spin_lock(&dentry->d_lock); 105 else
77 name = (const char *) dentry->d_name.name; 106 bdi_work_clear(work);
78 }
79 printk(KERN_DEBUG
80 "%s(%d): dirtied inode %lu (%s) on %s\n",
81 current->comm, task_pid_nr(current), inode->i_ino,
82 name, inode->i_sb->s_id);
83 if (dentry) {
84 spin_unlock(&dentry->d_lock);
85 dput(dentry);
86 }
87 }
88} 107}
89 108
90/** 109static void wb_work_complete(struct bdi_work *work)
91 * __mark_inode_dirty - internal function
92 * @inode: inode to mark
93 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
94 * Mark an inode as dirty. Callers should use mark_inode_dirty or
95 * mark_inode_dirty_sync.
96 *
97 * Put the inode on the super block's dirty list.
98 *
99 * CAREFUL! We mark it dirty unconditionally, but move it onto the
100 * dirty list only if it is hashed or if it refers to a blockdev.
101 * If it was not hashed, it will never be added to the dirty list
102 * even if it is later hashed, as it will have been marked dirty already.
103 *
104 * In short, make sure you hash any inodes _before_ you start marking
105 * them dirty.
106 *
107 * This function *must* be atomic for the I_DIRTY_PAGES case -
108 * set_page_dirty() is called under spinlock in several places.
109 *
110 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
111 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
112 * the kernel-internal blockdev inode represents the dirtying time of the
113 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
114 * page->mapping->host, so the page-dirtying time is recorded in the internal
115 * blockdev inode.
116 */
117void __mark_inode_dirty(struct inode *inode, int flags)
118{ 110{
119 struct super_block *sb = inode->i_sb; 111 const enum writeback_sync_modes sync_mode = work->sync_mode;
120 112
121 /* 113 /*
122 * Don't do this for I_DIRTY_PAGES - that doesn't actually 114 * For allocated work, we can clear the done/seen bit right here.
123 * dirty the inode itself 115 * For on-stack work, we need to postpone both the clear and free
116 * to after the RCU grace period, since the stack could be invalidated
117 * as soon as bdi_work_clear() has done the wakeup.
124 */ 118 */
125 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 119 if (!bdi_work_on_stack(work))
126 if (sb->s_op->dirty_inode) 120 bdi_work_clear(work);
127 sb->s_op->dirty_inode(inode); 121 if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
128 } 122 call_rcu(&work->rcu_head, bdi_work_free);
123}
129 124
125static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
126{
130 /* 127 /*
131 * make sure that changes are seen by all cpus before we test i_state 128 * The caller has retrieved the work arguments from this work,
132 * -- mikulas 129 * drop our reference. If this is the last ref, delete and free it
133 */ 130 */
134 smp_mb(); 131 if (atomic_dec_and_test(&work->pending)) {
132 struct backing_dev_info *bdi = wb->bdi;
135 133
136 /* avoid the locking if we can */ 134 spin_lock(&bdi->wb_lock);
137 if ((inode->i_state & flags) == flags) 135 list_del_rcu(&work->list);
138 return; 136 spin_unlock(&bdi->wb_lock);
139
140 if (unlikely(block_dump))
141 block_dump___mark_inode_dirty(inode);
142 137
143 spin_lock(&inode_lock); 138 wb_work_complete(work);
144 if ((inode->i_state & flags) != flags) { 139 }
145 const int was_dirty = inode->i_state & I_DIRTY; 140}
146 141
147 inode->i_state |= flags; 142static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
143{
144 if (work) {
145 work->seen = bdi->wb_mask;
146 BUG_ON(!work->seen);
147 atomic_set(&work->pending, bdi->wb_cnt);
148 BUG_ON(!bdi->wb_cnt);
148 149
149 /* 150 /*
150 * If the inode is being synced, just update its dirty state. 151 * Make sure stores are seen before it appears on the list
151 * The unlocker will place the inode on the appropriate
152 * superblock list, based upon its state.
153 */ 152 */
154 if (inode->i_state & I_SYNC) 153 smp_mb();
155 goto out;
156 154
157 /* 155 spin_lock(&bdi->wb_lock);
158 * Only add valid (hashed) inodes to the superblock's 156 list_add_tail_rcu(&work->list, &bdi->work_list);
159 * dirty list. Add blockdev inodes as well. 157 spin_unlock(&bdi->wb_lock);
160 */ 158 }
161 if (!S_ISBLK(inode->i_mode)) { 159
162 if (hlist_unhashed(&inode->i_hash)) 160 /*
163 goto out; 161 * If the default thread isn't there, make sure we add it. When
164 } 162 * it gets created and wakes up, we'll run this work.
165 if (inode->i_state & (I_FREEING|I_CLEAR)) 163 */
166 goto out; 164 if (unlikely(list_empty_careful(&bdi->wb_list)))
165 wake_up_process(default_backing_dev_info.wb.task);
166 else {
167 struct bdi_writeback *wb = &bdi->wb;
167 168
168 /* 169 /*
169 * If the inode was already on b_dirty/b_io/b_more_io, don't 170 * If we failed allocating the bdi work item, wake up the wb
170 * reposition it (that would break b_dirty time-ordering). 171 * thread always. As a safety precaution, it'll flush out
172 * everything
171 */ 173 */
172 if (!was_dirty) { 174 if (!wb_has_dirty_io(wb)) {
173 inode->dirtied_when = jiffies; 175 if (work)
174 list_move(&inode->i_list, 176 wb_clear_pending(wb, work);
175 &inode_to_bdi(inode)->b_dirty); 177 } else if (wb->task)
176 } 178 wake_up_process(wb->task);
177 } 179 }
178out:
179 spin_unlock(&inode_lock);
180} 180}
181 181
182EXPORT_SYMBOL(__mark_inode_dirty); 182/*
183 * Used for on-stack allocated work items. The caller needs to wait until
184 * the wb threads have acked the work before it's safe to continue.
185 */
186static void bdi_wait_on_work_clear(struct bdi_work *work)
187{
188 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
189 TASK_UNINTERRUPTIBLE);
190}
183 191
184static int write_inode(struct inode *inode, int sync) 192static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
185{ 193{
186 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 194 struct bdi_work *work;
187 return inode->i_sb->s_op->write_inode(inode, sync); 195
188 return 0; 196 work = kmalloc(sizeof(*work), GFP_ATOMIC);
197 if (work)
198 bdi_work_init(work, wbc);
199
200 return work;
201}
202
203void bdi_start_writeback(struct writeback_control *wbc)
204{
205 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
206 struct bdi_work work_stack, *work = NULL;
207
208 if (!must_wait)
209 work = bdi_alloc_work(wbc);
210
211 if (!work) {
212 work = &work_stack;
213 bdi_work_init_on_stack(work, wbc);
214 }
215
216 bdi_queue_work(wbc->bdi, work);
217
218 /*
219 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
220 * complete. If not, we only need to wait for the work to be started,
221 * if we allocated it on-stack. We use the same mechanism, if the
222 * wait bit is set in the bdi_work struct, then threads will not
223 * clear pending until after they are done.
224 *
225 * Note that work == &work_stack if must_wait is true, so we don't
226 * need to do call_rcu() here ever, since the completion path will
227 * have done that for us.
228 */
229 if (must_wait || work == &work_stack) {
230 bdi_wait_on_work_clear(work);
231 if (work != &work_stack)
232 call_rcu(&work->rcu_head, bdi_work_free);
233 }
189} 234}
190 235
191/* 236/*
@@ -199,16 +244,16 @@ static int write_inode(struct inode *inode, int sync)
199 */ 244 */
200static void redirty_tail(struct inode *inode) 245static void redirty_tail(struct inode *inode)
201{ 246{
202 struct backing_dev_info *bdi = inode_to_bdi(inode); 247 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
203 248
204 if (!list_empty(&bdi->b_dirty)) { 249 if (!list_empty(&wb->b_dirty)) {
205 struct inode *tail; 250 struct inode *tail;
206 251
207 tail = list_entry(bdi->b_dirty.next, struct inode, i_list); 252 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
208 if (time_before(inode->dirtied_when, tail->dirtied_when)) 253 if (time_before(inode->dirtied_when, tail->dirtied_when))
209 inode->dirtied_when = jiffies; 254 inode->dirtied_when = jiffies;
210 } 255 }
211 list_move(&inode->i_list, &bdi->b_dirty); 256 list_move(&inode->i_list, &wb->b_dirty);
212} 257}
213 258
214/* 259/*
@@ -216,7 +261,9 @@ static void redirty_tail(struct inode *inode)
216 */ 261 */
217static void requeue_io(struct inode *inode) 262static void requeue_io(struct inode *inode)
218{ 263{
219 list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io); 264 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
265
266 list_move(&inode->i_list, &wb->b_more_io);
220} 267}
221 268
222static void inode_sync_complete(struct inode *inode) 269static void inode_sync_complete(struct inode *inode)
@@ -263,52 +310,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
263/* 310/*
264 * Queue all expired dirty inodes for io, eldest first. 311 * Queue all expired dirty inodes for io, eldest first.
265 */ 312 */
266static void queue_io(struct backing_dev_info *bdi, 313static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
267 unsigned long *older_than_this)
268{ 314{
269 list_splice_init(&bdi->b_more_io, bdi->b_io.prev); 315 list_splice_init(&wb->b_more_io, wb->b_io.prev);
270 move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this); 316 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
271} 317}
272 318
273static int sb_on_inode_list(struct super_block *sb, struct list_head *list) 319static int write_inode(struct inode *inode, int sync)
274{
275 struct inode *inode;
276 int ret = 0;
277
278 spin_lock(&inode_lock);
279 list_for_each_entry(inode, list, i_list) {
280 if (inode->i_sb == sb) {
281 ret = 1;
282 break;
283 }
284 }
285 spin_unlock(&inode_lock);
286 return ret;
287}
288
289int sb_has_dirty_inodes(struct super_block *sb)
290{ 320{
291 struct backing_dev_info *bdi; 321 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
292 int ret = 0; 322 return inode->i_sb->s_op->write_inode(inode, sync);
293 323 return 0;
294 /*
295 * This is REALLY expensive right now, but it'll go away
296 * when the bdi writeback is introduced
297 */
298 mutex_lock(&bdi_lock);
299 list_for_each_entry(bdi, &bdi_list, bdi_list) {
300 if (sb_on_inode_list(sb, &bdi->b_dirty) ||
301 sb_on_inode_list(sb, &bdi->b_io) ||
302 sb_on_inode_list(sb, &bdi->b_more_io)) {
303 ret = 1;
304 break;
305 }
306 }
307 mutex_unlock(&bdi_lock);
308
309 return ret;
310} 324}
311EXPORT_SYMBOL(sb_has_dirty_inodes);
312 325
313/* 326/*
314 * Wait for writeback on an inode to complete. 327 * Wait for writeback on an inode to complete.
@@ -466,20 +479,71 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
466 return ret; 479 return ret;
467} 480}
468 481
469static void generic_sync_bdi_inodes(struct backing_dev_info *bdi, 482/*
470 struct writeback_control *wbc, 483 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
471 struct super_block *sb) 484 * before calling writeback. So make sure that we do pin it, so it doesn't
485 * go away while we are writing inodes from it.
486 *
487 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
488 * 1 if we failed.
489 */
490static int pin_sb_for_writeback(struct writeback_control *wbc,
491 struct inode *inode)
492{
493 struct super_block *sb = inode->i_sb;
494
495 /*
496 * Caller must already hold the ref for this
497 */
498 if (wbc->sync_mode == WB_SYNC_ALL) {
499 WARN_ON(!rwsem_is_locked(&sb->s_umount));
500 return 0;
501 }
502
503 spin_lock(&sb_lock);
504 sb->s_count++;
505 if (down_read_trylock(&sb->s_umount)) {
506 if (sb->s_root) {
507 spin_unlock(&sb_lock);
508 return 0;
509 }
510 /*
511 * umounted, drop rwsem again and fall through to failure
512 */
513 up_read(&sb->s_umount);
514 }
515
516 sb->s_count--;
517 spin_unlock(&sb_lock);
518 return 1;
519}
520
521static void unpin_sb_for_writeback(struct writeback_control *wbc,
522 struct inode *inode)
523{
524 struct super_block *sb = inode->i_sb;
525
526 if (wbc->sync_mode == WB_SYNC_ALL)
527 return;
528
529 up_read(&sb->s_umount);
530 put_super(sb);
531}
532
533static void writeback_inodes_wb(struct bdi_writeback *wb,
534 struct writeback_control *wbc)
472{ 535{
536 struct super_block *sb = wbc->sb;
473 const int is_blkdev_sb = sb_is_blkdev_sb(sb); 537 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
474 const unsigned long start = jiffies; /* livelock avoidance */ 538 const unsigned long start = jiffies; /* livelock avoidance */
475 539
476 spin_lock(&inode_lock); 540 spin_lock(&inode_lock);
477 541
478 if (!wbc->for_kupdate || list_empty(&bdi->b_io)) 542 if (!wbc->for_kupdate || list_empty(&wb->b_io))
479 queue_io(bdi, wbc->older_than_this); 543 queue_io(wb, wbc->older_than_this);
480 544
481 while (!list_empty(&bdi->b_io)) { 545 while (!list_empty(&wb->b_io)) {
482 struct inode *inode = list_entry(bdi->b_io.prev, 546 struct inode *inode = list_entry(wb->b_io.prev,
483 struct inode, i_list); 547 struct inode, i_list);
484 long pages_skipped; 548 long pages_skipped;
485 549
@@ -491,7 +555,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
491 continue; 555 continue;
492 } 556 }
493 557
494 if (!bdi_cap_writeback_dirty(bdi)) { 558 if (!bdi_cap_writeback_dirty(wb->bdi)) {
495 redirty_tail(inode); 559 redirty_tail(inode);
496 if (is_blkdev_sb) { 560 if (is_blkdev_sb) {
497 /* 561 /*
@@ -513,7 +577,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
513 continue; 577 continue;
514 } 578 }
515 579
516 if (wbc->nonblocking && bdi_write_congested(bdi)) { 580 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
517 wbc->encountered_congestion = 1; 581 wbc->encountered_congestion = 1;
518 if (!is_blkdev_sb) 582 if (!is_blkdev_sb)
519 break; /* Skip a congested fs */ 583 break; /* Skip a congested fs */
@@ -521,13 +585,6 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
521 continue; /* Skip a congested blockdev */ 585 continue; /* Skip a congested blockdev */
522 } 586 }
523 587
524 if (wbc->bdi && bdi != wbc->bdi) {
525 if (!is_blkdev_sb)
526 break; /* fs has the wrong queue */
527 requeue_io(inode);
528 continue; /* blockdev has wrong queue */
529 }
530
531 /* 588 /*
532 * Was this inode dirtied after sync_sb_inodes was called? 589 * Was this inode dirtied after sync_sb_inodes was called?
533 * This keeps sync from extra jobs and livelock. 590 * This keeps sync from extra jobs and livelock.
@@ -535,16 +592,16 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
535 if (inode_dirtied_after(inode, start)) 592 if (inode_dirtied_after(inode, start))
536 break; 593 break;
537 594
538 /* Is another pdflush already flushing this queue? */ 595 if (pin_sb_for_writeback(wbc, inode)) {
539 if (current_is_pdflush() && !writeback_acquire(bdi)) 596 requeue_io(inode);
540 break; 597 continue;
598 }
541 599
542 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 600 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
543 __iget(inode); 601 __iget(inode);
544 pages_skipped = wbc->pages_skipped; 602 pages_skipped = wbc->pages_skipped;
545 writeback_single_inode(inode, wbc); 603 writeback_single_inode(inode, wbc);
546 if (current_is_pdflush()) 604 unpin_sb_for_writeback(wbc, inode);
547 writeback_release(bdi);
548 if (wbc->pages_skipped != pages_skipped) { 605 if (wbc->pages_skipped != pages_skipped) {
549 /* 606 /*
550 * writeback is not making progress due to locked 607 * writeback is not making progress due to locked
@@ -560,7 +617,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
560 wbc->more_io = 1; 617 wbc->more_io = 1;
561 break; 618 break;
562 } 619 }
563 if (!list_empty(&bdi->b_more_io)) 620 if (!list_empty(&wb->b_more_io))
564 wbc->more_io = 1; 621 wbc->more_io = 1;
565 } 622 }
566 623
@@ -568,139 +625,500 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
568 /* Leave any unwritten inodes on b_io */ 625 /* Leave any unwritten inodes on b_io */
569} 626}
570 627
628void writeback_inodes_wbc(struct writeback_control *wbc)
629{
630 struct backing_dev_info *bdi = wbc->bdi;
631
632 writeback_inodes_wb(&bdi->wb, wbc);
633}
634
571/* 635/*
572 * Write out a superblock's list of dirty inodes. A wait will be performed 636 * The maximum number of pages to writeout in a single bdi flush/kupdate
573 * upon no inodes, all inodes or the final one, depending upon sync_mode. 637 * operation. We do this so we don't hold I_SYNC against an inode for
574 * 638 * enormous amounts of time, which would block a userspace task which has
575 * If older_than_this is non-NULL, then only write out inodes which 639 * been forced to throttle against that inode. Also, the code reevaluates
576 * had their first dirtying at a time earlier than *older_than_this. 640 * the dirty each time it has written this many pages.
577 * 641 */
578 * If we're a pdlfush thread, then implement pdflush collision avoidance 642#define MAX_WRITEBACK_PAGES 1024
579 * against the entire list. 643
644static inline bool over_bground_thresh(void)
645{
646 unsigned long background_thresh, dirty_thresh;
647
648 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
649
650 return (global_page_state(NR_FILE_DIRTY) +
651 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
652}
653
654/*
655 * Explicit flushing or periodic writeback of "old" data.
580 * 656 *
581 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 657 * Define "old": the first time one of an inode's pages is dirtied, we mark the
582 * This function assumes that the blockdev superblock's inodes are backed by 658 * dirtying-time in the inode's address_space. So this periodic writeback code
583 * a variety of queues, so all inodes are searched. For other superblocks, 659 * just walks the superblock inode list, writing back any inodes which are
584 * assume that all inodes are backed by the same queue. 660 * older than a specific point in time.
585 * 661 *
586 * FIXME: this linear search could get expensive with many fileystems. But 662 * Try to run once per dirty_writeback_interval. But if a writeback event
587 * how to fix? We need to go from an address_space to all inodes which share 663 * takes longer than a dirty_writeback_interval interval, then leave a
588 * a queue with that address_space. (Easy: have a global "dirty superblocks" 664 * one-second gap.
589 * list).
590 * 665 *
591 * The inodes to be written are parked on bdi->b_io. They are moved back onto 666 * older_than_this takes precedence over nr_to_write. So we'll only write back
592 * bdi->b_dirty as they are selected for writing. This way, none can be missed 667 * all dirty pages if they are all attached to "old" mappings.
593 * on the writer throttling path, and we get decent balancing between many
594 * throttled threads: we don't want them all piling up on inode_sync_wait.
595 */ 668 */
596static void generic_sync_sb_inodes(struct super_block *sb, 669static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
597 struct writeback_control *wbc) 670 struct super_block *sb,
671 enum writeback_sync_modes sync_mode, int for_kupdate)
598{ 672{
599 struct backing_dev_info *bdi; 673 struct writeback_control wbc = {
600 674 .bdi = wb->bdi,
601 if (!wbc->bdi) { 675 .sb = sb,
602 mutex_lock(&bdi_lock); 676 .sync_mode = sync_mode,
603 list_for_each_entry(bdi, &bdi_list, bdi_list) 677 .older_than_this = NULL,
604 generic_sync_bdi_inodes(bdi, wbc, sb); 678 .for_kupdate = for_kupdate,
605 mutex_unlock(&bdi_lock); 679 .range_cyclic = 1,
606 } else 680 };
607 generic_sync_bdi_inodes(wbc->bdi, wbc, sb); 681 unsigned long oldest_jif;
682 long wrote = 0;
608 683
609 if (wbc->sync_mode == WB_SYNC_ALL) { 684 if (wbc.for_kupdate) {
610 struct inode *inode, *old_inode = NULL; 685 wbc.older_than_this = &oldest_jif;
686 oldest_jif = jiffies -
687 msecs_to_jiffies(dirty_expire_interval * 10);
688 }
611 689
612 spin_lock(&inode_lock); 690 for (;;) {
691 /*
692 * Don't flush anything for non-integrity writeback where
693 * no nr_pages was given
694 */
695 if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
696 break;
613 697
614 /* 698 /*
615 * Data integrity sync. Must wait for all pages under writeback, 699 * If no specific pages were given and this is just a
616 * because there may have been pages dirtied before our sync 700 * periodic background writeout and we are below the
617 * call, but which had writeout started before we write it out. 701 * background dirty threshold, don't do anything
618 * In which case, the inode may not be on the dirty list, but
619 * we still have to wait for that writeout.
620 */ 702 */
621 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 703 if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
622 struct address_space *mapping; 704 break;
623 705
624 if (inode->i_state & 706 wbc.more_io = 0;
625 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 707 wbc.encountered_congestion = 0;
626 continue; 708 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
627 mapping = inode->i_mapping; 709 wbc.pages_skipped = 0;
628 if (mapping->nrpages == 0) 710 writeback_inodes_wb(wb, &wbc);
711 nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
712 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
713
714 /*
715 * If we ran out of stuff to write, bail unless more_io got set
716 */
717 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
718 if (wbc.more_io && !wbc.for_kupdate)
629 continue; 719 continue;
630 __iget(inode); 720 break;
631 spin_unlock(&inode_lock); 721 }
722 }
723
724 return wrote;
725}
726
727/*
728 * Return the next bdi_work struct that hasn't been processed by this
729 * wb thread yet
730 */
731static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
732 struct bdi_writeback *wb)
733{
734 struct bdi_work *work, *ret = NULL;
735
736 rcu_read_lock();
737
738 list_for_each_entry_rcu(work, &bdi->work_list, list) {
739 if (!test_and_clear_bit(wb->nr, &work->seen))
740 continue;
741
742 ret = work;
743 break;
744 }
745
746 rcu_read_unlock();
747 return ret;
748}
749
750static long wb_check_old_data_flush(struct bdi_writeback *wb)
751{
752 unsigned long expired;
753 long nr_pages;
754
755 expired = wb->last_old_flush +
756 msecs_to_jiffies(dirty_writeback_interval * 10);
757 if (time_before(jiffies, expired))
758 return 0;
759
760 wb->last_old_flush = jiffies;
761 nr_pages = global_page_state(NR_FILE_DIRTY) +
762 global_page_state(NR_UNSTABLE_NFS) +
763 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
764
765 if (nr_pages)
766 return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
767
768 return 0;
769}
770
771/*
772 * Retrieve work items and do the writeback they describe
773 */
774long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
775{
776 struct backing_dev_info *bdi = wb->bdi;
777 struct bdi_work *work;
778 long nr_pages, wrote = 0;
779
780 while ((work = get_next_work_item(bdi, wb)) != NULL) {
781 enum writeback_sync_modes sync_mode;
782
783 nr_pages = work->nr_pages;
784
785 /*
786 * Override sync mode, in case we must wait for completion
787 */
788 if (force_wait)
789 work->sync_mode = sync_mode = WB_SYNC_ALL;
790 else
791 sync_mode = work->sync_mode;
792
793 /*
794 * If this isn't a data integrity operation, just notify
795 * that we have seen this work and we are now starting it.
796 */
797 if (sync_mode == WB_SYNC_NONE)
798 wb_clear_pending(wb, work);
799
800 wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
801
802 /*
803 * This is a data integrity writeback, so only do the
804 * notification when we have completed the work.
805 */
806 if (sync_mode == WB_SYNC_ALL)
807 wb_clear_pending(wb, work);
808 }
809
810 /*
811 * Check for periodic writeback, kupdated() style
812 */
813 wrote += wb_check_old_data_flush(wb);
814
815 return wrote;
816}
817
818/*
819 * Handle writeback of dirty data for the device backed by this bdi. Also
820 * wakes up periodically and does kupdated style flushing.
821 */
822int bdi_writeback_task(struct bdi_writeback *wb)
823{
824 unsigned long last_active = jiffies;
825 unsigned long wait_jiffies = -1UL;
826 long pages_written;
827
828 while (!kthread_should_stop()) {
829 pages_written = wb_do_writeback(wb, 0);
830
831 if (pages_written)
832 last_active = jiffies;
833 else if (wait_jiffies != -1UL) {
834 unsigned long max_idle;
835
632 /* 836 /*
633 * We hold a reference to 'inode' so it couldn't have 837 * Longest period of inactivity that we tolerate. If we
634 * been removed from s_inodes list while we dropped the 838 * see dirty data again later, the task will get
635 * inode_lock. We cannot iput the inode now as we can 839 * recreated automatically.
636 * be holding the last reference and we cannot iput it
637 * under inode_lock. So we keep the reference and iput
638 * it later.
639 */ 840 */
640 iput(old_inode); 841 max_idle = max(5UL * 60 * HZ, wait_jiffies);
641 old_inode = inode; 842 if (time_after(jiffies, max_idle + last_active))
843 break;
844 }
845
846 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
847 set_current_state(TASK_INTERRUPTIBLE);
848 schedule_timeout(wait_jiffies);
849 try_to_freeze();
850 }
851
852 return 0;
853}
854
855/*
856 * Schedule writeback for all backing devices. Expensive! If this is a data
857 * integrity operation, writeback will be complete when this returns. If
858 * we are simply called for WB_SYNC_NONE, then writeback will merely be
859 * scheduled to run.
860 */
861static void bdi_writeback_all(struct writeback_control *wbc)
862{
863 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
864 struct backing_dev_info *bdi;
865 struct bdi_work *work;
866 LIST_HEAD(list);
867
868restart:
869 spin_lock(&bdi_lock);
870
871 list_for_each_entry(bdi, &bdi_list, bdi_list) {
872 struct bdi_work *work;
873
874 if (!bdi_has_dirty_io(bdi))
875 continue;
642 876
643 filemap_fdatawait(mapping); 877 /*
878 * If work allocation fails, do the writes inline. We drop
879 * the lock and restart the list writeout. This should be OK,
880 * since this happens rarely and because the writeout should
881 * eventually make more free memory available.
882 */
883 work = bdi_alloc_work(wbc);
884 if (!work) {
885 struct writeback_control __wbc;
644 886
645 cond_resched(); 887 /*
888 * Not a data integrity writeout, just continue
889 */
890 if (!must_wait)
891 continue;
646 892
647 spin_lock(&inode_lock); 893 spin_unlock(&bdi_lock);
894 __wbc = *wbc;
895 __wbc.bdi = bdi;
896 writeback_inodes_wbc(&__wbc);
897 goto restart;
648 } 898 }
649 spin_unlock(&inode_lock); 899 if (must_wait)
650 iput(old_inode); 900 list_add_tail(&work->wait_list, &list);
901
902 bdi_queue_work(bdi, work);
903 }
904
905 spin_unlock(&bdi_lock);
906
907 /*
908 * If this is for WB_SYNC_ALL, wait for pending work to complete
909 * before returning.
910 */
911 while (!list_empty(&list)) {
912 work = list_entry(list.next, struct bdi_work, wait_list);
913 list_del(&work->wait_list);
914 bdi_wait_on_work_clear(work);
915 call_rcu(&work->rcu_head, bdi_work_free);
651 } 916 }
652} 917}
653 918
654/* 919/*
655 * Start writeback of dirty pagecache data against all unlocked inodes. 920 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
921 * the whole world.
922 */
923void wakeup_flusher_threads(long nr_pages)
924{
925 struct writeback_control wbc = {
926 .sync_mode = WB_SYNC_NONE,
927 .older_than_this = NULL,
928 .range_cyclic = 1,
929 };
930
931 if (nr_pages == 0)
932 nr_pages = global_page_state(NR_FILE_DIRTY) +
933 global_page_state(NR_UNSTABLE_NFS);
934 wbc.nr_to_write = nr_pages;
935 bdi_writeback_all(&wbc);
936}
937
938static noinline void block_dump___mark_inode_dirty(struct inode *inode)
939{
940 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
941 struct dentry *dentry;
942 const char *name = "?";
943
944 dentry = d_find_alias(inode);
945 if (dentry) {
946 spin_lock(&dentry->d_lock);
947 name = (const char *) dentry->d_name.name;
948 }
949 printk(KERN_DEBUG
950 "%s(%d): dirtied inode %lu (%s) on %s\n",
951 current->comm, task_pid_nr(current), inode->i_ino,
952 name, inode->i_sb->s_id);
953 if (dentry) {
954 spin_unlock(&dentry->d_lock);
955 dput(dentry);
956 }
957 }
958}
959
960/**
961 * __mark_inode_dirty - internal function
962 * @inode: inode to mark
963 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
964 * Mark an inode as dirty. Callers should use mark_inode_dirty or
965 * mark_inode_dirty_sync.
656 * 966 *
657 * Note: 967 * Put the inode on the super block's dirty list.
658 * We don't need to grab a reference to superblock here. If it has non-empty 968 *
659 * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed 969 * CAREFUL! We mark it dirty unconditionally, but move it onto the
660 * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all 970 * dirty list only if it is hashed or if it refers to a blockdev.
661 * empty. Since __sync_single_inode() regains inode_lock before it finally moves 971 * If it was not hashed, it will never be added to the dirty list
662 * inode from superblock lists we are OK. 972 * even if it is later hashed, as it will have been marked dirty already.
973 *
974 * In short, make sure you hash any inodes _before_ you start marking
975 * them dirty.
663 * 976 *
664 * If `older_than_this' is non-zero then only flush inodes which have a 977 * This function *must* be atomic for the I_DIRTY_PAGES case -
665 * flushtime older than *older_than_this. 978 * set_page_dirty() is called under spinlock in several places.
666 * 979 *
667 * If `bdi' is non-zero then we will scan the first inode against each 980 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
668 * superblock until we find the matching ones. One group will be the dirty 981 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
669 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 982 * the kernel-internal blockdev inode represents the dirtying time of the
670 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 983 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
671 * super-efficient but we're about to do a ton of I/O... 984 * page->mapping->host, so the page-dirtying time is recorded in the internal
985 * blockdev inode.
672 */ 986 */
673void 987void __mark_inode_dirty(struct inode *inode, int flags)
674writeback_inodes(struct writeback_control *wbc)
675{ 988{
676 struct super_block *sb; 989 struct super_block *sb = inode->i_sb;
677 990
678 might_sleep(); 991 /*
679 spin_lock(&sb_lock); 992 * Don't do this for I_DIRTY_PAGES - that doesn't actually
680restart: 993 * dirty the inode itself
681 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 994 */
682 if (sb_has_dirty_inodes(sb)) { 995 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
683 /* we're making our own get_super here */ 996 if (sb->s_op->dirty_inode)
684 sb->s_count++; 997 sb->s_op->dirty_inode(inode);
685 spin_unlock(&sb_lock); 998 }
686 /* 999
687 * If we can't get the readlock, there's no sense in 1000 /*
688 * waiting around, most of the time the FS is going to 1001 * make sure that changes are seen by all cpus before we test i_state
689 * be unmounted by the time it is released. 1002 * -- mikulas
690 */ 1003 */
691 if (down_read_trylock(&sb->s_umount)) { 1004 smp_mb();
692 if (sb->s_root) 1005
693 generic_sync_sb_inodes(sb, wbc); 1006 /* avoid the locking if we can */
694 up_read(&sb->s_umount); 1007 if ((inode->i_state & flags) == flags)
695 } 1008 return;
696 spin_lock(&sb_lock); 1009
697 if (__put_super_and_need_restart(sb)) 1010 if (unlikely(block_dump))
698 goto restart; 1011 block_dump___mark_inode_dirty(inode);
1012
1013 spin_lock(&inode_lock);
1014 if ((inode->i_state & flags) != flags) {
1015 const int was_dirty = inode->i_state & I_DIRTY;
1016
1017 inode->i_state |= flags;
1018
1019 /*
1020 * If the inode is being synced, just update its dirty state.
1021 * The unlocker will place the inode on the appropriate
1022 * superblock list, based upon its state.
1023 */
1024 if (inode->i_state & I_SYNC)
1025 goto out;
1026
1027 /*
1028 * Only add valid (hashed) inodes to the superblock's
1029 * dirty list. Add blockdev inodes as well.
1030 */
1031 if (!S_ISBLK(inode->i_mode)) {
1032 if (hlist_unhashed(&inode->i_hash))
1033 goto out;
1034 }
1035 if (inode->i_state & (I_FREEING|I_CLEAR))
1036 goto out;
1037
1038 /*
1039 * If the inode was already on b_dirty/b_io/b_more_io, don't
1040 * reposition it (that would break b_dirty time-ordering).
1041 */
1042 if (!was_dirty) {
1043 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1044
1045 inode->dirtied_when = jiffies;
1046 list_move(&inode->i_list, &wb->b_dirty);
699 } 1047 }
700 if (wbc->nr_to_write <= 0)
701 break;
702 } 1048 }
703 spin_unlock(&sb_lock); 1049out:
1050 spin_unlock(&inode_lock);
1051}
1052EXPORT_SYMBOL(__mark_inode_dirty);
1053
1054/*
1055 * Write out a superblock's list of dirty inodes. A wait will be performed
1056 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1057 *
1058 * If older_than_this is non-NULL, then only write out inodes which
1059 * had their first dirtying at a time earlier than *older_than_this.
1060 *
1061 * If we're a pdlfush thread, then implement pdflush collision avoidance
1062 * against the entire list.
1063 *
1064 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1065 * This function assumes that the blockdev superblock's inodes are backed by
1066 * a variety of queues, so all inodes are searched. For other superblocks,
1067 * assume that all inodes are backed by the same queue.
1068 *
1069 * The inodes to be written are parked on bdi->b_io. They are moved back onto
1070 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1071 * on the writer throttling path, and we get decent balancing between many
1072 * throttled threads: we don't want them all piling up on inode_sync_wait.
1073 */
1074static void wait_sb_inodes(struct writeback_control *wbc)
1075{
1076 struct inode *inode, *old_inode = NULL;
1077
1078 /*
1079 * We need to be protected against the filesystem going from
1080 * r/o to r/w or vice versa.
1081 */
1082 WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
1083
1084 spin_lock(&inode_lock);
1085
1086 /*
1087 * Data integrity sync. Must wait for all pages under writeback,
1088 * because there may have been pages dirtied before our sync
1089 * call, but which had writeout started before we write it out.
1090 * In which case, the inode may not be on the dirty list, but
1091 * we still have to wait for that writeout.
1092 */
1093 list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
1094 struct address_space *mapping;
1095
1096 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1097 continue;
1098 mapping = inode->i_mapping;
1099 if (mapping->nrpages == 0)
1100 continue;
1101 __iget(inode);
1102 spin_unlock(&inode_lock);
1103 /*
1104 * We hold a reference to 'inode' so it couldn't have
1105 * been removed from s_inodes list while we dropped the
1106 * inode_lock. We cannot iput the inode now as we can
1107 * be holding the last reference and we cannot iput it
1108 * under inode_lock. So we keep the reference and iput
1109 * it later.
1110 */
1111 iput(old_inode);
1112 old_inode = inode;
1113
1114 filemap_fdatawait(mapping);
1115
1116 cond_resched();
1117
1118 spin_lock(&inode_lock);
1119 }
1120 spin_unlock(&inode_lock);
1121 iput(old_inode);
704} 1122}
705 1123
706/** 1124/**
@@ -715,6 +1133,7 @@ restart:
715long writeback_inodes_sb(struct super_block *sb) 1133long writeback_inodes_sb(struct super_block *sb)
716{ 1134{
717 struct writeback_control wbc = { 1135 struct writeback_control wbc = {
1136 .sb = sb,
718 .sync_mode = WB_SYNC_NONE, 1137 .sync_mode = WB_SYNC_NONE,
719 .range_start = 0, 1138 .range_start = 0,
720 .range_end = LLONG_MAX, 1139 .range_end = LLONG_MAX,
@@ -727,7 +1146,7 @@ long writeback_inodes_sb(struct super_block *sb)
727 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1146 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
728 1147
729 wbc.nr_to_write = nr_to_write; 1148 wbc.nr_to_write = nr_to_write;
730 generic_sync_sb_inodes(sb, &wbc); 1149 bdi_writeback_all(&wbc);
731 return nr_to_write - wbc.nr_to_write; 1150 return nr_to_write - wbc.nr_to_write;
732} 1151}
733EXPORT_SYMBOL(writeback_inodes_sb); 1152EXPORT_SYMBOL(writeback_inodes_sb);
@@ -742,6 +1161,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
742long sync_inodes_sb(struct super_block *sb) 1161long sync_inodes_sb(struct super_block *sb)
743{ 1162{
744 struct writeback_control wbc = { 1163 struct writeback_control wbc = {
1164 .sb = sb,
745 .sync_mode = WB_SYNC_ALL, 1165 .sync_mode = WB_SYNC_ALL,
746 .range_start = 0, 1166 .range_start = 0,
747 .range_end = LLONG_MAX, 1167 .range_end = LLONG_MAX,
@@ -749,7 +1169,8 @@ long sync_inodes_sb(struct super_block *sb)
749 long nr_to_write = LONG_MAX; /* doesn't actually matter */ 1169 long nr_to_write = LONG_MAX; /* doesn't actually matter */
750 1170
751 wbc.nr_to_write = nr_to_write; 1171 wbc.nr_to_write = nr_to_write;
752 generic_sync_sb_inodes(sb, &wbc); 1172 bdi_writeback_all(&wbc);
1173 wait_sb_inodes(&wbc);
753 return nr_to_write - wbc.nr_to_write; 1174 return nr_to_write - wbc.nr_to_write;
754} 1175}
755EXPORT_SYMBOL(sync_inodes_sb); 1176EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/super.c b/fs/super.c
index 0d22ce3be4aa..9cda337ddae2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -168,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
168 * Drops a temporary reference, frees superblock if there's no 168 * Drops a temporary reference, frees superblock if there's no
169 * references left. 169 * references left.
170 */ 170 */
171static void put_super(struct super_block *sb) 171void put_super(struct super_block *sb)
172{ 172{
173 spin_lock(&sb_lock); 173 spin_lock(&sb_lock);
174 __put_super(sb); 174 __put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 66f210476f40..103cc7fdd3df 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -120,7 +120,7 @@ restart:
120 */ 120 */
121SYSCALL_DEFINE0(sync) 121SYSCALL_DEFINE0(sync)
122{ 122{
123 wakeup_pdflush(0); 123 wakeup_flusher_threads(0);
124 sync_filesystems(0); 124 sync_filesystems(0);
125 sync_filesystems(1); 125 sync_filesystems(1);
126 if (unlikely(laptop_mode)) 126 if (unlikely(laptop_mode))