aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2009-09-09 03:08:54 -0400
committerJens Axboe <jens.axboe@oracle.com>2009-09-11 03:20:25 -0400
commit03ba3782e8dcc5b0e1efe440d33084f066e38cae (patch)
treee5a6513b411de16a46199530ec98ef9b7f1efc50
parent66f3b8e2e103a0b93b945764d98e9ba46cb926dd (diff)
writeback: switch to per-bdi threads for flushing data
This gets rid of pdflush for bdi writeout and kupdated style cleaning. pdflush writeout suffers from lack of locality and also requires more threads to handle the same workload, since it has to work in a non-blocking fashion against each queue. This also introduces lumpy behaviour and potential request starvation, since pdflush can be starved for queue access if others are accessing it. A sample ffsb workload that does random writes to files is about 8% faster here on a simple SATA drive during the benchmark phase. File layout also seems a LOT more smooth in vmstat: r b swpd free buff cache si so bi bo in cs us sy id wa 0 1 0 608848 2652 375372 0 0 0 71024 604 24 1 10 48 42 0 1 0 549644 2712 433736 0 0 0 60692 505 27 1 8 48 44 1 0 0 476928 2784 505192 0 0 4 29540 553 24 0 9 53 37 0 1 0 457972 2808 524008 0 0 0 54876 331 16 0 4 38 58 0 1 0 366128 2928 614284 0 0 4 92168 710 58 0 13 53 34 0 1 0 295092 3000 684140 0 0 0 62924 572 23 0 9 53 37 0 1 0 236592 3064 741704 0 0 4 58256 523 17 0 8 48 44 0 1 0 165608 3132 811464 0 0 0 57460 560 21 0 8 54 38 0 1 0 102952 3200 873164 0 0 4 74748 540 29 1 10 48 41 0 1 0 48604 3252 926472 0 0 0 53248 469 29 0 7 47 45 where vanilla tends to fluctuate a lot in the creation phase: r b swpd free buff cache si so bi bo in cs us sy id wa 1 1 0 678716 5792 303380 0 0 0 74064 565 50 1 11 52 36 1 0 0 662488 5864 319396 0 0 4 352 302 329 0 2 47 51 0 1 0 599312 5924 381468 0 0 0 78164 516 55 0 9 51 40 0 1 0 519952 6008 459516 0 0 4 78156 622 56 1 11 52 37 1 1 0 436640 6092 541632 0 0 0 82244 622 54 0 11 48 41 0 1 0 436640 6092 541660 0 0 0 8 152 39 0 0 51 49 0 1 0 332224 6200 644252 0 0 4 102800 728 46 1 13 49 36 1 0 0 274492 6260 701056 0 0 4 12328 459 49 0 7 50 43 0 1 0 211220 6324 763356 0 0 0 106940 515 37 1 10 51 39 1 0 0 160412 6376 813468 0 0 0 8224 415 43 0 6 49 45 1 1 0 85980 6452 886556 0 0 4 113516 575 39 1 11 54 34 0 2 0 85968 6452 886620 0 0 0 1640 158 211 0 0 46 54 A 10 disk test with btrfs performs 26% faster with per-bdi flushing. A SSD based writeback test on XFS performs over 20% better as well, with the throughput being very stable around 1GB/sec, where pdflush only manages 750MB/sec and fluctuates wildly while doing so. Random buffered writes to many files behave a lot better as well, as does random mmap'ed writes. A separate thread is added to sync the super blocks. In the long term, adding sync_supers_bdi() functionality could get rid of this thread again. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/fs-writeback.c999
-rw-r--r--fs/super.c2
-rw-r--r--fs/sync.c2
-rw-r--r--include/linux/backing-dev.h55
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/writeback.h8
-rw-r--r--mm/backing-dev.c341
-rw-r--r--mm/page-writeback.c179
-rw-r--r--mm/vmscan.c2
10 files changed, 1120 insertions, 472 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
281 struct zone *zone; 281 struct zone *zone;
282 int nid; 282 int nid;
283 283
284 wakeup_pdflush(1024); 284 wakeup_flusher_threads(1024);
285 yield(); 285 yield();
286 286
287 for_each_online_node(nid) { 287 for_each_online_node(nid) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 45ad4bb700e6..7f6dae8aa47f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,6 +19,8 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
@@ -27,165 +29,208 @@
27 29
28#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) 30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
29 31
30/** 32/*
31 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * Work items for the bdi_writeback threads
32 * @bdi: the device's backing_dev_info structure
33 *
34 * It is a waste of resources to have more than one pdflush thread blocked on
35 * a single request queue. Exclusion at the request_queue level is obtained
36 * via a flag in the request_queue's backing_dev_info.state.
37 *
38 * Non-request_queue-backed address_spaces will share default_backing_dev_info,
39 * unless they implement their own. Which is somewhat inefficient, as this
40 * may prevent concurrent writeback against multiple devices.
41 */ 34 */
42static int writeback_acquire(struct backing_dev_info *bdi) 35struct bdi_work {
36 struct list_head list;
37 struct list_head wait_list;
38 struct rcu_head rcu_head;
39
40 unsigned long seen;
41 atomic_t pending;
42
43 struct super_block *sb;
44 unsigned long nr_pages;
45 enum writeback_sync_modes sync_mode;
46
47 unsigned long state;
48};
49
50enum {
51 WS_USED_B = 0,
52 WS_ONSTACK_B,
53};
54
55#define WS_USED (1 << WS_USED_B)
56#define WS_ONSTACK (1 << WS_ONSTACK_B)
57
58static inline bool bdi_work_on_stack(struct bdi_work *work)
59{
60 return test_bit(WS_ONSTACK_B, &work->state);
61}
62
63static inline void bdi_work_init(struct bdi_work *work,
64 struct writeback_control *wbc)
65{
66 INIT_RCU_HEAD(&work->rcu_head);
67 work->sb = wbc->sb;
68 work->nr_pages = wbc->nr_to_write;
69 work->sync_mode = wbc->sync_mode;
70 work->state = WS_USED;
71}
72
73static inline void bdi_work_init_on_stack(struct bdi_work *work,
74 struct writeback_control *wbc)
43{ 75{
44 return !test_and_set_bit(BDI_pdflush, &bdi->state); 76 bdi_work_init(work, wbc);
77 work->state |= WS_ONSTACK;
45} 78}
46 79
47/** 80/**
48 * writeback_in_progress - determine whether there is writeback in progress 81 * writeback_in_progress - determine whether there is writeback in progress
49 * @bdi: the device's backing_dev_info structure. 82 * @bdi: the device's backing_dev_info structure.
50 * 83 *
51 * Determine whether there is writeback in progress against a backing device. 84 * Determine whether there is writeback waiting to be handled against a
85 * backing device.
52 */ 86 */
53int writeback_in_progress(struct backing_dev_info *bdi) 87int writeback_in_progress(struct backing_dev_info *bdi)
54{ 88{
55 return test_bit(BDI_pdflush, &bdi->state); 89 return !list_empty(&bdi->work_list);
56} 90}
57 91
58/** 92static void bdi_work_clear(struct bdi_work *work)
59 * writeback_release - relinquish exclusive writeback access against a device.
60 * @bdi: the device's backing_dev_info structure
61 */
62static void writeback_release(struct backing_dev_info *bdi)
63{ 93{
64 BUG_ON(!writeback_in_progress(bdi)); 94 clear_bit(WS_USED_B, &work->state);
65 clear_bit(BDI_pdflush, &bdi->state); 95 smp_mb__after_clear_bit();
96 wake_up_bit(&work->state, WS_USED_B);
66} 97}
67 98
68static noinline void block_dump___mark_inode_dirty(struct inode *inode) 99static void bdi_work_free(struct rcu_head *head)
69{ 100{
70 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 101 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
71 struct dentry *dentry;
72 const char *name = "?";
73 102
74 dentry = d_find_alias(inode); 103 if (!bdi_work_on_stack(work))
75 if (dentry) { 104 kfree(work);
76 spin_lock(&dentry->d_lock); 105 else
77 name = (const char *) dentry->d_name.name; 106 bdi_work_clear(work);
78 }
79 printk(KERN_DEBUG
80 "%s(%d): dirtied inode %lu (%s) on %s\n",
81 current->comm, task_pid_nr(current), inode->i_ino,
82 name, inode->i_sb->s_id);
83 if (dentry) {
84 spin_unlock(&dentry->d_lock);
85 dput(dentry);
86 }
87 }
88} 107}
89 108
90/** 109static void wb_work_complete(struct bdi_work *work)
91 * __mark_inode_dirty - internal function
92 * @inode: inode to mark
93 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
94 * Mark an inode as dirty. Callers should use mark_inode_dirty or
95 * mark_inode_dirty_sync.
96 *
97 * Put the inode on the super block's dirty list.
98 *
99 * CAREFUL! We mark it dirty unconditionally, but move it onto the
100 * dirty list only if it is hashed or if it refers to a blockdev.
101 * If it was not hashed, it will never be added to the dirty list
102 * even if it is later hashed, as it will have been marked dirty already.
103 *
104 * In short, make sure you hash any inodes _before_ you start marking
105 * them dirty.
106 *
107 * This function *must* be atomic for the I_DIRTY_PAGES case -
108 * set_page_dirty() is called under spinlock in several places.
109 *
110 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
111 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
112 * the kernel-internal blockdev inode represents the dirtying time of the
113 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
114 * page->mapping->host, so the page-dirtying time is recorded in the internal
115 * blockdev inode.
116 */
117void __mark_inode_dirty(struct inode *inode, int flags)
118{ 110{
119 struct super_block *sb = inode->i_sb; 111 const enum writeback_sync_modes sync_mode = work->sync_mode;
120 112
121 /* 113 /*
122 * Don't do this for I_DIRTY_PAGES - that doesn't actually 114 * For allocated work, we can clear the done/seen bit right here.
123 * dirty the inode itself 115 * For on-stack work, we need to postpone both the clear and free
116 * to after the RCU grace period, since the stack could be invalidated
117 * as soon as bdi_work_clear() has done the wakeup.
124 */ 118 */
125 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 119 if (!bdi_work_on_stack(work))
126 if (sb->s_op->dirty_inode) 120 bdi_work_clear(work);
127 sb->s_op->dirty_inode(inode); 121 if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
128 } 122 call_rcu(&work->rcu_head, bdi_work_free);
123}
129 124
125static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
126{
130 /* 127 /*
131 * make sure that changes are seen by all cpus before we test i_state 128 * The caller has retrieved the work arguments from this work,
132 * -- mikulas 129 * drop our reference. If this is the last ref, delete and free it
133 */ 130 */
134 smp_mb(); 131 if (atomic_dec_and_test(&work->pending)) {
132 struct backing_dev_info *bdi = wb->bdi;
135 133
136 /* avoid the locking if we can */ 134 spin_lock(&bdi->wb_lock);
137 if ((inode->i_state & flags) == flags) 135 list_del_rcu(&work->list);
138 return; 136 spin_unlock(&bdi->wb_lock);
139
140 if (unlikely(block_dump))
141 block_dump___mark_inode_dirty(inode);
142 137
143 spin_lock(&inode_lock); 138 wb_work_complete(work);
144 if ((inode->i_state & flags) != flags) { 139 }
145 const int was_dirty = inode->i_state & I_DIRTY; 140}
146 141
147 inode->i_state |= flags; 142static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
143{
144 if (work) {
145 work->seen = bdi->wb_mask;
146 BUG_ON(!work->seen);
147 atomic_set(&work->pending, bdi->wb_cnt);
148 BUG_ON(!bdi->wb_cnt);
148 149
149 /* 150 /*
150 * If the inode is being synced, just update its dirty state. 151 * Make sure stores are seen before it appears on the list
151 * The unlocker will place the inode on the appropriate
152 * superblock list, based upon its state.
153 */ 152 */
154 if (inode->i_state & I_SYNC) 153 smp_mb();
155 goto out;
156 154
157 /* 155 spin_lock(&bdi->wb_lock);
158 * Only add valid (hashed) inodes to the superblock's 156 list_add_tail_rcu(&work->list, &bdi->work_list);
159 * dirty list. Add blockdev inodes as well. 157 spin_unlock(&bdi->wb_lock);
160 */ 158 }
161 if (!S_ISBLK(inode->i_mode)) { 159
162 if (hlist_unhashed(&inode->i_hash)) 160 /*
163 goto out; 161 * If the default thread isn't there, make sure we add it. When
164 } 162 * it gets created and wakes up, we'll run this work.
165 if (inode->i_state & (I_FREEING|I_CLEAR)) 163 */
166 goto out; 164 if (unlikely(list_empty_careful(&bdi->wb_list)))
165 wake_up_process(default_backing_dev_info.wb.task);
166 else {
167 struct bdi_writeback *wb = &bdi->wb;
167 168
168 /* 169 /*
169 * If the inode was already on b_dirty/b_io/b_more_io, don't 170 * If we failed allocating the bdi work item, wake up the wb
170 * reposition it (that would break b_dirty time-ordering). 171 * thread always. As a safety precaution, it'll flush out
172 * everything
171 */ 173 */
172 if (!was_dirty) { 174 if (!wb_has_dirty_io(wb)) {
173 inode->dirtied_when = jiffies; 175 if (work)
174 list_move(&inode->i_list, 176 wb_clear_pending(wb, work);
175 &inode_to_bdi(inode)->b_dirty); 177 } else if (wb->task)
176 } 178 wake_up_process(wb->task);
177 } 179 }
178out:
179 spin_unlock(&inode_lock);
180} 180}
181 181
182EXPORT_SYMBOL(__mark_inode_dirty); 182/*
183 * Used for on-stack allocated work items. The caller needs to wait until
184 * the wb threads have acked the work before it's safe to continue.
185 */
186static void bdi_wait_on_work_clear(struct bdi_work *work)
187{
188 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
189 TASK_UNINTERRUPTIBLE);
190}
183 191
184static int write_inode(struct inode *inode, int sync) 192static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
185{ 193{
186 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 194 struct bdi_work *work;
187 return inode->i_sb->s_op->write_inode(inode, sync); 195
188 return 0; 196 work = kmalloc(sizeof(*work), GFP_ATOMIC);
197 if (work)
198 bdi_work_init(work, wbc);
199
200 return work;
201}
202
203void bdi_start_writeback(struct writeback_control *wbc)
204{
205 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
206 struct bdi_work work_stack, *work = NULL;
207
208 if (!must_wait)
209 work = bdi_alloc_work(wbc);
210
211 if (!work) {
212 work = &work_stack;
213 bdi_work_init_on_stack(work, wbc);
214 }
215
216 bdi_queue_work(wbc->bdi, work);
217
218 /*
219 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
220 * complete. If not, we only need to wait for the work to be started,
221 * if we allocated it on-stack. We use the same mechanism, if the
222 * wait bit is set in the bdi_work struct, then threads will not
223 * clear pending until after they are done.
224 *
225 * Note that work == &work_stack if must_wait is true, so we don't
226 * need to do call_rcu() here ever, since the completion path will
227 * have done that for us.
228 */
229 if (must_wait || work == &work_stack) {
230 bdi_wait_on_work_clear(work);
231 if (work != &work_stack)
232 call_rcu(&work->rcu_head, bdi_work_free);
233 }
189} 234}
190 235
191/* 236/*
@@ -199,16 +244,16 @@ static int write_inode(struct inode *inode, int sync)
199 */ 244 */
200static void redirty_tail(struct inode *inode) 245static void redirty_tail(struct inode *inode)
201{ 246{
202 struct backing_dev_info *bdi = inode_to_bdi(inode); 247 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
203 248
204 if (!list_empty(&bdi->b_dirty)) { 249 if (!list_empty(&wb->b_dirty)) {
205 struct inode *tail; 250 struct inode *tail;
206 251
207 tail = list_entry(bdi->b_dirty.next, struct inode, i_list); 252 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
208 if (time_before(inode->dirtied_when, tail->dirtied_when)) 253 if (time_before(inode->dirtied_when, tail->dirtied_when))
209 inode->dirtied_when = jiffies; 254 inode->dirtied_when = jiffies;
210 } 255 }
211 list_move(&inode->i_list, &bdi->b_dirty); 256 list_move(&inode->i_list, &wb->b_dirty);
212} 257}
213 258
214/* 259/*
@@ -216,7 +261,9 @@ static void redirty_tail(struct inode *inode)
216 */ 261 */
217static void requeue_io(struct inode *inode) 262static void requeue_io(struct inode *inode)
218{ 263{
219 list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io); 264 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
265
266 list_move(&inode->i_list, &wb->b_more_io);
220} 267}
221 268
222static void inode_sync_complete(struct inode *inode) 269static void inode_sync_complete(struct inode *inode)
@@ -263,52 +310,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
263/* 310/*
264 * Queue all expired dirty inodes for io, eldest first. 311 * Queue all expired dirty inodes for io, eldest first.
265 */ 312 */
266static void queue_io(struct backing_dev_info *bdi, 313static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
267 unsigned long *older_than_this)
268{ 314{
269 list_splice_init(&bdi->b_more_io, bdi->b_io.prev); 315 list_splice_init(&wb->b_more_io, wb->b_io.prev);
270 move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this); 316 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
271} 317}
272 318
273static int sb_on_inode_list(struct super_block *sb, struct list_head *list) 319static int write_inode(struct inode *inode, int sync)
274{
275 struct inode *inode;
276 int ret = 0;
277
278 spin_lock(&inode_lock);
279 list_for_each_entry(inode, list, i_list) {
280 if (inode->i_sb == sb) {
281 ret = 1;
282 break;
283 }
284 }
285 spin_unlock(&inode_lock);
286 return ret;
287}
288
289int sb_has_dirty_inodes(struct super_block *sb)
290{ 320{
291 struct backing_dev_info *bdi; 321 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
292 int ret = 0; 322 return inode->i_sb->s_op->write_inode(inode, sync);
293 323 return 0;
294 /*
295 * This is REALLY expensive right now, but it'll go away
296 * when the bdi writeback is introduced
297 */
298 mutex_lock(&bdi_lock);
299 list_for_each_entry(bdi, &bdi_list, bdi_list) {
300 if (sb_on_inode_list(sb, &bdi->b_dirty) ||
301 sb_on_inode_list(sb, &bdi->b_io) ||
302 sb_on_inode_list(sb, &bdi->b_more_io)) {
303 ret = 1;
304 break;
305 }
306 }
307 mutex_unlock(&bdi_lock);
308
309 return ret;
310} 324}
311EXPORT_SYMBOL(sb_has_dirty_inodes);
312 325
313/* 326/*
314 * Wait for writeback on an inode to complete. 327 * Wait for writeback on an inode to complete.
@@ -466,20 +479,71 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
466 return ret; 479 return ret;
467} 480}
468 481
469static void generic_sync_bdi_inodes(struct backing_dev_info *bdi, 482/*
470 struct writeback_control *wbc, 483 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
471 struct super_block *sb) 484 * before calling writeback. So make sure that we do pin it, so it doesn't
485 * go away while we are writing inodes from it.
486 *
487 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
488 * 1 if we failed.
489 */
490static int pin_sb_for_writeback(struct writeback_control *wbc,
491 struct inode *inode)
492{
493 struct super_block *sb = inode->i_sb;
494
495 /*
496 * Caller must already hold the ref for this
497 */
498 if (wbc->sync_mode == WB_SYNC_ALL) {
499 WARN_ON(!rwsem_is_locked(&sb->s_umount));
500 return 0;
501 }
502
503 spin_lock(&sb_lock);
504 sb->s_count++;
505 if (down_read_trylock(&sb->s_umount)) {
506 if (sb->s_root) {
507 spin_unlock(&sb_lock);
508 return 0;
509 }
510 /*
511 * umounted, drop rwsem again and fall through to failure
512 */
513 up_read(&sb->s_umount);
514 }
515
516 sb->s_count--;
517 spin_unlock(&sb_lock);
518 return 1;
519}
520
521static void unpin_sb_for_writeback(struct writeback_control *wbc,
522 struct inode *inode)
523{
524 struct super_block *sb = inode->i_sb;
525
526 if (wbc->sync_mode == WB_SYNC_ALL)
527 return;
528
529 up_read(&sb->s_umount);
530 put_super(sb);
531}
532
533static void writeback_inodes_wb(struct bdi_writeback *wb,
534 struct writeback_control *wbc)
472{ 535{
536 struct super_block *sb = wbc->sb;
473 const int is_blkdev_sb = sb_is_blkdev_sb(sb); 537 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
474 const unsigned long start = jiffies; /* livelock avoidance */ 538 const unsigned long start = jiffies; /* livelock avoidance */
475 539
476 spin_lock(&inode_lock); 540 spin_lock(&inode_lock);
477 541
478 if (!wbc->for_kupdate || list_empty(&bdi->b_io)) 542 if (!wbc->for_kupdate || list_empty(&wb->b_io))
479 queue_io(bdi, wbc->older_than_this); 543 queue_io(wb, wbc->older_than_this);
480 544
481 while (!list_empty(&bdi->b_io)) { 545 while (!list_empty(&wb->b_io)) {
482 struct inode *inode = list_entry(bdi->b_io.prev, 546 struct inode *inode = list_entry(wb->b_io.prev,
483 struct inode, i_list); 547 struct inode, i_list);
484 long pages_skipped; 548 long pages_skipped;
485 549
@@ -491,7 +555,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
491 continue; 555 continue;
492 } 556 }
493 557
494 if (!bdi_cap_writeback_dirty(bdi)) { 558 if (!bdi_cap_writeback_dirty(wb->bdi)) {
495 redirty_tail(inode); 559 redirty_tail(inode);
496 if (is_blkdev_sb) { 560 if (is_blkdev_sb) {
497 /* 561 /*
@@ -513,7 +577,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
513 continue; 577 continue;
514 } 578 }
515 579
516 if (wbc->nonblocking && bdi_write_congested(bdi)) { 580 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
517 wbc->encountered_congestion = 1; 581 wbc->encountered_congestion = 1;
518 if (!is_blkdev_sb) 582 if (!is_blkdev_sb)
519 break; /* Skip a congested fs */ 583 break; /* Skip a congested fs */
@@ -521,13 +585,6 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
521 continue; /* Skip a congested blockdev */ 585 continue; /* Skip a congested blockdev */
522 } 586 }
523 587
524 if (wbc->bdi && bdi != wbc->bdi) {
525 if (!is_blkdev_sb)
526 break; /* fs has the wrong queue */
527 requeue_io(inode);
528 continue; /* blockdev has wrong queue */
529 }
530
531 /* 588 /*
532 * Was this inode dirtied after sync_sb_inodes was called? 589 * Was this inode dirtied after sync_sb_inodes was called?
533 * This keeps sync from extra jobs and livelock. 590 * This keeps sync from extra jobs and livelock.
@@ -535,16 +592,16 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
535 if (inode_dirtied_after(inode, start)) 592 if (inode_dirtied_after(inode, start))
536 break; 593 break;
537 594
538 /* Is another pdflush already flushing this queue? */ 595 if (pin_sb_for_writeback(wbc, inode)) {
539 if (current_is_pdflush() && !writeback_acquire(bdi)) 596 requeue_io(inode);
540 break; 597 continue;
598 }
541 599
542 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 600 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
543 __iget(inode); 601 __iget(inode);
544 pages_skipped = wbc->pages_skipped; 602 pages_skipped = wbc->pages_skipped;
545 writeback_single_inode(inode, wbc); 603 writeback_single_inode(inode, wbc);
546 if (current_is_pdflush()) 604 unpin_sb_for_writeback(wbc, inode);
547 writeback_release(bdi);
548 if (wbc->pages_skipped != pages_skipped) { 605 if (wbc->pages_skipped != pages_skipped) {
549 /* 606 /*
550 * writeback is not making progress due to locked 607 * writeback is not making progress due to locked
@@ -560,7 +617,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
560 wbc->more_io = 1; 617 wbc->more_io = 1;
561 break; 618 break;
562 } 619 }
563 if (!list_empty(&bdi->b_more_io)) 620 if (!list_empty(&wb->b_more_io))
564 wbc->more_io = 1; 621 wbc->more_io = 1;
565 } 622 }
566 623
@@ -568,139 +625,500 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
568 /* Leave any unwritten inodes on b_io */ 625 /* Leave any unwritten inodes on b_io */
569} 626}
570 627
628void writeback_inodes_wbc(struct writeback_control *wbc)
629{
630 struct backing_dev_info *bdi = wbc->bdi;
631
632 writeback_inodes_wb(&bdi->wb, wbc);
633}
634
571/* 635/*
572 * Write out a superblock's list of dirty inodes. A wait will be performed 636 * The maximum number of pages to writeout in a single bdi flush/kupdate
573 * upon no inodes, all inodes or the final one, depending upon sync_mode. 637 * operation. We do this so we don't hold I_SYNC against an inode for
574 * 638 * enormous amounts of time, which would block a userspace task which has
575 * If older_than_this is non-NULL, then only write out inodes which 639 * been forced to throttle against that inode. Also, the code reevaluates
576 * had their first dirtying at a time earlier than *older_than_this. 640 * the dirty each time it has written this many pages.
577 * 641 */
578 * If we're a pdlfush thread, then implement pdflush collision avoidance 642#define MAX_WRITEBACK_PAGES 1024
579 * against the entire list. 643
644static inline bool over_bground_thresh(void)
645{
646 unsigned long background_thresh, dirty_thresh;
647
648 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
649
650 return (global_page_state(NR_FILE_DIRTY) +
651 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
652}
653
654/*
655 * Explicit flushing or periodic writeback of "old" data.
580 * 656 *
581 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 657 * Define "old": the first time one of an inode's pages is dirtied, we mark the
582 * This function assumes that the blockdev superblock's inodes are backed by 658 * dirtying-time in the inode's address_space. So this periodic writeback code
583 * a variety of queues, so all inodes are searched. For other superblocks, 659 * just walks the superblock inode list, writing back any inodes which are
584 * assume that all inodes are backed by the same queue. 660 * older than a specific point in time.
585 * 661 *
586 * FIXME: this linear search could get expensive with many fileystems. But 662 * Try to run once per dirty_writeback_interval. But if a writeback event
587 * how to fix? We need to go from an address_space to all inodes which share 663 * takes longer than a dirty_writeback_interval interval, then leave a
588 * a queue with that address_space. (Easy: have a global "dirty superblocks" 664 * one-second gap.
589 * list).
590 * 665 *
591 * The inodes to be written are parked on bdi->b_io. They are moved back onto 666 * older_than_this takes precedence over nr_to_write. So we'll only write back
592 * bdi->b_dirty as they are selected for writing. This way, none can be missed 667 * all dirty pages if they are all attached to "old" mappings.
593 * on the writer throttling path, and we get decent balancing between many
594 * throttled threads: we don't want them all piling up on inode_sync_wait.
595 */ 668 */
596static void generic_sync_sb_inodes(struct super_block *sb, 669static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
597 struct writeback_control *wbc) 670 struct super_block *sb,
671 enum writeback_sync_modes sync_mode, int for_kupdate)
598{ 672{
599 struct backing_dev_info *bdi; 673 struct writeback_control wbc = {
600 674 .bdi = wb->bdi,
601 if (!wbc->bdi) { 675 .sb = sb,
602 mutex_lock(&bdi_lock); 676 .sync_mode = sync_mode,
603 list_for_each_entry(bdi, &bdi_list, bdi_list) 677 .older_than_this = NULL,
604 generic_sync_bdi_inodes(bdi, wbc, sb); 678 .for_kupdate = for_kupdate,
605 mutex_unlock(&bdi_lock); 679 .range_cyclic = 1,
606 } else 680 };
607 generic_sync_bdi_inodes(wbc->bdi, wbc, sb); 681 unsigned long oldest_jif;
682 long wrote = 0;
608 683
609 if (wbc->sync_mode == WB_SYNC_ALL) { 684 if (wbc.for_kupdate) {
610 struct inode *inode, *old_inode = NULL; 685 wbc.older_than_this = &oldest_jif;
686 oldest_jif = jiffies -
687 msecs_to_jiffies(dirty_expire_interval * 10);
688 }
611 689
612 spin_lock(&inode_lock); 690 for (;;) {
691 /*
692 * Don't flush anything for non-integrity writeback where
693 * no nr_pages was given
694 */
695 if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
696 break;
613 697
614 /* 698 /*
615 * Data integrity sync. Must wait for all pages under writeback, 699 * If no specific pages were given and this is just a
616 * because there may have been pages dirtied before our sync 700 * periodic background writeout and we are below the
617 * call, but which had writeout started before we write it out. 701 * background dirty threshold, don't do anything
618 * In which case, the inode may not be on the dirty list, but
619 * we still have to wait for that writeout.
620 */ 702 */
621 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 703 if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
622 struct address_space *mapping; 704 break;
623 705
624 if (inode->i_state & 706 wbc.more_io = 0;
625 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 707 wbc.encountered_congestion = 0;
626 continue; 708 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
627 mapping = inode->i_mapping; 709 wbc.pages_skipped = 0;
628 if (mapping->nrpages == 0) 710 writeback_inodes_wb(wb, &wbc);
711 nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
712 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
713
714 /*
715 * If we ran out of stuff to write, bail unless more_io got set
716 */
717 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
718 if (wbc.more_io && !wbc.for_kupdate)
629 continue; 719 continue;
630 __iget(inode); 720 break;
631 spin_unlock(&inode_lock); 721 }
722 }
723
724 return wrote;
725}
726
727/*
728 * Return the next bdi_work struct that hasn't been processed by this
729 * wb thread yet
730 */
731static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
732 struct bdi_writeback *wb)
733{
734 struct bdi_work *work, *ret = NULL;
735
736 rcu_read_lock();
737
738 list_for_each_entry_rcu(work, &bdi->work_list, list) {
739 if (!test_and_clear_bit(wb->nr, &work->seen))
740 continue;
741
742 ret = work;
743 break;
744 }
745
746 rcu_read_unlock();
747 return ret;
748}
749
750static long wb_check_old_data_flush(struct bdi_writeback *wb)
751{
752 unsigned long expired;
753 long nr_pages;
754
755 expired = wb->last_old_flush +
756 msecs_to_jiffies(dirty_writeback_interval * 10);
757 if (time_before(jiffies, expired))
758 return 0;
759
760 wb->last_old_flush = jiffies;
761 nr_pages = global_page_state(NR_FILE_DIRTY) +
762 global_page_state(NR_UNSTABLE_NFS) +
763 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
764
765 if (nr_pages)
766 return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
767
768 return 0;
769}
770
771/*
772 * Retrieve work items and do the writeback they describe
773 */
774long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
775{
776 struct backing_dev_info *bdi = wb->bdi;
777 struct bdi_work *work;
778 long nr_pages, wrote = 0;
779
780 while ((work = get_next_work_item(bdi, wb)) != NULL) {
781 enum writeback_sync_modes sync_mode;
782
783 nr_pages = work->nr_pages;
784
785 /*
786 * Override sync mode, in case we must wait for completion
787 */
788 if (force_wait)
789 work->sync_mode = sync_mode = WB_SYNC_ALL;
790 else
791 sync_mode = work->sync_mode;
792
793 /*
794 * If this isn't a data integrity operation, just notify
795 * that we have seen this work and we are now starting it.
796 */
797 if (sync_mode == WB_SYNC_NONE)
798 wb_clear_pending(wb, work);
799
800 wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
801
802 /*
803 * This is a data integrity writeback, so only do the
804 * notification when we have completed the work.
805 */
806 if (sync_mode == WB_SYNC_ALL)
807 wb_clear_pending(wb, work);
808 }
809
810 /*
811 * Check for periodic writeback, kupdated() style
812 */
813 wrote += wb_check_old_data_flush(wb);
814
815 return wrote;
816}
817
818/*
819 * Handle writeback of dirty data for the device backed by this bdi. Also
820 * wakes up periodically and does kupdated style flushing.
821 */
822int bdi_writeback_task(struct bdi_writeback *wb)
823{
824 unsigned long last_active = jiffies;
825 unsigned long wait_jiffies = -1UL;
826 long pages_written;
827
828 while (!kthread_should_stop()) {
829 pages_written = wb_do_writeback(wb, 0);
830
831 if (pages_written)
832 last_active = jiffies;
833 else if (wait_jiffies != -1UL) {
834 unsigned long max_idle;
835
632 /* 836 /*
633 * We hold a reference to 'inode' so it couldn't have 837 * Longest period of inactivity that we tolerate. If we
634 * been removed from s_inodes list while we dropped the 838 * see dirty data again later, the task will get
635 * inode_lock. We cannot iput the inode now as we can 839 * recreated automatically.
636 * be holding the last reference and we cannot iput it
637 * under inode_lock. So we keep the reference and iput
638 * it later.
639 */ 840 */
640 iput(old_inode); 841 max_idle = max(5UL * 60 * HZ, wait_jiffies);
641 old_inode = inode; 842 if (time_after(jiffies, max_idle + last_active))
843 break;
844 }
845
846 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
847 set_current_state(TASK_INTERRUPTIBLE);
848 schedule_timeout(wait_jiffies);
849 try_to_freeze();
850 }
851
852 return 0;
853}
854
855/*
856 * Schedule writeback for all backing devices. Expensive! If this is a data
857 * integrity operation, writeback will be complete when this returns. If
858 * we are simply called for WB_SYNC_NONE, then writeback will merely be
859 * scheduled to run.
860 */
861static void bdi_writeback_all(struct writeback_control *wbc)
862{
863 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
864 struct backing_dev_info *bdi;
865 struct bdi_work *work;
866 LIST_HEAD(list);
867
868restart:
869 spin_lock(&bdi_lock);
870
871 list_for_each_entry(bdi, &bdi_list, bdi_list) {
872 struct bdi_work *work;
873
874 if (!bdi_has_dirty_io(bdi))
875 continue;
642 876
643 filemap_fdatawait(mapping); 877 /*
878 * If work allocation fails, do the writes inline. We drop
879 * the lock and restart the list writeout. This should be OK,
880 * since this happens rarely and because the writeout should
881 * eventually make more free memory available.
882 */
883 work = bdi_alloc_work(wbc);
884 if (!work) {
885 struct writeback_control __wbc;
644 886
645 cond_resched(); 887 /*
888 * Not a data integrity writeout, just continue
889 */
890 if (!must_wait)
891 continue;
646 892
647 spin_lock(&inode_lock); 893 spin_unlock(&bdi_lock);
894 __wbc = *wbc;
895 __wbc.bdi = bdi;
896 writeback_inodes_wbc(&__wbc);
897 goto restart;
648 } 898 }
649 spin_unlock(&inode_lock); 899 if (must_wait)
650 iput(old_inode); 900 list_add_tail(&work->wait_list, &list);
901
902 bdi_queue_work(bdi, work);
903 }
904
905 spin_unlock(&bdi_lock);
906
907 /*
908 * If this is for WB_SYNC_ALL, wait for pending work to complete
909 * before returning.
910 */
911 while (!list_empty(&list)) {
912 work = list_entry(list.next, struct bdi_work, wait_list);
913 list_del(&work->wait_list);
914 bdi_wait_on_work_clear(work);
915 call_rcu(&work->rcu_head, bdi_work_free);
651 } 916 }
652} 917}
653 918
654/* 919/*
655 * Start writeback of dirty pagecache data against all unlocked inodes. 920 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
921 * the whole world.
922 */
923void wakeup_flusher_threads(long nr_pages)
924{
925 struct writeback_control wbc = {
926 .sync_mode = WB_SYNC_NONE,
927 .older_than_this = NULL,
928 .range_cyclic = 1,
929 };
930
931 if (nr_pages == 0)
932 nr_pages = global_page_state(NR_FILE_DIRTY) +
933 global_page_state(NR_UNSTABLE_NFS);
934 wbc.nr_to_write = nr_pages;
935 bdi_writeback_all(&wbc);
936}
937
938static noinline void block_dump___mark_inode_dirty(struct inode *inode)
939{
940 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
941 struct dentry *dentry;
942 const char *name = "?";
943
944 dentry = d_find_alias(inode);
945 if (dentry) {
946 spin_lock(&dentry->d_lock);
947 name = (const char *) dentry->d_name.name;
948 }
949 printk(KERN_DEBUG
950 "%s(%d): dirtied inode %lu (%s) on %s\n",
951 current->comm, task_pid_nr(current), inode->i_ino,
952 name, inode->i_sb->s_id);
953 if (dentry) {
954 spin_unlock(&dentry->d_lock);
955 dput(dentry);
956 }
957 }
958}
959
960/**
961 * __mark_inode_dirty - internal function
962 * @inode: inode to mark
963 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
964 * Mark an inode as dirty. Callers should use mark_inode_dirty or
965 * mark_inode_dirty_sync.
656 * 966 *
657 * Note: 967 * Put the inode on the super block's dirty list.
658 * We don't need to grab a reference to superblock here. If it has non-empty 968 *
659 * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed 969 * CAREFUL! We mark it dirty unconditionally, but move it onto the
660 * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all 970 * dirty list only if it is hashed or if it refers to a blockdev.
661 * empty. Since __sync_single_inode() regains inode_lock before it finally moves 971 * If it was not hashed, it will never be added to the dirty list
662 * inode from superblock lists we are OK. 972 * even if it is later hashed, as it will have been marked dirty already.
973 *
974 * In short, make sure you hash any inodes _before_ you start marking
975 * them dirty.
663 * 976 *
664 * If `older_than_this' is non-zero then only flush inodes which have a 977 * This function *must* be atomic for the I_DIRTY_PAGES case -
665 * flushtime older than *older_than_this. 978 * set_page_dirty() is called under spinlock in several places.
666 * 979 *
667 * If `bdi' is non-zero then we will scan the first inode against each 980 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
668 * superblock until we find the matching ones. One group will be the dirty 981 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
669 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 982 * the kernel-internal blockdev inode represents the dirtying time of the
670 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 983 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
671 * super-efficient but we're about to do a ton of I/O... 984 * page->mapping->host, so the page-dirtying time is recorded in the internal
985 * blockdev inode.
672 */ 986 */
673void 987void __mark_inode_dirty(struct inode *inode, int flags)
674writeback_inodes(struct writeback_control *wbc)
675{ 988{
676 struct super_block *sb; 989 struct super_block *sb = inode->i_sb;
677 990
678 might_sleep(); 991 /*
679 spin_lock(&sb_lock); 992 * Don't do this for I_DIRTY_PAGES - that doesn't actually
680restart: 993 * dirty the inode itself
681 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 994 */
682 if (sb_has_dirty_inodes(sb)) { 995 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
683 /* we're making our own get_super here */ 996 if (sb->s_op->dirty_inode)
684 sb->s_count++; 997 sb->s_op->dirty_inode(inode);
685 spin_unlock(&sb_lock); 998 }
686 /* 999
687 * If we can't get the readlock, there's no sense in 1000 /*
688 * waiting around, most of the time the FS is going to 1001 * make sure that changes are seen by all cpus before we test i_state
689 * be unmounted by the time it is released. 1002 * -- mikulas
690 */ 1003 */
691 if (down_read_trylock(&sb->s_umount)) { 1004 smp_mb();
692 if (sb->s_root) 1005
693 generic_sync_sb_inodes(sb, wbc); 1006 /* avoid the locking if we can */
694 up_read(&sb->s_umount); 1007 if ((inode->i_state & flags) == flags)
695 } 1008 return;
696 spin_lock(&sb_lock); 1009
697 if (__put_super_and_need_restart(sb)) 1010 if (unlikely(block_dump))
698 goto restart; 1011 block_dump___mark_inode_dirty(inode);
1012
1013 spin_lock(&inode_lock);
1014 if ((inode->i_state & flags) != flags) {
1015 const int was_dirty = inode->i_state & I_DIRTY;
1016
1017 inode->i_state |= flags;
1018
1019 /*
1020 * If the inode is being synced, just update its dirty state.
1021 * The unlocker will place the inode on the appropriate
1022 * superblock list, based upon its state.
1023 */
1024 if (inode->i_state & I_SYNC)
1025 goto out;
1026
1027 /*
1028 * Only add valid (hashed) inodes to the superblock's
1029 * dirty list. Add blockdev inodes as well.
1030 */
1031 if (!S_ISBLK(inode->i_mode)) {
1032 if (hlist_unhashed(&inode->i_hash))
1033 goto out;
1034 }
1035 if (inode->i_state & (I_FREEING|I_CLEAR))
1036 goto out;
1037
1038 /*
1039 * If the inode was already on b_dirty/b_io/b_more_io, don't
1040 * reposition it (that would break b_dirty time-ordering).
1041 */
1042 if (!was_dirty) {
1043 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1044
1045 inode->dirtied_when = jiffies;
1046 list_move(&inode->i_list, &wb->b_dirty);
699 } 1047 }
700 if (wbc->nr_to_write <= 0)
701 break;
702 } 1048 }
703 spin_unlock(&sb_lock); 1049out:
1050 spin_unlock(&inode_lock);
1051}
1052EXPORT_SYMBOL(__mark_inode_dirty);
1053
1054/*
1055 * Write out a superblock's list of dirty inodes. A wait will be performed
1056 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1057 *
1058 * If older_than_this is non-NULL, then only write out inodes which
1059 * had their first dirtying at a time earlier than *older_than_this.
1060 *
1061 * If we're a pdlfush thread, then implement pdflush collision avoidance
1062 * against the entire list.
1063 *
1064 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1065 * This function assumes that the blockdev superblock's inodes are backed by
1066 * a variety of queues, so all inodes are searched. For other superblocks,
1067 * assume that all inodes are backed by the same queue.
1068 *
1069 * The inodes to be written are parked on bdi->b_io. They are moved back onto
1070 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1071 * on the writer throttling path, and we get decent balancing between many
1072 * throttled threads: we don't want them all piling up on inode_sync_wait.
1073 */
1074static void wait_sb_inodes(struct writeback_control *wbc)
1075{
1076 struct inode *inode, *old_inode = NULL;
1077
1078 /*
1079 * We need to be protected against the filesystem going from
1080 * r/o to r/w or vice versa.
1081 */
1082 WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
1083
1084 spin_lock(&inode_lock);
1085
1086 /*
1087 * Data integrity sync. Must wait for all pages under writeback,
1088 * because there may have been pages dirtied before our sync
1089 * call, but which had writeout started before we write it out.
1090 * In which case, the inode may not be on the dirty list, but
1091 * we still have to wait for that writeout.
1092 */
1093 list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
1094 struct address_space *mapping;
1095
1096 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1097 continue;
1098 mapping = inode->i_mapping;
1099 if (mapping->nrpages == 0)
1100 continue;
1101 __iget(inode);
1102 spin_unlock(&inode_lock);
1103 /*
1104 * We hold a reference to 'inode' so it couldn't have
1105 * been removed from s_inodes list while we dropped the
1106 * inode_lock. We cannot iput the inode now as we can
1107 * be holding the last reference and we cannot iput it
1108 * under inode_lock. So we keep the reference and iput
1109 * it later.
1110 */
1111 iput(old_inode);
1112 old_inode = inode;
1113
1114 filemap_fdatawait(mapping);
1115
1116 cond_resched();
1117
1118 spin_lock(&inode_lock);
1119 }
1120 spin_unlock(&inode_lock);
1121 iput(old_inode);
704} 1122}
705 1123
706/** 1124/**
@@ -715,6 +1133,7 @@ restart:
715long writeback_inodes_sb(struct super_block *sb) 1133long writeback_inodes_sb(struct super_block *sb)
716{ 1134{
717 struct writeback_control wbc = { 1135 struct writeback_control wbc = {
1136 .sb = sb,
718 .sync_mode = WB_SYNC_NONE, 1137 .sync_mode = WB_SYNC_NONE,
719 .range_start = 0, 1138 .range_start = 0,
720 .range_end = LLONG_MAX, 1139 .range_end = LLONG_MAX,
@@ -727,7 +1146,7 @@ long writeback_inodes_sb(struct super_block *sb)
727 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1146 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
728 1147
729 wbc.nr_to_write = nr_to_write; 1148 wbc.nr_to_write = nr_to_write;
730 generic_sync_sb_inodes(sb, &wbc); 1149 bdi_writeback_all(&wbc);
731 return nr_to_write - wbc.nr_to_write; 1150 return nr_to_write - wbc.nr_to_write;
732} 1151}
733EXPORT_SYMBOL(writeback_inodes_sb); 1152EXPORT_SYMBOL(writeback_inodes_sb);
@@ -742,6 +1161,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
742long sync_inodes_sb(struct super_block *sb) 1161long sync_inodes_sb(struct super_block *sb)
743{ 1162{
744 struct writeback_control wbc = { 1163 struct writeback_control wbc = {
1164 .sb = sb,
745 .sync_mode = WB_SYNC_ALL, 1165 .sync_mode = WB_SYNC_ALL,
746 .range_start = 0, 1166 .range_start = 0,
747 .range_end = LLONG_MAX, 1167 .range_end = LLONG_MAX,
@@ -749,7 +1169,8 @@ long sync_inodes_sb(struct super_block *sb)
749 long nr_to_write = LONG_MAX; /* doesn't actually matter */ 1169 long nr_to_write = LONG_MAX; /* doesn't actually matter */
750 1170
751 wbc.nr_to_write = nr_to_write; 1171 wbc.nr_to_write = nr_to_write;
752 generic_sync_sb_inodes(sb, &wbc); 1172 bdi_writeback_all(&wbc);
1173 wait_sb_inodes(&wbc);
753 return nr_to_write - wbc.nr_to_write; 1174 return nr_to_write - wbc.nr_to_write;
754} 1175}
755EXPORT_SYMBOL(sync_inodes_sb); 1176EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/super.c b/fs/super.c
index 0d22ce3be4aa..9cda337ddae2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -168,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
168 * Drops a temporary reference, frees superblock if there's no 168 * Drops a temporary reference, frees superblock if there's no
169 * references left. 169 * references left.
170 */ 170 */
171static void put_super(struct super_block *sb) 171void put_super(struct super_block *sb)
172{ 172{
173 spin_lock(&sb_lock); 173 spin_lock(&sb_lock);
174 __put_super(sb); 174 __put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 66f210476f40..103cc7fdd3df 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -120,7 +120,7 @@ restart:
120 */ 120 */
121SYSCALL_DEFINE0(sync) 121SYSCALL_DEFINE0(sync)
122{ 122{
123 wakeup_pdflush(0); 123 wakeup_flusher_threads(0);
124 sync_filesystems(0); 124 sync_filesystems(0);
125 sync_filesystems(1); 125 sync_filesystems(1);
126 if (unlikely(laptop_mode)) 126 if (unlikely(laptop_mode))
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 928cd5484f4d..d045f5f615c7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,8 @@
13#include <linux/proportions.h> 13#include <linux/proportions.h>
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h>
17#include <linux/writeback.h>
16#include <asm/atomic.h> 18#include <asm/atomic.h>
17 19
18struct page; 20struct page;
@@ -23,7 +25,8 @@ struct dentry;
23 * Bits in backing_dev_info.state 25 * Bits in backing_dev_info.state
24 */ 26 */
25enum bdi_state { 27enum bdi_state {
26 BDI_pdflush, /* A pdflush thread is working this device */ 28 BDI_pending, /* On its way to being activated */
29 BDI_wb_alloc, /* Default embedded wb allocated */
27 BDI_async_congested, /* The async (write) queue is getting full */ 30 BDI_async_congested, /* The async (write) queue is getting full */
28 BDI_sync_congested, /* The sync queue is getting full */ 31 BDI_sync_congested, /* The sync queue is getting full */
29 BDI_unused, /* Available bits start here */ 32 BDI_unused, /* Available bits start here */
@@ -39,9 +42,22 @@ enum bdi_stat_item {
39 42
40#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) 43#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
41 44
45struct bdi_writeback {
46 struct list_head list; /* hangs off the bdi */
47
48 struct backing_dev_info *bdi; /* our parent bdi */
49 unsigned int nr;
50
51 unsigned long last_old_flush; /* last old data flush */
52
53 struct task_struct *task; /* writeback task */
54 struct list_head b_dirty; /* dirty inodes */
55 struct list_head b_io; /* parked for writeback */
56 struct list_head b_more_io; /* parked for more writeback */
57};
58
42struct backing_dev_info { 59struct backing_dev_info {
43 struct list_head bdi_list; 60 struct list_head bdi_list;
44
45 unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ 61 unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
46 unsigned long state; /* Always use atomic bitops on this */ 62 unsigned long state; /* Always use atomic bitops on this */
47 unsigned int capabilities; /* Device capabilities */ 63 unsigned int capabilities; /* Device capabilities */
@@ -58,11 +74,15 @@ struct backing_dev_info {
58 unsigned int min_ratio; 74 unsigned int min_ratio;
59 unsigned int max_ratio, max_prop_frac; 75 unsigned int max_ratio, max_prop_frac;
60 76
61 struct device *dev; 77 struct bdi_writeback wb; /* default writeback info for this bdi */
78 spinlock_t wb_lock; /* protects update side of wb_list */
79 struct list_head wb_list; /* the flusher threads hanging off this bdi */
80 unsigned long wb_mask; /* bitmask of registered tasks */
81 unsigned int wb_cnt; /* number of registered tasks */
62 82
63 struct list_head b_dirty; /* dirty inodes */ 83 struct list_head work_list;
64 struct list_head b_io; /* parked for writeback */ 84
65 struct list_head b_more_io; /* parked for more writeback */ 85 struct device *dev;
66 86
67#ifdef CONFIG_DEBUG_FS 87#ifdef CONFIG_DEBUG_FS
68 struct dentry *debug_dir; 88 struct dentry *debug_dir;
@@ -77,10 +97,20 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
77 const char *fmt, ...); 97 const char *fmt, ...);
78int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); 98int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
79void bdi_unregister(struct backing_dev_info *bdi); 99void bdi_unregister(struct backing_dev_info *bdi);
100void bdi_start_writeback(struct writeback_control *wbc);
101int bdi_writeback_task(struct bdi_writeback *wb);
102int bdi_has_dirty_io(struct backing_dev_info *bdi);
80 103
81extern struct mutex bdi_lock; 104extern spinlock_t bdi_lock;
82extern struct list_head bdi_list; 105extern struct list_head bdi_list;
83 106
107static inline int wb_has_dirty_io(struct bdi_writeback *wb)
108{
109 return !list_empty(&wb->b_dirty) ||
110 !list_empty(&wb->b_io) ||
111 !list_empty(&wb->b_more_io);
112}
113
84static inline void __add_bdi_stat(struct backing_dev_info *bdi, 114static inline void __add_bdi_stat(struct backing_dev_info *bdi,
85 enum bdi_stat_item item, s64 amount) 115 enum bdi_stat_item item, s64 amount)
86{ 116{
@@ -270,6 +300,11 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
270 return bdi->capabilities & BDI_CAP_SWAP_BACKED; 300 return bdi->capabilities & BDI_CAP_SWAP_BACKED;
271} 301}
272 302
303static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
304{
305 return bdi == &default_backing_dev_info;
306}
307
273static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) 308static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
274{ 309{
275 return bdi_cap_writeback_dirty(mapping->backing_dev_info); 310 return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -285,4 +320,10 @@ static inline bool mapping_cap_swap_backed(struct address_space *mapping)
285 return bdi_cap_swap_backed(mapping->backing_dev_info); 320 return bdi_cap_swap_backed(mapping->backing_dev_info);
286} 321}
287 322
323static inline int bdi_sched_wait(void *word)
324{
325 schedule();
326 return 0;
327}
328
288#endif /* _LINUX_BACKING_DEV_H */ 329#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56371be1be65..26da98f61116 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1786,6 +1786,7 @@ extern int get_sb_pseudo(struct file_system_type *, char *,
1786 struct vfsmount *mnt); 1786 struct vfsmount *mnt);
1787extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); 1787extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
1788int __put_super_and_need_restart(struct super_block *sb); 1788int __put_super_and_need_restart(struct super_block *sb);
1789void put_super(struct super_block *sb);
1789 1790
1790/* Alas, no aliases. Too much hassle with bringing module.h everywhere */ 1791/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
1791#define fops_get(fops) \ 1792#define fops_get(fops) \
@@ -2182,7 +2183,6 @@ extern int bdev_read_only(struct block_device *);
2182extern int set_blocksize(struct block_device *, int); 2183extern int set_blocksize(struct block_device *, int);
2183extern int sb_set_blocksize(struct super_block *, int); 2184extern int sb_set_blocksize(struct super_block *, int);
2184extern int sb_min_blocksize(struct super_block *, int); 2185extern int sb_min_blocksize(struct super_block *, int);
2185extern int sb_has_dirty_inodes(struct super_block *);
2186 2186
2187extern int generic_file_mmap(struct file *, struct vm_area_struct *); 2187extern int generic_file_mmap(struct file *, struct vm_area_struct *);
2188extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); 2188extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 07039299603d..cef75527a14c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -40,6 +40,8 @@ enum writeback_sync_modes {
40struct writeback_control { 40struct writeback_control {
41 struct backing_dev_info *bdi; /* If !NULL, only write back this 41 struct backing_dev_info *bdi; /* If !NULL, only write back this
42 queue */ 42 queue */
43 struct super_block *sb; /* if !NULL, only write inodes from
44 this super_block */
43 enum writeback_sync_modes sync_mode; 45 enum writeback_sync_modes sync_mode;
44 unsigned long *older_than_this; /* If !NULL, only write back inodes 46 unsigned long *older_than_this; /* If !NULL, only write back inodes
45 older than this */ 47 older than this */
@@ -76,10 +78,13 @@ struct writeback_control {
76/* 78/*
77 * fs/fs-writeback.c 79 * fs/fs-writeback.c
78 */ 80 */
79void writeback_inodes(struct writeback_control *wbc); 81struct bdi_writeback;
80int inode_wait(void *); 82int inode_wait(void *);
81long writeback_inodes_sb(struct super_block *); 83long writeback_inodes_sb(struct super_block *);
82long sync_inodes_sb(struct super_block *); 84long sync_inodes_sb(struct super_block *);
85void writeback_inodes_wbc(struct writeback_control *wbc);
86long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
87void wakeup_flusher_threads(long nr_pages);
83 88
84/* writeback.h requires fs.h; it, too, is not included from here. */ 89/* writeback.h requires fs.h; it, too, is not included from here. */
85static inline void wait_on_inode(struct inode *inode) 90static inline void wait_on_inode(struct inode *inode)
@@ -99,7 +104,6 @@ static inline void inode_sync_wait(struct inode *inode)
99/* 104/*
100 * mm/page-writeback.c 105 * mm/page-writeback.c
101 */ 106 */
102int wakeup_pdflush(long nr_pages);
103void laptop_io_completion(void); 107void laptop_io_completion(void);
104void laptop_sync_completion(void); 108void laptop_sync_completion(void);
105void throttle_vm_writeout(gfp_t gfp_mask); 109void throttle_vm_writeout(gfp_t gfp_mask);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6f163e0f0509..7f3fa79f25c0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -22,8 +25,18 @@ struct backing_dev_info default_backing_dev_info = {
22EXPORT_SYMBOL_GPL(default_backing_dev_info); 25EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 26
24static struct class *bdi_class; 27static struct class *bdi_class;
25DEFINE_MUTEX(bdi_lock); 28DEFINE_SPINLOCK(bdi_lock);
26LIST_HEAD(bdi_list); 29LIST_HEAD(bdi_list);
30LIST_HEAD(bdi_pending_list);
31
32static struct task_struct *sync_supers_tsk;
33static struct timer_list sync_supers_timer;
34
35static int bdi_sync_supers(void *);
36static void sync_supers_timer_fn(unsigned long);
37static void arm_supers_timer(void);
38
39static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
27 40
28#ifdef CONFIG_DEBUG_FS 41#ifdef CONFIG_DEBUG_FS
29#include <linux/debugfs.h> 42#include <linux/debugfs.h>
@@ -187,6 +200,13 @@ static int __init default_bdi_init(void)
187{ 200{
188 int err; 201 int err;
189 202
203 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
204 BUG_ON(IS_ERR(sync_supers_tsk));
205
206 init_timer(&sync_supers_timer);
207 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
208 arm_supers_timer();
209
190 err = bdi_init(&default_backing_dev_info); 210 err = bdi_init(&default_backing_dev_info);
191 if (!err) 211 if (!err)
192 bdi_register(&default_backing_dev_info, NULL, "default"); 212 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -195,6 +215,242 @@ static int __init default_bdi_init(void)
195} 215}
196subsys_initcall(default_bdi_init); 216subsys_initcall(default_bdi_init);
197 217
218static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
219{
220 memset(wb, 0, sizeof(*wb));
221
222 wb->bdi = bdi;
223 wb->last_old_flush = jiffies;
224 INIT_LIST_HEAD(&wb->b_dirty);
225 INIT_LIST_HEAD(&wb->b_io);
226 INIT_LIST_HEAD(&wb->b_more_io);
227}
228
229static void bdi_task_init(struct backing_dev_info *bdi,
230 struct bdi_writeback *wb)
231{
232 struct task_struct *tsk = current;
233
234 spin_lock(&bdi->wb_lock);
235 list_add_tail_rcu(&wb->list, &bdi->wb_list);
236 spin_unlock(&bdi->wb_lock);
237
238 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
239 set_freezable();
240
241 /*
242 * Our parent may run at a different priority, just set us to normal
243 */
244 set_user_nice(tsk, 0);
245}
246
247static int bdi_start_fn(void *ptr)
248{
249 struct bdi_writeback *wb = ptr;
250 struct backing_dev_info *bdi = wb->bdi;
251 int ret;
252
253 /*
254 * Add us to the active bdi_list
255 */
256 spin_lock(&bdi_lock);
257 list_add(&bdi->bdi_list, &bdi_list);
258 spin_unlock(&bdi_lock);
259
260 bdi_task_init(bdi, wb);
261
262 /*
263 * Clear pending bit and wakeup anybody waiting to tear us down
264 */
265 clear_bit(BDI_pending, &bdi->state);
266 smp_mb__after_clear_bit();
267 wake_up_bit(&bdi->state, BDI_pending);
268
269 ret = bdi_writeback_task(wb);
270
271 /*
272 * Remove us from the list
273 */
274 spin_lock(&bdi->wb_lock);
275 list_del_rcu(&wb->list);
276 spin_unlock(&bdi->wb_lock);
277
278 /*
279 * Flush any work that raced with us exiting. No new work
280 * will be added, since this bdi isn't discoverable anymore.
281 */
282 if (!list_empty(&bdi->work_list))
283 wb_do_writeback(wb, 1);
284
285 wb->task = NULL;
286 return ret;
287}
288
289int bdi_has_dirty_io(struct backing_dev_info *bdi)
290{
291 return wb_has_dirty_io(&bdi->wb);
292}
293
294static void bdi_flush_io(struct backing_dev_info *bdi)
295{
296 struct writeback_control wbc = {
297 .bdi = bdi,
298 .sync_mode = WB_SYNC_NONE,
299 .older_than_this = NULL,
300 .range_cyclic = 1,
301 .nr_to_write = 1024,
302 };
303
304 writeback_inodes_wbc(&wbc);
305}
306
307/*
308 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
309 * or we risk deadlocking on ->s_umount. The longer term solution would be
310 * to implement sync_supers_bdi() or similar and simply do it from the
311 * bdi writeback tasks individually.
312 */
313static int bdi_sync_supers(void *unused)
314{
315 set_user_nice(current, 0);
316
317 while (!kthread_should_stop()) {
318 set_current_state(TASK_INTERRUPTIBLE);
319 schedule();
320
321 /*
322 * Do this periodically, like kupdated() did before.
323 */
324 sync_supers();
325 }
326
327 return 0;
328}
329
330static void arm_supers_timer(void)
331{
332 unsigned long next;
333
334 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
335 mod_timer(&sync_supers_timer, round_jiffies_up(next));
336}
337
338static void sync_supers_timer_fn(unsigned long unused)
339{
340 wake_up_process(sync_supers_tsk);
341 arm_supers_timer();
342}
343
344static int bdi_forker_task(void *ptr)
345{
346 struct bdi_writeback *me = ptr;
347
348 bdi_task_init(me->bdi, me);
349
350 for (;;) {
351 struct backing_dev_info *bdi, *tmp;
352 struct bdi_writeback *wb;
353
354 /*
355 * Temporary measure, we want to make sure we don't see
356 * dirty data on the default backing_dev_info
357 */
358 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
359 wb_do_writeback(me, 0);
360
361 spin_lock(&bdi_lock);
362
363 /*
364 * Check if any existing bdi's have dirty data without
365 * a thread registered. If so, set that up.
366 */
367 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
368 if (bdi->wb.task)
369 continue;
370 if (list_empty(&bdi->work_list) &&
371 !bdi_has_dirty_io(bdi))
372 continue;
373
374 bdi_add_default_flusher_task(bdi);
375 }
376
377 set_current_state(TASK_INTERRUPTIBLE);
378
379 if (list_empty(&bdi_pending_list)) {
380 unsigned long wait;
381
382 spin_unlock(&bdi_lock);
383 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
384 schedule_timeout(wait);
385 try_to_freeze();
386 continue;
387 }
388
389 __set_current_state(TASK_RUNNING);
390
391 /*
392 * This is our real job - check for pending entries in
393 * bdi_pending_list, and create the tasks that got added
394 */
395 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
396 bdi_list);
397 list_del_init(&bdi->bdi_list);
398 spin_unlock(&bdi_lock);
399
400 wb = &bdi->wb;
401 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
402 dev_name(bdi->dev));
403 /*
404 * If task creation fails, then readd the bdi to
405 * the pending list and force writeout of the bdi
406 * from this forker thread. That will free some memory
407 * and we can try again.
408 */
409 if (IS_ERR(wb->task)) {
410 wb->task = NULL;
411
412 /*
413 * Add this 'bdi' to the back, so we get
414 * a chance to flush other bdi's to free
415 * memory.
416 */
417 spin_lock(&bdi_lock);
418 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
419 spin_unlock(&bdi_lock);
420
421 bdi_flush_io(bdi);
422 }
423 }
424
425 return 0;
426}
427
428/*
429 * Add the default flusher task that gets created for any bdi
430 * that has dirty data pending writeout
431 */
432void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
433{
434 if (!bdi_cap_writeback_dirty(bdi))
435 return;
436
437 /*
438 * Check with the helper whether to proceed adding a task. Will only
439 * abort if we two or more simultanous calls to
440 * bdi_add_default_flusher_task() occured, further additions will block
441 * waiting for previous additions to finish.
442 */
443 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
444 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
445
446 /*
447 * We are now on the pending list, wake up bdi_forker_task()
448 * to finish the job and add us back to the active bdi_list
449 */
450 wake_up_process(default_backing_dev_info.wb.task);
451 }
452}
453
198int bdi_register(struct backing_dev_info *bdi, struct device *parent, 454int bdi_register(struct backing_dev_info *bdi, struct device *parent,
199 const char *fmt, ...) 455 const char *fmt, ...)
200{ 456{
@@ -213,13 +469,34 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
213 goto exit; 469 goto exit;
214 } 470 }
215 471
216 mutex_lock(&bdi_lock); 472 spin_lock(&bdi_lock);
217 list_add_tail(&bdi->bdi_list, &bdi_list); 473 list_add_tail(&bdi->bdi_list, &bdi_list);
218 mutex_unlock(&bdi_lock); 474 spin_unlock(&bdi_lock);
219 475
220 bdi->dev = dev; 476 bdi->dev = dev;
221 bdi_debug_register(bdi, dev_name(dev));
222 477
478 /*
479 * Just start the forker thread for our default backing_dev_info,
480 * and add other bdi's to the list. They will get a thread created
481 * on-demand when they need it.
482 */
483 if (bdi_cap_flush_forker(bdi)) {
484 struct bdi_writeback *wb = &bdi->wb;
485
486 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
487 dev_name(dev));
488 if (IS_ERR(wb->task)) {
489 wb->task = NULL;
490 ret = -ENOMEM;
491
492 spin_lock(&bdi_lock);
493 list_del(&bdi->bdi_list);
494 spin_unlock(&bdi_lock);
495 goto exit;
496 }
497 }
498
499 bdi_debug_register(bdi, dev_name(dev));
223exit: 500exit:
224 return ret; 501 return ret;
225} 502}
@@ -231,17 +508,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
231} 508}
232EXPORT_SYMBOL(bdi_register_dev); 509EXPORT_SYMBOL(bdi_register_dev);
233 510
234static void bdi_remove_from_list(struct backing_dev_info *bdi) 511/*
512 * Remove bdi from the global list and shutdown any threads we have running
513 */
514static void bdi_wb_shutdown(struct backing_dev_info *bdi)
235{ 515{
236 mutex_lock(&bdi_lock); 516 struct bdi_writeback *wb;
517
518 if (!bdi_cap_writeback_dirty(bdi))
519 return;
520
521 /*
522 * If setup is pending, wait for that to complete first
523 */
524 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
525 TASK_UNINTERRUPTIBLE);
526
527 /*
528 * Make sure nobody finds us on the bdi_list anymore
529 */
530 spin_lock(&bdi_lock);
237 list_del(&bdi->bdi_list); 531 list_del(&bdi->bdi_list);
238 mutex_unlock(&bdi_lock); 532 spin_unlock(&bdi_lock);
533
534 /*
535 * Finally, kill the kernel threads. We don't need to be RCU
536 * safe anymore, since the bdi is gone from visibility.
537 */
538 list_for_each_entry(wb, &bdi->wb_list, list)
539 kthread_stop(wb->task);
239} 540}
240 541
241void bdi_unregister(struct backing_dev_info *bdi) 542void bdi_unregister(struct backing_dev_info *bdi)
242{ 543{
243 if (bdi->dev) { 544 if (bdi->dev) {
244 bdi_remove_from_list(bdi); 545 if (!bdi_cap_flush_forker(bdi))
546 bdi_wb_shutdown(bdi);
245 bdi_debug_unregister(bdi); 547 bdi_debug_unregister(bdi);
246 device_unregister(bdi->dev); 548 device_unregister(bdi->dev);
247 bdi->dev = NULL; 549 bdi->dev = NULL;
@@ -251,18 +553,25 @@ EXPORT_SYMBOL(bdi_unregister);
251 553
252int bdi_init(struct backing_dev_info *bdi) 554int bdi_init(struct backing_dev_info *bdi)
253{ 555{
254 int i; 556 int i, err;
255 int err;
256 557
257 bdi->dev = NULL; 558 bdi->dev = NULL;
258 559
259 bdi->min_ratio = 0; 560 bdi->min_ratio = 0;
260 bdi->max_ratio = 100; 561 bdi->max_ratio = 100;
261 bdi->max_prop_frac = PROP_FRAC_BASE; 562 bdi->max_prop_frac = PROP_FRAC_BASE;
563 spin_lock_init(&bdi->wb_lock);
262 INIT_LIST_HEAD(&bdi->bdi_list); 564 INIT_LIST_HEAD(&bdi->bdi_list);
263 INIT_LIST_HEAD(&bdi->b_io); 565 INIT_LIST_HEAD(&bdi->wb_list);
264 INIT_LIST_HEAD(&bdi->b_dirty); 566 INIT_LIST_HEAD(&bdi->work_list);
265 INIT_LIST_HEAD(&bdi->b_more_io); 567
568 bdi_wb_init(&bdi->wb, bdi);
569
570 /*
571 * Just one thread support for now, hard code mask and count
572 */
573 bdi->wb_mask = 1;
574 bdi->wb_cnt = 1;
266 575
267 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 576 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
268 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 577 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -277,8 +586,6 @@ int bdi_init(struct backing_dev_info *bdi)
277err: 586err:
278 while (i--) 587 while (i--)
279 percpu_counter_destroy(&bdi->bdi_stat[i]); 588 percpu_counter_destroy(&bdi->bdi_stat[i]);
280
281 bdi_remove_from_list(bdi);
282 } 589 }
283 590
284 return err; 591 return err;
@@ -289,9 +596,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
289{ 596{
290 int i; 597 int i;
291 598
292 WARN_ON(!list_empty(&bdi->b_dirty)); 599 WARN_ON(bdi_has_dirty_io(bdi));
293 WARN_ON(!list_empty(&bdi->b_io));
294 WARN_ON(!list_empty(&bdi->b_more_io));
295 600
296 bdi_unregister(bdi); 601 bdi_unregister(bdi);
297 602
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f8341b6019bf..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -326,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
326{ 315{
327 int ret = 0; 316 int ret = 0;
328 317
329 mutex_lock(&bdi_lock); 318 spin_lock(&bdi_lock);
330 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
331 ret = -EINVAL; 320 ret = -EINVAL;
332 } else { 321 } else {
@@ -338,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
338 ret = -EINVAL; 327 ret = -EINVAL;
339 } 328 }
340 } 329 }
341 mutex_unlock(&bdi_lock); 330 spin_unlock(&bdi_lock);
342 331
343 return ret; 332 return ret;
344} 333}
@@ -350,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
350 if (max_ratio > 100) 339 if (max_ratio > 100)
351 return -EINVAL; 340 return -EINVAL;
352 341
353 mutex_lock(&bdi_lock); 342 spin_lock(&bdi_lock);
354 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
355 ret = -EINVAL; 344 ret = -EINVAL;
356 } else { 345 } else {
357 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
358 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
359 } 348 }
360 mutex_unlock(&bdi_lock); 349 spin_unlock(&bdi_lock);
361 350
362 return ret; 351 return ret;
363} 352}
@@ -543,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
543 * up. 532 * up.
544 */ 533 */
545 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
546 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
547 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
548 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
549 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -572,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
572 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
573 break; /* We've done our duty */ 562 break; /* We've done our duty */
574 563
575 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
576 } 565 }
577 566
578 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -591,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
591 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
592 */ 581 */
593 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
594 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
595 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
596 > background_thresh))) 585 > background_thresh))) {
597 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
598} 595}
599 596
600void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -678,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
678 } 675 }
679} 676}
680 677
681/*
682 * writeback at least _min_pages, and keep writing until the amount of dirty
683 * memory is less than the background threshold, or until we're all clean.
684 */
685static void background_writeout(unsigned long _min_pages)
686{
687 long min_pages = _min_pages;
688 struct writeback_control wbc = {
689 .bdi = NULL,
690 .sync_mode = WB_SYNC_NONE,
691 .older_than_this = NULL,
692 .nr_to_write = 0,
693 .nonblocking = 1,
694 .range_cyclic = 1,
695 };
696
697 for ( ; ; ) {
698 unsigned long background_thresh;
699 unsigned long dirty_thresh;
700
701 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
702 if (global_page_state(NR_FILE_DIRTY) +
703 global_page_state(NR_UNSTABLE_NFS) < background_thresh
704 && min_pages <= 0)
705 break;
706 wbc.more_io = 0;
707 wbc.encountered_congestion = 0;
708 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
709 wbc.pages_skipped = 0;
710 writeback_inodes(&wbc);
711 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
712 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
713 /* Wrote less than expected */
714 if (wbc.encountered_congestion || wbc.more_io)
715 congestion_wait(BLK_RW_ASYNC, HZ/10);
716 else
717 break;
718 }
719 }
720}
721
722/*
723 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
724 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
725 * -1 if all pdflush threads were busy.
726 */
727int wakeup_pdflush(long nr_pages)
728{
729 if (nr_pages == 0)
730 nr_pages = global_page_state(NR_FILE_DIRTY) +
731 global_page_state(NR_UNSTABLE_NFS);
732 return pdflush_operation(background_writeout, nr_pages);
733}
734
735static void wb_timer_fn(unsigned long unused);
736static void laptop_timer_fn(unsigned long unused); 678static void laptop_timer_fn(unsigned long unused);
737 679
738static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
739static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 680static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
740 681
741/* 682/*
742 * Periodic writeback of "old" data.
743 *
744 * Define "old": the first time one of an inode's pages is dirtied, we mark the
745 * dirtying-time in the inode's address_space. So this periodic writeback code
746 * just walks the superblock inode list, writing back any inodes which are
747 * older than a specific point in time.
748 *
749 * Try to run once per dirty_writeback_interval. But if a writeback event
750 * takes longer than a dirty_writeback_interval interval, then leave a
751 * one-second gap.
752 *
753 * older_than_this takes precedence over nr_to_write. So we'll only write back
754 * all dirty pages if they are all attached to "old" mappings.
755 */
756static void wb_kupdate(unsigned long arg)
757{
758 unsigned long oldest_jif;
759 unsigned long start_jif;
760 unsigned long next_jif;
761 long nr_to_write;
762 struct writeback_control wbc = {
763 .bdi = NULL,
764 .sync_mode = WB_SYNC_NONE,
765 .older_than_this = &oldest_jif,
766 .nr_to_write = 0,
767 .nonblocking = 1,
768 .for_kupdate = 1,
769 .range_cyclic = 1,
770 };
771
772 sync_supers();
773
774 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
775 start_jif = jiffies;
776 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
777 nr_to_write = global_page_state(NR_FILE_DIRTY) +
778 global_page_state(NR_UNSTABLE_NFS) +
779 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
780 while (nr_to_write > 0) {
781 wbc.more_io = 0;
782 wbc.encountered_congestion = 0;
783 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
784 writeback_inodes(&wbc);
785 if (wbc.nr_to_write > 0) {
786 if (wbc.encountered_congestion || wbc.more_io)
787 congestion_wait(BLK_RW_ASYNC, HZ/10);
788 else
789 break; /* All the old data is written */
790 }
791 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
792 }
793 if (time_before(next_jif, jiffies + HZ))
794 next_jif = jiffies + HZ;
795 if (dirty_writeback_interval)
796 mod_timer(&wb_timer, next_jif);
797}
798
799/*
800 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 683 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
801 */ 684 */
802int dirty_writeback_centisecs_handler(ctl_table *table, int write, 685int dirty_writeback_centisecs_handler(ctl_table *table, int write,
803 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 686 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
804{ 687{
805 proc_dointvec(table, write, file, buffer, length, ppos); 688 proc_dointvec(table, write, file, buffer, length, ppos);
806 if (dirty_writeback_interval)
807 mod_timer(&wb_timer, jiffies +
808 msecs_to_jiffies(dirty_writeback_interval * 10));
809 else
810 del_timer(&wb_timer);
811 return 0; 689 return 0;
812} 690}
813 691
814static void wb_timer_fn(unsigned long unused) 692static void do_laptop_sync(struct work_struct *work)
815{
816 if (pdflush_operation(wb_kupdate, 0) < 0)
817 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
818}
819
820static void laptop_flush(unsigned long unused)
821{ 693{
822 sys_sync(); 694 wakeup_flusher_threads(0);
695 kfree(work);
823} 696}
824 697
825static void laptop_timer_fn(unsigned long unused) 698static void laptop_timer_fn(unsigned long unused)
826{ 699{
827 pdflush_operation(laptop_flush, 0); 700 struct work_struct *work;
701
702 work = kmalloc(sizeof(*work), GFP_ATOMIC);
703 if (work) {
704 INIT_WORK(work, do_laptop_sync);
705 schedule_work(work);
706 }
828} 707}
829 708
830/* 709/*
@@ -907,8 +786,6 @@ void __init page_writeback_init(void)
907{ 786{
908 int shift; 787 int shift;
909 788
910 mod_timer(&wb_timer,
911 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
912 writeback_set_ratelimit(); 789 writeback_set_ratelimit();
913 register_cpu_notifier(&ratelimit_nb); 790 register_cpu_notifier(&ratelimit_nb);
914 791
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1720 */ 1720 */
1721 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1725 } 1725 }
1726 1726