aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-11 12:17:05 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-11 12:17:05 -0400
commita12e4d304ce701844c639541d90df86e165d03f9 (patch)
tree6ad7314b63a3303d9aa36f1c7eeb68abf64d3592
parent89af571ca633ada14d17746519a179553a732d31 (diff)
parent500b067c5e6ceea49cf280a02597b1169320e08c (diff)
Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block
* 'writeback' of git://git.kernel.dk/linux-2.6-block: writeback: check for registered bdi in flusher add and inode dirty writeback: add name to backing_dev_info writeback: add some debug inode list counters to bdi stats writeback: get rid of pdflush completely writeback: switch to per-bdi threads for flushing data writeback: move dirty inodes from super_block to backing_dev_info writeback: get rid of generic_sync_sb_inodes() export
-rw-r--r--block/blk-core.c1
-rw-r--r--drivers/block/aoe/aoeblk.c1
-rw-r--r--drivers/char/mem.c1
-rw-r--r--drivers/staging/pohmelfs/inode.c9
-rw-r--r--fs/btrfs/disk-io.c1
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/fs-writeback.c1065
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/super.c5
-rw-r--r--fs/sync.c20
-rw-r--r--fs/sysfs/inode.c1
-rw-r--r--fs/ubifs/budget.c16
-rw-r--r--fs/ubifs/super.c9
-rw-r--r--include/linux/backing-dev.h55
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/writeback.h23
-rw-r--r--kernel/cgroup.c1
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c381
-rw-r--r--mm/page-writeback.c182
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/vmscan.c2
29 files changed, 1285 insertions, 778 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index e3299a77a0d8..e695634882a6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -501,6 +501,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
501 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 501 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
502 q->backing_dev_info.state = 0; 502 q->backing_dev_info.state = 0;
503 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 503 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
504 q->backing_dev_info.name = "block";
504 505
505 err = bdi_init(&q->backing_dev_info); 506 err = bdi_init(&q->backing_dev_info);
506 if (err) { 507 if (err) {
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 1e15889c4b98..95d344971eda 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -268,6 +268,7 @@ aoeblk_gdalloc(void *vp)
268 if (!d->blkq) 268 if (!d->blkq)
269 goto err_mempool; 269 goto err_mempool;
270 blk_queue_make_request(d->blkq, aoeblk_make_request); 270 blk_queue_make_request(d->blkq, aoeblk_make_request);
271 d->blkq->backing_dev_info.name = "aoe";
271 if (bdi_init(&d->blkq->backing_dev_info)) 272 if (bdi_init(&d->blkq->backing_dev_info))
272 goto err_blkq; 273 goto err_blkq;
273 spin_lock_irqsave(&d->lock, flags); 274 spin_lock_irqsave(&d->lock, flags);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index afa8813e737a..645237bda682 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -822,6 +822,7 @@ static const struct file_operations zero_fops = {
822 * - permits private mappings, "copies" are taken of the source of zeros 822 * - permits private mappings, "copies" are taken of the source of zeros
823 */ 823 */
824static struct backing_dev_info zero_bdi = { 824static struct backing_dev_info zero_bdi = {
825 .name = "char/mem",
825 .capabilities = BDI_CAP_MAP_COPY, 826 .capabilities = BDI_CAP_MAP_COPY,
826}; 827};
827 828
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 7b605795b770..e63c9bea6c54 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -1950,14 +1950,7 @@ static int pohmelfs_get_sb(struct file_system_type *fs_type,
1950 */ 1950 */
1951static void pohmelfs_kill_super(struct super_block *sb) 1951static void pohmelfs_kill_super(struct super_block *sb)
1952{ 1952{
1953 struct writeback_control wbc = { 1953 sync_inodes_sb(sb);
1954 .sync_mode = WB_SYNC_ALL,
1955 .range_start = 0,
1956 .range_end = LLONG_MAX,
1957 .nr_to_write = LONG_MAX,
1958 };
1959 generic_sync_sb_inodes(sb, &wbc);
1960
1961 kill_anon_super(sb); 1954 kill_anon_super(sb);
1962} 1955}
1963 1956
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..15831d5c7367 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1352{ 1352{
1353 int err; 1353 int err;
1354 1354
1355 bdi->name = "btrfs";
1355 bdi->capabilities = BDI_CAP_MAP_COPY; 1356 bdi->capabilities = BDI_CAP_MAP_COPY;
1356 err = bdi_init(bdi); 1357 err = bdi_init(bdi);
1357 if (err) 1358 if (err)
diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
281 struct zone *zone; 281 struct zone *zone;
282 int nid; 282 int nid;
283 283
284 wakeup_pdflush(1024); 284 wakeup_flusher_threads(1024);
285 yield(); 285 yield();
286 286
287 for_each_online_node(nid) { 287 for_each_online_node(nid) {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..7c27a8ebef6a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
31 * - no readahead or I/O queue unplugging required 31 * - no readahead or I/O queue unplugging required
32 */ 32 */
33struct backing_dev_info directly_mappable_cdev_bdi = { 33struct backing_dev_info directly_mappable_cdev_bdi = {
34 .name = "char",
34 .capabilities = ( 35 .capabilities = (
35#ifdef CONFIG_MMU 36#ifdef CONFIG_MMU
36 /* permit private copies of the data to be taken */ 37 /* permit private copies of the data to be taken */
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
51}; 51};
52 52
53static struct backing_dev_info configfs_backing_dev_info = { 53static struct backing_dev_info configfs_backing_dev_info = {
54 .name = "configfs",
54 .ra_pages = 0, /* No readahead */ 55 .ra_pages = 0, /* No readahead */
55 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
56}; 57};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..da86ef58e427 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,223 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
25#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
26#include "internal.h" 28#include "internal.h"
27 29
30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 31
29/** 32/*
30 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * We don't actually have pdflush, but this one is exported though /proc...
31 * @bdi: the device's backing_dev_info structure
32 *
33 * It is a waste of resources to have more than one pdflush thread blocked on
34 * a single request queue. Exclusion at the request_queue level is obtained
35 * via a flag in the request_queue's backing_dev_info.state.
36 *
37 * Non-request_queue-backed address_spaces will share default_backing_dev_info,
38 * unless they implement their own. Which is somewhat inefficient, as this
39 * may prevent concurrent writeback against multiple devices.
40 */ 34 */
41static int writeback_acquire(struct backing_dev_info *bdi) 35int nr_pdflush_threads;
36
37/*
38 * Work items for the bdi_writeback threads
39 */
40struct bdi_work {
41 struct list_head list;
42 struct list_head wait_list;
43 struct rcu_head rcu_head;
44
45 unsigned long seen;
46 atomic_t pending;
47
48 struct super_block *sb;
49 unsigned long nr_pages;
50 enum writeback_sync_modes sync_mode;
51
52 unsigned long state;
53};
54
55enum {
56 WS_USED_B = 0,
57 WS_ONSTACK_B,
58};
59
60#define WS_USED (1 << WS_USED_B)
61#define WS_ONSTACK (1 << WS_ONSTACK_B)
62
63static inline bool bdi_work_on_stack(struct bdi_work *work)
42{ 64{
43 return !test_and_set_bit(BDI_pdflush, &bdi->state); 65 return test_bit(WS_ONSTACK_B, &work->state);
66}
67
68static inline void bdi_work_init(struct bdi_work *work,
69 struct writeback_control *wbc)
70{
71 INIT_RCU_HEAD(&work->rcu_head);
72 work->sb = wbc->sb;
73 work->nr_pages = wbc->nr_to_write;
74 work->sync_mode = wbc->sync_mode;
75 work->state = WS_USED;
76}
77
78static inline void bdi_work_init_on_stack(struct bdi_work *work,
79 struct writeback_control *wbc)
80{
81 bdi_work_init(work, wbc);
82 work->state |= WS_ONSTACK;
44} 83}
45 84
46/** 85/**
47 * writeback_in_progress - determine whether there is writeback in progress 86 * writeback_in_progress - determine whether there is writeback in progress
48 * @bdi: the device's backing_dev_info structure. 87 * @bdi: the device's backing_dev_info structure.
49 * 88 *
50 * Determine whether there is writeback in progress against a backing device. 89 * Determine whether there is writeback waiting to be handled against a
90 * backing device.
51 */ 91 */
52int writeback_in_progress(struct backing_dev_info *bdi) 92int writeback_in_progress(struct backing_dev_info *bdi)
53{ 93{
54 return test_bit(BDI_pdflush, &bdi->state); 94 return !list_empty(&bdi->work_list);
55} 95}
56 96
57/** 97static void bdi_work_clear(struct bdi_work *work)
58 * writeback_release - relinquish exclusive writeback access against a device.
59 * @bdi: the device's backing_dev_info structure
60 */
61static void writeback_release(struct backing_dev_info *bdi)
62{ 98{
63 BUG_ON(!writeback_in_progress(bdi)); 99 clear_bit(WS_USED_B, &work->state);
64 clear_bit(BDI_pdflush, &bdi->state); 100 smp_mb__after_clear_bit();
101 wake_up_bit(&work->state, WS_USED_B);
65} 102}
66 103
67static noinline void block_dump___mark_inode_dirty(struct inode *inode) 104static void bdi_work_free(struct rcu_head *head)
68{ 105{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 106 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
70 struct dentry *dentry;
71 const char *name = "?";
72 107
73 dentry = d_find_alias(inode); 108 if (!bdi_work_on_stack(work))
74 if (dentry) { 109 kfree(work);
75 spin_lock(&dentry->d_lock); 110 else
76 name = (const char *) dentry->d_name.name; 111 bdi_work_clear(work);
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87} 112}
88 113
89/** 114static void wb_work_complete(struct bdi_work *work)
90 * __mark_inode_dirty - internal function
91 * @inode: inode to mark
92 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
93 * Mark an inode as dirty. Callers should use mark_inode_dirty or
94 * mark_inode_dirty_sync.
95 *
96 * Put the inode on the super block's dirty list.
97 *
98 * CAREFUL! We mark it dirty unconditionally, but move it onto the
99 * dirty list only if it is hashed or if it refers to a blockdev.
100 * If it was not hashed, it will never be added to the dirty list
101 * even if it is later hashed, as it will have been marked dirty already.
102 *
103 * In short, make sure you hash any inodes _before_ you start marking
104 * them dirty.
105 *
106 * This function *must* be atomic for the I_DIRTY_PAGES case -
107 * set_page_dirty() is called under spinlock in several places.
108 *
109 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
110 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
111 * the kernel-internal blockdev inode represents the dirtying time of the
112 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
113 * page->mapping->host, so the page-dirtying time is recorded in the internal
114 * blockdev inode.
115 */
116void __mark_inode_dirty(struct inode *inode, int flags)
117{ 115{
118 struct super_block *sb = inode->i_sb; 116 const enum writeback_sync_modes sync_mode = work->sync_mode;
119 117
120 /* 118 /*
121 * Don't do this for I_DIRTY_PAGES - that doesn't actually 119 * For allocated work, we can clear the done/seen bit right here.
122 * dirty the inode itself 120 * For on-stack work, we need to postpone both the clear and free
121 * to after the RCU grace period, since the stack could be invalidated
122 * as soon as bdi_work_clear() has done the wakeup.
123 */ 123 */
124 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 124 if (!bdi_work_on_stack(work))
125 if (sb->s_op->dirty_inode) 125 bdi_work_clear(work);
126 sb->s_op->dirty_inode(inode); 126 if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
127 } 127 call_rcu(&work->rcu_head, bdi_work_free);
128}
128 129
130static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
131{
129 /* 132 /*
130 * make sure that changes are seen by all cpus before we test i_state 133 * The caller has retrieved the work arguments from this work,
131 * -- mikulas 134 * drop our reference. If this is the last ref, delete and free it
132 */ 135 */
133 smp_mb(); 136 if (atomic_dec_and_test(&work->pending)) {
137 struct backing_dev_info *bdi = wb->bdi;
134 138
135 /* avoid the locking if we can */ 139 spin_lock(&bdi->wb_lock);
136 if ((inode->i_state & flags) == flags) 140 list_del_rcu(&work->list);
137 return; 141 spin_unlock(&bdi->wb_lock);
138 142
139 if (unlikely(block_dump)) 143 wb_work_complete(work);
140 block_dump___mark_inode_dirty(inode); 144 }
141 145}
142 spin_lock(&inode_lock);
143 if ((inode->i_state & flags) != flags) {
144 const int was_dirty = inode->i_state & I_DIRTY;
145 146
146 inode->i_state |= flags; 147static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
148{
149 if (work) {
150 work->seen = bdi->wb_mask;
151 BUG_ON(!work->seen);
152 atomic_set(&work->pending, bdi->wb_cnt);
153 BUG_ON(!bdi->wb_cnt);
147 154
148 /* 155 /*
149 * If the inode is being synced, just update its dirty state. 156 * Make sure stores are seen before it appears on the list
150 * The unlocker will place the inode on the appropriate
151 * superblock list, based upon its state.
152 */ 157 */
153 if (inode->i_state & I_SYNC) 158 smp_mb();
154 goto out;
155 159
156 /* 160 spin_lock(&bdi->wb_lock);
157 * Only add valid (hashed) inodes to the superblock's 161 list_add_tail_rcu(&work->list, &bdi->work_list);
158 * dirty list. Add blockdev inodes as well. 162 spin_unlock(&bdi->wb_lock);
159 */ 163 }
160 if (!S_ISBLK(inode->i_mode)) { 164
161 if (hlist_unhashed(&inode->i_hash)) 165 /*
162 goto out; 166 * If the default thread isn't there, make sure we add it. When
163 } 167 * it gets created and wakes up, we'll run this work.
164 if (inode->i_state & (I_FREEING|I_CLEAR)) 168 */
165 goto out; 169 if (unlikely(list_empty_careful(&bdi->wb_list)))
170 wake_up_process(default_backing_dev_info.wb.task);
171 else {
172 struct bdi_writeback *wb = &bdi->wb;
166 173
167 /* 174 /*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 175 * If we failed allocating the bdi work item, wake up the wb
169 * reposition it (that would break s_dirty time-ordering). 176 * thread always. As a safety precaution, it'll flush out
177 * everything
170 */ 178 */
171 if (!was_dirty) { 179 if (!wb_has_dirty_io(wb)) {
172 inode->dirtied_when = jiffies; 180 if (work)
173 list_move(&inode->i_list, &sb->s_dirty); 181 wb_clear_pending(wb, work);
174 } 182 } else if (wb->task)
183 wake_up_process(wb->task);
175 } 184 }
176out:
177 spin_unlock(&inode_lock);
178} 185}
179 186
180EXPORT_SYMBOL(__mark_inode_dirty); 187/*
188 * Used for on-stack allocated work items. The caller needs to wait until
189 * the wb threads have acked the work before it's safe to continue.
190 */
191static void bdi_wait_on_work_clear(struct bdi_work *work)
192{
193 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
194 TASK_UNINTERRUPTIBLE);
195}
181 196
182static int write_inode(struct inode *inode, int sync) 197static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
183{ 198{
184 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 199 struct bdi_work *work;
185 return inode->i_sb->s_op->write_inode(inode, sync); 200
186 return 0; 201 work = kmalloc(sizeof(*work), GFP_ATOMIC);
202 if (work)
203 bdi_work_init(work, wbc);
204
205 return work;
206}
207
208void bdi_start_writeback(struct writeback_control *wbc)
209{
210 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
211 struct bdi_work work_stack, *work = NULL;
212
213 if (!must_wait)
214 work = bdi_alloc_work(wbc);
215
216 if (!work) {
217 work = &work_stack;
218 bdi_work_init_on_stack(work, wbc);
219 }
220
221 bdi_queue_work(wbc->bdi, work);
222
223 /*
224 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
225 * complete. If not, we only need to wait for the work to be started,
226 * if we allocated it on-stack. We use the same mechanism, if the
227 * wait bit is set in the bdi_work struct, then threads will not
228 * clear pending until after they are done.
229 *
230 * Note that work == &work_stack if must_wait is true, so we don't
231 * need to do call_rcu() here ever, since the completion path will
232 * have done that for us.
233 */
234 if (must_wait || work == &work_stack) {
235 bdi_wait_on_work_clear(work);
236 if (work != &work_stack)
237 call_rcu(&work->rcu_head, bdi_work_free);
238 }
187} 239}
188 240
189/* 241/*
@@ -191,31 +243,32 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 243 * furthest end of its superblock's dirty-inode list.
192 * 244 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 245 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 246 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 247 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 248 * out and we don't reset its dirtied_when.
197 */ 249 */
198static void redirty_tail(struct inode *inode) 250static void redirty_tail(struct inode *inode)
199{ 251{
200 struct super_block *sb = inode->i_sb; 252 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201 253
202 if (!list_empty(&sb->s_dirty)) { 254 if (!list_empty(&wb->b_dirty)) {
203 struct inode *tail_inode; 255 struct inode *tail;
204 256
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 257 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 258 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 259 inode->dirtied_when = jiffies;
209 } 260 }
210 list_move(&inode->i_list, &sb->s_dirty); 261 list_move(&inode->i_list, &wb->b_dirty);
211} 262}
212 263
213/* 264/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 265 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 266 */
216static void requeue_io(struct inode *inode) 267static void requeue_io(struct inode *inode)
217{ 268{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 269 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
270
271 list_move(&inode->i_list, &wb->b_more_io);
219} 272}
220 273
221static void inode_sync_complete(struct inode *inode) 274static void inode_sync_complete(struct inode *inode)
@@ -262,20 +315,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
262/* 315/*
263 * Queue all expired dirty inodes for io, eldest first. 316 * Queue all expired dirty inodes for io, eldest first.
264 */ 317 */
265static void queue_io(struct super_block *sb, 318static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
266 unsigned long *older_than_this)
267{ 319{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 320 list_splice_init(&wb->b_more_io, wb->b_io.prev);
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 321 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
270} 322}
271 323
272int sb_has_dirty_inodes(struct super_block *sb) 324static int write_inode(struct inode *inode, int sync)
273{ 325{
274 return !list_empty(&sb->s_dirty) || 326 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
275 !list_empty(&sb->s_io) || 327 return inode->i_sb->s_op->write_inode(inode, sync);
276 !list_empty(&sb->s_more_io); 328 return 0;
277} 329}
278EXPORT_SYMBOL(sb_has_dirty_inodes);
279 330
280/* 331/*
281 * Wait for writeback on an inode to complete. 332 * Wait for writeback on an inode to complete.
@@ -322,11 +373,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 373 if (inode->i_state & I_SYNC) {
323 /* 374 /*
324 * If this inode is locked for writeback and we are not doing 375 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 376 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 377 * writeback can proceed with the other inodes on s_io.
327 * 378 *
328 * We'll have another go at writing back this inode when we 379 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 380 * completed a full scan of b_io.
330 */ 381 */
331 if (!wait) { 382 if (!wait) {
332 requeue_io(inode); 383 requeue_io(inode);
@@ -371,11 +422,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
371 /* 422 /*
372 * We didn't write back all the pages. nfs_writepages() 423 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 424 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 425 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 426 */
376 /* 427 /*
377 * akpm: if the caller was the kupdate function we put 428 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 429 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 430 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 431 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 432 * how much sense this makes. Presumably I had a good
@@ -385,7 +436,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 436 if (wbc->for_kupdate) {
386 /* 437 /*
387 * For the kupdate function we move the inode 438 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 439 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 440 * soon as the queue becomes uncongested.
390 */ 441 */
391 inode->i_state |= I_DIRTY_PAGES; 442 inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +485,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
434} 485}
435 486
436/* 487/*
437 * Write out a superblock's list of dirty inodes. A wait will be performed 488 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 489 * before calling writeback. So make sure that we do pin it, so it doesn't
439 * 490 * go away while we are writing inodes from it.
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 * 491 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 492 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
447 * This function assumes that the blockdev superblock's inodes are backed by 493 * 1 if we failed.
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 *
451 * FIXME: this linear search could get expensive with many fileystems. But
452 * how to fix? We need to go from an address_space to all inodes which share
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */ 494 */
461void generic_sync_sb_inodes(struct super_block *sb, 495static int pin_sb_for_writeback(struct writeback_control *wbc,
496 struct inode *inode)
497{
498 struct super_block *sb = inode->i_sb;
499
500 /*
501 * Caller must already hold the ref for this
502 */
503 if (wbc->sync_mode == WB_SYNC_ALL) {
504 WARN_ON(!rwsem_is_locked(&sb->s_umount));
505 return 0;
506 }
507
508 spin_lock(&sb_lock);
509 sb->s_count++;
510 if (down_read_trylock(&sb->s_umount)) {
511 if (sb->s_root) {
512 spin_unlock(&sb_lock);
513 return 0;
514 }
515 /*
516 * umounted, drop rwsem again and fall through to failure
517 */
518 up_read(&sb->s_umount);
519 }
520
521 sb->s_count--;
522 spin_unlock(&sb_lock);
523 return 1;
524}
525
526static void unpin_sb_for_writeback(struct writeback_control *wbc,
527 struct inode *inode)
528{
529 struct super_block *sb = inode->i_sb;
530
531 if (wbc->sync_mode == WB_SYNC_ALL)
532 return;
533
534 up_read(&sb->s_umount);
535 put_super(sb);
536}
537
538static void writeback_inodes_wb(struct bdi_writeback *wb,
462 struct writeback_control *wbc) 539 struct writeback_control *wbc)
463{ 540{
541 struct super_block *sb = wbc->sb;
542 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 543 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 544
467 spin_lock(&inode_lock); 545 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 546
471 while (!list_empty(&sb->s_io)) { 547 if (!wbc->for_kupdate || list_empty(&wb->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 548 queue_io(wb, wbc->older_than_this);
549
550 while (!list_empty(&wb->b_io)) {
551 struct inode *inode = list_entry(wb->b_io.prev,
473 struct inode, i_list); 552 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 553 long pages_skipped;
477 554
478 if (!bdi_cap_writeback_dirty(bdi)) { 555 /*
556 * super block given and doesn't match, skip this inode
557 */
558 if (sb && sb != inode->i_sb) {
559 redirty_tail(inode);
560 continue;
561 }
562
563 if (!bdi_cap_writeback_dirty(wb->bdi)) {
479 redirty_tail(inode); 564 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 565 if (is_blkdev_sb) {
481 /* 566 /*
482 * Dirty memory-backed blockdev: the ramdisk 567 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 568 * driver does this. Skip just this inode
@@ -497,21 +582,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
497 continue; 582 continue;
498 } 583 }
499 584
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 585 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
501 wbc->encountered_congestion = 1; 586 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 587 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 588 break; /* Skip a congested fs */
504 requeue_io(inode); 589 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 590 continue; /* Skip a congested blockdev */
506 } 591 }
507 592
508 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb))
510 break; /* fs has the wrong queue */
511 requeue_io(inode);
512 continue; /* blockdev has wrong queue */
513 }
514
515 /* 593 /*
516 * Was this inode dirtied after sync_sb_inodes was called? 594 * Was this inode dirtied after sync_sb_inodes was called?
517 * This keeps sync from extra jobs and livelock. 595 * This keeps sync from extra jobs and livelock.
@@ -519,16 +597,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
519 if (inode_dirtied_after(inode, start)) 597 if (inode_dirtied_after(inode, start))
520 break; 598 break;
521 599
522 /* Is another pdflush already flushing this queue? */ 600 if (pin_sb_for_writeback(wbc, inode)) {
523 if (current_is_pdflush() && !writeback_acquire(bdi)) 601 requeue_io(inode);
524 break; 602 continue;
603 }
525 604
526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 605 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
527 __iget(inode); 606 __iget(inode);
528 pages_skipped = wbc->pages_skipped; 607 pages_skipped = wbc->pages_skipped;
529 writeback_single_inode(inode, wbc); 608 writeback_single_inode(inode, wbc);
530 if (current_is_pdflush()) 609 unpin_sb_for_writeback(wbc, inode);
531 writeback_release(bdi);
532 if (wbc->pages_skipped != pages_skipped) { 610 if (wbc->pages_skipped != pages_skipped) {
533 /* 611 /*
534 * writeback is not making progress due to locked 612 * writeback is not making progress due to locked
@@ -544,144 +622,571 @@ void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 622 wbc->more_io = 1;
545 break; 623 break;
546 } 624 }
547 if (!list_empty(&sb->s_more_io)) 625 if (!list_empty(&wb->b_more_io))
548 wbc->more_io = 1; 626 wbc->more_io = 1;
549 } 627 }
550 628
551 if (sync) { 629 spin_unlock(&inode_lock);
552 struct inode *inode, *old_inode = NULL; 630 /* Leave any unwritten inodes on b_io */
631}
632
633void writeback_inodes_wbc(struct writeback_control *wbc)
634{
635 struct backing_dev_info *bdi = wbc->bdi;
553 636
637 writeback_inodes_wb(&bdi->wb, wbc);
638}
639
640/*
641 * The maximum number of pages to writeout in a single bdi flush/kupdate
642 * operation. We do this so we don't hold I_SYNC against an inode for
643 * enormous amounts of time, which would block a userspace task which has
644 * been forced to throttle against that inode. Also, the code reevaluates
645 * the dirty each time it has written this many pages.
646 */
647#define MAX_WRITEBACK_PAGES 1024
648
649static inline bool over_bground_thresh(void)
650{
651 unsigned long background_thresh, dirty_thresh;
652
653 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
654
655 return (global_page_state(NR_FILE_DIRTY) +
656 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
657}
658
659/*
660 * Explicit flushing or periodic writeback of "old" data.
661 *
662 * Define "old": the first time one of an inode's pages is dirtied, we mark the
663 * dirtying-time in the inode's address_space. So this periodic writeback code
664 * just walks the superblock inode list, writing back any inodes which are
665 * older than a specific point in time.
666 *
667 * Try to run once per dirty_writeback_interval. But if a writeback event
668 * takes longer than a dirty_writeback_interval interval, then leave a
669 * one-second gap.
670 *
671 * older_than_this takes precedence over nr_to_write. So we'll only write back
672 * all dirty pages if they are all attached to "old" mappings.
673 */
674static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
675 struct super_block *sb,
676 enum writeback_sync_modes sync_mode, int for_kupdate)
677{
678 struct writeback_control wbc = {
679 .bdi = wb->bdi,
680 .sb = sb,
681 .sync_mode = sync_mode,
682 .older_than_this = NULL,
683 .for_kupdate = for_kupdate,
684 .range_cyclic = 1,
685 };
686 unsigned long oldest_jif;
687 long wrote = 0;
688
689 if (wbc.for_kupdate) {
690 wbc.older_than_this = &oldest_jif;
691 oldest_jif = jiffies -
692 msecs_to_jiffies(dirty_expire_interval * 10);
693 }
694
695 for (;;) {
554 /* 696 /*
555 * Data integrity sync. Must wait for all pages under writeback, 697 * Don't flush anything for non-integrity writeback where
556 * because there may have been pages dirtied before our sync 698 * no nr_pages was given
557 * call, but which had writeout started before we write it out.
558 * In which case, the inode may not be on the dirty list, but
559 * we still have to wait for that writeout.
560 */ 699 */
561 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 700 if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
562 struct address_space *mapping; 701 break;
563 702
564 if (inode->i_state & 703 /*
565 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 704 * If no specific pages were given and this is just a
566 continue; 705 * periodic background writeout and we are below the
567 mapping = inode->i_mapping; 706 * background dirty threshold, don't do anything
568 if (mapping->nrpages == 0) 707 */
708 if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
709 break;
710
711 wbc.more_io = 0;
712 wbc.encountered_congestion = 0;
713 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
714 wbc.pages_skipped = 0;
715 writeback_inodes_wb(wb, &wbc);
716 nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
717 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
718
719 /*
720 * If we ran out of stuff to write, bail unless more_io got set
721 */
722 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
723 if (wbc.more_io && !wbc.for_kupdate)
569 continue; 724 continue;
570 __iget(inode); 725 break;
571 spin_unlock(&inode_lock); 726 }
727 }
728
729 return wrote;
730}
731
732/*
733 * Return the next bdi_work struct that hasn't been processed by this
734 * wb thread yet
735 */
736static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
737 struct bdi_writeback *wb)
738{
739 struct bdi_work *work, *ret = NULL;
740
741 rcu_read_lock();
742
743 list_for_each_entry_rcu(work, &bdi->work_list, list) {
744 if (!test_and_clear_bit(wb->nr, &work->seen))
745 continue;
746
747 ret = work;
748 break;
749 }
750
751 rcu_read_unlock();
752 return ret;
753}
754
755static long wb_check_old_data_flush(struct bdi_writeback *wb)
756{
757 unsigned long expired;
758 long nr_pages;
759
760 expired = wb->last_old_flush +
761 msecs_to_jiffies(dirty_writeback_interval * 10);
762 if (time_before(jiffies, expired))
763 return 0;
764
765 wb->last_old_flush = jiffies;
766 nr_pages = global_page_state(NR_FILE_DIRTY) +
767 global_page_state(NR_UNSTABLE_NFS) +
768 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
769
770 if (nr_pages)
771 return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
772
773 return 0;
774}
775
776/*
777 * Retrieve work items and do the writeback they describe
778 */
779long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
780{
781 struct backing_dev_info *bdi = wb->bdi;
782 struct bdi_work *work;
783 long nr_pages, wrote = 0;
784
785 while ((work = get_next_work_item(bdi, wb)) != NULL) {
786 enum writeback_sync_modes sync_mode;
787
788 nr_pages = work->nr_pages;
789
790 /*
791 * Override sync mode, in case we must wait for completion
792 */
793 if (force_wait)
794 work->sync_mode = sync_mode = WB_SYNC_ALL;
795 else
796 sync_mode = work->sync_mode;
797
798 /*
799 * If this isn't a data integrity operation, just notify
800 * that we have seen this work and we are now starting it.
801 */
802 if (sync_mode == WB_SYNC_NONE)
803 wb_clear_pending(wb, work);
804
805 wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
806
807 /*
808 * This is a data integrity writeback, so only do the
809 * notification when we have completed the work.
810 */
811 if (sync_mode == WB_SYNC_ALL)
812 wb_clear_pending(wb, work);
813 }
814
815 /*
816 * Check for periodic writeback, kupdated() style
817 */
818 wrote += wb_check_old_data_flush(wb);
819
820 return wrote;
821}
822
823/*
824 * Handle writeback of dirty data for the device backed by this bdi. Also
825 * wakes up periodically and does kupdated style flushing.
826 */
827int bdi_writeback_task(struct bdi_writeback *wb)
828{
829 unsigned long last_active = jiffies;
830 unsigned long wait_jiffies = -1UL;
831 long pages_written;
832
833 while (!kthread_should_stop()) {
834 pages_written = wb_do_writeback(wb, 0);
835
836 if (pages_written)
837 last_active = jiffies;
838 else if (wait_jiffies != -1UL) {
839 unsigned long max_idle;
840
572 /* 841 /*
573 * We hold a reference to 'inode' so it couldn't have 842 * Longest period of inactivity that we tolerate. If we
574 * been removed from s_inodes list while we dropped the 843 * see dirty data again later, the task will get
575 * inode_lock. We cannot iput the inode now as we can 844 * recreated automatically.
576 * be holding the last reference and we cannot iput it
577 * under inode_lock. So we keep the reference and iput
578 * it later.
579 */ 845 */
580 iput(old_inode); 846 max_idle = max(5UL * 60 * HZ, wait_jiffies);
581 old_inode = inode; 847 if (time_after(jiffies, max_idle + last_active))
848 break;
849 }
582 850
583 filemap_fdatawait(mapping); 851 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
852 set_current_state(TASK_INTERRUPTIBLE);
853 schedule_timeout(wait_jiffies);
854 try_to_freeze();
855 }
584 856
585 cond_resched(); 857 return 0;
858}
859
860/*
861 * Schedule writeback for all backing devices. Expensive! If this is a data
862 * integrity operation, writeback will be complete when this returns. If
863 * we are simply called for WB_SYNC_NONE, then writeback will merely be
864 * scheduled to run.
865 */
866static void bdi_writeback_all(struct writeback_control *wbc)
867{
868 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
869 struct backing_dev_info *bdi;
870 struct bdi_work *work;
871 LIST_HEAD(list);
872
873restart:
874 spin_lock(&bdi_lock);
875
876 list_for_each_entry(bdi, &bdi_list, bdi_list) {
877 struct bdi_work *work;
586 878
587 spin_lock(&inode_lock); 879 if (!bdi_has_dirty_io(bdi))
880 continue;
881
882 /*
883 * If work allocation fails, do the writes inline. We drop
884 * the lock and restart the list writeout. This should be OK,
885 * since this happens rarely and because the writeout should
886 * eventually make more free memory available.
887 */
888 work = bdi_alloc_work(wbc);
889 if (!work) {
890 struct writeback_control __wbc;
891
892 /*
893 * Not a data integrity writeout, just continue
894 */
895 if (!must_wait)
896 continue;
897
898 spin_unlock(&bdi_lock);
899 __wbc = *wbc;
900 __wbc.bdi = bdi;
901 writeback_inodes_wbc(&__wbc);
902 goto restart;
588 } 903 }
589 spin_unlock(&inode_lock); 904 if (must_wait)
590 iput(old_inode); 905 list_add_tail(&work->wait_list, &list);
591 } else 906
592 spin_unlock(&inode_lock); 907 bdi_queue_work(bdi, work);
908 }
909
910 spin_unlock(&bdi_lock);
593 911
594 return; /* Leave any unwritten inodes on s_io */ 912 /*
913 * If this is for WB_SYNC_ALL, wait for pending work to complete
914 * before returning.
915 */
916 while (!list_empty(&list)) {
917 work = list_entry(list.next, struct bdi_work, wait_list);
918 list_del(&work->wait_list);
919 bdi_wait_on_work_clear(work);
920 call_rcu(&work->rcu_head, bdi_work_free);
921 }
595} 922}
596EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
597 923
598static void sync_sb_inodes(struct super_block *sb, 924/*
599 struct writeback_control *wbc) 925 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
926 * the whole world.
927 */
928void wakeup_flusher_threads(long nr_pages)
600{ 929{
601 generic_sync_sb_inodes(sb, wbc); 930 struct writeback_control wbc = {
931 .sync_mode = WB_SYNC_NONE,
932 .older_than_this = NULL,
933 .range_cyclic = 1,
934 };
935
936 if (nr_pages == 0)
937 nr_pages = global_page_state(NR_FILE_DIRTY) +
938 global_page_state(NR_UNSTABLE_NFS);
939 wbc.nr_to_write = nr_pages;
940 bdi_writeback_all(&wbc);
602} 941}
603 942
604/* 943static noinline void block_dump___mark_inode_dirty(struct inode *inode)
605 * Start writeback of dirty pagecache data against all unlocked inodes. 944{
945 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
946 struct dentry *dentry;
947 const char *name = "?";
948
949 dentry = d_find_alias(inode);
950 if (dentry) {
951 spin_lock(&dentry->d_lock);
952 name = (const char *) dentry->d_name.name;
953 }
954 printk(KERN_DEBUG
955 "%s(%d): dirtied inode %lu (%s) on %s\n",
956 current->comm, task_pid_nr(current), inode->i_ino,
957 name, inode->i_sb->s_id);
958 if (dentry) {
959 spin_unlock(&dentry->d_lock);
960 dput(dentry);
961 }
962 }
963}
964
965/**
966 * __mark_inode_dirty - internal function
967 * @inode: inode to mark
968 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
969 * Mark an inode as dirty. Callers should use mark_inode_dirty or
970 * mark_inode_dirty_sync.
606 * 971 *
607 * Note: 972 * Put the inode on the super block's dirty list.
608 * We don't need to grab a reference to superblock here. If it has non-empty 973 *
609 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed 974 * CAREFUL! We mark it dirty unconditionally, but move it onto the
610 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all 975 * dirty list only if it is hashed or if it refers to a blockdev.
611 * empty. Since __sync_single_inode() regains inode_lock before it finally moves 976 * If it was not hashed, it will never be added to the dirty list
612 * inode from superblock lists we are OK. 977 * even if it is later hashed, as it will have been marked dirty already.
613 * 978 *
614 * If `older_than_this' is non-zero then only flush inodes which have a 979 * In short, make sure you hash any inodes _before_ you start marking
615 * flushtime older than *older_than_this. 980 * them dirty.
616 * 981 *
617 * If `bdi' is non-zero then we will scan the first inode against each 982 * This function *must* be atomic for the I_DIRTY_PAGES case -
618 * superblock until we find the matching ones. One group will be the dirty 983 * set_page_dirty() is called under spinlock in several places.
619 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 984 *
620 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 985 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
621 * super-efficient but we're about to do a ton of I/O... 986 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
987 * the kernel-internal blockdev inode represents the dirtying time of the
988 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
989 * page->mapping->host, so the page-dirtying time is recorded in the internal
990 * blockdev inode.
622 */ 991 */
623void 992void __mark_inode_dirty(struct inode *inode, int flags)
624writeback_inodes(struct writeback_control *wbc)
625{ 993{
626 struct super_block *sb; 994 struct super_block *sb = inode->i_sb;
627 995
628 might_sleep(); 996 /*
629 spin_lock(&sb_lock); 997 * Don't do this for I_DIRTY_PAGES - that doesn't actually
630restart: 998 * dirty the inode itself
631 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 999 */
632 if (sb_has_dirty_inodes(sb)) { 1000 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
633 /* we're making our own get_super here */ 1001 if (sb->s_op->dirty_inode)
634 sb->s_count++; 1002 sb->s_op->dirty_inode(inode);
635 spin_unlock(&sb_lock); 1003 }
636 /* 1004
637 * If we can't get the readlock, there's no sense in 1005 /*
638 * waiting around, most of the time the FS is going to 1006 * make sure that changes are seen by all cpus before we test i_state
639 * be unmounted by the time it is released. 1007 * -- mikulas
640 */ 1008 */
641 if (down_read_trylock(&sb->s_umount)) { 1009 smp_mb();
642 if (sb->s_root) 1010
643 sync_sb_inodes(sb, wbc); 1011 /* avoid the locking if we can */
644 up_read(&sb->s_umount); 1012 if ((inode->i_state & flags) == flags)
1013 return;
1014
1015 if (unlikely(block_dump))
1016 block_dump___mark_inode_dirty(inode);
1017
1018 spin_lock(&inode_lock);
1019 if ((inode->i_state & flags) != flags) {
1020 const int was_dirty = inode->i_state & I_DIRTY;
1021
1022 inode->i_state |= flags;
1023
1024 /*
1025 * If the inode is being synced, just update its dirty state.
1026 * The unlocker will place the inode on the appropriate
1027 * superblock list, based upon its state.
1028 */
1029 if (inode->i_state & I_SYNC)
1030 goto out;
1031
1032 /*
1033 * Only add valid (hashed) inodes to the superblock's
1034 * dirty list. Add blockdev inodes as well.
1035 */
1036 if (!S_ISBLK(inode->i_mode)) {
1037 if (hlist_unhashed(&inode->i_hash))
1038 goto out;
1039 }
1040 if (inode->i_state & (I_FREEING|I_CLEAR))
1041 goto out;
1042
1043 /*
1044 * If the inode was already on b_dirty/b_io/b_more_io, don't
1045 * reposition it (that would break b_dirty time-ordering).
1046 */
1047 if (!was_dirty) {
1048 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1049 struct backing_dev_info *bdi = wb->bdi;
1050
1051 if (bdi_cap_writeback_dirty(bdi) &&
1052 !test_bit(BDI_registered, &bdi->state)) {
1053 WARN_ON(1);
1054 printk(KERN_ERR "bdi-%s not registered\n",
1055 bdi->name);
645 } 1056 }
646 spin_lock(&sb_lock); 1057
647 if (__put_super_and_need_restart(sb)) 1058 inode->dirtied_when = jiffies;
648 goto restart; 1059 list_move(&inode->i_list, &wb->b_dirty);
649 } 1060 }
650 if (wbc->nr_to_write <= 0)
651 break;
652 } 1061 }
653 spin_unlock(&sb_lock); 1062out:
1063 spin_unlock(&inode_lock);
654} 1064}
1065EXPORT_SYMBOL(__mark_inode_dirty);
655 1066
656/* 1067/*
657 * writeback and wait upon the filesystem's dirty inodes. The caller will 1068 * Write out a superblock's list of dirty inodes. A wait will be performed
658 * do this in two passes - one to write, and one to wait. 1069 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1070 *
1071 * If older_than_this is non-NULL, then only write out inodes which
1072 * had their first dirtying at a time earlier than *older_than_this.
659 * 1073 *
660 * A finite limit is set on the number of pages which will be written. 1074 * If we're a pdlfush thread, then implement pdflush collision avoidance
661 * To prevent infinite livelock of sys_sync(). 1075 * against the entire list.
662 * 1076 *
663 * We add in the number of potentially dirty inodes, because each inode write 1077 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
664 * can dirty pagecache in the underlying blockdev. 1078 * This function assumes that the blockdev superblock's inodes are backed by
1079 * a variety of queues, so all inodes are searched. For other superblocks,
1080 * assume that all inodes are backed by the same queue.
1081 *
1082 * The inodes to be written are parked on bdi->b_io. They are moved back onto
1083 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1084 * on the writer throttling path, and we get decent balancing between many
1085 * throttled threads: we don't want them all piling up on inode_sync_wait.
665 */ 1086 */
666void sync_inodes_sb(struct super_block *sb, int wait) 1087static void wait_sb_inodes(struct writeback_control *wbc)
1088{
1089 struct inode *inode, *old_inode = NULL;
1090
1091 /*
1092 * We need to be protected against the filesystem going from
1093 * r/o to r/w or vice versa.
1094 */
1095 WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
1096
1097 spin_lock(&inode_lock);
1098
1099 /*
1100 * Data integrity sync. Must wait for all pages under writeback,
1101 * because there may have been pages dirtied before our sync
1102 * call, but which had writeout started before we write it out.
1103 * In which case, the inode may not be on the dirty list, but
1104 * we still have to wait for that writeout.
1105 */
1106 list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
1107 struct address_space *mapping;
1108
1109 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1110 continue;
1111 mapping = inode->i_mapping;
1112 if (mapping->nrpages == 0)
1113 continue;
1114 __iget(inode);
1115 spin_unlock(&inode_lock);
1116 /*
1117 * We hold a reference to 'inode' so it couldn't have
1118 * been removed from s_inodes list while we dropped the
1119 * inode_lock. We cannot iput the inode now as we can
1120 * be holding the last reference and we cannot iput it
1121 * under inode_lock. So we keep the reference and iput
1122 * it later.
1123 */
1124 iput(old_inode);
1125 old_inode = inode;
1126
1127 filemap_fdatawait(mapping);
1128
1129 cond_resched();
1130
1131 spin_lock(&inode_lock);
1132 }
1133 spin_unlock(&inode_lock);
1134 iput(old_inode);
1135}
1136
1137/**
1138 * writeback_inodes_sb - writeback dirty inodes from given super_block
1139 * @sb: the superblock
1140 *
1141 * Start writeback on some inodes on this super_block. No guarantees are made
1142 * on how many (if any) will be written, and this function does not wait
1143 * for IO completion of submitted IO. The number of pages submitted is
1144 * returned.
1145 */
1146long writeback_inodes_sb(struct super_block *sb)
667{ 1147{
668 struct writeback_control wbc = { 1148 struct writeback_control wbc = {
669 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1149 .sb = sb,
1150 .sync_mode = WB_SYNC_NONE,
670 .range_start = 0, 1151 .range_start = 0,
671 .range_end = LLONG_MAX, 1152 .range_end = LLONG_MAX,
672 }; 1153 };
1154 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1155 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1156 long nr_to_write;
673 1157
674 if (!wait) { 1158 nr_to_write = nr_dirty + nr_unstable +
675 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
676 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
677
678 wbc.nr_to_write = nr_dirty + nr_unstable +
679 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1159 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
680 } else
681 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
682 1160
683 sync_sb_inodes(sb, &wbc); 1161 wbc.nr_to_write = nr_to_write;
1162 bdi_writeback_all(&wbc);
1163 return nr_to_write - wbc.nr_to_write;
1164}
1165EXPORT_SYMBOL(writeback_inodes_sb);
1166
1167/**
1168 * sync_inodes_sb - sync sb inode pages
1169 * @sb: the superblock
1170 *
1171 * This function writes and waits on any dirty inode belonging to this
1172 * super_block. The number of pages synced is returned.
1173 */
1174long sync_inodes_sb(struct super_block *sb)
1175{
1176 struct writeback_control wbc = {
1177 .sb = sb,
1178 .sync_mode = WB_SYNC_ALL,
1179 .range_start = 0,
1180 .range_end = LLONG_MAX,
1181 };
1182 long nr_to_write = LONG_MAX; /* doesn't actually matter */
1183
1184 wbc.nr_to_write = nr_to_write;
1185 bdi_writeback_all(&wbc);
1186 wait_sb_inodes(&wbc);
1187 return nr_to_write - wbc.nr_to_write;
684} 1188}
1189EXPORT_SYMBOL(sync_inodes_sb);
685 1190
686/** 1191/**
687 * write_inode_now - write an inode to disk 1192 * write_inode_now - write an inode to disk
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..4567db6f9430 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -801,6 +801,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
801{ 801{
802 int err; 802 int err;
803 803
804 fc->bdi.name = "fuse";
804 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 805 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
805 fc->bdi.unplug_io_fn = default_unplug_io_fn; 806 fc->bdi.unplug_io_fn = default_unplug_io_fn;
806 /* fuse does it's own writeback accounting */ 807 /* fuse does it's own writeback accounting */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cb88dac8ccaa..a93b885311d8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
44static const struct inode_operations hugetlbfs_inode_operations; 44static const struct inode_operations hugetlbfs_inode_operations;
45 45
46static struct backing_dev_info hugetlbfs_backing_dev_info = { 46static struct backing_dev_info hugetlbfs_backing_dev_info = {
47 .name = "hugetlbfs",
47 .ra_pages = 0, /* No readahead */ 48 .ra_pages = 0, /* No readahead */
48 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 49 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
49}; 50};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..c6be84a161f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -879,6 +879,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
879 server->rsize = NFS_MAX_FILE_IO_SIZE; 879 server->rsize = NFS_MAX_FILE_IO_SIZE;
880 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 880 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
881 881
882 server->backing_dev_info.name = "nfs";
882 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; 883 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
883 884
884 if (server->wsize > max_rpc_payload) 885 if (server->wsize > max_rpc_payload)
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
325} 325}
326 326
327static struct backing_dev_info dlmfs_backing_dev_info = { 327static struct backing_dev_info dlmfs_backing_dev_info = {
328 .name = "ocfs2-dlmfs",
328 .ra_pages = 0, /* No readahead */ 329 .ra_pages = 0, /* No readahead */
329 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 330 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
330}; 331};
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
46static const struct inode_operations ramfs_dir_inode_operations; 46static const struct inode_operations ramfs_dir_inode_operations;
47 47
48static struct backing_dev_info ramfs_backing_dev_info = { 48static struct backing_dev_info ramfs_backing_dev_info = {
49 .name = "ramfs",
49 .ra_pages = 0, /* No readahead */ 50 .ra_pages = 0, /* No readahead */
50 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | 51 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
51 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | 52 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..9cda337ddae2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
62 s = NULL; 62 s = NULL;
63 goto out; 63 goto out;
64 } 64 }
65 INIT_LIST_HEAD(&s->s_dirty);
66 INIT_LIST_HEAD(&s->s_io);
67 INIT_LIST_HEAD(&s->s_more_io);
68 INIT_LIST_HEAD(&s->s_files); 65 INIT_LIST_HEAD(&s->s_files);
69 INIT_LIST_HEAD(&s->s_instances); 66 INIT_LIST_HEAD(&s->s_instances);
70 INIT_HLIST_HEAD(&s->s_anon); 67 INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
171 * Drops a temporary reference, frees superblock if there's no 168 * Drops a temporary reference, frees superblock if there's no
172 * references left. 169 * references left.
173 */ 170 */
174static void put_super(struct super_block *sb) 171void put_super(struct super_block *sb)
175{ 172{
176 spin_lock(&sb_lock); 173 spin_lock(&sb_lock);
177 __put_super(sb); 174 __put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..103cc7fdd3df 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,22 @@
19 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
20 20
21/* 21/*
22 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) 22 * Do the filesystem syncing work. For simple filesystems
23 * just dirties buffers with inodes so we have to submit IO for these buffers 23 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
24 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that 24 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
25 * case write_inode() functions do sync_dirty_buffer() and thus effectively 25 * wait == 1 case since in that case write_inode() functions do
26 * write one block at a time. 26 * sync_dirty_buffer() and thus effectively write one block at a time.
27 */ 27 */
28static int __sync_filesystem(struct super_block *sb, int wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
29{ 29{
30 /* Avoid doing twice syncing and cache pruning for quota sync */ 30 /* Avoid doing twice syncing and cache pruning for quota sync */
31 if (!wait) 31 if (!wait) {
32 writeout_quota_sb(sb, -1); 32 writeout_quota_sb(sb, -1);
33 else 33 writeback_inodes_sb(sb);
34 } else {
34 sync_quota_sb(sb, -1); 35 sync_quota_sb(sb, -1);
35 sync_inodes_sb(sb, wait); 36 sync_inodes_sb(sb);
37 }
36 if (sb->s_op->sync_fs) 38 if (sb->s_op->sync_fs)
37 sb->s_op->sync_fs(sb, wait); 39 sb->s_op->sync_fs(sb, wait);
38 return __sync_blockdev(sb->s_bdev, wait); 40 return __sync_blockdev(sb->s_bdev, wait);
@@ -118,7 +120,7 @@ restart:
118 */ 120 */
119SYSCALL_DEFINE0(sync) 121SYSCALL_DEFINE0(sync)
120{ 122{
121 wakeup_pdflush(0); 123 wakeup_flusher_threads(0);
122 sync_filesystems(0); 124 sync_filesystems(0);
123 sync_filesystems(1); 125 sync_filesystems(1);
124 if (unlikely(laptop_mode)) 126 if (unlikely(laptop_mode))
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 2b6a8d9de73d..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -31,6 +31,7 @@ static const struct address_space_operations sysfs_aops = {
31}; 31};
32 32
33static struct backing_dev_info sysfs_backing_dev_info = { 33static struct backing_dev_info sysfs_backing_dev_info = {
34 .name = "sysfs",
34 .ra_pages = 0, /* No readahead */ 35 .ra_pages = 0, /* No readahead */
35 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
36}; 37};
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..1c8991b0db13 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -65,26 +65,14 @@
65static int shrink_liability(struct ubifs_info *c, int nr_to_write) 65static int shrink_liability(struct ubifs_info *c, int nr_to_write)
66{ 66{
67 int nr_written; 67 int nr_written;
68 struct writeback_control wbc = {
69 .sync_mode = WB_SYNC_NONE,
70 .range_end = LLONG_MAX,
71 .nr_to_write = nr_to_write,
72 };
73
74 generic_sync_sb_inodes(c->vfs_sb, &wbc);
75 nr_written = nr_to_write - wbc.nr_to_write;
76 68
69 nr_written = writeback_inodes_sb(c->vfs_sb);
77 if (!nr_written) { 70 if (!nr_written) {
78 /* 71 /*
79 * Re-try again but wait on pages/inodes which are being 72 * Re-try again but wait on pages/inodes which are being
80 * written-back concurrently (e.g., by pdflush). 73 * written-back concurrently (e.g., by pdflush).
81 */ 74 */
82 memset(&wbc, 0, sizeof(struct writeback_control)); 75 nr_written = sync_inodes_sb(c->vfs_sb);
83 wbc.sync_mode = WB_SYNC_ALL;
84 wbc.range_end = LLONG_MAX;
85 wbc.nr_to_write = nr_to_write;
86 generic_sync_sb_inodes(c->vfs_sb, &wbc);
87 nr_written = nr_to_write - wbc.nr_to_write;
88 } 76 }
89 77
90 dbg_budg("%d pages were written back", nr_written); 78 dbg_budg("%d pages were written back", nr_written);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..51763aa8f4de 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
438{ 438{
439 int i, err; 439 int i, err;
440 struct ubifs_info *c = sb->s_fs_info; 440 struct ubifs_info *c = sb->s_fs_info;
441 struct writeback_control wbc = {
442 .sync_mode = WB_SYNC_ALL,
443 .range_start = 0,
444 .range_end = LLONG_MAX,
445 .nr_to_write = LONG_MAX,
446 };
447 441
448 /* 442 /*
449 * Zero @wait is just an advisory thing to help the file system shove 443 * Zero @wait is just an advisory thing to help the file system shove
@@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
462 * the user be able to get more accurate results of 'statfs()' after 456 * the user be able to get more accurate results of 'statfs()' after
463 * they synchronize the file system. 457 * they synchronize the file system.
464 */ 458 */
465 generic_sync_sb_inodes(sb, &wbc); 459 sync_inodes_sb(sb);
466 460
467 /* 461 /*
468 * Synchronize write buffers, because 'ubifs_run_commit()' does not 462 * Synchronize write buffers, because 'ubifs_run_commit()' does not
@@ -1971,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1971 * 1965 *
1972 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 1966 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1973 */ 1967 */
1968 c->bdi.name = "ubifs",
1974 c->bdi.capabilities = BDI_CAP_MAP_COPY; 1969 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1975 c->bdi.unplug_io_fn = default_unplug_io_fn; 1970 c->bdi.unplug_io_fn = default_unplug_io_fn;
1976 err = bdi_init(&c->bdi); 1971 err = bdi_init(&c->bdi);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 1d52425a6118..f169bcb90b58 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,8 @@
13#include <linux/proportions.h> 13#include <linux/proportions.h>
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h>
17#include <linux/writeback.h>
16#include <asm/atomic.h> 18#include <asm/atomic.h>
17 19
18struct page; 20struct page;
@@ -23,9 +25,11 @@ struct dentry;
23 * Bits in backing_dev_info.state 25 * Bits in backing_dev_info.state
24 */ 26 */
25enum bdi_state { 27enum bdi_state {
26 BDI_pdflush, /* A pdflush thread is working this device */ 28 BDI_pending, /* On its way to being activated */
29 BDI_wb_alloc, /* Default embedded wb allocated */
27 BDI_async_congested, /* The async (write) queue is getting full */ 30 BDI_async_congested, /* The async (write) queue is getting full */
28 BDI_sync_congested, /* The sync queue is getting full */ 31 BDI_sync_congested, /* The sync queue is getting full */
32 BDI_registered, /* bdi_register() was done */
29 BDI_unused, /* Available bits start here */ 33 BDI_unused, /* Available bits start here */
30}; 34};
31 35
@@ -39,7 +43,22 @@ enum bdi_stat_item {
39 43
40#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) 44#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
41 45
46struct bdi_writeback {
47 struct list_head list; /* hangs off the bdi */
48
49 struct backing_dev_info *bdi; /* our parent bdi */
50 unsigned int nr;
51
52 unsigned long last_old_flush; /* last old data flush */
53
54 struct task_struct *task; /* writeback task */
55 struct list_head b_dirty; /* dirty inodes */
56 struct list_head b_io; /* parked for writeback */
57 struct list_head b_more_io; /* parked for more writeback */
58};
59
42struct backing_dev_info { 60struct backing_dev_info {
61 struct list_head bdi_list;
43 unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ 62 unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
44 unsigned long state; /* Always use atomic bitops on this */ 63 unsigned long state; /* Always use atomic bitops on this */
45 unsigned int capabilities; /* Device capabilities */ 64 unsigned int capabilities; /* Device capabilities */
@@ -48,6 +67,8 @@ struct backing_dev_info {
48 void (*unplug_io_fn)(struct backing_dev_info *, struct page *); 67 void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
49 void *unplug_io_data; 68 void *unplug_io_data;
50 69
70 char *name;
71
51 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; 72 struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
52 73
53 struct prop_local_percpu completions; 74 struct prop_local_percpu completions;
@@ -56,6 +77,14 @@ struct backing_dev_info {
56 unsigned int min_ratio; 77 unsigned int min_ratio;
57 unsigned int max_ratio, max_prop_frac; 78 unsigned int max_ratio, max_prop_frac;
58 79
80 struct bdi_writeback wb; /* default writeback info for this bdi */
81 spinlock_t wb_lock; /* protects update side of wb_list */
82 struct list_head wb_list; /* the flusher threads hanging off this bdi */
83 unsigned long wb_mask; /* bitmask of registered tasks */
84 unsigned int wb_cnt; /* number of registered tasks */
85
86 struct list_head work_list;
87
59 struct device *dev; 88 struct device *dev;
60 89
61#ifdef CONFIG_DEBUG_FS 90#ifdef CONFIG_DEBUG_FS
@@ -71,6 +100,19 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
71 const char *fmt, ...); 100 const char *fmt, ...);
72int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); 101int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
73void bdi_unregister(struct backing_dev_info *bdi); 102void bdi_unregister(struct backing_dev_info *bdi);
103void bdi_start_writeback(struct writeback_control *wbc);
104int bdi_writeback_task(struct bdi_writeback *wb);
105int bdi_has_dirty_io(struct backing_dev_info *bdi);
106
107extern spinlock_t bdi_lock;
108extern struct list_head bdi_list;
109
110static inline int wb_has_dirty_io(struct bdi_writeback *wb)
111{
112 return !list_empty(&wb->b_dirty) ||
113 !list_empty(&wb->b_io) ||
114 !list_empty(&wb->b_more_io);
115}
74 116
75static inline void __add_bdi_stat(struct backing_dev_info *bdi, 117static inline void __add_bdi_stat(struct backing_dev_info *bdi,
76 enum bdi_stat_item item, s64 amount) 118 enum bdi_stat_item item, s64 amount)
@@ -261,6 +303,11 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
261 return bdi->capabilities & BDI_CAP_SWAP_BACKED; 303 return bdi->capabilities & BDI_CAP_SWAP_BACKED;
262} 304}
263 305
306static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
307{
308 return bdi == &default_backing_dev_info;
309}
310
264static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) 311static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
265{ 312{
266 return bdi_cap_writeback_dirty(mapping->backing_dev_info); 313 return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -276,4 +323,10 @@ static inline bool mapping_cap_swap_backed(struct address_space *mapping)
276 return bdi_cap_swap_backed(mapping->backing_dev_info); 323 return bdi_cap_swap_backed(mapping->backing_dev_info);
277} 324}
278 325
326static inline int bdi_sched_wait(void *word)
327{
328 schedule();
329 return 0;
330}
331
279#endif /* _LINUX_BACKING_DEV_H */ 332#endif /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c1f993515f51..26da98f61116 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -715,7 +715,7 @@ struct posix_acl;
715 715
716struct inode { 716struct inode {
717 struct hlist_node i_hash; 717 struct hlist_node i_hash;
718 struct list_head i_list; 718 struct list_head i_list; /* backing dev IO list */
719 struct list_head i_sb_list; 719 struct list_head i_sb_list;
720 struct list_head i_dentry; 720 struct list_head i_dentry;
721 unsigned long i_ino; 721 unsigned long i_ino;
@@ -1336,9 +1336,6 @@ struct super_block {
1336 struct xattr_handler **s_xattr; 1336 struct xattr_handler **s_xattr;
1337 1337
1338 struct list_head s_inodes; /* all inodes */ 1338 struct list_head s_inodes; /* all inodes */
1339 struct list_head s_dirty; /* dirty inodes */
1340 struct list_head s_io; /* parked for writeback */
1341 struct list_head s_more_io; /* parked for more writeback */
1342 struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ 1339 struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
1343 struct list_head s_files; 1340 struct list_head s_files;
1344 /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ 1341 /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
@@ -1789,6 +1786,7 @@ extern int get_sb_pseudo(struct file_system_type *, char *,
1789 struct vfsmount *mnt); 1786 struct vfsmount *mnt);
1790extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); 1787extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
1791int __put_super_and_need_restart(struct super_block *sb); 1788int __put_super_and_need_restart(struct super_block *sb);
1789void put_super(struct super_block *sb);
1792 1790
1793/* Alas, no aliases. Too much hassle with bringing module.h everywhere */ 1791/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
1794#define fops_get(fops) \ 1792#define fops_get(fops) \
@@ -2071,8 +2069,6 @@ static inline void invalidate_remote_inode(struct inode *inode)
2071extern int invalidate_inode_pages2(struct address_space *mapping); 2069extern int invalidate_inode_pages2(struct address_space *mapping);
2072extern int invalidate_inode_pages2_range(struct address_space *mapping, 2070extern int invalidate_inode_pages2_range(struct address_space *mapping,
2073 pgoff_t start, pgoff_t end); 2071 pgoff_t start, pgoff_t end);
2074extern void generic_sync_sb_inodes(struct super_block *sb,
2075 struct writeback_control *wbc);
2076extern int write_inode_now(struct inode *, int); 2072extern int write_inode_now(struct inode *, int);
2077extern int filemap_fdatawrite(struct address_space *); 2073extern int filemap_fdatawrite(struct address_space *);
2078extern int filemap_flush(struct address_space *); 2074extern int filemap_flush(struct address_space *);
@@ -2187,7 +2183,6 @@ extern int bdev_read_only(struct block_device *);
2187extern int set_blocksize(struct block_device *, int); 2183extern int set_blocksize(struct block_device *, int);
2188extern int sb_set_blocksize(struct super_block *, int); 2184extern int sb_set_blocksize(struct super_block *, int);
2189extern int sb_min_blocksize(struct super_block *, int); 2185extern int sb_min_blocksize(struct super_block *, int);
2190extern int sb_has_dirty_inodes(struct super_block *);
2191 2186
2192extern int generic_file_mmap(struct file *, struct vm_area_struct *); 2187extern int generic_file_mmap(struct file *, struct vm_area_struct *);
2193extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); 2188extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3224820c8514..78b1e4684cc9 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -14,17 +14,6 @@ extern struct list_head inode_in_use;
14extern struct list_head inode_unused; 14extern struct list_head inode_unused;
15 15
16/* 16/*
17 * Yes, writeback.h requires sched.h
18 * No, sched.h is not included from here.
19 */
20static inline int task_is_pdflush(struct task_struct *task)
21{
22 return task->flags & PF_FLUSHER;
23}
24
25#define current_is_pdflush() task_is_pdflush(current)
26
27/*
28 * fs/fs-writeback.c 17 * fs/fs-writeback.c
29 */ 18 */
30enum writeback_sync_modes { 19enum writeback_sync_modes {
@@ -40,6 +29,8 @@ enum writeback_sync_modes {
40struct writeback_control { 29struct writeback_control {
41 struct backing_dev_info *bdi; /* If !NULL, only write back this 30 struct backing_dev_info *bdi; /* If !NULL, only write back this
42 queue */ 31 queue */
32 struct super_block *sb; /* if !NULL, only write inodes from
33 this super_block */
43 enum writeback_sync_modes sync_mode; 34 enum writeback_sync_modes sync_mode;
44 unsigned long *older_than_this; /* If !NULL, only write back inodes 35 unsigned long *older_than_this; /* If !NULL, only write back inodes
45 older than this */ 36 older than this */
@@ -76,9 +67,13 @@ struct writeback_control {
76/* 67/*
77 * fs/fs-writeback.c 68 * fs/fs-writeback.c
78 */ 69 */
79void writeback_inodes(struct writeback_control *wbc); 70struct bdi_writeback;
80int inode_wait(void *); 71int inode_wait(void *);
81void sync_inodes_sb(struct super_block *, int wait); 72long writeback_inodes_sb(struct super_block *);
73long sync_inodes_sb(struct super_block *);
74void writeback_inodes_wbc(struct writeback_control *wbc);
75long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
76void wakeup_flusher_threads(long nr_pages);
82 77
83/* writeback.h requires fs.h; it, too, is not included from here. */ 78/* writeback.h requires fs.h; it, too, is not included from here. */
84static inline void wait_on_inode(struct inode *inode) 79static inline void wait_on_inode(struct inode *inode)
@@ -98,7 +93,6 @@ static inline void inode_sync_wait(struct inode *inode)
98/* 93/*
99 * mm/page-writeback.c 94 * mm/page-writeback.c
100 */ 95 */
101int wakeup_pdflush(long nr_pages);
102void laptop_io_completion(void); 96void laptop_io_completion(void);
103void laptop_sync_completion(void); 97void laptop_sync_completion(void);
104void throttle_vm_writeout(gfp_t gfp_mask); 98void throttle_vm_writeout(gfp_t gfp_mask);
@@ -150,7 +144,6 @@ balance_dirty_pages_ratelimited(struct address_space *mapping)
150typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, 144typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
151 void *data); 145 void *data);
152 146
153int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
154int generic_writepages(struct address_space *mapping, 147int generic_writepages(struct address_space *mapping,
155 struct writeback_control *wbc); 148 struct writeback_control *wbc);
156int write_cache_pages(struct address_space *mapping, 149int write_cache_pages(struct address_space *mapping,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7b..c7ece8f027f2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 600static struct file_operations proc_cgroupstats_operations;
601 601
602static struct backing_dev_info cgroup_backing_dev_info = { 602static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup",
603 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 604 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
604}; 605};
605 606
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..147a7a7873c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
8 vmalloc.o 8 vmalloc.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..d3ca0dac1111 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
22EXPORT_SYMBOL_GPL(default_backing_dev_info); 26EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
29DEFINE_SPINLOCK(bdi_lock);
30LIST_HEAD(bdi_list);
31LIST_HEAD(bdi_pending_list);
32
33static struct task_struct *sync_supers_tsk;
34static struct timer_list sync_supers_timer;
35
36static int bdi_sync_supers(void *);
37static void sync_supers_timer_fn(unsigned long);
38static void arm_supers_timer(void);
39
40static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
25 41
26#ifdef CONFIG_DEBUG_FS 42#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 43#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 53static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 54{
39 struct backing_dev_info *bdi = m->private; 55 struct backing_dev_info *bdi = m->private;
56 struct bdi_writeback *wb;
40 unsigned long background_thresh; 57 unsigned long background_thresh;
41 unsigned long dirty_thresh; 58 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 59 unsigned long bdi_thresh;
60 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
61 struct inode *inode;
62
63 /*
64 * inode lock is enough here, the bdi->wb_list is protected by
65 * RCU on the reader side
66 */
67 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
68 spin_lock(&inode_lock);
69 list_for_each_entry(wb, &bdi->wb_list, list) {
70 nr_wb++;
71 list_for_each_entry(inode, &wb->b_dirty, i_list)
72 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_list)
74 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_list)
76 nr_more_io++;
77 }
78 spin_unlock(&inode_lock);
43 79
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 80 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 81
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 85 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 86 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 87 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 88 "BackgroundThresh: %8lu kB\n"
89 "WriteBack threads:%8lu\n"
90 "b_dirty: %8lu\n"
91 "b_io: %8lu\n"
92 "b_more_io: %8lu\n"
93 "bdi_list: %8u\n"
94 "state: %8lx\n"
95 "wb_mask: %8lx\n"
96 "wb_list: %8u\n"
97 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 98 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 99 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 100 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 101 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 102 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
103 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 104#undef K
59 105
60 return 0; 106 return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
185{ 231{
186 int err; 232 int err;
187 233
234 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
235 BUG_ON(IS_ERR(sync_supers_tsk));
236
237 init_timer(&sync_supers_timer);
238 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
239 arm_supers_timer();
240
188 err = bdi_init(&default_backing_dev_info); 241 err = bdi_init(&default_backing_dev_info);
189 if (!err) 242 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 243 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void)
193} 246}
194subsys_initcall(default_bdi_init); 247subsys_initcall(default_bdi_init);
195 248
249static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
250{
251 memset(wb, 0, sizeof(*wb));
252
253 wb->bdi = bdi;
254 wb->last_old_flush = jiffies;
255 INIT_LIST_HEAD(&wb->b_dirty);
256 INIT_LIST_HEAD(&wb->b_io);
257 INIT_LIST_HEAD(&wb->b_more_io);
258}
259
260static void bdi_task_init(struct backing_dev_info *bdi,
261 struct bdi_writeback *wb)
262{
263 struct task_struct *tsk = current;
264
265 spin_lock(&bdi->wb_lock);
266 list_add_tail_rcu(&wb->list, &bdi->wb_list);
267 spin_unlock(&bdi->wb_lock);
268
269 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
270 set_freezable();
271
272 /*
273 * Our parent may run at a different priority, just set us to normal
274 */
275 set_user_nice(tsk, 0);
276}
277
278static int bdi_start_fn(void *ptr)
279{
280 struct bdi_writeback *wb = ptr;
281 struct backing_dev_info *bdi = wb->bdi;
282 int ret;
283
284 /*
285 * Add us to the active bdi_list
286 */
287 spin_lock(&bdi_lock);
288 list_add(&bdi->bdi_list, &bdi_list);
289 spin_unlock(&bdi_lock);
290
291 bdi_task_init(bdi, wb);
292
293 /*
294 * Clear pending bit and wakeup anybody waiting to tear us down
295 */
296 clear_bit(BDI_pending, &bdi->state);
297 smp_mb__after_clear_bit();
298 wake_up_bit(&bdi->state, BDI_pending);
299
300 ret = bdi_writeback_task(wb);
301
302 /*
303 * Remove us from the list
304 */
305 spin_lock(&bdi->wb_lock);
306 list_del_rcu(&wb->list);
307 spin_unlock(&bdi->wb_lock);
308
309 /*
310 * Flush any work that raced with us exiting. No new work
311 * will be added, since this bdi isn't discoverable anymore.
312 */
313 if (!list_empty(&bdi->work_list))
314 wb_do_writeback(wb, 1);
315
316 wb->task = NULL;
317 return ret;
318}
319
320int bdi_has_dirty_io(struct backing_dev_info *bdi)
321{
322 return wb_has_dirty_io(&bdi->wb);
323}
324
325static void bdi_flush_io(struct backing_dev_info *bdi)
326{
327 struct writeback_control wbc = {
328 .bdi = bdi,
329 .sync_mode = WB_SYNC_NONE,
330 .older_than_this = NULL,
331 .range_cyclic = 1,
332 .nr_to_write = 1024,
333 };
334
335 writeback_inodes_wbc(&wbc);
336}
337
338/*
339 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
340 * or we risk deadlocking on ->s_umount. The longer term solution would be
341 * to implement sync_supers_bdi() or similar and simply do it from the
342 * bdi writeback tasks individually.
343 */
344static int bdi_sync_supers(void *unused)
345{
346 set_user_nice(current, 0);
347
348 while (!kthread_should_stop()) {
349 set_current_state(TASK_INTERRUPTIBLE);
350 schedule();
351
352 /*
353 * Do this periodically, like kupdated() did before.
354 */
355 sync_supers();
356 }
357
358 return 0;
359}
360
361static void arm_supers_timer(void)
362{
363 unsigned long next;
364
365 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
366 mod_timer(&sync_supers_timer, round_jiffies_up(next));
367}
368
369static void sync_supers_timer_fn(unsigned long unused)
370{
371 wake_up_process(sync_supers_tsk);
372 arm_supers_timer();
373}
374
375static int bdi_forker_task(void *ptr)
376{
377 struct bdi_writeback *me = ptr;
378
379 bdi_task_init(me->bdi, me);
380
381 for (;;) {
382 struct backing_dev_info *bdi, *tmp;
383 struct bdi_writeback *wb;
384
385 /*
386 * Temporary measure, we want to make sure we don't see
387 * dirty data on the default backing_dev_info
388 */
389 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
390 wb_do_writeback(me, 0);
391
392 spin_lock(&bdi_lock);
393
394 /*
395 * Check if any existing bdi's have dirty data without
396 * a thread registered. If so, set that up.
397 */
398 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
399 if (bdi->wb.task)
400 continue;
401 if (list_empty(&bdi->work_list) &&
402 !bdi_has_dirty_io(bdi))
403 continue;
404
405 bdi_add_default_flusher_task(bdi);
406 }
407
408 set_current_state(TASK_INTERRUPTIBLE);
409
410 if (list_empty(&bdi_pending_list)) {
411 unsigned long wait;
412
413 spin_unlock(&bdi_lock);
414 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
415 schedule_timeout(wait);
416 try_to_freeze();
417 continue;
418 }
419
420 __set_current_state(TASK_RUNNING);
421
422 /*
423 * This is our real job - check for pending entries in
424 * bdi_pending_list, and create the tasks that got added
425 */
426 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
427 bdi_list);
428 list_del_init(&bdi->bdi_list);
429 spin_unlock(&bdi_lock);
430
431 wb = &bdi->wb;
432 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
433 dev_name(bdi->dev));
434 /*
435 * If task creation fails, then readd the bdi to
436 * the pending list and force writeout of the bdi
437 * from this forker thread. That will free some memory
438 * and we can try again.
439 */
440 if (IS_ERR(wb->task)) {
441 wb->task = NULL;
442
443 /*
444 * Add this 'bdi' to the back, so we get
445 * a chance to flush other bdi's to free
446 * memory.
447 */
448 spin_lock(&bdi_lock);
449 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
450 spin_unlock(&bdi_lock);
451
452 bdi_flush_io(bdi);
453 }
454 }
455
456 return 0;
457}
458
459/*
460 * Add the default flusher task that gets created for any bdi
461 * that has dirty data pending writeout
462 */
463void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
464{
465 if (!bdi_cap_writeback_dirty(bdi))
466 return;
467
468 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
469 printk(KERN_ERR "bdi %p/%s is not registered!\n",
470 bdi, bdi->name);
471 return;
472 }
473
474 /*
475 * Check with the helper whether to proceed adding a task. Will only
476 * abort if we two or more simultanous calls to
477 * bdi_add_default_flusher_task() occured, further additions will block
478 * waiting for previous additions to finish.
479 */
480 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
481 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
482
483 /*
484 * We are now on the pending list, wake up bdi_forker_task()
485 * to finish the job and add us back to the active bdi_list
486 */
487 wake_up_process(default_backing_dev_info.wb.task);
488 }
489}
490
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 491int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 492 const char *fmt, ...)
198{ 493{
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 506 goto exit;
212 } 507 }
213 508
509 spin_lock(&bdi_lock);
510 list_add_tail(&bdi->bdi_list, &bdi_list);
511 spin_unlock(&bdi_lock);
512
214 bdi->dev = dev; 513 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 514
515 /*
516 * Just start the forker thread for our default backing_dev_info,
517 * and add other bdi's to the list. They will get a thread created
518 * on-demand when they need it.
519 */
520 if (bdi_cap_flush_forker(bdi)) {
521 struct bdi_writeback *wb = &bdi->wb;
522
523 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
524 dev_name(dev));
525 if (IS_ERR(wb->task)) {
526 wb->task = NULL;
527 ret = -ENOMEM;
528
529 spin_lock(&bdi_lock);
530 list_del(&bdi->bdi_list);
531 spin_unlock(&bdi_lock);
532 goto exit;
533 }
534 }
535
536 bdi_debug_register(bdi, dev_name(dev));
537 set_bit(BDI_registered, &bdi->state);
217exit: 538exit:
218 return ret; 539 return ret;
219} 540}
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 546}
226EXPORT_SYMBOL(bdi_register_dev); 547EXPORT_SYMBOL(bdi_register_dev);
227 548
549/*
550 * Remove bdi from the global list and shutdown any threads we have running
551 */
552static void bdi_wb_shutdown(struct backing_dev_info *bdi)
553{
554 struct bdi_writeback *wb;
555
556 if (!bdi_cap_writeback_dirty(bdi))
557 return;
558
559 /*
560 * If setup is pending, wait for that to complete first
561 */
562 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
563 TASK_UNINTERRUPTIBLE);
564
565 /*
566 * Make sure nobody finds us on the bdi_list anymore
567 */
568 spin_lock(&bdi_lock);
569 list_del(&bdi->bdi_list);
570 spin_unlock(&bdi_lock);
571
572 /*
573 * Finally, kill the kernel threads. We don't need to be RCU
574 * safe anymore, since the bdi is gone from visibility.
575 */
576 list_for_each_entry(wb, &bdi->wb_list, list)
577 kthread_stop(wb->task);
578}
579
228void bdi_unregister(struct backing_dev_info *bdi) 580void bdi_unregister(struct backing_dev_info *bdi)
229{ 581{
230 if (bdi->dev) { 582 if (bdi->dev) {
583 if (!bdi_cap_flush_forker(bdi))
584 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 585 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 586 device_unregister(bdi->dev);
233 bdi->dev = NULL; 587 bdi->dev = NULL;
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister);
237 591
238int bdi_init(struct backing_dev_info *bdi) 592int bdi_init(struct backing_dev_info *bdi)
239{ 593{
240 int i; 594 int i, err;
241 int err;
242 595
243 bdi->dev = NULL; 596 bdi->dev = NULL;
244 597
245 bdi->min_ratio = 0; 598 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 599 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 600 bdi->max_prop_frac = PROP_FRAC_BASE;
601 spin_lock_init(&bdi->wb_lock);
602 INIT_LIST_HEAD(&bdi->bdi_list);
603 INIT_LIST_HEAD(&bdi->wb_list);
604 INIT_LIST_HEAD(&bdi->work_list);
605
606 bdi_wb_init(&bdi->wb, bdi);
607
608 /*
609 * Just one thread support for now, hard code mask and count
610 */
611 bdi->wb_mask = 1;
612 bdi->wb_cnt = 1;
248 613
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 614 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 615 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 634{
270 int i; 635 int i;
271 636
637 WARN_ON(bdi_has_dirty_io(bdi));
638
272 bdi_unregister(bdi); 639 bdi_unregister(bdi);
273 640
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 641 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 309/*
321 * 310 *
322 */ 311 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 312static unsigned int bdi_min_ratio;
325 313
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 314int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 315{
328 int ret = 0; 316 int ret = 0;
329 unsigned long flags;
330 317
331 spin_lock_irqsave(&bdi_lock, flags); 318 spin_lock(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 320 ret = -EINVAL;
334 } else { 321 } else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 327 ret = -EINVAL;
341 } 328 }
342 } 329 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 330 spin_unlock(&bdi_lock);
344 331
345 return ret; 332 return ret;
346} 333}
347 334
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 335int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 336{
350 unsigned long flags;
351 int ret = 0; 337 int ret = 0;
352 338
353 if (max_ratio > 100) 339 if (max_ratio > 100)
354 return -EINVAL; 340 return -EINVAL;
355 341
356 spin_lock_irqsave(&bdi_lock, flags); 342 spin_lock(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 344 ret = -EINVAL;
359 } else { 345 } else {
360 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 348 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 349 spin_unlock(&bdi_lock);
364 350
365 return ret; 351 return ret;
366} 352}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 532 * up.
547 */ 533 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 562 break; /* We've done our duty */
577 563
578 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
579 } 565 }
580 566
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
595 */ 581 */
596 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 585 > background_thresh))) {
600 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
601} 595}
602 596
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
681 } 675 }
682} 676}
683 677
684/*
685 * writeback at least _min_pages, and keep writing until the amount of dirty
686 * memory is less than the background threshold, or until we're all clean.
687 */
688static void background_writeout(unsigned long _min_pages)
689{
690 long min_pages = _min_pages;
691 struct writeback_control wbc = {
692 .bdi = NULL,
693 .sync_mode = WB_SYNC_NONE,
694 .older_than_this = NULL,
695 .nr_to_write = 0,
696 .nonblocking = 1,
697 .range_cyclic = 1,
698 };
699
700 for ( ; ; ) {
701 unsigned long background_thresh;
702 unsigned long dirty_thresh;
703
704 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
705 if (global_page_state(NR_FILE_DIRTY) +
706 global_page_state(NR_UNSTABLE_NFS) < background_thresh
707 && min_pages <= 0)
708 break;
709 wbc.more_io = 0;
710 wbc.encountered_congestion = 0;
711 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
712 wbc.pages_skipped = 0;
713 writeback_inodes(&wbc);
714 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
716 /* Wrote less than expected */
717 if (wbc.encountered_congestion || wbc.more_io)
718 congestion_wait(BLK_RW_ASYNC, HZ/10);
719 else
720 break;
721 }
722 }
723}
724
725/*
726 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
727 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
728 * -1 if all pdflush threads were busy.
729 */
730int wakeup_pdflush(long nr_pages)
731{
732 if (nr_pages == 0)
733 nr_pages = global_page_state(NR_FILE_DIRTY) +
734 global_page_state(NR_UNSTABLE_NFS);
735 return pdflush_operation(background_writeout, nr_pages);
736}
737
738static void wb_timer_fn(unsigned long unused);
739static void laptop_timer_fn(unsigned long unused); 678static void laptop_timer_fn(unsigned long unused);
740 679
741static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
742static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 680static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
743 681
744/* 682/*
745 * Periodic writeback of "old" data.
746 *
747 * Define "old": the first time one of an inode's pages is dirtied, we mark the
748 * dirtying-time in the inode's address_space. So this periodic writeback code
749 * just walks the superblock inode list, writing back any inodes which are
750 * older than a specific point in time.
751 *
752 * Try to run once per dirty_writeback_interval. But if a writeback event
753 * takes longer than a dirty_writeback_interval interval, then leave a
754 * one-second gap.
755 *
756 * older_than_this takes precedence over nr_to_write. So we'll only write back
757 * all dirty pages if they are all attached to "old" mappings.
758 */
759static void wb_kupdate(unsigned long arg)
760{
761 unsigned long oldest_jif;
762 unsigned long start_jif;
763 unsigned long next_jif;
764 long nr_to_write;
765 struct writeback_control wbc = {
766 .bdi = NULL,
767 .sync_mode = WB_SYNC_NONE,
768 .older_than_this = &oldest_jif,
769 .nr_to_write = 0,
770 .nonblocking = 1,
771 .for_kupdate = 1,
772 .range_cyclic = 1,
773 };
774
775 sync_supers();
776
777 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
778 start_jif = jiffies;
779 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
780 nr_to_write = global_page_state(NR_FILE_DIRTY) +
781 global_page_state(NR_UNSTABLE_NFS) +
782 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
783 while (nr_to_write > 0) {
784 wbc.more_io = 0;
785 wbc.encountered_congestion = 0;
786 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
787 writeback_inodes(&wbc);
788 if (wbc.nr_to_write > 0) {
789 if (wbc.encountered_congestion || wbc.more_io)
790 congestion_wait(BLK_RW_ASYNC, HZ/10);
791 else
792 break; /* All the old data is written */
793 }
794 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 }
796 if (time_before(next_jif, jiffies + HZ))
797 next_jif = jiffies + HZ;
798 if (dirty_writeback_interval)
799 mod_timer(&wb_timer, next_jif);
800}
801
802/*
803 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 683 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
804 */ 684 */
805int dirty_writeback_centisecs_handler(ctl_table *table, int write, 685int dirty_writeback_centisecs_handler(ctl_table *table, int write,
806 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 686 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
807{ 687{
808 proc_dointvec(table, write, file, buffer, length, ppos); 688 proc_dointvec(table, write, file, buffer, length, ppos);
809 if (dirty_writeback_interval)
810 mod_timer(&wb_timer, jiffies +
811 msecs_to_jiffies(dirty_writeback_interval * 10));
812 else
813 del_timer(&wb_timer);
814 return 0; 689 return 0;
815} 690}
816 691
817static void wb_timer_fn(unsigned long unused) 692static void do_laptop_sync(struct work_struct *work)
818{
819 if (pdflush_operation(wb_kupdate, 0) < 0)
820 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
821}
822
823static void laptop_flush(unsigned long unused)
824{ 693{
825 sys_sync(); 694 wakeup_flusher_threads(0);
695 kfree(work);
826} 696}
827 697
828static void laptop_timer_fn(unsigned long unused) 698static void laptop_timer_fn(unsigned long unused)
829{ 699{
830 pdflush_operation(laptop_flush, 0); 700 struct work_struct *work;
701
702 work = kmalloc(sizeof(*work), GFP_ATOMIC);
703 if (work) {
704 INIT_WORK(work, do_laptop_sync);
705 schedule_work(work);
706 }
831} 707}
832 708
833/* 709/*
@@ -910,8 +786,6 @@ void __init page_writeback_init(void)
910{ 786{
911 int shift; 787 int shift;
912 788
913 mod_timer(&wb_timer,
914 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
915 writeback_set_ratelimit(); 789 writeback_set_ratelimit();
916 register_cpu_notifier(&ratelimit_nb); 790 register_cpu_notifier(&ratelimit_nb);
917 791
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1720 */ 1720 */
1721 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1725 } 1725 }
1726 1726