diff options
author | Ken Chen <kenchen@google.com> | 2007-10-17 02:30:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-17 11:43:02 -0400 |
commit | 0e0f4fc22ece8e593167eccbb1a4154565c11faa (patch) | |
tree | 564ab2eabb31ab945c334706662854bb227f45e9 | |
parent | 670e4def6ef5f44315d62748134e535b479c784f (diff) |
writeback: fix periodic superblock dirty inode flushing
Current -mm tree has bucketful of bug fixes in periodic writeback path.
However, we still hit a glitch where dirty pages on a given inode aren't
completely flushed to the disk, and system will accumulate large amount of
dirty pages beyond what dirty_expire_interval is designed for.
The problem is __sync_single_inode() will move an inode to sb->s_dirty list
even when there are more pending dirty pages on that inode. If there is
another inode with a small number of dirty pages, we hit a case where the loop
iteration in wb_kupdate() terminates prematurely because wbc.nr_to_write > 0.
Thus leaving the inode that has large amount of dirty pages behind and it has
to wait for another dirty_writeback_interval before we flush it again. We
effectively only write out MAX_WRITEBACK_PAGES every dirty_writeback_interval.
If the rate of dirtying is sufficiently high, the system will start
accumulate a large number of dirty pages.
So fix it by having another sb->s_more_io list on which to park the inode
while we iterate through sb->s_io and to allow each dirty inode which resides
on that sb to have an equal chance of flushing some amount of dirty pages.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/fs-writeback.c | 36 | ||||
-rw-r--r-- | fs/super.c | 1 | ||||
-rw-r--r-- | include/linux/fs.h | 1 |
3 files changed, 16 insertions, 22 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 39fadfad86f7..c9d105ff7970 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -165,25 +165,11 @@ static void redirty_tail(struct inode *inode) | |||
165 | } | 165 | } |
166 | 166 | ||
167 | /* | 167 | /* |
168 | * Redirty an inode, but mark it as the very next-to-be-written inode on its | 168 | * requeue inode for re-scanning after sb->s_io list is exhausted. |
169 | * superblock's dirty-inode list. | ||
170 | * We need to preserve s_dirty's reverse-time-orderedness, so we cheat by | ||
171 | * setting this inode's dirtied_when to the same value as that of the inode | ||
172 | * which is presently head-of-list, if present head-of-list is newer than this | ||
173 | * inode. (head-of-list is the least-recently-dirtied inode: the oldest one). | ||
174 | */ | 169 | */ |
175 | static void redirty_head(struct inode *inode) | 170 | static void requeue_io(struct inode *inode) |
176 | { | 171 | { |
177 | struct super_block *sb = inode->i_sb; | 172 | list_move(&inode->i_list, &inode->i_sb->s_more_io); |
178 | |||
179 | if (!list_empty(&sb->s_dirty)) { | ||
180 | struct inode *head_inode; | ||
181 | |||
182 | head_inode = list_entry(sb->s_dirty.prev, struct inode, i_list); | ||
183 | if (time_after(inode->dirtied_when, head_inode->dirtied_when)) | ||
184 | inode->dirtied_when = head_inode->dirtied_when; | ||
185 | } | ||
186 | list_move_tail(&inode->i_list, &sb->s_dirty); | ||
187 | } | 173 | } |
188 | 174 | ||
189 | /* | 175 | /* |
@@ -255,7 +241,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
255 | * uncongested. | 241 | * uncongested. |
256 | */ | 242 | */ |
257 | inode->i_state |= I_DIRTY_PAGES; | 243 | inode->i_state |= I_DIRTY_PAGES; |
258 | redirty_head(inode); | 244 | requeue_io(inode); |
259 | } else { | 245 | } else { |
260 | /* | 246 | /* |
261 | * Otherwise fully redirty the inode so that | 247 | * Otherwise fully redirty the inode so that |
@@ -315,7 +301,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
315 | * on s_io. We'll have another go at writing back this inode | 301 | * on s_io. We'll have another go at writing back this inode |
316 | * when the s_dirty iodes get moved back onto s_io. | 302 | * when the s_dirty iodes get moved back onto s_io. |
317 | */ | 303 | */ |
318 | redirty_head(inode); | 304 | requeue_io(inode); |
319 | 305 | ||
320 | /* | 306 | /* |
321 | * Even if we don't actually write the inode itself here, | 307 | * Even if we don't actually write the inode itself here, |
@@ -410,14 +396,14 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) | |||
410 | wbc->encountered_congestion = 1; | 396 | wbc->encountered_congestion = 1; |
411 | if (!sb_is_blkdev_sb(sb)) | 397 | if (!sb_is_blkdev_sb(sb)) |
412 | break; /* Skip a congested fs */ | 398 | break; /* Skip a congested fs */ |
413 | redirty_head(inode); | 399 | requeue_io(inode); |
414 | continue; /* Skip a congested blockdev */ | 400 | continue; /* Skip a congested blockdev */ |
415 | } | 401 | } |
416 | 402 | ||
417 | if (wbc->bdi && bdi != wbc->bdi) { | 403 | if (wbc->bdi && bdi != wbc->bdi) { |
418 | if (!sb_is_blkdev_sb(sb)) | 404 | if (!sb_is_blkdev_sb(sb)) |
419 | break; /* fs has the wrong queue */ | 405 | break; /* fs has the wrong queue */ |
420 | redirty_head(inode); | 406 | requeue_io(inode); |
421 | continue; /* blockdev has wrong queue */ | 407 | continue; /* blockdev has wrong queue */ |
422 | } | 408 | } |
423 | 409 | ||
@@ -427,8 +413,10 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) | |||
427 | 413 | ||
428 | /* Was this inode dirtied too recently? */ | 414 | /* Was this inode dirtied too recently? */ |
429 | if (wbc->older_than_this && time_after(inode->dirtied_when, | 415 | if (wbc->older_than_this && time_after(inode->dirtied_when, |
430 | *wbc->older_than_this)) | 416 | *wbc->older_than_this)) { |
417 | list_splice_init(&sb->s_io, sb->s_dirty.prev); | ||
431 | break; | 418 | break; |
419 | } | ||
432 | 420 | ||
433 | /* Is another pdflush already flushing this queue? */ | 421 | /* Is another pdflush already flushing this queue? */ |
434 | if (current_is_pdflush() && !writeback_acquire(bdi)) | 422 | if (current_is_pdflush() && !writeback_acquire(bdi)) |
@@ -458,6 +446,10 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) | |||
458 | if (wbc->nr_to_write <= 0) | 446 | if (wbc->nr_to_write <= 0) |
459 | break; | 447 | break; |
460 | } | 448 | } |
449 | |||
450 | if (list_empty(&sb->s_io)) | ||
451 | list_splice_init(&sb->s_more_io, &sb->s_io); | ||
452 | |||
461 | return; /* Leave any unwritten inodes on s_io */ | 453 | return; /* Leave any unwritten inodes on s_io */ |
462 | } | 454 | } |
463 | 455 | ||
diff --git a/fs/super.c b/fs/super.c index fc8ebedc6bed..1bfcca2104be 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -67,6 +67,7 @@ static struct super_block *alloc_super(struct file_system_type *type) | |||
67 | } | 67 | } |
68 | INIT_LIST_HEAD(&s->s_dirty); | 68 | INIT_LIST_HEAD(&s->s_dirty); |
69 | INIT_LIST_HEAD(&s->s_io); | 69 | INIT_LIST_HEAD(&s->s_io); |
70 | INIT_LIST_HEAD(&s->s_more_io); | ||
70 | INIT_LIST_HEAD(&s->s_files); | 71 | INIT_LIST_HEAD(&s->s_files); |
71 | INIT_LIST_HEAD(&s->s_instances); | 72 | INIT_LIST_HEAD(&s->s_instances); |
72 | INIT_HLIST_HEAD(&s->s_anon); | 73 | INIT_HLIST_HEAD(&s->s_anon); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 30aca3399450..0b38a897c114 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -1002,6 +1002,7 @@ struct super_block { | |||
1002 | struct list_head s_inodes; /* all inodes */ | 1002 | struct list_head s_inodes; /* all inodes */ |
1003 | struct list_head s_dirty; /* dirty inodes */ | 1003 | struct list_head s_dirty; /* dirty inodes */ |
1004 | struct list_head s_io; /* parked for writeback */ | 1004 | struct list_head s_io; /* parked for writeback */ |
1005 | struct list_head s_more_io; /* parked for more writeback */ | ||
1005 | struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ | 1006 | struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ |
1006 | struct list_head s_files; | 1007 | struct list_head s_files; |
1007 | 1008 | ||