aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2009-09-02 03:19:46 -0400
committerJens Axboe <jens.axboe@oracle.com>2009-09-11 03:20:25 -0400
commit66f3b8e2e103a0b93b945764d98e9ba46cb926dd (patch)
tree442bf5664214f0a1448e4010b09868cc58fdd3d1 /fs/fs-writeback.c
parentd8a8559cd7a9ccac98d5f6f13297a2ff68a43627 (diff)
writeback: move dirty inodes from super_block to backing_dev_info
This is a first step at introducing per-bdi flusher threads. We should have no change in behaviour, although sb_has_dirty_inodes() is now ridiculously expensive, as there's no easy way to answer that question. Not a huge problem, since it'll be deleted in subsequent patches. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c197
1 files changed, 127 insertions, 70 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 271e5f44e871..45ad4bb700e6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -25,6 +25,7 @@
25#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
26#include "internal.h" 26#include "internal.h"
27 27
28#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 29
29/** 30/**
30 * writeback_acquire - attempt to get exclusive writeback access to a device 31 * writeback_acquire - attempt to get exclusive writeback access to a device
@@ -165,12 +166,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
165 goto out; 166 goto out;
166 167
167 /* 168 /*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 169 * If the inode was already on b_dirty/b_io/b_more_io, don't
169 * reposition it (that would break s_dirty time-ordering). 170 * reposition it (that would break b_dirty time-ordering).
170 */ 171 */
171 if (!was_dirty) { 172 if (!was_dirty) {
172 inode->dirtied_when = jiffies; 173 inode->dirtied_when = jiffies;
173 list_move(&inode->i_list, &sb->s_dirty); 174 list_move(&inode->i_list,
175 &inode_to_bdi(inode)->b_dirty);
174 } 176 }
175 } 177 }
176out: 178out:
@@ -191,31 +193,30 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 193 * furthest end of its superblock's dirty-inode list.
192 * 194 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 195 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 196 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 197 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 198 * out and we don't reset its dirtied_when.
197 */ 199 */
198static void redirty_tail(struct inode *inode) 200static void redirty_tail(struct inode *inode)
199{ 201{
200 struct super_block *sb = inode->i_sb; 202 struct backing_dev_info *bdi = inode_to_bdi(inode);
201 203
202 if (!list_empty(&sb->s_dirty)) { 204 if (!list_empty(&bdi->b_dirty)) {
203 struct inode *tail_inode; 205 struct inode *tail;
204 206
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 207 tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 208 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 209 inode->dirtied_when = jiffies;
209 } 210 }
210 list_move(&inode->i_list, &sb->s_dirty); 211 list_move(&inode->i_list, &bdi->b_dirty);
211} 212}
212 213
213/* 214/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 215 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 216 */
216static void requeue_io(struct inode *inode) 217static void requeue_io(struct inode *inode)
217{ 218{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 219 list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
219} 220}
220 221
221static void inode_sync_complete(struct inode *inode) 222static void inode_sync_complete(struct inode *inode)
@@ -262,18 +263,50 @@ static void move_expired_inodes(struct list_head *delaying_queue,
262/* 263/*
263 * Queue all expired dirty inodes for io, eldest first. 264 * Queue all expired dirty inodes for io, eldest first.
264 */ 265 */
265static void queue_io(struct super_block *sb, 266static void queue_io(struct backing_dev_info *bdi,
266 unsigned long *older_than_this) 267 unsigned long *older_than_this)
268{
269 list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
270 move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
271}
272
273static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
267{ 274{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 275 struct inode *inode;
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 276 int ret = 0;
277
278 spin_lock(&inode_lock);
279 list_for_each_entry(inode, list, i_list) {
280 if (inode->i_sb == sb) {
281 ret = 1;
282 break;
283 }
284 }
285 spin_unlock(&inode_lock);
286 return ret;
270} 287}
271 288
272int sb_has_dirty_inodes(struct super_block *sb) 289int sb_has_dirty_inodes(struct super_block *sb)
273{ 290{
274 return !list_empty(&sb->s_dirty) || 291 struct backing_dev_info *bdi;
275 !list_empty(&sb->s_io) || 292 int ret = 0;
276 !list_empty(&sb->s_more_io); 293
294 /*
295 * This is REALLY expensive right now, but it'll go away
296 * when the bdi writeback is introduced
297 */
298 mutex_lock(&bdi_lock);
299 list_for_each_entry(bdi, &bdi_list, bdi_list) {
300 if (sb_on_inode_list(sb, &bdi->b_dirty) ||
301 sb_on_inode_list(sb, &bdi->b_io) ||
302 sb_on_inode_list(sb, &bdi->b_more_io)) {
303 ret = 1;
304 break;
305 }
306 }
307 mutex_unlock(&bdi_lock);
308
309 return ret;
277} 310}
278EXPORT_SYMBOL(sb_has_dirty_inodes); 311EXPORT_SYMBOL(sb_has_dirty_inodes);
279 312
@@ -322,11 +355,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 355 if (inode->i_state & I_SYNC) {
323 /* 356 /*
324 * If this inode is locked for writeback and we are not doing 357 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 358 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 359 * writeback can proceed with the other inodes on s_io.
327 * 360 *
328 * We'll have another go at writing back this inode when we 361 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 362 * completed a full scan of b_io.
330 */ 363 */
331 if (!wait) { 364 if (!wait) {
332 requeue_io(inode); 365 requeue_io(inode);
@@ -371,11 +404,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
371 /* 404 /*
372 * We didn't write back all the pages. nfs_writepages() 405 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 406 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 407 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 408 */
376 /* 409 /*
377 * akpm: if the caller was the kupdate function we put 410 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 411 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 412 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 413 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 414 * how much sense this makes. Presumably I had a good
@@ -385,7 +418,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 418 if (wbc->for_kupdate) {
386 /* 419 /*
387 * For the kupdate function we move the inode 420 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 421 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 422 * soon as the queue becomes uncongested.
390 */ 423 */
391 inode->i_state |= I_DIRTY_PAGES; 424 inode->i_state |= I_DIRTY_PAGES;
@@ -433,51 +466,34 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
433 return ret; 466 return ret;
434} 467}
435 468
436/* 469static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
437 * Write out a superblock's list of dirty inodes. A wait will be performed 470 struct writeback_control *wbc,
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 471 struct super_block *sb)
439 *
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
447 * This function assumes that the blockdev superblock's inodes are backed by
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 *
451 * FIXME: this linear search could get expensive with many fileystems. But
452 * how to fix? We need to go from an address_space to all inodes which share
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */
461static void generic_sync_sb_inodes(struct super_block *sb,
462 struct writeback_control *wbc)
463{ 472{
473 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 474 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 475
467 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 477
471 while (!list_empty(&sb->s_io)) { 478 if (!wbc->for_kupdate || list_empty(&bdi->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 479 queue_io(bdi, wbc->older_than_this);
480
481 while (!list_empty(&bdi->b_io)) {
482 struct inode *inode = list_entry(bdi->b_io.prev,
473 struct inode, i_list); 483 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 484 long pages_skipped;
477 485
486 /*
487 * super block given and doesn't match, skip this inode
488 */
489 if (sb && sb != inode->i_sb) {
490 redirty_tail(inode);
491 continue;
492 }
493
478 if (!bdi_cap_writeback_dirty(bdi)) { 494 if (!bdi_cap_writeback_dirty(bdi)) {
479 redirty_tail(inode); 495 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 496 if (is_blkdev_sb) {
481 /* 497 /*
482 * Dirty memory-backed blockdev: the ramdisk 498 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 499 * driver does this. Skip just this inode
@@ -499,14 +515,14 @@ static void generic_sync_sb_inodes(struct super_block *sb,
499 515
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 516 if (wbc->nonblocking && bdi_write_congested(bdi)) {
501 wbc->encountered_congestion = 1; 517 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 518 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 519 break; /* Skip a congested fs */
504 requeue_io(inode); 520 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 521 continue; /* Skip a congested blockdev */
506 } 522 }
507 523
508 if (wbc->bdi && bdi != wbc->bdi) { 524 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb)) 525 if (!is_blkdev_sb)
510 break; /* fs has the wrong queue */ 526 break; /* fs has the wrong queue */
511 requeue_io(inode); 527 requeue_io(inode);
512 continue; /* blockdev has wrong queue */ 528 continue; /* blockdev has wrong queue */
@@ -544,13 +560,57 @@ static void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 560 wbc->more_io = 1;
545 break; 561 break;
546 } 562 }
547 if (!list_empty(&sb->s_more_io)) 563 if (!list_empty(&bdi->b_more_io))
548 wbc->more_io = 1; 564 wbc->more_io = 1;
549 } 565 }
550 566
551 if (sync) { 567 spin_unlock(&inode_lock);
568 /* Leave any unwritten inodes on b_io */
569}
570
571/*
572 * Write out a superblock's list of dirty inodes. A wait will be performed
573 * upon no inodes, all inodes or the final one, depending upon sync_mode.
574 *
575 * If older_than_this is non-NULL, then only write out inodes which
576 * had their first dirtying at a time earlier than *older_than_this.
577 *
578 * If we're a pdlfush thread, then implement pdflush collision avoidance
579 * against the entire list.
580 *
581 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
582 * This function assumes that the blockdev superblock's inodes are backed by
583 * a variety of queues, so all inodes are searched. For other superblocks,
584 * assume that all inodes are backed by the same queue.
585 *
586 * FIXME: this linear search could get expensive with many fileystems. But
587 * how to fix? We need to go from an address_space to all inodes which share
588 * a queue with that address_space. (Easy: have a global "dirty superblocks"
589 * list).
590 *
591 * The inodes to be written are parked on bdi->b_io. They are moved back onto
592 * bdi->b_dirty as they are selected for writing. This way, none can be missed
593 * on the writer throttling path, and we get decent balancing between many
594 * throttled threads: we don't want them all piling up on inode_sync_wait.
595 */
596static void generic_sync_sb_inodes(struct super_block *sb,
597 struct writeback_control *wbc)
598{
599 struct backing_dev_info *bdi;
600
601 if (!wbc->bdi) {
602 mutex_lock(&bdi_lock);
603 list_for_each_entry(bdi, &bdi_list, bdi_list)
604 generic_sync_bdi_inodes(bdi, wbc, sb);
605 mutex_unlock(&bdi_lock);
606 } else
607 generic_sync_bdi_inodes(wbc->bdi, wbc, sb);
608
609 if (wbc->sync_mode == WB_SYNC_ALL) {
552 struct inode *inode, *old_inode = NULL; 610 struct inode *inode, *old_inode = NULL;
553 611
612 spin_lock(&inode_lock);
613
554 /* 614 /*
555 * Data integrity sync. Must wait for all pages under writeback, 615 * Data integrity sync. Must wait for all pages under writeback,
556 * because there may have been pages dirtied before our sync 616 * because there may have been pages dirtied before our sync
@@ -588,10 +648,7 @@ static void generic_sync_sb_inodes(struct super_block *sb,
588 } 648 }
589 spin_unlock(&inode_lock); 649 spin_unlock(&inode_lock);
590 iput(old_inode); 650 iput(old_inode);
591 } else 651 }
592 spin_unlock(&inode_lock);
593
594 return; /* Leave any unwritten inodes on s_io */
595} 652}
596 653
597/* 654/*
@@ -599,8 +656,8 @@ static void generic_sync_sb_inodes(struct super_block *sb,
599 * 656 *
600 * Note: 657 * Note:
601 * We don't need to grab a reference to superblock here. If it has non-empty 658 * We don't need to grab a reference to superblock here. If it has non-empty
602 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed 659 * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
603 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all 660 * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
604 * empty. Since __sync_single_inode() regains inode_lock before it finally moves 661 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
605 * inode from superblock lists we are OK. 662 * inode from superblock lists we are OK.
606 * 663 *