diff options
author | Jens Axboe <jens.axboe@oracle.com> | 2009-09-02 03:19:46 -0400 |
---|---|---|
committer | Jens Axboe <jens.axboe@oracle.com> | 2009-09-11 03:20:25 -0400 |
commit | 66f3b8e2e103a0b93b945764d98e9ba46cb926dd (patch) | |
tree | 442bf5664214f0a1448e4010b09868cc58fdd3d1 /fs/fs-writeback.c | |
parent | d8a8559cd7a9ccac98d5f6f13297a2ff68a43627 (diff) |
writeback: move dirty inodes from super_block to backing_dev_info
This is a first step at introducing per-bdi flusher threads. We should
have no change in behaviour, although sb_has_dirty_inodes() is now
ridiculously expensive, as there's no easy way to answer that question.
Not a huge problem, since it'll be deleted in subsequent patches.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 197 |
1 files changed, 127 insertions, 70 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 271e5f44e871..45ad4bb700e6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/buffer_head.h> | 25 | #include <linux/buffer_head.h> |
26 | #include "internal.h" | 26 | #include "internal.h" |
27 | 27 | ||
28 | #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) | ||
28 | 29 | ||
29 | /** | 30 | /** |
30 | * writeback_acquire - attempt to get exclusive writeback access to a device | 31 | * writeback_acquire - attempt to get exclusive writeback access to a device |
@@ -165,12 +166,13 @@ void __mark_inode_dirty(struct inode *inode, int flags) | |||
165 | goto out; | 166 | goto out; |
166 | 167 | ||
167 | /* | 168 | /* |
168 | * If the inode was already on s_dirty/s_io/s_more_io, don't | 169 | * If the inode was already on b_dirty/b_io/b_more_io, don't |
169 | * reposition it (that would break s_dirty time-ordering). | 170 | * reposition it (that would break b_dirty time-ordering). |
170 | */ | 171 | */ |
171 | if (!was_dirty) { | 172 | if (!was_dirty) { |
172 | inode->dirtied_when = jiffies; | 173 | inode->dirtied_when = jiffies; |
173 | list_move(&inode->i_list, &sb->s_dirty); | 174 | list_move(&inode->i_list, |
175 | &inode_to_bdi(inode)->b_dirty); | ||
174 | } | 176 | } |
175 | } | 177 | } |
176 | out: | 178 | out: |
@@ -191,31 +193,30 @@ static int write_inode(struct inode *inode, int sync) | |||
191 | * furthest end of its superblock's dirty-inode list. | 193 | * furthest end of its superblock's dirty-inode list. |
192 | * | 194 | * |
193 | * Before stamping the inode's ->dirtied_when, we check to see whether it is | 195 | * Before stamping the inode's ->dirtied_when, we check to see whether it is |
194 | * already the most-recently-dirtied inode on the s_dirty list. If that is | 196 | * already the most-recently-dirtied inode on the b_dirty list. If that is |
195 | * the case then the inode must have been redirtied while it was being written | 197 | * the case then the inode must have been redirtied while it was being written |
196 | * out and we don't reset its dirtied_when. | 198 | * out and we don't reset its dirtied_when. |
197 | */ | 199 | */ |
198 | static void redirty_tail(struct inode *inode) | 200 | static void redirty_tail(struct inode *inode) |
199 | { | 201 | { |
200 | struct super_block *sb = inode->i_sb; | 202 | struct backing_dev_info *bdi = inode_to_bdi(inode); |
201 | 203 | ||
202 | if (!list_empty(&sb->s_dirty)) { | 204 | if (!list_empty(&bdi->b_dirty)) { |
203 | struct inode *tail_inode; | 205 | struct inode *tail; |
204 | 206 | ||
205 | tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); | 207 | tail = list_entry(bdi->b_dirty.next, struct inode, i_list); |
206 | if (time_before(inode->dirtied_when, | 208 | if (time_before(inode->dirtied_when, tail->dirtied_when)) |
207 | tail_inode->dirtied_when)) | ||
208 | inode->dirtied_when = jiffies; | 209 | inode->dirtied_when = jiffies; |
209 | } | 210 | } |
210 | list_move(&inode->i_list, &sb->s_dirty); | 211 | list_move(&inode->i_list, &bdi->b_dirty); |
211 | } | 212 | } |
212 | 213 | ||
213 | /* | 214 | /* |
214 | * requeue inode for re-scanning after sb->s_io list is exhausted. | 215 | * requeue inode for re-scanning after bdi->b_io list is exhausted. |
215 | */ | 216 | */ |
216 | static void requeue_io(struct inode *inode) | 217 | static void requeue_io(struct inode *inode) |
217 | { | 218 | { |
218 | list_move(&inode->i_list, &inode->i_sb->s_more_io); | 219 | list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io); |
219 | } | 220 | } |
220 | 221 | ||
221 | static void inode_sync_complete(struct inode *inode) | 222 | static void inode_sync_complete(struct inode *inode) |
@@ -262,18 +263,50 @@ static void move_expired_inodes(struct list_head *delaying_queue, | |||
262 | /* | 263 | /* |
263 | * Queue all expired dirty inodes for io, eldest first. | 264 | * Queue all expired dirty inodes for io, eldest first. |
264 | */ | 265 | */ |
265 | static void queue_io(struct super_block *sb, | 266 | static void queue_io(struct backing_dev_info *bdi, |
266 | unsigned long *older_than_this) | 267 | unsigned long *older_than_this) |
268 | { | ||
269 | list_splice_init(&bdi->b_more_io, bdi->b_io.prev); | ||
270 | move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this); | ||
271 | } | ||
272 | |||
273 | static int sb_on_inode_list(struct super_block *sb, struct list_head *list) | ||
267 | { | 274 | { |
268 | list_splice_init(&sb->s_more_io, sb->s_io.prev); | 275 | struct inode *inode; |
269 | move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); | 276 | int ret = 0; |
277 | |||
278 | spin_lock(&inode_lock); | ||
279 | list_for_each_entry(inode, list, i_list) { | ||
280 | if (inode->i_sb == sb) { | ||
281 | ret = 1; | ||
282 | break; | ||
283 | } | ||
284 | } | ||
285 | spin_unlock(&inode_lock); | ||
286 | return ret; | ||
270 | } | 287 | } |
271 | 288 | ||
272 | int sb_has_dirty_inodes(struct super_block *sb) | 289 | int sb_has_dirty_inodes(struct super_block *sb) |
273 | { | 290 | { |
274 | return !list_empty(&sb->s_dirty) || | 291 | struct backing_dev_info *bdi; |
275 | !list_empty(&sb->s_io) || | 292 | int ret = 0; |
276 | !list_empty(&sb->s_more_io); | 293 | |
294 | /* | ||
295 | * This is REALLY expensive right now, but it'll go away | ||
296 | * when the bdi writeback is introduced | ||
297 | */ | ||
298 | mutex_lock(&bdi_lock); | ||
299 | list_for_each_entry(bdi, &bdi_list, bdi_list) { | ||
300 | if (sb_on_inode_list(sb, &bdi->b_dirty) || | ||
301 | sb_on_inode_list(sb, &bdi->b_io) || | ||
302 | sb_on_inode_list(sb, &bdi->b_more_io)) { | ||
303 | ret = 1; | ||
304 | break; | ||
305 | } | ||
306 | } | ||
307 | mutex_unlock(&bdi_lock); | ||
308 | |||
309 | return ret; | ||
277 | } | 310 | } |
278 | EXPORT_SYMBOL(sb_has_dirty_inodes); | 311 | EXPORT_SYMBOL(sb_has_dirty_inodes); |
279 | 312 | ||
@@ -322,11 +355,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
322 | if (inode->i_state & I_SYNC) { | 355 | if (inode->i_state & I_SYNC) { |
323 | /* | 356 | /* |
324 | * If this inode is locked for writeback and we are not doing | 357 | * If this inode is locked for writeback and we are not doing |
325 | * writeback-for-data-integrity, move it to s_more_io so that | 358 | * writeback-for-data-integrity, move it to b_more_io so that |
326 | * writeback can proceed with the other inodes on s_io. | 359 | * writeback can proceed with the other inodes on s_io. |
327 | * | 360 | * |
328 | * We'll have another go at writing back this inode when we | 361 | * We'll have another go at writing back this inode when we |
329 | * completed a full scan of s_io. | 362 | * completed a full scan of b_io. |
330 | */ | 363 | */ |
331 | if (!wait) { | 364 | if (!wait) { |
332 | requeue_io(inode); | 365 | requeue_io(inode); |
@@ -371,11 +404,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
371 | /* | 404 | /* |
372 | * We didn't write back all the pages. nfs_writepages() | 405 | * We didn't write back all the pages. nfs_writepages() |
373 | * sometimes bales out without doing anything. Redirty | 406 | * sometimes bales out without doing anything. Redirty |
374 | * the inode; Move it from s_io onto s_more_io/s_dirty. | 407 | * the inode; Move it from b_io onto b_more_io/b_dirty. |
375 | */ | 408 | */ |
376 | /* | 409 | /* |
377 | * akpm: if the caller was the kupdate function we put | 410 | * akpm: if the caller was the kupdate function we put |
378 | * this inode at the head of s_dirty so it gets first | 411 | * this inode at the head of b_dirty so it gets first |
379 | * consideration. Otherwise, move it to the tail, for | 412 | * consideration. Otherwise, move it to the tail, for |
380 | * the reasons described there. I'm not really sure | 413 | * the reasons described there. I'm not really sure |
381 | * how much sense this makes. Presumably I had a good | 414 | * how much sense this makes. Presumably I had a good |
@@ -385,7 +418,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
385 | if (wbc->for_kupdate) { | 418 | if (wbc->for_kupdate) { |
386 | /* | 419 | /* |
387 | * For the kupdate function we move the inode | 420 | * For the kupdate function we move the inode |
388 | * to s_more_io so it will get more writeout as | 421 | * to b_more_io so it will get more writeout as |
389 | * soon as the queue becomes uncongested. | 422 | * soon as the queue becomes uncongested. |
390 | */ | 423 | */ |
391 | inode->i_state |= I_DIRTY_PAGES; | 424 | inode->i_state |= I_DIRTY_PAGES; |
@@ -433,51 +466,34 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
433 | return ret; | 466 | return ret; |
434 | } | 467 | } |
435 | 468 | ||
436 | /* | 469 | static void generic_sync_bdi_inodes(struct backing_dev_info *bdi, |
437 | * Write out a superblock's list of dirty inodes. A wait will be performed | 470 | struct writeback_control *wbc, |
438 | * upon no inodes, all inodes or the final one, depending upon sync_mode. | 471 | struct super_block *sb) |
439 | * | ||
440 | * If older_than_this is non-NULL, then only write out inodes which | ||
441 | * had their first dirtying at a time earlier than *older_than_this. | ||
442 | * | ||
443 | * If we're a pdflush thread, then implement pdflush collision avoidance | ||
444 | * against the entire list. | ||
445 | * | ||
446 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. | ||
447 | * This function assumes that the blockdev superblock's inodes are backed by | ||
448 | * a variety of queues, so all inodes are searched. For other superblocks, | ||
449 | * assume that all inodes are backed by the same queue. | ||
450 | * | ||
451 | * FIXME: this linear search could get expensive with many fileystems. But | ||
452 | * how to fix? We need to go from an address_space to all inodes which share | ||
453 | * a queue with that address_space. (Easy: have a global "dirty superblocks" | ||
454 | * list). | ||
455 | * | ||
456 | * The inodes to be written are parked on sb->s_io. They are moved back onto | ||
457 | * sb->s_dirty as they are selected for writing. This way, none can be missed | ||
458 | * on the writer throttling path, and we get decent balancing between many | ||
459 | * throttled threads: we don't want them all piling up on inode_sync_wait. | ||
460 | */ | ||
461 | static void generic_sync_sb_inodes(struct super_block *sb, | ||
462 | struct writeback_control *wbc) | ||
463 | { | 472 | { |
473 | const int is_blkdev_sb = sb_is_blkdev_sb(sb); | ||
464 | const unsigned long start = jiffies; /* livelock avoidance */ | 474 | const unsigned long start = jiffies; /* livelock avoidance */ |
465 | int sync = wbc->sync_mode == WB_SYNC_ALL; | ||
466 | 475 | ||
467 | spin_lock(&inode_lock); | 476 | spin_lock(&inode_lock); |
468 | if (!wbc->for_kupdate || list_empty(&sb->s_io)) | ||
469 | queue_io(sb, wbc->older_than_this); | ||
470 | 477 | ||
471 | while (!list_empty(&sb->s_io)) { | 478 | if (!wbc->for_kupdate || list_empty(&bdi->b_io)) |
472 | struct inode *inode = list_entry(sb->s_io.prev, | 479 | queue_io(bdi, wbc->older_than_this); |
480 | |||
481 | while (!list_empty(&bdi->b_io)) { | ||
482 | struct inode *inode = list_entry(bdi->b_io.prev, | ||
473 | struct inode, i_list); | 483 | struct inode, i_list); |
474 | struct address_space *mapping = inode->i_mapping; | ||
475 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
476 | long pages_skipped; | 484 | long pages_skipped; |
477 | 485 | ||
486 | /* | ||
487 | * super block given and doesn't match, skip this inode | ||
488 | */ | ||
489 | if (sb && sb != inode->i_sb) { | ||
490 | redirty_tail(inode); | ||
491 | continue; | ||
492 | } | ||
493 | |||
478 | if (!bdi_cap_writeback_dirty(bdi)) { | 494 | if (!bdi_cap_writeback_dirty(bdi)) { |
479 | redirty_tail(inode); | 495 | redirty_tail(inode); |
480 | if (sb_is_blkdev_sb(sb)) { | 496 | if (is_blkdev_sb) { |
481 | /* | 497 | /* |
482 | * Dirty memory-backed blockdev: the ramdisk | 498 | * Dirty memory-backed blockdev: the ramdisk |
483 | * driver does this. Skip just this inode | 499 | * driver does this. Skip just this inode |
@@ -499,14 +515,14 @@ static void generic_sync_sb_inodes(struct super_block *sb, | |||
499 | 515 | ||
500 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 516 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
501 | wbc->encountered_congestion = 1; | 517 | wbc->encountered_congestion = 1; |
502 | if (!sb_is_blkdev_sb(sb)) | 518 | if (!is_blkdev_sb) |
503 | break; /* Skip a congested fs */ | 519 | break; /* Skip a congested fs */ |
504 | requeue_io(inode); | 520 | requeue_io(inode); |
505 | continue; /* Skip a congested blockdev */ | 521 | continue; /* Skip a congested blockdev */ |
506 | } | 522 | } |
507 | 523 | ||
508 | if (wbc->bdi && bdi != wbc->bdi) { | 524 | if (wbc->bdi && bdi != wbc->bdi) { |
509 | if (!sb_is_blkdev_sb(sb)) | 525 | if (!is_blkdev_sb) |
510 | break; /* fs has the wrong queue */ | 526 | break; /* fs has the wrong queue */ |
511 | requeue_io(inode); | 527 | requeue_io(inode); |
512 | continue; /* blockdev has wrong queue */ | 528 | continue; /* blockdev has wrong queue */ |
@@ -544,13 +560,57 @@ static void generic_sync_sb_inodes(struct super_block *sb, | |||
544 | wbc->more_io = 1; | 560 | wbc->more_io = 1; |
545 | break; | 561 | break; |
546 | } | 562 | } |
547 | if (!list_empty(&sb->s_more_io)) | 563 | if (!list_empty(&bdi->b_more_io)) |
548 | wbc->more_io = 1; | 564 | wbc->more_io = 1; |
549 | } | 565 | } |
550 | 566 | ||
551 | if (sync) { | 567 | spin_unlock(&inode_lock); |
568 | /* Leave any unwritten inodes on b_io */ | ||
569 | } | ||
570 | |||
571 | /* | ||
572 | * Write out a superblock's list of dirty inodes. A wait will be performed | ||
573 | * upon no inodes, all inodes or the final one, depending upon sync_mode. | ||
574 | * | ||
575 | * If older_than_this is non-NULL, then only write out inodes which | ||
576 | * had their first dirtying at a time earlier than *older_than_this. | ||
577 | * | ||
578 | * If we're a pdlfush thread, then implement pdflush collision avoidance | ||
579 | * against the entire list. | ||
580 | * | ||
581 | * If `bdi' is non-zero then we're being asked to writeback a specific queue. | ||
582 | * This function assumes that the blockdev superblock's inodes are backed by | ||
583 | * a variety of queues, so all inodes are searched. For other superblocks, | ||
584 | * assume that all inodes are backed by the same queue. | ||
585 | * | ||
586 | * FIXME: this linear search could get expensive with many fileystems. But | ||
587 | * how to fix? We need to go from an address_space to all inodes which share | ||
588 | * a queue with that address_space. (Easy: have a global "dirty superblocks" | ||
589 | * list). | ||
590 | * | ||
591 | * The inodes to be written are parked on bdi->b_io. They are moved back onto | ||
592 | * bdi->b_dirty as they are selected for writing. This way, none can be missed | ||
593 | * on the writer throttling path, and we get decent balancing between many | ||
594 | * throttled threads: we don't want them all piling up on inode_sync_wait. | ||
595 | */ | ||
596 | static void generic_sync_sb_inodes(struct super_block *sb, | ||
597 | struct writeback_control *wbc) | ||
598 | { | ||
599 | struct backing_dev_info *bdi; | ||
600 | |||
601 | if (!wbc->bdi) { | ||
602 | mutex_lock(&bdi_lock); | ||
603 | list_for_each_entry(bdi, &bdi_list, bdi_list) | ||
604 | generic_sync_bdi_inodes(bdi, wbc, sb); | ||
605 | mutex_unlock(&bdi_lock); | ||
606 | } else | ||
607 | generic_sync_bdi_inodes(wbc->bdi, wbc, sb); | ||
608 | |||
609 | if (wbc->sync_mode == WB_SYNC_ALL) { | ||
552 | struct inode *inode, *old_inode = NULL; | 610 | struct inode *inode, *old_inode = NULL; |
553 | 611 | ||
612 | spin_lock(&inode_lock); | ||
613 | |||
554 | /* | 614 | /* |
555 | * Data integrity sync. Must wait for all pages under writeback, | 615 | * Data integrity sync. Must wait for all pages under writeback, |
556 | * because there may have been pages dirtied before our sync | 616 | * because there may have been pages dirtied before our sync |
@@ -588,10 +648,7 @@ static void generic_sync_sb_inodes(struct super_block *sb, | |||
588 | } | 648 | } |
589 | spin_unlock(&inode_lock); | 649 | spin_unlock(&inode_lock); |
590 | iput(old_inode); | 650 | iput(old_inode); |
591 | } else | 651 | } |
592 | spin_unlock(&inode_lock); | ||
593 | |||
594 | return; /* Leave any unwritten inodes on s_io */ | ||
595 | } | 652 | } |
596 | 653 | ||
597 | /* | 654 | /* |
@@ -599,8 +656,8 @@ static void generic_sync_sb_inodes(struct super_block *sb, | |||
599 | * | 656 | * |
600 | * Note: | 657 | * Note: |
601 | * We don't need to grab a reference to superblock here. If it has non-empty | 658 | * We don't need to grab a reference to superblock here. If it has non-empty |
602 | * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed | 659 | * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed |
603 | * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all | 660 | * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all |
604 | * empty. Since __sync_single_inode() regains inode_lock before it finally moves | 661 | * empty. Since __sync_single_inode() regains inode_lock before it finally moves |
605 | * inode from superblock lists we are OK. | 662 | * inode from superblock lists we are OK. |
606 | * | 663 | * |