diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 230 |
1 files changed, 117 insertions, 113 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 594cd822d84d..e4f9c1b0836c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include "xfs_log.h" | 22 | #include "xfs_log.h" |
23 | #include "xfs_inum.h" | 23 | #include "xfs_inum.h" |
24 | #include "xfs_trans.h" | 24 | #include "xfs_trans.h" |
25 | #include "xfs_trans_priv.h" | ||
25 | #include "xfs_sb.h" | 26 | #include "xfs_sb.h" |
26 | #include "xfs_ag.h" | 27 | #include "xfs_ag.h" |
27 | #include "xfs_mount.h" | 28 | #include "xfs_mount.h" |
@@ -39,6 +40,8 @@ | |||
39 | #include <linux/kthread.h> | 40 | #include <linux/kthread.h> |
40 | #include <linux/freezer.h> | 41 | #include <linux/freezer.h> |
41 | 42 | ||
43 | struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ | ||
44 | |||
42 | /* | 45 | /* |
43 | * The inode lookup is done in batches to keep the amount of lock traffic and | 46 | * The inode lookup is done in batches to keep the amount of lock traffic and |
44 | * radix tree lookups to a minimum. The batch size is a trade off between | 47 | * radix tree lookups to a minimum. The batch size is a trade off between |
@@ -401,7 +404,7 @@ xfs_quiesce_fs( | |||
401 | /* | 404 | /* |
402 | * Second stage of a quiesce. The data is already synced, now we have to take | 405 | * Second stage of a quiesce. The data is already synced, now we have to take |
403 | * care of the metadata. New transactions are already blocked, so we need to | 406 | * care of the metadata. New transactions are already blocked, so we need to |
404 | * wait for any remaining transactions to drain out before proceding. | 407 | * wait for any remaining transactions to drain out before proceeding. |
405 | */ | 408 | */ |
406 | void | 409 | void |
407 | xfs_quiesce_attr( | 410 | xfs_quiesce_attr( |
@@ -431,62 +434,12 @@ xfs_quiesce_attr( | |||
431 | xfs_unmountfs_writesb(mp); | 434 | xfs_unmountfs_writesb(mp); |
432 | } | 435 | } |
433 | 436 | ||
434 | /* | 437 | static void |
435 | * Enqueue a work item to be picked up by the vfs xfssyncd thread. | 438 | xfs_syncd_queue_sync( |
436 | * Doing this has two advantages: | 439 | struct xfs_mount *mp) |
437 | * - It saves on stack space, which is tight in certain situations | ||
438 | * - It can be used (with care) as a mechanism to avoid deadlocks. | ||
439 | * Flushing while allocating in a full filesystem requires both. | ||
440 | */ | ||
441 | STATIC void | ||
442 | xfs_syncd_queue_work( | ||
443 | struct xfs_mount *mp, | ||
444 | void *data, | ||
445 | void (*syncer)(struct xfs_mount *, void *), | ||
446 | struct completion *completion) | ||
447 | { | ||
448 | struct xfs_sync_work *work; | ||
449 | |||
450 | work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP); | ||
451 | INIT_LIST_HEAD(&work->w_list); | ||
452 | work->w_syncer = syncer; | ||
453 | work->w_data = data; | ||
454 | work->w_mount = mp; | ||
455 | work->w_completion = completion; | ||
456 | spin_lock(&mp->m_sync_lock); | ||
457 | list_add_tail(&work->w_list, &mp->m_sync_list); | ||
458 | spin_unlock(&mp->m_sync_lock); | ||
459 | wake_up_process(mp->m_sync_task); | ||
460 | } | ||
461 | |||
462 | /* | ||
463 | * Flush delayed allocate data, attempting to free up reserved space | ||
464 | * from existing allocations. At this point a new allocation attempt | ||
465 | * has failed with ENOSPC and we are in the process of scratching our | ||
466 | * heads, looking about for more room... | ||
467 | */ | ||
468 | STATIC void | ||
469 | xfs_flush_inodes_work( | ||
470 | struct xfs_mount *mp, | ||
471 | void *arg) | ||
472 | { | ||
473 | struct inode *inode = arg; | ||
474 | xfs_sync_data(mp, SYNC_TRYLOCK); | ||
475 | xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); | ||
476 | iput(inode); | ||
477 | } | ||
478 | |||
479 | void | ||
480 | xfs_flush_inodes( | ||
481 | xfs_inode_t *ip) | ||
482 | { | 440 | { |
483 | struct inode *inode = VFS_I(ip); | 441 | queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, |
484 | DECLARE_COMPLETION_ONSTACK(completion); | 442 | msecs_to_jiffies(xfs_syncd_centisecs * 10)); |
485 | |||
486 | igrab(inode); | ||
487 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); | ||
488 | wait_for_completion(&completion); | ||
489 | xfs_log_force(ip->i_mount, XFS_LOG_SYNC); | ||
490 | } | 443 | } |
491 | 444 | ||
492 | /* | 445 | /* |
@@ -496,9 +449,10 @@ xfs_flush_inodes( | |||
496 | */ | 449 | */ |
497 | STATIC void | 450 | STATIC void |
498 | xfs_sync_worker( | 451 | xfs_sync_worker( |
499 | struct xfs_mount *mp, | 452 | struct work_struct *work) |
500 | void *unused) | ||
501 | { | 453 | { |
454 | struct xfs_mount *mp = container_of(to_delayed_work(work), | ||
455 | struct xfs_mount, m_sync_work); | ||
502 | int error; | 456 | int error; |
503 | 457 | ||
504 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { | 458 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { |
@@ -508,73 +462,106 @@ xfs_sync_worker( | |||
508 | error = xfs_fs_log_dummy(mp); | 462 | error = xfs_fs_log_dummy(mp); |
509 | else | 463 | else |
510 | xfs_log_force(mp, 0); | 464 | xfs_log_force(mp, 0); |
511 | xfs_reclaim_inodes(mp, 0); | ||
512 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); | 465 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); |
466 | |||
467 | /* start pushing all the metadata that is currently dirty */ | ||
468 | xfs_ail_push_all(mp->m_ail); | ||
513 | } | 469 | } |
514 | mp->m_sync_seq++; | 470 | |
515 | wake_up(&mp->m_wait_single_sync_task); | 471 | /* queue us up again */ |
472 | xfs_syncd_queue_sync(mp); | ||
516 | } | 473 | } |
517 | 474 | ||
518 | STATIC int | 475 | /* |
519 | xfssyncd( | 476 | * Queue a new inode reclaim pass if there are reclaimable inodes and there |
520 | void *arg) | 477 | * isn't a reclaim pass already in progress. By default it runs every 5s based |
478 | * on the xfs syncd work default of 30s. Perhaps this should have it's own | ||
479 | * tunable, but that can be done if this method proves to be ineffective or too | ||
480 | * aggressive. | ||
481 | */ | ||
482 | static void | ||
483 | xfs_syncd_queue_reclaim( | ||
484 | struct xfs_mount *mp) | ||
521 | { | 485 | { |
522 | struct xfs_mount *mp = arg; | ||
523 | long timeleft; | ||
524 | xfs_sync_work_t *work, *n; | ||
525 | LIST_HEAD (tmp); | ||
526 | |||
527 | set_freezable(); | ||
528 | timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); | ||
529 | for (;;) { | ||
530 | if (list_empty(&mp->m_sync_list)) | ||
531 | timeleft = schedule_timeout_interruptible(timeleft); | ||
532 | /* swsusp */ | ||
533 | try_to_freeze(); | ||
534 | if (kthread_should_stop() && list_empty(&mp->m_sync_list)) | ||
535 | break; | ||
536 | 486 | ||
537 | spin_lock(&mp->m_sync_lock); | 487 | /* |
538 | /* | 488 | * We can have inodes enter reclaim after we've shut down the syncd |
539 | * We can get woken by laptop mode, to do a sync - | 489 | * workqueue during unmount, so don't allow reclaim work to be queued |
540 | * that's the (only!) case where the list would be | 490 | * during unmount. |
541 | * empty with time remaining. | 491 | */ |
542 | */ | 492 | if (!(mp->m_super->s_flags & MS_ACTIVE)) |
543 | if (!timeleft || list_empty(&mp->m_sync_list)) { | 493 | return; |
544 | if (!timeleft) | ||
545 | timeleft = xfs_syncd_centisecs * | ||
546 | msecs_to_jiffies(10); | ||
547 | INIT_LIST_HEAD(&mp->m_sync_work.w_list); | ||
548 | list_add_tail(&mp->m_sync_work.w_list, | ||
549 | &mp->m_sync_list); | ||
550 | } | ||
551 | list_splice_init(&mp->m_sync_list, &tmp); | ||
552 | spin_unlock(&mp->m_sync_lock); | ||
553 | 494 | ||
554 | list_for_each_entry_safe(work, n, &tmp, w_list) { | 495 | rcu_read_lock(); |
555 | (*work->w_syncer)(mp, work->w_data); | 496 | if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { |
556 | list_del(&work->w_list); | 497 | queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, |
557 | if (work == &mp->m_sync_work) | 498 | msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); |
558 | continue; | ||
559 | if (work->w_completion) | ||
560 | complete(work->w_completion); | ||
561 | kmem_free(work); | ||
562 | } | ||
563 | } | 499 | } |
500 | rcu_read_unlock(); | ||
501 | } | ||
564 | 502 | ||
565 | return 0; | 503 | /* |
504 | * This is a fast pass over the inode cache to try to get reclaim moving on as | ||
505 | * many inodes as possible in a short period of time. It kicks itself every few | ||
506 | * seconds, as well as being kicked by the inode cache shrinker when memory | ||
507 | * goes low. It scans as quickly as possible avoiding locked inodes or those | ||
508 | * already being flushed, and once done schedules a future pass. | ||
509 | */ | ||
510 | STATIC void | ||
511 | xfs_reclaim_worker( | ||
512 | struct work_struct *work) | ||
513 | { | ||
514 | struct xfs_mount *mp = container_of(to_delayed_work(work), | ||
515 | struct xfs_mount, m_reclaim_work); | ||
516 | |||
517 | xfs_reclaim_inodes(mp, SYNC_TRYLOCK); | ||
518 | xfs_syncd_queue_reclaim(mp); | ||
519 | } | ||
520 | |||
521 | /* | ||
522 | * Flush delayed allocate data, attempting to free up reserved space | ||
523 | * from existing allocations. At this point a new allocation attempt | ||
524 | * has failed with ENOSPC and we are in the process of scratching our | ||
525 | * heads, looking about for more room. | ||
526 | * | ||
527 | * Queue a new data flush if there isn't one already in progress and | ||
528 | * wait for completion of the flush. This means that we only ever have one | ||
529 | * inode flush in progress no matter how many ENOSPC events are occurring and | ||
530 | * so will prevent the system from bogging down due to every concurrent | ||
531 | * ENOSPC event scanning all the active inodes in the system for writeback. | ||
532 | */ | ||
533 | void | ||
534 | xfs_flush_inodes( | ||
535 | struct xfs_inode *ip) | ||
536 | { | ||
537 | struct xfs_mount *mp = ip->i_mount; | ||
538 | |||
539 | queue_work(xfs_syncd_wq, &mp->m_flush_work); | ||
540 | flush_work_sync(&mp->m_flush_work); | ||
541 | } | ||
542 | |||
543 | STATIC void | ||
544 | xfs_flush_worker( | ||
545 | struct work_struct *work) | ||
546 | { | ||
547 | struct xfs_mount *mp = container_of(work, | ||
548 | struct xfs_mount, m_flush_work); | ||
549 | |||
550 | xfs_sync_data(mp, SYNC_TRYLOCK); | ||
551 | xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); | ||
566 | } | 552 | } |
567 | 553 | ||
568 | int | 554 | int |
569 | xfs_syncd_init( | 555 | xfs_syncd_init( |
570 | struct xfs_mount *mp) | 556 | struct xfs_mount *mp) |
571 | { | 557 | { |
572 | mp->m_sync_work.w_syncer = xfs_sync_worker; | 558 | INIT_WORK(&mp->m_flush_work, xfs_flush_worker); |
573 | mp->m_sync_work.w_mount = mp; | 559 | INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); |
574 | mp->m_sync_work.w_completion = NULL; | 560 | INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); |
575 | mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); | 561 | |
576 | if (IS_ERR(mp->m_sync_task)) | 562 | xfs_syncd_queue_sync(mp); |
577 | return -PTR_ERR(mp->m_sync_task); | 563 | xfs_syncd_queue_reclaim(mp); |
564 | |||
578 | return 0; | 565 | return 0; |
579 | } | 566 | } |
580 | 567 | ||
@@ -582,7 +569,9 @@ void | |||
582 | xfs_syncd_stop( | 569 | xfs_syncd_stop( |
583 | struct xfs_mount *mp) | 570 | struct xfs_mount *mp) |
584 | { | 571 | { |
585 | kthread_stop(mp->m_sync_task); | 572 | cancel_delayed_work_sync(&mp->m_sync_work); |
573 | cancel_delayed_work_sync(&mp->m_reclaim_work); | ||
574 | cancel_work_sync(&mp->m_flush_work); | ||
586 | } | 575 | } |
587 | 576 | ||
588 | void | 577 | void |
@@ -601,6 +590,10 @@ __xfs_inode_set_reclaim_tag( | |||
601 | XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), | 590 | XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), |
602 | XFS_ICI_RECLAIM_TAG); | 591 | XFS_ICI_RECLAIM_TAG); |
603 | spin_unlock(&ip->i_mount->m_perag_lock); | 592 | spin_unlock(&ip->i_mount->m_perag_lock); |
593 | |||
594 | /* schedule periodic background inode reclaim */ | ||
595 | xfs_syncd_queue_reclaim(ip->i_mount); | ||
596 | |||
604 | trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, | 597 | trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, |
605 | -1, _RET_IP_); | 598 | -1, _RET_IP_); |
606 | } | 599 | } |
@@ -1017,7 +1010,13 @@ xfs_reclaim_inodes( | |||
1017 | } | 1010 | } |
1018 | 1011 | ||
1019 | /* | 1012 | /* |
1020 | * Shrinker infrastructure. | 1013 | * Inode cache shrinker. |
1014 | * | ||
1015 | * When called we make sure that there is a background (fast) inode reclaim in | ||
1016 | * progress, while we will throttle the speed of reclaim via doiing synchronous | ||
1017 | * reclaim of inodes. That means if we come across dirty inodes, we wait for | ||
1018 | * them to be cleaned, which we hope will not be very long due to the | ||
1019 | * background walker having already kicked the IO off on those dirty inodes. | ||
1021 | */ | 1020 | */ |
1022 | static int | 1021 | static int |
1023 | xfs_reclaim_inode_shrink( | 1022 | xfs_reclaim_inode_shrink( |
@@ -1032,10 +1031,15 @@ xfs_reclaim_inode_shrink( | |||
1032 | 1031 | ||
1033 | mp = container_of(shrink, struct xfs_mount, m_inode_shrink); | 1032 | mp = container_of(shrink, struct xfs_mount, m_inode_shrink); |
1034 | if (nr_to_scan) { | 1033 | if (nr_to_scan) { |
1034 | /* kick background reclaimer and push the AIL */ | ||
1035 | xfs_syncd_queue_reclaim(mp); | ||
1036 | xfs_ail_push_all(mp->m_ail); | ||
1037 | |||
1035 | if (!(gfp_mask & __GFP_FS)) | 1038 | if (!(gfp_mask & __GFP_FS)) |
1036 | return -1; | 1039 | return -1; |
1037 | 1040 | ||
1038 | xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); | 1041 | xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, |
1042 | &nr_to_scan); | ||
1039 | /* terminate if we don't exhaust the scan */ | 1043 | /* terminate if we don't exhaust the scan */ |
1040 | if (nr_to_scan > 0) | 1044 | if (nr_to_scan > 0) |
1041 | return -1; | 1045 | return -1; |