diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 265 |
1 files changed, 146 insertions, 119 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index e22f0057d21f..e4f9c1b0836c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include "xfs_log.h" | 22 | #include "xfs_log.h" |
23 | #include "xfs_inum.h" | 23 | #include "xfs_inum.h" |
24 | #include "xfs_trans.h" | 24 | #include "xfs_trans.h" |
25 | #include "xfs_trans_priv.h" | ||
25 | #include "xfs_sb.h" | 26 | #include "xfs_sb.h" |
26 | #include "xfs_ag.h" | 27 | #include "xfs_ag.h" |
27 | #include "xfs_mount.h" | 28 | #include "xfs_mount.h" |
@@ -39,6 +40,8 @@ | |||
39 | #include <linux/kthread.h> | 40 | #include <linux/kthread.h> |
40 | #include <linux/freezer.h> | 41 | #include <linux/freezer.h> |
41 | 42 | ||
43 | struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ | ||
44 | |||
42 | /* | 45 | /* |
43 | * The inode lookup is done in batches to keep the amount of lock traffic and | 46 | * The inode lookup is done in batches to keep the amount of lock traffic and |
44 | * radix tree lookups to a minimum. The batch size is a trade off between | 47 | * radix tree lookups to a minimum. The batch size is a trade off between |
@@ -401,7 +404,7 @@ xfs_quiesce_fs( | |||
401 | /* | 404 | /* |
402 | * Second stage of a quiesce. The data is already synced, now we have to take | 405 | * Second stage of a quiesce. The data is already synced, now we have to take |
403 | * care of the metadata. New transactions are already blocked, so we need to | 406 | * care of the metadata. New transactions are already blocked, so we need to |
404 | * wait for any remaining transactions to drain out before proceding. | 407 | * wait for any remaining transactions to drain out before proceeding. |
405 | */ | 408 | */ |
406 | void | 409 | void |
407 | xfs_quiesce_attr( | 410 | xfs_quiesce_attr( |
@@ -425,69 +428,18 @@ xfs_quiesce_attr( | |||
425 | /* Push the superblock and write an unmount record */ | 428 | /* Push the superblock and write an unmount record */ |
426 | error = xfs_log_sbcount(mp, 1); | 429 | error = xfs_log_sbcount(mp, 1); |
427 | if (error) | 430 | if (error) |
428 | xfs_fs_cmn_err(CE_WARN, mp, | 431 | xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " |
429 | "xfs_attr_quiesce: failed to log sb changes. " | ||
430 | "Frozen image may not be consistent."); | 432 | "Frozen image may not be consistent."); |
431 | xfs_log_unmount_write(mp); | 433 | xfs_log_unmount_write(mp); |
432 | xfs_unmountfs_writesb(mp); | 434 | xfs_unmountfs_writesb(mp); |
433 | } | 435 | } |
434 | 436 | ||
435 | /* | 437 | static void |
436 | * Enqueue a work item to be picked up by the vfs xfssyncd thread. | 438 | xfs_syncd_queue_sync( |
437 | * Doing this has two advantages: | 439 | struct xfs_mount *mp) |
438 | * - It saves on stack space, which is tight in certain situations | ||
439 | * - It can be used (with care) as a mechanism to avoid deadlocks. | ||
440 | * Flushing while allocating in a full filesystem requires both. | ||
441 | */ | ||
442 | STATIC void | ||
443 | xfs_syncd_queue_work( | ||
444 | struct xfs_mount *mp, | ||
445 | void *data, | ||
446 | void (*syncer)(struct xfs_mount *, void *), | ||
447 | struct completion *completion) | ||
448 | { | ||
449 | struct xfs_sync_work *work; | ||
450 | |||
451 | work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP); | ||
452 | INIT_LIST_HEAD(&work->w_list); | ||
453 | work->w_syncer = syncer; | ||
454 | work->w_data = data; | ||
455 | work->w_mount = mp; | ||
456 | work->w_completion = completion; | ||
457 | spin_lock(&mp->m_sync_lock); | ||
458 | list_add_tail(&work->w_list, &mp->m_sync_list); | ||
459 | spin_unlock(&mp->m_sync_lock); | ||
460 | wake_up_process(mp->m_sync_task); | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * Flush delayed allocate data, attempting to free up reserved space | ||
465 | * from existing allocations. At this point a new allocation attempt | ||
466 | * has failed with ENOSPC and we are in the process of scratching our | ||
467 | * heads, looking about for more room... | ||
468 | */ | ||
469 | STATIC void | ||
470 | xfs_flush_inodes_work( | ||
471 | struct xfs_mount *mp, | ||
472 | void *arg) | ||
473 | { | ||
474 | struct inode *inode = arg; | ||
475 | xfs_sync_data(mp, SYNC_TRYLOCK); | ||
476 | xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); | ||
477 | iput(inode); | ||
478 | } | ||
479 | |||
480 | void | ||
481 | xfs_flush_inodes( | ||
482 | xfs_inode_t *ip) | ||
483 | { | 440 | { |
484 | struct inode *inode = VFS_I(ip); | 441 | queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, |
485 | DECLARE_COMPLETION_ONSTACK(completion); | 442 | msecs_to_jiffies(xfs_syncd_centisecs * 10)); |
486 | |||
487 | igrab(inode); | ||
488 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); | ||
489 | wait_for_completion(&completion); | ||
490 | xfs_log_force(ip->i_mount, XFS_LOG_SYNC); | ||
491 | } | 443 | } |
492 | 444 | ||
493 | /* | 445 | /* |
@@ -497,9 +449,10 @@ xfs_flush_inodes( | |||
497 | */ | 449 | */ |
498 | STATIC void | 450 | STATIC void |
499 | xfs_sync_worker( | 451 | xfs_sync_worker( |
500 | struct xfs_mount *mp, | 452 | struct work_struct *work) |
501 | void *unused) | ||
502 | { | 453 | { |
454 | struct xfs_mount *mp = container_of(to_delayed_work(work), | ||
455 | struct xfs_mount, m_sync_work); | ||
503 | int error; | 456 | int error; |
504 | 457 | ||
505 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { | 458 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { |
@@ -509,73 +462,106 @@ xfs_sync_worker( | |||
509 | error = xfs_fs_log_dummy(mp); | 462 | error = xfs_fs_log_dummy(mp); |
510 | else | 463 | else |
511 | xfs_log_force(mp, 0); | 464 | xfs_log_force(mp, 0); |
512 | xfs_reclaim_inodes(mp, 0); | ||
513 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); | 465 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); |
466 | |||
467 | /* start pushing all the metadata that is currently dirty */ | ||
468 | xfs_ail_push_all(mp->m_ail); | ||
514 | } | 469 | } |
515 | mp->m_sync_seq++; | 470 | |
516 | wake_up(&mp->m_wait_single_sync_task); | 471 | /* queue us up again */ |
472 | xfs_syncd_queue_sync(mp); | ||
517 | } | 473 | } |
518 | 474 | ||
519 | STATIC int | 475 | /* |
520 | xfssyncd( | 476 | * Queue a new inode reclaim pass if there are reclaimable inodes and there |
521 | void *arg) | 477 | * isn't a reclaim pass already in progress. By default it runs every 5s based |
478 | * on the xfs syncd work default of 30s. Perhaps this should have it's own | ||
479 | * tunable, but that can be done if this method proves to be ineffective or too | ||
480 | * aggressive. | ||
481 | */ | ||
482 | static void | ||
483 | xfs_syncd_queue_reclaim( | ||
484 | struct xfs_mount *mp) | ||
522 | { | 485 | { |
523 | struct xfs_mount *mp = arg; | ||
524 | long timeleft; | ||
525 | xfs_sync_work_t *work, *n; | ||
526 | LIST_HEAD (tmp); | ||
527 | |||
528 | set_freezable(); | ||
529 | timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); | ||
530 | for (;;) { | ||
531 | if (list_empty(&mp->m_sync_list)) | ||
532 | timeleft = schedule_timeout_interruptible(timeleft); | ||
533 | /* swsusp */ | ||
534 | try_to_freeze(); | ||
535 | if (kthread_should_stop() && list_empty(&mp->m_sync_list)) | ||
536 | break; | ||
537 | 486 | ||
538 | spin_lock(&mp->m_sync_lock); | 487 | /* |
539 | /* | 488 | * We can have inodes enter reclaim after we've shut down the syncd |
540 | * We can get woken by laptop mode, to do a sync - | 489 | * workqueue during unmount, so don't allow reclaim work to be queued |
541 | * that's the (only!) case where the list would be | 490 | * during unmount. |
542 | * empty with time remaining. | 491 | */ |
543 | */ | 492 | if (!(mp->m_super->s_flags & MS_ACTIVE)) |
544 | if (!timeleft || list_empty(&mp->m_sync_list)) { | 493 | return; |
545 | if (!timeleft) | ||
546 | timeleft = xfs_syncd_centisecs * | ||
547 | msecs_to_jiffies(10); | ||
548 | INIT_LIST_HEAD(&mp->m_sync_work.w_list); | ||
549 | list_add_tail(&mp->m_sync_work.w_list, | ||
550 | &mp->m_sync_list); | ||
551 | } | ||
552 | list_splice_init(&mp->m_sync_list, &tmp); | ||
553 | spin_unlock(&mp->m_sync_lock); | ||
554 | 494 | ||
555 | list_for_each_entry_safe(work, n, &tmp, w_list) { | 495 | rcu_read_lock(); |
556 | (*work->w_syncer)(mp, work->w_data); | 496 | if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { |
557 | list_del(&work->w_list); | 497 | queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, |
558 | if (work == &mp->m_sync_work) | 498 | msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); |
559 | continue; | ||
560 | if (work->w_completion) | ||
561 | complete(work->w_completion); | ||
562 | kmem_free(work); | ||
563 | } | ||
564 | } | 499 | } |
500 | rcu_read_unlock(); | ||
501 | } | ||
565 | 502 | ||
566 | return 0; | 503 | /* |
504 | * This is a fast pass over the inode cache to try to get reclaim moving on as | ||
505 | * many inodes as possible in a short period of time. It kicks itself every few | ||
506 | * seconds, as well as being kicked by the inode cache shrinker when memory | ||
507 | * goes low. It scans as quickly as possible avoiding locked inodes or those | ||
508 | * already being flushed, and once done schedules a future pass. | ||
509 | */ | ||
510 | STATIC void | ||
511 | xfs_reclaim_worker( | ||
512 | struct work_struct *work) | ||
513 | { | ||
514 | struct xfs_mount *mp = container_of(to_delayed_work(work), | ||
515 | struct xfs_mount, m_reclaim_work); | ||
516 | |||
517 | xfs_reclaim_inodes(mp, SYNC_TRYLOCK); | ||
518 | xfs_syncd_queue_reclaim(mp); | ||
519 | } | ||
520 | |||
521 | /* | ||
522 | * Flush delayed allocate data, attempting to free up reserved space | ||
523 | * from existing allocations. At this point a new allocation attempt | ||
524 | * has failed with ENOSPC and we are in the process of scratching our | ||
525 | * heads, looking about for more room. | ||
526 | * | ||
527 | * Queue a new data flush if there isn't one already in progress and | ||
528 | * wait for completion of the flush. This means that we only ever have one | ||
529 | * inode flush in progress no matter how many ENOSPC events are occurring and | ||
530 | * so will prevent the system from bogging down due to every concurrent | ||
531 | * ENOSPC event scanning all the active inodes in the system for writeback. | ||
532 | */ | ||
533 | void | ||
534 | xfs_flush_inodes( | ||
535 | struct xfs_inode *ip) | ||
536 | { | ||
537 | struct xfs_mount *mp = ip->i_mount; | ||
538 | |||
539 | queue_work(xfs_syncd_wq, &mp->m_flush_work); | ||
540 | flush_work_sync(&mp->m_flush_work); | ||
541 | } | ||
542 | |||
543 | STATIC void | ||
544 | xfs_flush_worker( | ||
545 | struct work_struct *work) | ||
546 | { | ||
547 | struct xfs_mount *mp = container_of(work, | ||
548 | struct xfs_mount, m_flush_work); | ||
549 | |||
550 | xfs_sync_data(mp, SYNC_TRYLOCK); | ||
551 | xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); | ||
567 | } | 552 | } |
568 | 553 | ||
569 | int | 554 | int |
570 | xfs_syncd_init( | 555 | xfs_syncd_init( |
571 | struct xfs_mount *mp) | 556 | struct xfs_mount *mp) |
572 | { | 557 | { |
573 | mp->m_sync_work.w_syncer = xfs_sync_worker; | 558 | INIT_WORK(&mp->m_flush_work, xfs_flush_worker); |
574 | mp->m_sync_work.w_mount = mp; | 559 | INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); |
575 | mp->m_sync_work.w_completion = NULL; | 560 | INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); |
576 | mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); | 561 | |
577 | if (IS_ERR(mp->m_sync_task)) | 562 | xfs_syncd_queue_sync(mp); |
578 | return -PTR_ERR(mp->m_sync_task); | 563 | xfs_syncd_queue_reclaim(mp); |
564 | |||
579 | return 0; | 565 | return 0; |
580 | } | 566 | } |
581 | 567 | ||
@@ -583,7 +569,9 @@ void | |||
583 | xfs_syncd_stop( | 569 | xfs_syncd_stop( |
584 | struct xfs_mount *mp) | 570 | struct xfs_mount *mp) |
585 | { | 571 | { |
586 | kthread_stop(mp->m_sync_task); | 572 | cancel_delayed_work_sync(&mp->m_sync_work); |
573 | cancel_delayed_work_sync(&mp->m_reclaim_work); | ||
574 | cancel_work_sync(&mp->m_flush_work); | ||
587 | } | 575 | } |
588 | 576 | ||
589 | void | 577 | void |
@@ -602,6 +590,10 @@ __xfs_inode_set_reclaim_tag( | |||
602 | XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), | 590 | XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), |
603 | XFS_ICI_RECLAIM_TAG); | 591 | XFS_ICI_RECLAIM_TAG); |
604 | spin_unlock(&ip->i_mount->m_perag_lock); | 592 | spin_unlock(&ip->i_mount->m_perag_lock); |
593 | |||
594 | /* schedule periodic background inode reclaim */ | ||
595 | xfs_syncd_queue_reclaim(ip->i_mount); | ||
596 | |||
605 | trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, | 597 | trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, |
606 | -1, _RET_IP_); | 598 | -1, _RET_IP_); |
607 | } | 599 | } |
@@ -762,8 +754,10 @@ xfs_reclaim_inode( | |||
762 | struct xfs_perag *pag, | 754 | struct xfs_perag *pag, |
763 | int sync_mode) | 755 | int sync_mode) |
764 | { | 756 | { |
765 | int error = 0; | 757 | int error; |
766 | 758 | ||
759 | restart: | ||
760 | error = 0; | ||
767 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 761 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
768 | if (!xfs_iflock_nowait(ip)) { | 762 | if (!xfs_iflock_nowait(ip)) { |
769 | if (!(sync_mode & SYNC_WAIT)) | 763 | if (!(sync_mode & SYNC_WAIT)) |
@@ -789,9 +783,31 @@ xfs_reclaim_inode( | |||
789 | if (xfs_inode_clean(ip)) | 783 | if (xfs_inode_clean(ip)) |
790 | goto reclaim; | 784 | goto reclaim; |
791 | 785 | ||
792 | /* Now we have an inode that needs flushing */ | 786 | /* |
793 | error = xfs_iflush(ip, sync_mode); | 787 | * Now we have an inode that needs flushing. |
788 | * | ||
789 | * We do a nonblocking flush here even if we are doing a SYNC_WAIT | ||
790 | * reclaim as we can deadlock with inode cluster removal. | ||
791 | * xfs_ifree_cluster() can lock the inode buffer before it locks the | ||
792 | * ip->i_lock, and we are doing the exact opposite here. As a result, | ||
793 | * doing a blocking xfs_itobp() to get the cluster buffer will result | ||
794 | * in an ABBA deadlock with xfs_ifree_cluster(). | ||
795 | * | ||
796 | * As xfs_ifree_cluser() must gather all inodes that are active in the | ||
797 | * cache to mark them stale, if we hit this case we don't actually want | ||
798 | * to do IO here - we want the inode marked stale so we can simply | ||
799 | * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, | ||
800 | * just unlock the inode, back off and try again. Hopefully the next | ||
801 | * pass through will see the stale flag set on the inode. | ||
802 | */ | ||
803 | error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); | ||
794 | if (sync_mode & SYNC_WAIT) { | 804 | if (sync_mode & SYNC_WAIT) { |
805 | if (error == EAGAIN) { | ||
806 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
807 | /* backoff longer than in xfs_ifree_cluster */ | ||
808 | delay(2); | ||
809 | goto restart; | ||
810 | } | ||
795 | xfs_iflock(ip); | 811 | xfs_iflock(ip); |
796 | goto reclaim; | 812 | goto reclaim; |
797 | } | 813 | } |
@@ -806,7 +822,7 @@ xfs_reclaim_inode( | |||
806 | * pass on the error. | 822 | * pass on the error. |
807 | */ | 823 | */ |
808 | if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { | 824 | if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { |
809 | xfs_fs_cmn_err(CE_WARN, ip->i_mount, | 825 | xfs_warn(ip->i_mount, |
810 | "inode 0x%llx background reclaim flush failed with %d", | 826 | "inode 0x%llx background reclaim flush failed with %d", |
811 | (long long)ip->i_ino, error); | 827 | (long long)ip->i_ino, error); |
812 | } | 828 | } |
@@ -994,7 +1010,13 @@ xfs_reclaim_inodes( | |||
994 | } | 1010 | } |
995 | 1011 | ||
996 | /* | 1012 | /* |
997 | * Shrinker infrastructure. | 1013 | * Inode cache shrinker. |
1014 | * | ||
1015 | * When called we make sure that there is a background (fast) inode reclaim in | ||
1016 | * progress, while we will throttle the speed of reclaim via doiing synchronous | ||
1017 | * reclaim of inodes. That means if we come across dirty inodes, we wait for | ||
1018 | * them to be cleaned, which we hope will not be very long due to the | ||
1019 | * background walker having already kicked the IO off on those dirty inodes. | ||
998 | */ | 1020 | */ |
999 | static int | 1021 | static int |
1000 | xfs_reclaim_inode_shrink( | 1022 | xfs_reclaim_inode_shrink( |
@@ -1009,10 +1031,15 @@ xfs_reclaim_inode_shrink( | |||
1009 | 1031 | ||
1010 | mp = container_of(shrink, struct xfs_mount, m_inode_shrink); | 1032 | mp = container_of(shrink, struct xfs_mount, m_inode_shrink); |
1011 | if (nr_to_scan) { | 1033 | if (nr_to_scan) { |
1034 | /* kick background reclaimer and push the AIL */ | ||
1035 | xfs_syncd_queue_reclaim(mp); | ||
1036 | xfs_ail_push_all(mp->m_ail); | ||
1037 | |||
1012 | if (!(gfp_mask & __GFP_FS)) | 1038 | if (!(gfp_mask & __GFP_FS)) |
1013 | return -1; | 1039 | return -1; |
1014 | 1040 | ||
1015 | xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); | 1041 | xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, |
1042 | &nr_to_scan); | ||
1016 | /* terminate if we don't exhaust the scan */ | 1043 | /* terminate if we don't exhaust the scan */ |
1017 | if (nr_to_scan > 0) | 1044 | if (nr_to_scan > 0) |
1018 | return -1; | 1045 | return -1; |