diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 105 |
1 files changed, 76 insertions, 29 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 525260c7617f..a9f6d20aff41 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -270,8 +270,7 @@ xfs_sync_inode_attr( | |||
270 | goto out_unlock; | 270 | goto out_unlock; |
271 | } | 271 | } |
272 | 272 | ||
273 | error = xfs_iflush(ip, (flags & SYNC_WAIT) ? | 273 | error = xfs_iflush(ip, flags); |
274 | XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI); | ||
275 | 274 | ||
276 | out_unlock: | 275 | out_unlock: |
277 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 276 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
@@ -460,16 +459,18 @@ xfs_quiesce_fs( | |||
460 | { | 459 | { |
461 | int count = 0, pincount; | 460 | int count = 0, pincount; |
462 | 461 | ||
462 | xfs_reclaim_inodes(mp, 0); | ||
463 | xfs_flush_buftarg(mp->m_ddev_targp, 0); | 463 | xfs_flush_buftarg(mp->m_ddev_targp, 0); |
464 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | ||
465 | 464 | ||
466 | /* | 465 | /* |
467 | * This loop must run at least twice. The first instance of the loop | 466 | * This loop must run at least twice. The first instance of the loop |
468 | * will flush most meta data but that will generate more meta data | 467 | * will flush most meta data but that will generate more meta data |
469 | * (typically directory updates). Which then must be flushed and | 468 | * (typically directory updates). Which then must be flushed and |
470 | * logged before we can write the unmount record. | 469 | * logged before we can write the unmount record. We also so sync |
470 | * reclaim of inodes to catch any that the above delwri flush skipped. | ||
471 | */ | 471 | */ |
472 | do { | 472 | do { |
473 | xfs_reclaim_inodes(mp, SYNC_WAIT); | ||
473 | xfs_sync_attr(mp, SYNC_WAIT); | 474 | xfs_sync_attr(mp, SYNC_WAIT); |
474 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); | 475 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); |
475 | if (!pincount) { | 476 | if (!pincount) { |
@@ -585,7 +586,7 @@ xfs_sync_worker( | |||
585 | 586 | ||
586 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { | 587 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { |
587 | xfs_log_force(mp, 0); | 588 | xfs_log_force(mp, 0); |
588 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | 589 | xfs_reclaim_inodes(mp, 0); |
589 | /* dgc: errors ignored here */ | 590 | /* dgc: errors ignored here */ |
590 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); | 591 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); |
591 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); | 592 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); |
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag( | |||
719 | * shutdown EIO unpin and reclaim | 720 | * shutdown EIO unpin and reclaim |
720 | * clean, unpinned 0 reclaim | 721 | * clean, unpinned 0 reclaim |
721 | * stale, unpinned 0 reclaim | 722 | * stale, unpinned 0 reclaim |
722 | * clean, pinned(*) 0 unpin and reclaim | 723 | * clean, pinned(*) 0 requeue |
723 | * stale, pinned 0 unpin and reclaim | 724 | * stale, pinned EAGAIN requeue |
724 | * dirty, async 0 block on flush lock, reclaim | 725 | * dirty, delwri ok 0 requeue |
725 | * dirty, sync flush 0 block on flush lock, reclaim | 726 | * dirty, delwri blocked EAGAIN requeue |
727 | * dirty, sync flush 0 reclaim | ||
726 | * | 728 | * |
727 | * (*) dgc: I don't think the clean, pinned state is possible but it gets | 729 | * (*) dgc: I don't think the clean, pinned state is possible but it gets |
728 | * handled anyway given the order of checks implemented. | 730 | * handled anyway given the order of checks implemented. |
729 | * | 731 | * |
732 | * As can be seen from the table, the return value of xfs_iflush() is not | ||
733 | * sufficient to correctly decide the reclaim action here. The checks in | ||
734 | * xfs_iflush() might look like duplicates, but they are not. | ||
735 | * | ||
736 | * Also, because we get the flush lock first, we know that any inode that has | ||
737 | * been flushed delwri has had the flush completed by the time we check that | ||
738 | * the inode is clean. The clean inode check needs to be done before flushing | ||
739 | * the inode delwri otherwise we would loop forever requeuing clean inodes as | ||
740 | * we cannot tell apart a successful delwri flush and a clean inode from the | ||
741 | * return value of xfs_iflush(). | ||
742 | * | ||
743 | * Note that because the inode is flushed delayed write by background | ||
744 | * writeback, the flush lock may already be held here and waiting on it can | ||
745 | * result in very long latencies. Hence for sync reclaims, where we wait on the | ||
746 | * flush lock, the caller should push out delayed write inodes first before | ||
747 | * trying to reclaim them to minimise the amount of time spent waiting. For | ||
748 | * background relaim, we just requeue the inode for the next pass. | ||
749 | * | ||
730 | * Hence the order of actions after gaining the locks should be: | 750 | * Hence the order of actions after gaining the locks should be: |
731 | * bad => reclaim | 751 | * bad => reclaim |
732 | * shutdown => unpin and reclaim | 752 | * shutdown => unpin and reclaim |
733 | * pinned => unpin | 753 | * pinned, delwri => requeue |
754 | * pinned, sync => unpin | ||
734 | * stale => reclaim | 755 | * stale => reclaim |
735 | * clean => reclaim | 756 | * clean => reclaim |
736 | * dirty => flush, wait and reclaim | 757 | * dirty, delwri => flush and requeue |
758 | * dirty, sync => flush, wait and reclaim | ||
737 | */ | 759 | */ |
738 | STATIC int | 760 | STATIC int |
739 | xfs_reclaim_inode( | 761 | xfs_reclaim_inode( |
@@ -741,7 +763,7 @@ xfs_reclaim_inode( | |||
741 | struct xfs_perag *pag, | 763 | struct xfs_perag *pag, |
742 | int sync_mode) | 764 | int sync_mode) |
743 | { | 765 | { |
744 | int error; | 766 | int error = 0; |
745 | 767 | ||
746 | /* | 768 | /* |
747 | * The radix tree lock here protects a thread in xfs_iget from racing | 769 | * The radix tree lock here protects a thread in xfs_iget from racing |
@@ -761,7 +783,11 @@ xfs_reclaim_inode( | |||
761 | write_unlock(&pag->pag_ici_lock); | 783 | write_unlock(&pag->pag_ici_lock); |
762 | 784 | ||
763 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 785 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
764 | xfs_iflock(ip); | 786 | if (!xfs_iflock_nowait(ip)) { |
787 | if (!(sync_mode & SYNC_WAIT)) | ||
788 | goto out; | ||
789 | xfs_iflock(ip); | ||
790 | } | ||
765 | 791 | ||
766 | if (is_bad_inode(VFS_I(ip))) | 792 | if (is_bad_inode(VFS_I(ip))) |
767 | goto reclaim; | 793 | goto reclaim; |
@@ -769,8 +795,13 @@ xfs_reclaim_inode( | |||
769 | xfs_iunpin_wait(ip); | 795 | xfs_iunpin_wait(ip); |
770 | goto reclaim; | 796 | goto reclaim; |
771 | } | 797 | } |
772 | if (xfs_ipincount(ip)) | 798 | if (xfs_ipincount(ip)) { |
799 | if (!(sync_mode & SYNC_WAIT)) { | ||
800 | xfs_ifunlock(ip); | ||
801 | goto out; | ||
802 | } | ||
773 | xfs_iunpin_wait(ip); | 803 | xfs_iunpin_wait(ip); |
804 | } | ||
774 | if (xfs_iflags_test(ip, XFS_ISTALE)) | 805 | if (xfs_iflags_test(ip, XFS_ISTALE)) |
775 | goto reclaim; | 806 | goto reclaim; |
776 | if (xfs_inode_clean(ip)) | 807 | if (xfs_inode_clean(ip)) |
@@ -778,27 +809,43 @@ xfs_reclaim_inode( | |||
778 | 809 | ||
779 | /* Now we have an inode that needs flushing */ | 810 | /* Now we have an inode that needs flushing */ |
780 | error = xfs_iflush(ip, sync_mode); | 811 | error = xfs_iflush(ip, sync_mode); |
781 | if (!error) { | 812 | if (sync_mode & SYNC_WAIT) { |
782 | switch(sync_mode) { | 813 | xfs_iflock(ip); |
783 | case XFS_IFLUSH_DELWRI_ELSE_ASYNC: | 814 | goto reclaim; |
784 | case XFS_IFLUSH_DELWRI: | ||
785 | case XFS_IFLUSH_ASYNC: | ||
786 | case XFS_IFLUSH_DELWRI_ELSE_SYNC: | ||
787 | case XFS_IFLUSH_SYNC: | ||
788 | /* IO issued, synchronise with IO completion */ | ||
789 | xfs_iflock(ip); | ||
790 | break; | ||
791 | default: | ||
792 | ASSERT(0); | ||
793 | break; | ||
794 | } | ||
795 | } | 815 | } |
796 | 816 | ||
817 | /* | ||
818 | * When we have to flush an inode but don't have SYNC_WAIT set, we | ||
819 | * flush the inode out using a delwri buffer and wait for the next | ||
820 | * call into reclaim to find it in a clean state instead of waiting for | ||
821 | * it now. We also don't return errors here - if the error is transient | ||
822 | * then the next reclaim pass will flush the inode, and if the error | ||
823 | * is permanent then the next sync reclaim will relcaim the inode and | ||
824 | * pass on the error. | ||
825 | */ | ||
826 | if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { | ||
827 | xfs_fs_cmn_err(CE_WARN, ip->i_mount, | ||
828 | "inode 0x%llx background reclaim flush failed with %d", | ||
829 | (long long)ip->i_ino, error); | ||
830 | } | ||
831 | out: | ||
832 | xfs_iflags_clear(ip, XFS_IRECLAIM); | ||
833 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
834 | /* | ||
835 | * We could return EAGAIN here to make reclaim rescan the inode tree in | ||
836 | * a short while. However, this just burns CPU time scanning the tree | ||
837 | * waiting for IO to complete and xfssyncd never goes back to the idle | ||
838 | * state. Instead, return 0 to let the next scheduled background reclaim | ||
839 | * attempt to reclaim the inode again. | ||
840 | */ | ||
841 | return 0; | ||
842 | |||
797 | reclaim: | 843 | reclaim: |
798 | xfs_ifunlock(ip); | 844 | xfs_ifunlock(ip); |
799 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 845 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
800 | xfs_ireclaim(ip); | 846 | xfs_ireclaim(ip); |
801 | return 0; | 847 | return error; |
848 | |||
802 | } | 849 | } |
803 | 850 | ||
804 | int | 851 | int |