aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c105
-rw-r--r--fs/xfs/xfs_inode.c75
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_mount.c13
6 files changed, 102 insertions, 115 deletions
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 6ce828e0e17b..3b5b46b8e3b9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1064,7 +1064,7 @@ xfs_fs_write_inode(
1064 xfs_ilock(ip, XFS_ILOCK_SHARED); 1064 xfs_ilock(ip, XFS_ILOCK_SHARED);
1065 xfs_iflock(ip); 1065 xfs_iflock(ip);
1066 1066
1067 error = xfs_iflush(ip, XFS_IFLUSH_SYNC); 1067 error = xfs_iflush(ip, SYNC_WAIT);
1068 } else { 1068 } else {
1069 error = EAGAIN; 1069 error = EAGAIN;
1070 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 1070 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
@@ -1072,7 +1072,7 @@ xfs_fs_write_inode(
1072 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) 1072 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1073 goto out_unlock; 1073 goto out_unlock;
1074 1074
1075 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); 1075 error = xfs_iflush(ip, 0);
1076 } 1076 }
1077 1077
1078 out_unlock: 1078 out_unlock:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 525260c7617f..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -270,8 +270,7 @@ xfs_sync_inode_attr(
270 goto out_unlock; 270 goto out_unlock;
271 } 271 }
272 272
273 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
274 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
275 274
276 out_unlock: 275 out_unlock:
277 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
460{ 459{
461 int count = 0, pincount; 460 int count = 0, pincount;
462 461
462 xfs_reclaim_inodes(mp, 0);
463 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
464 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
465 464
466 /* 465 /*
467 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
468 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
469 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
470 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
471 */ 471 */
472 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
473 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
474 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
475 if (!pincount) { 476 if (!pincount) {
@@ -585,7 +586,7 @@ xfs_sync_worker(
585 586
586 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
587 xfs_log_force(mp, 0); 588 xfs_log_force(mp, 0);
588 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
589 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
590 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
591 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag(
719 * shutdown EIO unpin and reclaim 720 * shutdown EIO unpin and reclaim
720 * clean, unpinned 0 reclaim 721 * clean, unpinned 0 reclaim
721 * stale, unpinned 0 reclaim 722 * stale, unpinned 0 reclaim
722 * clean, pinned(*) 0 unpin and reclaim 723 * clean, pinned(*) 0 requeue
723 * stale, pinned 0 unpin and reclaim 724 * stale, pinned EAGAIN requeue
724 * dirty, async 0 block on flush lock, reclaim 725 * dirty, delwri ok 0 requeue
725 * dirty, sync flush 0 block on flush lock, reclaim 726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
726 * 728 *
727 * (*) dgc: I don't think the clean, pinned state is possible but it gets 729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
728 * handled anyway given the order of checks implemented. 730 * handled anyway given the order of checks implemented.
729 * 731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
730 * Hence the order of actions after gaining the locks should be: 750 * Hence the order of actions after gaining the locks should be:
731 * bad => reclaim 751 * bad => reclaim
732 * shutdown => unpin and reclaim 752 * shutdown => unpin and reclaim
733 * pinned => unpin 753 * pinned, delwri => requeue
754 * pinned, sync => unpin
734 * stale => reclaim 755 * stale => reclaim
735 * clean => reclaim 756 * clean => reclaim
736 * dirty => flush, wait and reclaim 757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
737 */ 759 */
738STATIC int 760STATIC int
739xfs_reclaim_inode( 761xfs_reclaim_inode(
@@ -741,7 +763,7 @@ xfs_reclaim_inode(
741 struct xfs_perag *pag, 763 struct xfs_perag *pag,
742 int sync_mode) 764 int sync_mode)
743{ 765{
744 int error; 766 int error = 0;
745 767
746 /* 768 /*
747 * The radix tree lock here protects a thread in xfs_iget from racing 769 * The radix tree lock here protects a thread in xfs_iget from racing
@@ -761,7 +783,11 @@ xfs_reclaim_inode(
761 write_unlock(&pag->pag_ici_lock); 783 write_unlock(&pag->pag_ici_lock);
762 784
763 xfs_ilock(ip, XFS_ILOCK_EXCL); 785 xfs_ilock(ip, XFS_ILOCK_EXCL);
764 xfs_iflock(ip); 786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
765 791
766 if (is_bad_inode(VFS_I(ip))) 792 if (is_bad_inode(VFS_I(ip)))
767 goto reclaim; 793 goto reclaim;
@@ -769,8 +795,13 @@ xfs_reclaim_inode(
769 xfs_iunpin_wait(ip); 795 xfs_iunpin_wait(ip);
770 goto reclaim; 796 goto reclaim;
771 } 797 }
772 if (xfs_ipincount(ip)) 798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
773 xfs_iunpin_wait(ip); 803 xfs_iunpin_wait(ip);
804 }
774 if (xfs_iflags_test(ip, XFS_ISTALE)) 805 if (xfs_iflags_test(ip, XFS_ISTALE))
775 goto reclaim; 806 goto reclaim;
776 if (xfs_inode_clean(ip)) 807 if (xfs_inode_clean(ip))
@@ -778,27 +809,43 @@ xfs_reclaim_inode(
778 809
779 /* Now we have an inode that needs flushing */ 810 /* Now we have an inode that needs flushing */
780 error = xfs_iflush(ip, sync_mode); 811 error = xfs_iflush(ip, sync_mode);
781 if (!error) { 812 if (sync_mode & SYNC_WAIT) {
782 switch(sync_mode) { 813 xfs_iflock(ip);
783 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 814 goto reclaim;
784 case XFS_IFLUSH_DELWRI:
785 case XFS_IFLUSH_ASYNC:
786 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
787 case XFS_IFLUSH_SYNC:
788 /* IO issued, synchronise with IO completion */
789 xfs_iflock(ip);
790 break;
791 default:
792 ASSERT(0);
793 break;
794 }
795 } 815 }
796 816
817 /*
818 * When we have to flush an inode but don't have SYNC_WAIT set, we
819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and
824 * pass on the error.
825 */
826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
828 "inode 0x%llx background reclaim flush failed with %d",
829 (long long)ip->i_ino, error);
830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
842
797reclaim: 843reclaim:
798 xfs_ifunlock(ip); 844 xfs_ifunlock(ip);
799 xfs_iunlock(ip, XFS_ILOCK_EXCL); 845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
800 xfs_ireclaim(ip); 846 xfs_ireclaim(ip);
801 return 0; 847 return error;
848
802} 849}
803 850
804int 851int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8d0666dd170a..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
2835 xfs_dinode_t *dip; 2835 xfs_dinode_t *dip;
2836 xfs_mount_t *mp; 2836 xfs_mount_t *mp;
2837 int error; 2837 int error;
2838 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2839 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2840 2838
2841 XFS_STATS_INC(xs_iflush_count); 2839 XFS_STATS_INC(xs_iflush_count);
2842 2840
@@ -2859,7 +2857,7 @@ xfs_iflush(
2859 * in the same cluster are dirty, they will probably write the inode 2857 * in the same cluster are dirty, they will probably write the inode
2860 * out for us if they occur after the log force completes. 2858 * out for us if they occur after the log force completes.
2861 */ 2859 */
2862 if (noblock && xfs_ipincount(ip)) { 2860 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2863 xfs_iunpin_nowait(ip); 2861 xfs_iunpin_nowait(ip);
2864 xfs_ifunlock(ip); 2862 xfs_ifunlock(ip);
2865 return EAGAIN; 2863 return EAGAIN;
@@ -2893,60 +2891,10 @@ xfs_iflush(
2893 } 2891 }
2894 2892
2895 /* 2893 /*
2896 * Decide how buffer will be flushed out. This is done before
2897 * the call to xfs_iflush_int because this field is zeroed by it.
2898 */
2899 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2900 /*
2901 * Flush out the inode buffer according to the directions
2902 * of the caller. In the cases where the caller has given
2903 * us a choice choose the non-delwri case. This is because
2904 * the inode is in the AIL and we need to get it out soon.
2905 */
2906 switch (flags) {
2907 case XFS_IFLUSH_SYNC:
2908 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2909 flags = 0;
2910 break;
2911 case XFS_IFLUSH_ASYNC_NOBLOCK:
2912 case XFS_IFLUSH_ASYNC:
2913 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2914 flags = INT_ASYNC;
2915 break;
2916 case XFS_IFLUSH_DELWRI:
2917 flags = INT_DELWRI;
2918 break;
2919 default:
2920 ASSERT(0);
2921 flags = 0;
2922 break;
2923 }
2924 } else {
2925 switch (flags) {
2926 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2927 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2928 case XFS_IFLUSH_DELWRI:
2929 flags = INT_DELWRI;
2930 break;
2931 case XFS_IFLUSH_ASYNC_NOBLOCK:
2932 case XFS_IFLUSH_ASYNC:
2933 flags = INT_ASYNC;
2934 break;
2935 case XFS_IFLUSH_SYNC:
2936 flags = 0;
2937 break;
2938 default:
2939 ASSERT(0);
2940 flags = 0;
2941 break;
2942 }
2943 }
2944
2945 /*
2946 * Get the buffer containing the on-disk inode. 2894 * Get the buffer containing the on-disk inode.
2947 */ 2895 */
2948 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2896 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2949 noblock ? XBF_TRYLOCK : XBF_LOCK); 2897 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2950 if (error || !bp) { 2898 if (error || !bp) {
2951 xfs_ifunlock(ip); 2899 xfs_ifunlock(ip);
2952 return error; 2900 return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
2974 if (error) 2922 if (error)
2975 goto cluster_corrupt_out; 2923 goto cluster_corrupt_out;
2976 2924
2977 if (flags & INT_DELWRI) { 2925 if (flags & SYNC_WAIT)
2978 xfs_bdwrite(mp, bp);
2979 } else if (flags & INT_ASYNC) {
2980 error = xfs_bawrite(mp, bp);
2981 } else {
2982 error = xfs_bwrite(mp, bp); 2926 error = xfs_bwrite(mp, bp);
2983 } 2927 else
2928 xfs_bdwrite(mp, bp);
2984 return error; 2929 return error;
2985 2930
2986corrupt_out: 2931corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
3015 iip = ip->i_itemp; 2960 iip = ip->i_itemp;
3016 mp = ip->i_mount; 2961 mp = ip->i_mount;
3017 2962
3018
3019 /*
3020 * If the inode isn't dirty, then just release the inode
3021 * flush lock and do nothing.
3022 */
3023 if (xfs_inode_clean(ip)) {
3024 xfs_ifunlock(ip);
3025 return 0;
3026 }
3027
3028 /* set *dip = inode's place in the buffer */ 2963 /* set *dip = inode's place in the buffer */
3029 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2964 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3030 2965
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8b618ea4d692..6c912b027596 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421 421
422/* 422/*
423 * Flags for xfs_iflush()
424 */
425#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
426#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
427#define XFS_IFLUSH_SYNC 3
428#define XFS_IFLUSH_ASYNC 4
429#define XFS_IFLUSH_DELWRI 5
430#define XFS_IFLUSH_ASYNC_NOBLOCK 6
431
432/*
433 * Flags for xfs_itruncate_start(). 423 * Flags for xfs_itruncate_start().
434 */ 424 */
435#define XFS_ITRUNC_DEFINITE 0x1 425#define XFS_ITRUNC_DEFINITE 0x1
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 48ec1c0b23ce..207553e82954 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -866,10 +866,14 @@ xfs_inode_item_push(
866 iip->ili_format.ilf_fields != 0); 866 iip->ili_format.ilf_fields != 0);
867 867
868 /* 868 /*
869 * Write out the inode. The completion routine ('iflush_done') will 869 * Push the inode to it's backing buffer. This will not remove the
870 * pull it from the AIL, mark it clean, unlock the flush lock. 870 * inode from the AIL - a further push will be required to trigger a
871 * buffer push. However, this allows all the dirty inodes to be pushed
872 * to the buffer before it is pushed to disk. THe buffer IO completion
873 * will pull th einode from the AIL, mark it clean and unlock the flush
874 * lock.
871 */ 875 */
872 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 876 (void) xfs_iflush(ip, 0);
873 xfs_iunlock(ip, XFS_ILOCK_SHARED); 877 xfs_iunlock(ip, XFS_ILOCK_SHARED);
874 878
875 return; 879 return;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5061149b2cc4..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
1468 * need to force the log first. 1468 * need to force the log first.
1469 */ 1469 */
1470 xfs_log_force(mp, XFS_LOG_SYNC); 1470 xfs_log_force(mp, XFS_LOG_SYNC);
1471 xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); 1471
1472 /*
1473 * Do a delwri reclaim pass first so that as many dirty inodes are
1474 * queued up for IO as possible. Then flush the buffers before making
1475 * a synchronous path to catch all the remaining inodes are reclaimed.
1476 * This makes the reclaim process as quick as possible by avoiding
1477 * synchronous writeout and blocking on inodes already in the delwri
1478 * state as much as possible.
1479 */
1480 xfs_reclaim_inodes(mp, 0);
1481 XFS_bflush(mp->m_ddev_targp);
1482 xfs_reclaim_inodes(mp, SYNC_WAIT);
1472 1483
1473 xfs_qm_unmount(mp); 1484 xfs_qm_unmount(mp);
1474 1485