aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-02-05 20:39:36 -0500
committerDave Chinner <david@fromorbit.com>2010-02-05 20:39:36 -0500
commitc854363e80b49dd04a4de18ebc379eb8c8806674 (patch)
tree8c8d0dec26d961631a3cd8b6c402b5d1444336e5
parent777df5afdb26c71634edd60582be620ff94e87a0 (diff)
xfs: Use delayed write for inodes rather than async V2
We currently do background inode flush asynchronously, resulting in inodes being written in whatever order the background writeback issues them. Not only that, there are also blocking and non-blocking asynchronous inode flushes, depending on where the flush comes from. This patch completely removes asynchronous inode writeback. It removes all the strange writeback modes and replaces them with either a synchronous flush or a non-blocking delayed write flush. That is, inode flushes will only issue IO directly if they are synchronous, and background flushing may do nothing if the operation would block (e.g. on a pinned inode or buffer lock). Delayed write flushes will now result in the inode buffer sitting in the delwri queue of the buffer cache to be flushed by either an AIL push or by the xfsbufd timing out the buffer. This will allow accumulation of dirty inode buffers in memory and allow optimisation of inode cluster writeback at the xfsbufd level where we have much greater queue depths than the block layer elevators. We will also get adjacent inode cluster buffer IO merging for free when a later patch in the series allows sorting of the delayed write buffers before dispatch. This effectively means that any inode that is written back by background writeback will be seen as flush locked during AIL pushing, and will result in the buffers being pushed from there. This writeback path is currently non-optimal, but the next patch in the series will fix that problem. A side effect of this delayed write mechanism is that background inode reclaim will no longer directly flush inodes, nor can it wait on the flush lock. The result is that inode reclaim must leave the inode in the reclaimable state until it is clean. Hence attempts to reclaim a dirty inode in the background will simply skip the inode until it is clean and this allows other mechanisms (i.e. xfsbufd) to do more optimal writeback of the dirty buffers. As a result, the inode reclaim code has been rewritten so that it no longer relies on the ambiguous return values of xfs_iflush() to determine whether it is safe to reclaim an inode. Portions of this patch are derived from patches by Christoph Hellwig. Version 2: - cleanup reclaim code as suggested by Christoph - log background reclaim inode flush errors - just pass sync flags to xfs_iflush Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c105
-rw-r--r--fs/xfs/xfs_inode.c75
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_mount.c13
6 files changed, 102 insertions, 115 deletions
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 6ce828e0e17b..3b5b46b8e3b9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1064,7 +1064,7 @@ xfs_fs_write_inode(
1064 xfs_ilock(ip, XFS_ILOCK_SHARED); 1064 xfs_ilock(ip, XFS_ILOCK_SHARED);
1065 xfs_iflock(ip); 1065 xfs_iflock(ip);
1066 1066
1067 error = xfs_iflush(ip, XFS_IFLUSH_SYNC); 1067 error = xfs_iflush(ip, SYNC_WAIT);
1068 } else { 1068 } else {
1069 error = EAGAIN; 1069 error = EAGAIN;
1070 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 1070 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
@@ -1072,7 +1072,7 @@ xfs_fs_write_inode(
1072 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) 1072 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1073 goto out_unlock; 1073 goto out_unlock;
1074 1074
1075 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); 1075 error = xfs_iflush(ip, 0);
1076 } 1076 }
1077 1077
1078 out_unlock: 1078 out_unlock:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 525260c7617f..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -270,8 +270,7 @@ xfs_sync_inode_attr(
270 goto out_unlock; 270 goto out_unlock;
271 } 271 }
272 272
273 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
274 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
275 274
276 out_unlock: 275 out_unlock:
277 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
460{ 459{
461 int count = 0, pincount; 460 int count = 0, pincount;
462 461
462 xfs_reclaim_inodes(mp, 0);
463 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
464 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
465 464
466 /* 465 /*
467 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
468 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
469 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
470 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
471 */ 471 */
472 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
473 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
474 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
475 if (!pincount) { 476 if (!pincount) {
@@ -585,7 +586,7 @@ xfs_sync_worker(
585 586
586 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
587 xfs_log_force(mp, 0); 588 xfs_log_force(mp, 0);
588 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
589 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
590 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
591 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag(
719 * shutdown EIO unpin and reclaim 720 * shutdown EIO unpin and reclaim
720 * clean, unpinned 0 reclaim 721 * clean, unpinned 0 reclaim
721 * stale, unpinned 0 reclaim 722 * stale, unpinned 0 reclaim
722 * clean, pinned(*) 0 unpin and reclaim 723 * clean, pinned(*) 0 requeue
723 * stale, pinned 0 unpin and reclaim 724 * stale, pinned EAGAIN requeue
724 * dirty, async 0 block on flush lock, reclaim 725 * dirty, delwri ok 0 requeue
725 * dirty, sync flush 0 block on flush lock, reclaim 726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
726 * 728 *
727 * (*) dgc: I don't think the clean, pinned state is possible but it gets 729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
728 * handled anyway given the order of checks implemented. 730 * handled anyway given the order of checks implemented.
729 * 731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
730 * Hence the order of actions after gaining the locks should be: 750 * Hence the order of actions after gaining the locks should be:
731 * bad => reclaim 751 * bad => reclaim
732 * shutdown => unpin and reclaim 752 * shutdown => unpin and reclaim
733 * pinned => unpin 753 * pinned, delwri => requeue
754 * pinned, sync => unpin
734 * stale => reclaim 755 * stale => reclaim
735 * clean => reclaim 756 * clean => reclaim
736 * dirty => flush, wait and reclaim 757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
737 */ 759 */
738STATIC int 760STATIC int
739xfs_reclaim_inode( 761xfs_reclaim_inode(
@@ -741,7 +763,7 @@ xfs_reclaim_inode(
741 struct xfs_perag *pag, 763 struct xfs_perag *pag,
742 int sync_mode) 764 int sync_mode)
743{ 765{
744 int error; 766 int error = 0;
745 767
746 /* 768 /*
747 * The radix tree lock here protects a thread in xfs_iget from racing 769 * The radix tree lock here protects a thread in xfs_iget from racing
@@ -761,7 +783,11 @@ xfs_reclaim_inode(
761 write_unlock(&pag->pag_ici_lock); 783 write_unlock(&pag->pag_ici_lock);
762 784
763 xfs_ilock(ip, XFS_ILOCK_EXCL); 785 xfs_ilock(ip, XFS_ILOCK_EXCL);
764 xfs_iflock(ip); 786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
765 791
766 if (is_bad_inode(VFS_I(ip))) 792 if (is_bad_inode(VFS_I(ip)))
767 goto reclaim; 793 goto reclaim;
@@ -769,8 +795,13 @@ xfs_reclaim_inode(
769 xfs_iunpin_wait(ip); 795 xfs_iunpin_wait(ip);
770 goto reclaim; 796 goto reclaim;
771 } 797 }
772 if (xfs_ipincount(ip)) 798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
773 xfs_iunpin_wait(ip); 803 xfs_iunpin_wait(ip);
804 }
774 if (xfs_iflags_test(ip, XFS_ISTALE)) 805 if (xfs_iflags_test(ip, XFS_ISTALE))
775 goto reclaim; 806 goto reclaim;
776 if (xfs_inode_clean(ip)) 807 if (xfs_inode_clean(ip))
@@ -778,27 +809,43 @@ xfs_reclaim_inode(
778 809
779 /* Now we have an inode that needs flushing */ 810 /* Now we have an inode that needs flushing */
780 error = xfs_iflush(ip, sync_mode); 811 error = xfs_iflush(ip, sync_mode);
781 if (!error) { 812 if (sync_mode & SYNC_WAIT) {
782 switch(sync_mode) { 813 xfs_iflock(ip);
783 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 814 goto reclaim;
784 case XFS_IFLUSH_DELWRI:
785 case XFS_IFLUSH_ASYNC:
786 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
787 case XFS_IFLUSH_SYNC:
788 /* IO issued, synchronise with IO completion */
789 xfs_iflock(ip);
790 break;
791 default:
792 ASSERT(0);
793 break;
794 }
795 } 815 }
796 816
817 /*
818 * When we have to flush an inode but don't have SYNC_WAIT set, we
819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will relcaim the inode and
824 * pass on the error.
825 */
826 if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
828 "inode 0x%llx background reclaim flush failed with %d",
829 (long long)ip->i_ino, error);
830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
842
797reclaim: 843reclaim:
798 xfs_ifunlock(ip); 844 xfs_ifunlock(ip);
799 xfs_iunlock(ip, XFS_ILOCK_EXCL); 845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
800 xfs_ireclaim(ip); 846 xfs_ireclaim(ip);
801 return 0; 847 return error;
848
802} 849}
803 850
804int 851int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8d0666dd170a..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
2835 xfs_dinode_t *dip; 2835 xfs_dinode_t *dip;
2836 xfs_mount_t *mp; 2836 xfs_mount_t *mp;
2837 int error; 2837 int error;
2838 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2839 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2840 2838
2841 XFS_STATS_INC(xs_iflush_count); 2839 XFS_STATS_INC(xs_iflush_count);
2842 2840
@@ -2859,7 +2857,7 @@ xfs_iflush(
2859 * in the same cluster are dirty, they will probably write the inode 2857 * in the same cluster are dirty, they will probably write the inode
2860 * out for us if they occur after the log force completes. 2858 * out for us if they occur after the log force completes.
2861 */ 2859 */
2862 if (noblock && xfs_ipincount(ip)) { 2860 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2863 xfs_iunpin_nowait(ip); 2861 xfs_iunpin_nowait(ip);
2864 xfs_ifunlock(ip); 2862 xfs_ifunlock(ip);
2865 return EAGAIN; 2863 return EAGAIN;
@@ -2893,60 +2891,10 @@ xfs_iflush(
2893 } 2891 }
2894 2892
2895 /* 2893 /*
2896 * Decide how buffer will be flushed out. This is done before
2897 * the call to xfs_iflush_int because this field is zeroed by it.
2898 */
2899 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2900 /*
2901 * Flush out the inode buffer according to the directions
2902 * of the caller. In the cases where the caller has given
2903 * us a choice choose the non-delwri case. This is because
2904 * the inode is in the AIL and we need to get it out soon.
2905 */
2906 switch (flags) {
2907 case XFS_IFLUSH_SYNC:
2908 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2909 flags = 0;
2910 break;
2911 case XFS_IFLUSH_ASYNC_NOBLOCK:
2912 case XFS_IFLUSH_ASYNC:
2913 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2914 flags = INT_ASYNC;
2915 break;
2916 case XFS_IFLUSH_DELWRI:
2917 flags = INT_DELWRI;
2918 break;
2919 default:
2920 ASSERT(0);
2921 flags = 0;
2922 break;
2923 }
2924 } else {
2925 switch (flags) {
2926 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2927 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2928 case XFS_IFLUSH_DELWRI:
2929 flags = INT_DELWRI;
2930 break;
2931 case XFS_IFLUSH_ASYNC_NOBLOCK:
2932 case XFS_IFLUSH_ASYNC:
2933 flags = INT_ASYNC;
2934 break;
2935 case XFS_IFLUSH_SYNC:
2936 flags = 0;
2937 break;
2938 default:
2939 ASSERT(0);
2940 flags = 0;
2941 break;
2942 }
2943 }
2944
2945 /*
2946 * Get the buffer containing the on-disk inode. 2894 * Get the buffer containing the on-disk inode.
2947 */ 2895 */
2948 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2896 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2949 noblock ? XBF_TRYLOCK : XBF_LOCK); 2897 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2950 if (error || !bp) { 2898 if (error || !bp) {
2951 xfs_ifunlock(ip); 2899 xfs_ifunlock(ip);
2952 return error; 2900 return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
2974 if (error) 2922 if (error)
2975 goto cluster_corrupt_out; 2923 goto cluster_corrupt_out;
2976 2924
2977 if (flags & INT_DELWRI) { 2925 if (flags & SYNC_WAIT)
2978 xfs_bdwrite(mp, bp);
2979 } else if (flags & INT_ASYNC) {
2980 error = xfs_bawrite(mp, bp);
2981 } else {
2982 error = xfs_bwrite(mp, bp); 2926 error = xfs_bwrite(mp, bp);
2983 } 2927 else
2928 xfs_bdwrite(mp, bp);
2984 return error; 2929 return error;
2985 2930
2986corrupt_out: 2931corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
3015 iip = ip->i_itemp; 2960 iip = ip->i_itemp;
3016 mp = ip->i_mount; 2961 mp = ip->i_mount;
3017 2962
3018
3019 /*
3020 * If the inode isn't dirty, then just release the inode
3021 * flush lock and do nothing.
3022 */
3023 if (xfs_inode_clean(ip)) {
3024 xfs_ifunlock(ip);
3025 return 0;
3026 }
3027
3028 /* set *dip = inode's place in the buffer */ 2963 /* set *dip = inode's place in the buffer */
3029 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2964 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3030 2965
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8b618ea4d692..6c912b027596 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421 421
422/* 422/*
423 * Flags for xfs_iflush()
424 */
425#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
426#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
427#define XFS_IFLUSH_SYNC 3
428#define XFS_IFLUSH_ASYNC 4
429#define XFS_IFLUSH_DELWRI 5
430#define XFS_IFLUSH_ASYNC_NOBLOCK 6
431
432/*
433 * Flags for xfs_itruncate_start(). 423 * Flags for xfs_itruncate_start().
434 */ 424 */
435#define XFS_ITRUNC_DEFINITE 0x1 425#define XFS_ITRUNC_DEFINITE 0x1
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 48ec1c0b23ce..207553e82954 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -866,10 +866,14 @@ xfs_inode_item_push(
866 iip->ili_format.ilf_fields != 0); 866 iip->ili_format.ilf_fields != 0);
867 867
868 /* 868 /*
869 * Write out the inode. The completion routine ('iflush_done') will 869 * Push the inode to it's backing buffer. This will not remove the
870 * pull it from the AIL, mark it clean, unlock the flush lock. 870 * inode from the AIL - a further push will be required to trigger a
871 * buffer push. However, this allows all the dirty inodes to be pushed
872 * to the buffer before it is pushed to disk. THe buffer IO completion
873 * will pull th einode from the AIL, mark it clean and unlock the flush
874 * lock.
871 */ 875 */
872 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 876 (void) xfs_iflush(ip, 0);
873 xfs_iunlock(ip, XFS_ILOCK_SHARED); 877 xfs_iunlock(ip, XFS_ILOCK_SHARED);
874 878
875 return; 879 return;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5061149b2cc4..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
1468 * need to force the log first. 1468 * need to force the log first.
1469 */ 1469 */
1470 xfs_log_force(mp, XFS_LOG_SYNC); 1470 xfs_log_force(mp, XFS_LOG_SYNC);
1471 xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); 1471
1472 /*
1473 * Do a delwri reclaim pass first so that as many dirty inodes are
1474 * queued up for IO as possible. Then flush the buffers before making
1475 * a synchronous path to catch all the remaining inodes are reclaimed.
1476 * This makes the reclaim process as quick as possible by avoiding
1477 * synchronous writeout and blocking on inodes already in the delwri
1478 * state as much as possible.
1479 */
1480 xfs_reclaim_inodes(mp, 0);
1481 XFS_bflush(mp->m_ddev_targp);
1482 xfs_reclaim_inodes(mp, SYNC_WAIT);
1472 1483
1473 xfs_qm_unmount(mp); 1484 xfs_qm_unmount(mp);
1474 1485