aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-02-05 20:39:36 -0500
committerDave Chinner <david@fromorbit.com>2010-02-05 20:39:36 -0500
commitc854363e80b49dd04a4de18ebc379eb8c8806674 (patch)
tree8c8d0dec26d961631a3cd8b6c402b5d1444336e5 /fs/xfs/xfs_inode.c
parent777df5afdb26c71634edd60582be620ff94e87a0 (diff)
xfs: Use delayed write for inodes rather than async V2
We currently do background inode flush asynchronously, resulting in inodes being written in whatever order the background writeback issues them. Not only that, there are also blocking and non-blocking asynchronous inode flushes, depending on where the flush comes from. This patch completely removes asynchronous inode writeback. It removes all the strange writeback modes and replaces them with either a synchronous flush or a non-blocking delayed write flush. That is, inode flushes will only issue IO directly if they are synchronous, and background flushing may do nothing if the operation would block (e.g. on a pinned inode or buffer lock). Delayed write flushes will now result in the inode buffer sitting in the delwri queue of the buffer cache to be flushed by either an AIL push or by the xfsbufd timing out the buffer. This will allow accumulation of dirty inode buffers in memory and allow optimisation of inode cluster writeback at the xfsbufd level where we have much greater queue depths than the block layer elevators. We will also get adjacent inode cluster buffer IO merging for free when a later patch in the series allows sorting of the delayed write buffers before dispatch. This effectively means that any inode that is written back by background writeback will be seen as flush locked during AIL pushing, and will result in the buffers being pushed from there. This writeback path is currently non-optimal, but the next patch in the series will fix that problem. A side effect of this delayed write mechanism is that background inode reclaim will no longer directly flush inodes, nor can it wait on the flush lock. The result is that inode reclaim must leave the inode in the reclaimable state until it is clean. Hence attempts to reclaim a dirty inode in the background will simply skip the inode until it is clean and this allows other mechanisms (i.e. xfsbufd) to do more optimal writeback of the dirty buffers. As a result, the inode reclaim code has been rewritten so that it no longer relies on the ambiguous return values of xfs_iflush() to determine whether it is safe to reclaim an inode. Portions of this patch are derived from patches by Christoph Hellwig. Version 2: - cleanup reclaim code as suggested by Christoph - log background reclaim inode flush errors - just pass sync flags to xfs_iflush Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c75
1 files changed, 5 insertions, 70 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8d0666dd170a..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
2835 xfs_dinode_t *dip; 2835 xfs_dinode_t *dip;
2836 xfs_mount_t *mp; 2836 xfs_mount_t *mp;
2837 int error; 2837 int error;
2838 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2839 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2840 2838
2841 XFS_STATS_INC(xs_iflush_count); 2839 XFS_STATS_INC(xs_iflush_count);
2842 2840
@@ -2859,7 +2857,7 @@ xfs_iflush(
2859 * in the same cluster are dirty, they will probably write the inode 2857 * in the same cluster are dirty, they will probably write the inode
2860 * out for us if they occur after the log force completes. 2858 * out for us if they occur after the log force completes.
2861 */ 2859 */
2862 if (noblock && xfs_ipincount(ip)) { 2860 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2863 xfs_iunpin_nowait(ip); 2861 xfs_iunpin_nowait(ip);
2864 xfs_ifunlock(ip); 2862 xfs_ifunlock(ip);
2865 return EAGAIN; 2863 return EAGAIN;
@@ -2893,60 +2891,10 @@ xfs_iflush(
2893 } 2891 }
2894 2892
2895 /* 2893 /*
2896 * Decide how buffer will be flushed out. This is done before
2897 * the call to xfs_iflush_int because this field is zeroed by it.
2898 */
2899 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2900 /*
2901 * Flush out the inode buffer according to the directions
2902 * of the caller. In the cases where the caller has given
2903 * us a choice choose the non-delwri case. This is because
2904 * the inode is in the AIL and we need to get it out soon.
2905 */
2906 switch (flags) {
2907 case XFS_IFLUSH_SYNC:
2908 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2909 flags = 0;
2910 break;
2911 case XFS_IFLUSH_ASYNC_NOBLOCK:
2912 case XFS_IFLUSH_ASYNC:
2913 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2914 flags = INT_ASYNC;
2915 break;
2916 case XFS_IFLUSH_DELWRI:
2917 flags = INT_DELWRI;
2918 break;
2919 default:
2920 ASSERT(0);
2921 flags = 0;
2922 break;
2923 }
2924 } else {
2925 switch (flags) {
2926 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2927 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2928 case XFS_IFLUSH_DELWRI:
2929 flags = INT_DELWRI;
2930 break;
2931 case XFS_IFLUSH_ASYNC_NOBLOCK:
2932 case XFS_IFLUSH_ASYNC:
2933 flags = INT_ASYNC;
2934 break;
2935 case XFS_IFLUSH_SYNC:
2936 flags = 0;
2937 break;
2938 default:
2939 ASSERT(0);
2940 flags = 0;
2941 break;
2942 }
2943 }
2944
2945 /*
2946 * Get the buffer containing the on-disk inode. 2894 * Get the buffer containing the on-disk inode.
2947 */ 2895 */
2948 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2896 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2949 noblock ? XBF_TRYLOCK : XBF_LOCK); 2897 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2950 if (error || !bp) { 2898 if (error || !bp) {
2951 xfs_ifunlock(ip); 2899 xfs_ifunlock(ip);
2952 return error; 2900 return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
2974 if (error) 2922 if (error)
2975 goto cluster_corrupt_out; 2923 goto cluster_corrupt_out;
2976 2924
2977 if (flags & INT_DELWRI) { 2925 if (flags & SYNC_WAIT)
2978 xfs_bdwrite(mp, bp);
2979 } else if (flags & INT_ASYNC) {
2980 error = xfs_bawrite(mp, bp);
2981 } else {
2982 error = xfs_bwrite(mp, bp); 2926 error = xfs_bwrite(mp, bp);
2983 } 2927 else
2928 xfs_bdwrite(mp, bp);
2984 return error; 2929 return error;
2985 2930
2986corrupt_out: 2931corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
3015 iip = ip->i_itemp; 2960 iip = ip->i_itemp;
3016 mp = ip->i_mount; 2961 mp = ip->i_mount;
3017 2962
3018
3019 /*
3020 * If the inode isn't dirty, then just release the inode
3021 * flush lock and do nothing.
3022 */
3023 if (xfs_inode_clean(ip)) {
3024 xfs_ifunlock(ip);
3025 return 0;
3026 }
3027
3028 /* set *dip = inode's place in the buffer */ 2963 /* set *dip = inode's place in the buffer */
3029 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2964 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3030 2965