xfs: log changed inodes instead of writing them synchronously

When an inode has already be flushed delayed write, xfs_inode_clean() returns true and hence xfs_fs_write_inode() can return on a synchronous inode write without having written the inode. Currently these sycnhronous writes only come sync(1), unmount, a sycnhronous NFS export and cachefiles so should be relatively rare and out of common performance paths. Realistically, a synchronous inode write is not necessary here; we can avoid writing the inode by logging any non-transactional changes that are pending. This needs to be done with synchronous transactions, but it avoids seeking between the log and inode clusters as we do now. We don't force the log if the inode is pinned, though, so this differs from the fsync case. For normal sys_sync and unmount behaviour this is fine because we do a synchronous log force in xfs_sync_data which is called from the ->sync_fs code. It does however break the NFS synchronous export guarantees for now, but work is under way to fix this at a higher level or for the higher level to provide an additional flag in the writeback control to tell us that a log force is needed. Portions of this patch are based on work from Dave Chinner. Signed-off-by: Christoph Hellwig <hch@infradead.org> Reviewed-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Alex Elder <aelder@sgi.com>
author: Christoph Hellwig <hch@infradead.org> 2010-02-08 19:43:49 -0500
committer: Dave Chinner <david@fromorbit.com> 2010-02-08 19:43:49 -0500
commit: 07fec73625dc0db6f9aed68019918208a2ca53f5 (patch)
tree: 04f4ad3f829c3d64d6b2a3c0bddf11a4374eb93d /fs/xfs
parent: e8b217e7530c6a073ac69f1c85b922d93fdf5647 (diff)
1 files changed, 82 insertions, 29 deletions
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 3b5b46b8e3b9..25ea2408118f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1021,12 +1021,45 @@ xfs_fs_dirty_inode(
        XFS_I(inode)->i_update_core = 1;
 }
-/*
+STATIC int
- * Attempt to flush the inode, this will actually fail
+xfs_log_inode(
- * if the inode is pinned, but we dirty the inode again
+        struct xfs_inode        *ip)
- * at the point when it is unpinned after a log write,
+{
- * since this is when the inode itself becomes flushable.
+        struct xfs_mount        *mp = ip->i_mount;
- */
+        struct xfs_trans        *tp;
+        int                     error;
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+        error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                /* we need to return with the lock hold shared */
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        /*
+         * Note - it's possible that we might have pushed ourselves out of the
+         * way during trans_reserve which would flush the inode.  But there's
+         * no guarantee that the inode buffer has actually gone out yet (it's
+         * delwri).  Plus the buffer could be pinned anyway if it's part of
+         * an inode in another recent transaction.  So we play it safe and
+         * fire off the transaction anyway.
+         */
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0);
+        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+        return error;
+}
 STATIC int
 xfs_fs_write_inode(
        struct inode            *inode,
@@ -1034,7 +1067,7 @@ xfs_fs_write_inode(
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-        int                     error = 0;
+        int                     error = EAGAIN;
        xfs_itrace_entry(ip);
@@ -1045,35 +1078,55 @@ xfs_fs_write_inode(
                error = xfs_wait_on_pages(ip, 0, -1);
                if (error)
                        goto out;
-        }
-        /*
-         * Bypass inodes which have already been cleaned by
-         * the inode flush clustering code inside xfs_iflush
-         */
-        if (xfs_inode_clean(ip))
-                goto out;
-        /*
+                /*
-         * We make this non-blocking if the inode is contended, return
+                 * Make sure the inode has hit stable storage.  By using the
-         * EAGAIN to indicate to the caller that they did not succeed.
+                 * log and the fsync transactions we reduce the IOs we have
-         * This prevents the flush path from blocking on inodes inside
+                 * to do here from two (log and inode) to just the log.
-         * another operation right now, they get caught later by xfs_sync.
+                 *
-         */
+                 * Note: We still need to do a delwri write of the inode after
-        if (sync) {
+                 * this to flush it to the backing buffer so that bulkstat
+                 * works properly if this is the first time the inode has been
+                 * written.  Because we hold the ilock atomically over the
+                 * transaction commit and the inode flush we are guaranteed
+                 * that the inode is not pinned when it returns. If the flush
+                 * lock is already held, then the inode has already been
+                 * flushed once and we don't need to flush it again.  Hence
+                 * the code will only flush the inode if it isn't already
+                 * being flushed.
+                 */
                xfs_ilock(ip, XFS_ILOCK_SHARED);
-                xfs_iflock(ip);
+                if (ip->i_update_core) {
+                        error = xfs_log_inode(ip);
-                error = xfs_iflush(ip, SYNC_WAIT);
+                        if (error)
+                                goto out_unlock;
+                }
        } else {
-                error = EAGAIN;
+                /*
+                 * We make this non-blocking if the inode is contended, return
+                 * EAGAIN to indicate to the caller that they did not succeed.
+                 * This prevents the flush path from blocking on inodes inside
+                 * another operation right now, they get caught later by xfs_sync.
+                 */
                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
                        goto out;
-                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+        }
-                        goto out_unlock;
+        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+                goto out_unlock;
-                error = xfs_iflush(ip, 0);
+        /*
+         * Now we have the flush lock and the inode is not pinned, we can check
+         * if the inode is really clean as we know that there are no pending
+         * transaction completions, it is not waiting on the delayed write
+         * queue and there is no IO in progress.
+         */
+        if (xfs_inode_clean(ip)) {
+                xfs_ifunlock(ip);
+                error = 0;
+                goto out_unlock;
        }
+        error = xfs_iflush(ip, 0);
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
author	Christoph Hellwig <hch@infradead.org>	2010-02-08 19:43:49 -0500
committer	Dave Chinner <david@fromorbit.com>	2010-02-08 19:43:49 -0500
commit	07fec73625dc0db6f9aed68019918208a2ca53f5 (patch)
tree	04f4ad3f829c3d64d6b2a3c0bddf11a4374eb93d /fs/xfs
parent	e8b217e7530c6a073ac69f1c85b922d93fdf5647 (diff)

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 3b5b46b8e3b9..25ea2408118f 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1021,12 +1021,45 @@ xfs_fs_dirty_inode(
1021	XFS_I(inode)->i_update_core = 1;	1021	XFS_I(inode)->i_update_core = 1;
1022	}	1022	}
1023		1023
1024	/*	1024	STATIC int
1025	* Attempt to flush the inode, this will actually fail	1025	xfs_log_inode(
1026	* if the inode is pinned, but we dirty the inode again	1026	struct xfs_inode *ip)
1027	* at the point when it is unpinned after a log write,	1027	{
1028	* since this is when the inode itself becomes flushable.	1028	struct xfs_mount *mp = ip->i_mount;
1029	*/	1029	struct xfs_trans *tp;
		1030	int error;
		1031
		1032	xfs_iunlock(ip, XFS_ILOCK_SHARED);
		1033	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
		1034	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
		1035
		1036	if (error) {
		1037	xfs_trans_cancel(tp, 0);
		1038	/* we need to return with the lock hold shared */
		1039	xfs_ilock(ip, XFS_ILOCK_SHARED);
		1040	return error;
		1041	}
		1042
		1043	xfs_ilock(ip, XFS_ILOCK_EXCL);
		1044
		1045	/*
		1046	* Note - it's possible that we might have pushed ourselves out of the
		1047	* way during trans_reserve which would flush the inode. But there's
		1048	* no guarantee that the inode buffer has actually gone out yet (it's
		1049	* delwri). Plus the buffer could be pinned anyway if it's part of
		1050	* an inode in another recent transaction. So we play it safe and
		1051	* fire off the transaction anyway.
		1052	*/
		1053	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
		1054	xfs_trans_ihold(tp, ip);
		1055	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
		1056	xfs_trans_set_sync(tp);
		1057	error = xfs_trans_commit(tp, 0);
		1058	xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
		1059
		1060	return error;
		1061	}
		1062
1030	STATIC int	1063	STATIC int
1031	xfs_fs_write_inode(	1064	xfs_fs_write_inode(
1032	struct inode *inode,	1065	struct inode *inode,
@@ -1034,7 +1067,7 @@ xfs_fs_write_inode(
1034	{	1067	{
1035	struct xfs_inode *ip = XFS_I(inode);	1068	struct xfs_inode *ip = XFS_I(inode);
1036	struct xfs_mount *mp = ip->i_mount;	1069	struct xfs_mount *mp = ip->i_mount;
1037	int error = 0;	1070	int error = EAGAIN;
1038		1071
1039	xfs_itrace_entry(ip);	1072	xfs_itrace_entry(ip);
1040		1073
@@ -1045,35 +1078,55 @@ xfs_fs_write_inode(
1045	error = xfs_wait_on_pages(ip, 0, -1);	1078	error = xfs_wait_on_pages(ip, 0, -1);
1046	if (error)	1079	if (error)
1047	goto out;	1080	goto out;
1048	}
1049
1050	/*
1051	* Bypass inodes which have already been cleaned by
1052	* the inode flush clustering code inside xfs_iflush
1053	*/
1054	if (xfs_inode_clean(ip))
1055	goto out;
1056		1081
1057	/*	1082	/*
1058	* We make this non-blocking if the inode is contended, return	1083	* Make sure the inode has hit stable storage. By using the
1059	* EAGAIN to indicate to the caller that they did not succeed.	1084	* log and the fsync transactions we reduce the IOs we have
1060	* This prevents the flush path from blocking on inodes inside	1085	* to do here from two (log and inode) to just the log.
1061	* another operation right now, they get caught later by xfs_sync.	1086	*
1062	*/	1087	* Note: We still need to do a delwri write of the inode after
1063	if (sync) {	1088	* this to flush it to the backing buffer so that bulkstat
		1089	* works properly if this is the first time the inode has been
		1090	* written. Because we hold the ilock atomically over the
		1091	* transaction commit and the inode flush we are guaranteed
		1092	* that the inode is not pinned when it returns. If the flush
		1093	* lock is already held, then the inode has already been
		1094	* flushed once and we don't need to flush it again. Hence
		1095	* the code will only flush the inode if it isn't already
		1096	* being flushed.
		1097	*/
1064	xfs_ilock(ip, XFS_ILOCK_SHARED);	1098	xfs_ilock(ip, XFS_ILOCK_SHARED);
1065	xfs_iflock(ip);	1099	if (ip->i_update_core) {
1066		1100	error = xfs_log_inode(ip);
1067	error = xfs_iflush(ip, SYNC_WAIT);	1101	if (error)
		1102	goto out_unlock;
		1103	}
1068	} else {	1104	} else {
1069	error = EAGAIN;	1105	/*
		1106	* We make this non-blocking if the inode is contended, return
		1107	* EAGAIN to indicate to the caller that they did not succeed.
		1108	* This prevents the flush path from blocking on inodes inside
		1109	* another operation right now, they get caught later by xfs_sync.
		1110	*/
1070	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))	1111	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1071	goto out;	1112	goto out;
1072	if (xfs_ipincount(ip) \|\| !xfs_iflock_nowait(ip))	1113	}
1073	goto out_unlock;	1114
		1115	if (xfs_ipincount(ip) \|\| !xfs_iflock_nowait(ip))
		1116	goto out_unlock;
1074		1117
1075	error = xfs_iflush(ip, 0);	1118	/*
		1119	* Now we have the flush lock and the inode is not pinned, we can check
		1120	* if the inode is really clean as we know that there are no pending
		1121	* transaction completions, it is not waiting on the delayed write
		1122	* queue and there is no IO in progress.
		1123	*/
		1124	if (xfs_inode_clean(ip)) {
		1125	xfs_ifunlock(ip);
		1126	error = 0;
		1127	goto out_unlock;
1076	}	1128	}
		1129	error = xfs_iflush(ip, 0);
1077		1130
1078	out_unlock:	1131	out_unlock:
1079	xfs_iunlock(ip, XFS_ILOCK_SHARED);	1132	xfs_iunlock(ip, XFS_ILOCK_SHARED);