[XFS] Fix fsync() b0rkage.

xfs_fsync() fails to wait for data I/O completion before checking if the inode is dirty or clean to decide whether to log the inode or not. This misses inode size updates when the data flushed by the fsync() is extending the file. Hence, like fdatasync(), we need to wait for I/o completion first, then check the inode for cleanliness. Doing so makes the behaviour of xfs_fsync() identical for fsync and fdatasync and we *always* use synchronous semantics if the inode is dirty. Therefore also kill the differences and remove the unused flags from the xfs_fsync function and callers. SGI-PV: 981296 SGI-Modid: xfs-linux-melb:xfs-kern:31033a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
author: David Chinner <dgc@sgi.com> 2008-05-19 02:29:46 -0400
committer: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com> 2008-05-23 01:25:25 -0400
commit: 978b7237123d007b9fa983af6e0e2fa8f97f9934 (patch)
tree: ed4c8af42502efeb7ae79b166bb5890347b3de93
parent: c1e554aeea12d2dab5183e011c27dee6142dc927 (diff)
4 files changed, 54 insertions, 86 deletions
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 65e78c13d4ae..5f60363b9343 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
        return -xfs_release(XFS_I(inode));
 }
+/*
+ * We ignore the datasync flag here because a datasync is effectively
+ * identical to an fsync. That is, datasync implies that we need to write
+ * only the metadata needed to be able to access the data that is written
+ * if we crash after the call completes. Hence if we are writing beyond
+ * EOF we have to log the inode size change as well, which makes it a
+ * full fsync. If we don't write beyond EOF, the inode core will be
+ * clean in memory and so we don't need to log the inode, just like
+ * fsync.
+ */
 STATIC int
 xfs_file_fsync(
        struct file     *filp,
        struct dentry   *dentry,
        int             datasync)
 {
-        int             flags = FSYNC_WAIT;
-        if (datasync)
-                flags |= FSYNC_DATA;
        xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
-        return -xfs_fsync(XFS_I(dentry->d_inode), flags,
+        return -xfs_fsync(XFS_I(dentry->d_inode));
-                        (xfs_off_t)0, (xfs_off_t)-1);
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 9d73cb5c0fc7..25eb2a9e8d9b 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 #define ATTR_NOSIZETOK  0x400   /* Don't get the SIZE token */
 /*
- * Flags to vop_fsync/reclaim.
- */
-#define FSYNC_NOWAIT    0       /* asynchronous flush */
-#define FSYNC_WAIT      0x1     /* synchronous fsync or forced reclaim */
-#define FSYNC_INVAL     0x2     /* flush and invalidate cached data */
-#define FSYNC_DATA      0x4     /* synchronous fsync of data only */
-/*
 * Tracking vnode activity.
 */
 #if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 70702a60b4bb..e475e3717eb3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
 /*
 * xfs_fsync
 *
- * This is called to sync the inode and its data out to disk.
+ * This is called to sync the inode and its data out to disk.  We need to hold
- * We need to hold the I/O lock while flushing the data, and
+ * the I/O lock while flushing the data, and the inode lock while flushing the
- * the inode lock while flushing the inode.  The inode lock CANNOT
+ * inode.  The inode lock CANNOT be held while flushing the data, so acquire
- * be held while flushing the data, so acquire after we're done
+ * after we're done with that.
- * with that.
 */
 int
 xfs_fsync(
-        xfs_inode_t     *ip,
+        xfs_inode_t     *ip)
-        int             flag,
-        xfs_off_t       start,
-        xfs_off_t       stop)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -875,103 +871,79 @@ xfs_fsync(
        xfs_itrace_entry(ip);
-        ASSERT(start >= 0 && stop >= -1);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return XFS_ERROR(EIO);
-        if (flag & FSYNC_DATA)
+        /* capture size updates in I/O completion before writing the inode. */
-                filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+        error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+        if (error)
+                return XFS_ERROR(error);
        /*
-         * We always need to make sure that the required inode state
+         * We always need to make sure that the required inode state is safe on
-         * is safe on disk.  The vnode might be clean but because
+         * disk.  The vnode might be clean but we still might need to force the
-         * of committed transactions that haven't hit the disk yet.
+         * log because of committed transactions that haven't hit the disk yet.
-         * Likewise, there could be unflushed non-transactional
+         * Likewise, there could be unflushed non-transactional changes to the
-         * changes to the inode core that have to go to disk.
+         * inode core that have to go to disk and this requires us to issue
+         * a synchronous transaction to capture these changes correctly.
         *
-         * The following code depends on one assumption:  that
+         * This code relies on the assumption that if the update_* fields
-         * any transaction that changes an inode logs the core
+         * of the inode are clear and the inode is unpinned then it is clean
-         * because it has to change some field in the inode core
+         * and no action is required.
-         * (typically nextents or nblocks).  That assumption
-         * implies that any transactions against an inode will
-         * catch any non-transactional updates.  If inode-altering
-         * transactions exist that violate this assumption, the
-         * code breaks.  Right now, it figures that if the involved
-         * update_* field is clear and the inode is unpinned, the
-         * inode is clean.  Either it's been flushed or it's been
-         * committed and the commit has hit the disk unpinning the inode.
-         * (Note that xfs_inode_item_format() called at commit clears
-         * the update_* fields.)
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        /* If we are flushing data then we care about update_size
+        if (!(ip->i_update_size || ip->i_update_core)) {
-         * being set, otherwise we care about update_core
-         */
-        if ((flag & FSYNC_DATA) ?
-                        (ip->i_update_size == 0) :
-                        (ip->i_update_core == 0)) {
                /*
-                 * Timestamps/size haven't changed since last inode
+                 * Timestamps/size haven't changed since last inode flush or
-                 * flush or inode transaction commit.  That means
+                 * inode transaction commit.  That means either nothing got
-                 * either nothing got written or a transaction
+                 * written or a transaction committed which caught the updates.
-                 * committed which caught the updates.  If the
+                 * If the latter happened and the transaction hasn't hit the
-                 * latter happened and the transaction hasn't
+                 * disk yet, the inode will be still be pinned.  If it is,
-                 * hit the disk yet, the inode will be still
+                 * force the log.
-                 * be pinned.  If it is, force the log.
                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                if (xfs_ipincount(ip)) {
-                        _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+                        error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-                                      XFS_LOG_FORCE |
+                                      XFS_LOG_FORCE | XFS_LOG_SYNC,
-                                      ((flag & FSYNC_WAIT)
-                                       ? XFS_LOG_SYNC : 0),
                                      &log_flushed);
                } else {
                        /*
-                         * If the inode is not pinned and nothing
+                         * If the inode is not pinned and nothing has changed
-                         * has changed we don't need to flush the
+                         * we don't need to flush the cache.
-                         * cache.
                         */
                        changed = 0;
                }
-                error = 0;
        } else  {
                /*
-                 * Kick off a transaction to log the inode
+                 * Kick off a transaction to log the inode core to get the
-                 * core to get the updates.  Make it
+                 * updates.  The sync transaction will also force the log.
-                 * sync if FSYNC_WAIT is passed in (which
-                 * is done by everybody but specfs).  The
-                 * sync transaction will also force the log.
                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-                if ((error = xfs_trans_reserve(tp, 0,
+                error = xfs_trans_reserve(tp, 0,
-                                XFS_FSYNC_TS_LOG_RES(ip->i_mount),
+                                XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
-                                0, 0, 0)))  {
+                if (error) {
                        xfs_trans_cancel(tp, 0);
                        return error;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                /*
-                 * Note - it's possible that we might have pushed
+                 * Note - it's possible that we might have pushed ourselves out
-                 * ourselves out of the way during trans_reserve
+                 * of the way during trans_reserve which would flush the inode.
-                 * which would flush the inode.  But there's no
+                 * But there's no guarantee that the inode buffer has actually
-                 * guarantee that the inode buffer has actually
+                 * gone out yet (it's delwri).  Plus the buffer could be pinned
-                 * gone out yet (it's delwri).  Plus the buffer
+                 * anyway if it's part of an inode in another recent
-                 * could be pinned anyway if it's part of an
+                 * transaction.  So we play it safe and fire off the
-                 * inode in another recent transaction.  So we
+                 * transaction anyway.
-                 * play it safe and fire off the transaction anyway.
                 */
                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
                xfs_trans_ihold(tp, ip);
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                if (flag & FSYNC_WAIT)
+                xfs_trans_set_sync(tp);
-                        xfs_trans_set_sync(tp);
                error = _xfs_trans_commit(tp, 0, &log_flushed);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 8abe8f186e20..57335ba4ce53 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
 int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
                struct cred *credp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
+int xfs_fsync(struct xfs_inode *ip);
-                xfs_off_t stop);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
author	David Chinner <dgc@sgi.com>	2008-05-19 02:29:46 -0400
committer	Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>	2008-05-23 01:25:25 -0400
commit	978b7237123d007b9fa983af6e0e2fa8f97f9934 (patch)
tree	ed4c8af42502efeb7ae79b166bb5890347b3de93
parent	c1e554aeea12d2dab5183e011c27dee6142dc927 (diff)