aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2008-05-19 02:29:46 -0400
committerLachlan McIlroy <lachlan@redback.melbourne.sgi.com>2008-05-23 01:25:25 -0400
commit978b7237123d007b9fa983af6e0e2fa8f97f9934 (patch)
treeed4c8af42502efeb7ae79b166bb5890347b3de93
parentc1e554aeea12d2dab5183e011c27dee6142dc927 (diff)
[XFS] Fix fsync() b0rkage.
xfs_fsync() fails to wait for data I/O completion before checking if the inode is dirty or clean to decide whether to log the inode or not. This misses inode size updates when the data flushed by the fsync() is extending the file. Hence, like fdatasync(), we need to wait for I/o completion first, then check the inode for cleanliness. Doing so makes the behaviour of xfs_fsync() identical for fsync and fdatasync and we *always* use synchronous semantics if the inode is dirty. Therefore also kill the differences and remove the unused flags from the xfs_fsync function and callers. SGI-PV: 981296 SGI-Modid: xfs-linux-melb:xfs-kern:31033a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c17
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h8
-rw-r--r--fs/xfs/xfs_vnodeops.c112
-rw-r--r--fs/xfs/xfs_vnodeops.h3
4 files changed, 54 insertions, 86 deletions
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 65e78c13d4ae..5f60363b9343 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
184 return -xfs_release(XFS_I(inode)); 184 return -xfs_release(XFS_I(inode));
185} 185}
186 186
187/*
188 * We ignore the datasync flag here because a datasync is effectively
189 * identical to an fsync. That is, datasync implies that we need to write
190 * only the metadata needed to be able to access the data that is written
191 * if we crash after the call completes. Hence if we are writing beyond
192 * EOF we have to log the inode size change as well, which makes it a
193 * full fsync. If we don't write beyond EOF, the inode core will be
194 * clean in memory and so we don't need to log the inode, just like
195 * fsync.
196 */
187STATIC int 197STATIC int
188xfs_file_fsync( 198xfs_file_fsync(
189 struct file *filp, 199 struct file *filp,
190 struct dentry *dentry, 200 struct dentry *dentry,
191 int datasync) 201 int datasync)
192{ 202{
193 int flags = FSYNC_WAIT;
194
195 if (datasync)
196 flags |= FSYNC_DATA;
197 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); 203 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
198 return -xfs_fsync(XFS_I(dentry->d_inode), flags, 204 return -xfs_fsync(XFS_I(dentry->d_inode));
199 (xfs_off_t)0, (xfs_off_t)-1);
200} 205}
201 206
202/* 207/*
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 9d73cb5c0fc7..25eb2a9e8d9b 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
230#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */ 230#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */
231 231
232/* 232/*
233 * Flags to vop_fsync/reclaim.
234 */
235#define FSYNC_NOWAIT 0 /* asynchronous flush */
236#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */
237#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */
238#define FSYNC_DATA 0x4 /* synchronous fsync of data only */
239
240/*
241 * Tracking vnode activity. 233 * Tracking vnode activity.
242 */ 234 */
243#if defined(XFS_INODE_TRACE) 235#if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 70702a60b4bb..e475e3717eb3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
856/* 856/*
857 * xfs_fsync 857 * xfs_fsync
858 * 858 *
859 * This is called to sync the inode and its data out to disk. 859 * This is called to sync the inode and its data out to disk. We need to hold
860 * We need to hold the I/O lock while flushing the data, and 860 * the I/O lock while flushing the data, and the inode lock while flushing the
861 * the inode lock while flushing the inode. The inode lock CANNOT 861 * inode. The inode lock CANNOT be held while flushing the data, so acquire
862 * be held while flushing the data, so acquire after we're done 862 * after we're done with that.
863 * with that.
864 */ 863 */
865int 864int
866xfs_fsync( 865xfs_fsync(
867 xfs_inode_t *ip, 866 xfs_inode_t *ip)
868 int flag,
869 xfs_off_t start,
870 xfs_off_t stop)
871{ 867{
872 xfs_trans_t *tp; 868 xfs_trans_t *tp;
873 int error; 869 int error;
@@ -875,103 +871,79 @@ xfs_fsync(
875 871
876 xfs_itrace_entry(ip); 872 xfs_itrace_entry(ip);
877 873
878 ASSERT(start >= 0 && stop >= -1);
879
880 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 874 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
881 return XFS_ERROR(EIO); 875 return XFS_ERROR(EIO);
882 876
883 if (flag & FSYNC_DATA) 877 /* capture size updates in I/O completion before writing the inode. */
884 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); 878 error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
879 if (error)
880 return XFS_ERROR(error);
885 881
886 /* 882 /*
887 * We always need to make sure that the required inode state 883 * We always need to make sure that the required inode state is safe on
888 * is safe on disk. The vnode might be clean but because 884 * disk. The vnode might be clean but we still might need to force the
889 * of committed transactions that haven't hit the disk yet. 885 * log because of committed transactions that haven't hit the disk yet.
890 * Likewise, there could be unflushed non-transactional 886 * Likewise, there could be unflushed non-transactional changes to the
891 * changes to the inode core that have to go to disk. 887 * inode core that have to go to disk and this requires us to issue
888 * a synchronous transaction to capture these changes correctly.
892 * 889 *
893 * The following code depends on one assumption: that 890 * This code relies on the assumption that if the update_* fields
894 * any transaction that changes an inode logs the core 891 * of the inode are clear and the inode is unpinned then it is clean
895 * because it has to change some field in the inode core 892 * and no action is required.
896 * (typically nextents or nblocks). That assumption
897 * implies that any transactions against an inode will
898 * catch any non-transactional updates. If inode-altering
899 * transactions exist that violate this assumption, the
900 * code breaks. Right now, it figures that if the involved
901 * update_* field is clear and the inode is unpinned, the
902 * inode is clean. Either it's been flushed or it's been
903 * committed and the commit has hit the disk unpinning the inode.
904 * (Note that xfs_inode_item_format() called at commit clears
905 * the update_* fields.)
906 */ 893 */
907 xfs_ilock(ip, XFS_ILOCK_SHARED); 894 xfs_ilock(ip, XFS_ILOCK_SHARED);
908 895
909 /* If we are flushing data then we care about update_size 896 if (!(ip->i_update_size || ip->i_update_core)) {
910 * being set, otherwise we care about update_core
911 */
912 if ((flag & FSYNC_DATA) ?
913 (ip->i_update_size == 0) :
914 (ip->i_update_core == 0)) {
915 /* 897 /*
916 * Timestamps/size haven't changed since last inode 898 * Timestamps/size haven't changed since last inode flush or
917 * flush or inode transaction commit. That means 899 * inode transaction commit. That means either nothing got
918 * either nothing got written or a transaction 900 * written or a transaction committed which caught the updates.
919 * committed which caught the updates. If the 901 * If the latter happened and the transaction hasn't hit the
920 * latter happened and the transaction hasn't 902 * disk yet, the inode will be still be pinned. If it is,
921 * hit the disk yet, the inode will be still 903 * force the log.
922 * be pinned. If it is, force the log.
923 */ 904 */
924 905
925 xfs_iunlock(ip, XFS_ILOCK_SHARED); 906 xfs_iunlock(ip, XFS_ILOCK_SHARED);
926 907
927 if (xfs_ipincount(ip)) { 908 if (xfs_ipincount(ip)) {
928 _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, 909 error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
929 XFS_LOG_FORCE | 910 XFS_LOG_FORCE | XFS_LOG_SYNC,
930 ((flag & FSYNC_WAIT)
931 ? XFS_LOG_SYNC : 0),
932 &log_flushed); 911 &log_flushed);
933 } else { 912 } else {
934 /* 913 /*
935 * If the inode is not pinned and nothing 914 * If the inode is not pinned and nothing has changed
936 * has changed we don't need to flush the 915 * we don't need to flush the cache.
937 * cache.
938 */ 916 */
939 changed = 0; 917 changed = 0;
940 } 918 }
941 error = 0;
942 } else { 919 } else {
943 /* 920 /*
944 * Kick off a transaction to log the inode 921 * Kick off a transaction to log the inode core to get the
945 * core to get the updates. Make it 922 * updates. The sync transaction will also force the log.
946 * sync if FSYNC_WAIT is passed in (which
947 * is done by everybody but specfs). The
948 * sync transaction will also force the log.
949 */ 923 */
950 xfs_iunlock(ip, XFS_ILOCK_SHARED); 924 xfs_iunlock(ip, XFS_ILOCK_SHARED);
951 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); 925 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
952 if ((error = xfs_trans_reserve(tp, 0, 926 error = xfs_trans_reserve(tp, 0,
953 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 927 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
954 0, 0, 0))) { 928 if (error) {
955 xfs_trans_cancel(tp, 0); 929 xfs_trans_cancel(tp, 0);
956 return error; 930 return error;
957 } 931 }
958 xfs_ilock(ip, XFS_ILOCK_EXCL); 932 xfs_ilock(ip, XFS_ILOCK_EXCL);
959 933
960 /* 934 /*
961 * Note - it's possible that we might have pushed 935 * Note - it's possible that we might have pushed ourselves out
962 * ourselves out of the way during trans_reserve 936 * of the way during trans_reserve which would flush the inode.
963 * which would flush the inode. But there's no 937 * But there's no guarantee that the inode buffer has actually
964 * guarantee that the inode buffer has actually 938 * gone out yet (it's delwri). Plus the buffer could be pinned
965 * gone out yet (it's delwri). Plus the buffer 939 * anyway if it's part of an inode in another recent
966 * could be pinned anyway if it's part of an 940 * transaction. So we play it safe and fire off the
967 * inode in another recent transaction. So we 941 * transaction anyway.
968 * play it safe and fire off the transaction anyway.
969 */ 942 */
970 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 943 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
971 xfs_trans_ihold(tp, ip); 944 xfs_trans_ihold(tp, ip);
972 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 945 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
973 if (flag & FSYNC_WAIT) 946 xfs_trans_set_sync(tp);
974 xfs_trans_set_sync(tp);
975 error = _xfs_trans_commit(tp, 0, &log_flushed); 947 error = _xfs_trans_commit(tp, 0, &log_flushed);
976 948
977 xfs_iunlock(ip, XFS_ILOCK_EXCL); 949 xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 8abe8f186e20..57335ba4ce53 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
18int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags, 18int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
19 struct cred *credp); 19 struct cred *credp);
20int xfs_readlink(struct xfs_inode *ip, char *link); 20int xfs_readlink(struct xfs_inode *ip, char *link);
21int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start, 21int xfs_fsync(struct xfs_inode *ip);
22 xfs_off_t stop);
23int xfs_release(struct xfs_inode *ip); 22int xfs_release(struct xfs_inode *ip);
24int xfs_inactive(struct xfs_inode *ip); 23int xfs_inactive(struct xfs_inode *ip);
25int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 24int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,