diff options
author | David Chinner <dgc@sgi.com> | 2008-05-19 02:29:46 -0400 |
---|---|---|
committer | Lachlan McIlroy <lachlan@redback.melbourne.sgi.com> | 2008-05-23 01:25:25 -0400 |
commit | 978b7237123d007b9fa983af6e0e2fa8f97f9934 (patch) | |
tree | ed4c8af42502efeb7ae79b166bb5890347b3de93 /fs/xfs/xfs_vnodeops.c | |
parent | c1e554aeea12d2dab5183e011c27dee6142dc927 (diff) |
[XFS] Fix fsync() b0rkage.
xfs_fsync() fails to wait for data I/O completion before checking if the
inode is dirty or clean to decide whether to log the inode or not. This
misses inode size updates when the data flushed by the fsync() is
extending the file.
Hence, like fdatasync(), we need to wait for I/o completion first, then
check the inode for cleanliness. Doing so makes the behaviour of
xfs_fsync() identical for fsync and fdatasync and we *always* use
synchronous semantics if the inode is dirty. Therefore also kill the
differences and remove the unused flags from the xfs_fsync function and
callers.
SGI-PV: 981296
SGI-Modid: xfs-linux-melb:xfs-kern:31033a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_vnodeops.c')
-rw-r--r-- | fs/xfs/xfs_vnodeops.c | 112 |
1 files changed, 42 insertions, 70 deletions
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 70702a60b4bb..e475e3717eb3 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c | |||
@@ -856,18 +856,14 @@ xfs_readlink( | |||
856 | /* | 856 | /* |
857 | * xfs_fsync | 857 | * xfs_fsync |
858 | * | 858 | * |
859 | * This is called to sync the inode and its data out to disk. | 859 | * This is called to sync the inode and its data out to disk. We need to hold |
860 | * We need to hold the I/O lock while flushing the data, and | 860 | * the I/O lock while flushing the data, and the inode lock while flushing the |
861 | * the inode lock while flushing the inode. The inode lock CANNOT | 861 | * inode. The inode lock CANNOT be held while flushing the data, so acquire |
862 | * be held while flushing the data, so acquire after we're done | 862 | * after we're done with that. |
863 | * with that. | ||
864 | */ | 863 | */ |
865 | int | 864 | int |
866 | xfs_fsync( | 865 | xfs_fsync( |
867 | xfs_inode_t *ip, | 866 | xfs_inode_t *ip) |
868 | int flag, | ||
869 | xfs_off_t start, | ||
870 | xfs_off_t stop) | ||
871 | { | 867 | { |
872 | xfs_trans_t *tp; | 868 | xfs_trans_t *tp; |
873 | int error; | 869 | int error; |
@@ -875,103 +871,79 @@ xfs_fsync( | |||
875 | 871 | ||
876 | xfs_itrace_entry(ip); | 872 | xfs_itrace_entry(ip); |
877 | 873 | ||
878 | ASSERT(start >= 0 && stop >= -1); | ||
879 | |||
880 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) | 874 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) |
881 | return XFS_ERROR(EIO); | 875 | return XFS_ERROR(EIO); |
882 | 876 | ||
883 | if (flag & FSYNC_DATA) | 877 | /* capture size updates in I/O completion before writing the inode. */ |
884 | filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); | 878 | error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); |
879 | if (error) | ||
880 | return XFS_ERROR(error); | ||
885 | 881 | ||
886 | /* | 882 | /* |
887 | * We always need to make sure that the required inode state | 883 | * We always need to make sure that the required inode state is safe on |
888 | * is safe on disk. The vnode might be clean but because | 884 | * disk. The vnode might be clean but we still might need to force the |
889 | * of committed transactions that haven't hit the disk yet. | 885 | * log because of committed transactions that haven't hit the disk yet. |
890 | * Likewise, there could be unflushed non-transactional | 886 | * Likewise, there could be unflushed non-transactional changes to the |
891 | * changes to the inode core that have to go to disk. | 887 | * inode core that have to go to disk and this requires us to issue |
888 | * a synchronous transaction to capture these changes correctly. | ||
892 | * | 889 | * |
893 | * The following code depends on one assumption: that | 890 | * This code relies on the assumption that if the update_* fields |
894 | * any transaction that changes an inode logs the core | 891 | * of the inode are clear and the inode is unpinned then it is clean |
895 | * because it has to change some field in the inode core | 892 | * and no action is required. |
896 | * (typically nextents or nblocks). That assumption | ||
897 | * implies that any transactions against an inode will | ||
898 | * catch any non-transactional updates. If inode-altering | ||
899 | * transactions exist that violate this assumption, the | ||
900 | * code breaks. Right now, it figures that if the involved | ||
901 | * update_* field is clear and the inode is unpinned, the | ||
902 | * inode is clean. Either it's been flushed or it's been | ||
903 | * committed and the commit has hit the disk unpinning the inode. | ||
904 | * (Note that xfs_inode_item_format() called at commit clears | ||
905 | * the update_* fields.) | ||
906 | */ | 893 | */ |
907 | xfs_ilock(ip, XFS_ILOCK_SHARED); | 894 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
908 | 895 | ||
909 | /* If we are flushing data then we care about update_size | 896 | if (!(ip->i_update_size || ip->i_update_core)) { |
910 | * being set, otherwise we care about update_core | ||
911 | */ | ||
912 | if ((flag & FSYNC_DATA) ? | ||
913 | (ip->i_update_size == 0) : | ||
914 | (ip->i_update_core == 0)) { | ||
915 | /* | 897 | /* |
916 | * Timestamps/size haven't changed since last inode | 898 | * Timestamps/size haven't changed since last inode flush or |
917 | * flush or inode transaction commit. That means | 899 | * inode transaction commit. That means either nothing got |
918 | * either nothing got written or a transaction | 900 | * written or a transaction committed which caught the updates. |
919 | * committed which caught the updates. If the | 901 | * If the latter happened and the transaction hasn't hit the |
920 | * latter happened and the transaction hasn't | 902 | * disk yet, the inode will be still be pinned. If it is, |
921 | * hit the disk yet, the inode will be still | 903 | * force the log. |
922 | * be pinned. If it is, force the log. | ||
923 | */ | 904 | */ |
924 | 905 | ||
925 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 906 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
926 | 907 | ||
927 | if (xfs_ipincount(ip)) { | 908 | if (xfs_ipincount(ip)) { |
928 | _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, | 909 | error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, |
929 | XFS_LOG_FORCE | | 910 | XFS_LOG_FORCE | XFS_LOG_SYNC, |
930 | ((flag & FSYNC_WAIT) | ||
931 | ? XFS_LOG_SYNC : 0), | ||
932 | &log_flushed); | 911 | &log_flushed); |
933 | } else { | 912 | } else { |
934 | /* | 913 | /* |
935 | * If the inode is not pinned and nothing | 914 | * If the inode is not pinned and nothing has changed |
936 | * has changed we don't need to flush the | 915 | * we don't need to flush the cache. |
937 | * cache. | ||
938 | */ | 916 | */ |
939 | changed = 0; | 917 | changed = 0; |
940 | } | 918 | } |
941 | error = 0; | ||
942 | } else { | 919 | } else { |
943 | /* | 920 | /* |
944 | * Kick off a transaction to log the inode | 921 | * Kick off a transaction to log the inode core to get the |
945 | * core to get the updates. Make it | 922 | * updates. The sync transaction will also force the log. |
946 | * sync if FSYNC_WAIT is passed in (which | ||
947 | * is done by everybody but specfs). The | ||
948 | * sync transaction will also force the log. | ||
949 | */ | 923 | */ |
950 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 924 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
951 | tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); | 925 | tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); |
952 | if ((error = xfs_trans_reserve(tp, 0, | 926 | error = xfs_trans_reserve(tp, 0, |
953 | XFS_FSYNC_TS_LOG_RES(ip->i_mount), | 927 | XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); |
954 | 0, 0, 0))) { | 928 | if (error) { |
955 | xfs_trans_cancel(tp, 0); | 929 | xfs_trans_cancel(tp, 0); |
956 | return error; | 930 | return error; |
957 | } | 931 | } |
958 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 932 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
959 | 933 | ||
960 | /* | 934 | /* |
961 | * Note - it's possible that we might have pushed | 935 | * Note - it's possible that we might have pushed ourselves out |
962 | * ourselves out of the way during trans_reserve | 936 | * of the way during trans_reserve which would flush the inode. |
963 | * which would flush the inode. But there's no | 937 | * But there's no guarantee that the inode buffer has actually |
964 | * guarantee that the inode buffer has actually | 938 | * gone out yet (it's delwri). Plus the buffer could be pinned |
965 | * gone out yet (it's delwri). Plus the buffer | 939 | * anyway if it's part of an inode in another recent |
966 | * could be pinned anyway if it's part of an | 940 | * transaction. So we play it safe and fire off the |
967 | * inode in another recent transaction. So we | 941 | * transaction anyway. |
968 | * play it safe and fire off the transaction anyway. | ||
969 | */ | 942 | */ |
970 | xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); | 943 | xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); |
971 | xfs_trans_ihold(tp, ip); | 944 | xfs_trans_ihold(tp, ip); |
972 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | 945 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
973 | if (flag & FSYNC_WAIT) | 946 | xfs_trans_set_sync(tp); |
974 | xfs_trans_set_sync(tp); | ||
975 | error = _xfs_trans_commit(tp, 0, &log_flushed); | 947 | error = _xfs_trans_commit(tp, 0, &log_flushed); |
976 | 948 | ||
977 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 949 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |