aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Layton <jlayton@redhat.com>2014-01-27 13:46:15 -0500
committerTrond Myklebust <trond.myklebust@primarydata.com>2014-01-27 15:35:56 -0500
commitd529ef83c355f97027ff85298a9709fe06216a66 (patch)
treeb13587538729686d8f366ab734782de0aeba1fa7
parent7dd7d95916fe7c8494aa8708204d5a2b8689d270 (diff)
NFS: fix the handling of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping
There is a possible race in how the nfs_invalidate_mapping function is handled. Currently, we go and invalidate the pages in the file and then clear NFS_INO_INVALID_DATA. The problem is that it's possible for a stale page to creep into the mapping after the page was invalidated (i.e., via readahead). If another writer comes along and sets the flag after that happens but before invalidate_inode_pages2 returns then we could clear the flag without the cache having been properly invalidated. So, we must clear the flag first and then invalidate the pages. Doing this however, opens another race: It's possible to have two concurrent read() calls that end up in nfs_revalidate_mapping at the same time. The first one clears the NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping. Just before calling that though, the other task races in, checks the flag and finds it cleared. At that point, it trusts that the mapping is good and gets the lock on the page, allowing the read() to be satisfied from the cache even though the data is no longer valid. These effects are easily manifested by running diotest3 from the LTP test suite on NFS. That program does a series of DIO writes and buffered reads. The operations are serialized and page-aligned but the existing code fails the test since it occasionally allows a read to come out of the cache incorrectly. While mixing direct and buffered I/O isn't recommended, I believe it's possible to hit this in other ways that just use buffered I/O, though that situation is much harder to reproduce. The problem is that the checking/clearing of that flag and the invalidation of the mapping really need to be atomic. Fix this by serializing concurrent invalidations with a bitlock. At the same time, we also need to allow other places that check NFS_INO_INVALID_DATA to check whether we might be in the middle of invalidating the file, so fix up a couple of places that do that to look for the new NFS_INO_INVALIDATING flag. Doing this requires us to be careful not to set the bitlock unnecessarily, so this code only does that if it believes it will be doing an invalidation. Signed-off-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfs/inode.c42
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/write.c6
-rw-r--r--include/linux/nfs_fs.h1
5 files changed, 47 insertions, 6 deletions
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b266f734bd53..b39a0468829b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -288,7 +288,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
288 288
289 new_pos = desc->current_index + i; 289 new_pos = desc->current_index + i;
290 if (ctx->attr_gencount != nfsi->attr_gencount 290 if (ctx->attr_gencount != nfsi->attr_gencount
291 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { 291 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
292 || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
292 ctx->duped = 0; 293 ctx->duped = 0;
293 ctx->attr_gencount = nfsi->attr_gencount; 294 ctx->attr_gencount = nfsi->attr_gencount;
294 } else if (new_pos < desc->ctx->pos) { 295 } else if (new_pos < desc->ctx->pos) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c63e15224466..0a972ee9ccc1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
977 if (ret < 0) 977 if (ret < 0)
978 return ret; 978 return ret;
979 } 979 }
980 spin_lock(&inode->i_lock); 980 if (S_ISDIR(inode->i_mode)) {
981 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 981 spin_lock(&inode->i_lock);
982 if (S_ISDIR(inode->i_mode))
983 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 982 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
984 spin_unlock(&inode->i_lock); 983 spin_unlock(&inode->i_lock);
984 }
985 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 985 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
986 nfs_fscache_wait_on_invalidate(inode); 986 nfs_fscache_wait_on_invalidate(inode);
987 987
@@ -1008,6 +1008,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
1008int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 1008int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1009{ 1009{
1010 struct nfs_inode *nfsi = NFS_I(inode); 1010 struct nfs_inode *nfsi = NFS_I(inode);
1011 unsigned long *bitlock = &nfsi->flags;
1011 int ret = 0; 1012 int ret = 0;
1012 1013
1013 /* swapfiles are not supposed to be shared. */ 1014 /* swapfiles are not supposed to be shared. */
@@ -1019,12 +1020,45 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1019 if (ret < 0) 1020 if (ret < 0)
1020 goto out; 1021 goto out;
1021 } 1022 }
1023
1024 /*
1025 * We must clear NFS_INO_INVALID_DATA first to ensure that
1026 * invalidations that come in while we're shooting down the mappings
1027 * are respected. But, that leaves a race window where one revalidator
1028 * can clear the flag, and then another checks it before the mapping
1029 * gets invalidated. Fix that by serializing access to this part of
1030 * the function.
1031 *
1032 * At the same time, we need to allow other tasks to see whether we
1033 * might be in the middle of invalidating the pages, so we only set
1034 * the bit lock here if it looks like we're going to be doing that.
1035 */
1036 for (;;) {
1037 ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
1038 nfs_wait_bit_killable, TASK_KILLABLE);
1039 if (ret)
1040 goto out;
1041 if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
1042 goto out;
1043 if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
1044 break;
1045 }
1046
1047 spin_lock(&inode->i_lock);
1022 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { 1048 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
1049 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
1050 spin_unlock(&inode->i_lock);
1023 trace_nfs_invalidate_mapping_enter(inode); 1051 trace_nfs_invalidate_mapping_enter(inode);
1024 ret = nfs_invalidate_mapping(inode, mapping); 1052 ret = nfs_invalidate_mapping(inode, mapping);
1025 trace_nfs_invalidate_mapping_exit(inode, ret); 1053 trace_nfs_invalidate_mapping_exit(inode, ret);
1054 } else {
1055 /* something raced in and cleared the flag */
1056 spin_unlock(&inode->i_lock);
1026 } 1057 }
1027 1058
1059 clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
1060 smp_mb__after_clear_bit();
1061 wake_up_bit(bitlock, NFS_INO_INVALIDATING);
1028out: 1062out:
1029 return ret; 1063 return ret;
1030} 1064}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 89fe741e58b1..59f838cdc009 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,6 +36,7 @@
36 __print_flags(v, "|", \ 36 __print_flags(v, "|", \
37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ 37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
38 { 1 << NFS_INO_STALE, "STALE" }, \ 38 { 1 << NFS_INO_STALE, "STALE" }, \
39 { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
39 { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ 40 { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
40 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ 41 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
41 { 1 << NFS_INO_COMMIT, "COMMIT" }, \ 42 { 1 << NFS_INO_COMMIT, "COMMIT" }, \
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a44a87268a6e..5511a4247190 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -909,9 +909,13 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
909 */ 909 */
910static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) 910static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
911{ 911{
912 struct nfs_inode *nfsi = NFS_I(inode);
913
912 if (nfs_have_delegated_attributes(inode)) 914 if (nfs_have_delegated_attributes(inode))
913 goto out; 915 goto out;
914 if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) 916 if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
917 return false;
918 if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
915 return false; 919 return false;
916out: 920out:
917 return PageUptodate(page) != 0; 921 return PageUptodate(page) != 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 48997374eaf0..18fb16f4b939 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -215,6 +215,7 @@ struct nfs_inode {
215#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */ 215#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */
216#define NFS_INO_STALE (1) /* possible stale inode */ 216#define NFS_INO_STALE (1) /* possible stale inode */
217#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ 217#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
218#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
218#define NFS_INO_FLUSHING (4) /* inode is flushing out data */ 219#define NFS_INO_FLUSHING (4) /* inode is flushing out data */
219#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ 220#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
220#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ 221#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */