From d8b16b3d1c9d8d9124d647d05797383d35e2d645 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 6 Nov 2010 12:41:16 -0700
Subject: ceph: fix bad pointer dereference in ceph_fill_trace

We dereference *in a few lines down, but only set it on rename.  It is
apparently pretty rare for this to trigger, but I have been hitting it
with a clustered MDSs.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ceph/inode.c')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04c..cd0432c03d2f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1055,7 +1055,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 		ininfo = rinfo->targeti.in;
 		vino.ino = le64_to_cpu(ininfo->ino);
 		vino.snap = le64_to_cpu(ininfo->snapid);
-		if (!dn->d_inode) {
+		in = dn->d_inode;
+		if (!in) {
 			in = ceph_get_inode(sb, vino);
 			if (IS_ERR(in)) {
 				pr_err("fill_trace bad get_inode "
-- 
cgit v1.2.2


From 912a9b0319a8eb9e0834b19a25e01013ab2d6a9f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 7 Nov 2010 09:37:25 -0800
Subject: ceph: only let auth caps update max_size

Only the auth MDS has a meaningful max_size value for us, so only update it
in fill_inode if we're being issued an auth cap.  Otherwise, a random
stat result from a non-auth MDS can clobber a meaningful max_size, get
the client<->mds cap state out of sync, and make writes hang.

Specifically, even if the client re-requests a larger max_size (which it
will), the MDS won't respond because as far as it knows we already have a
sufficiently large value.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/ceph/inode.c')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index cd0432c03d2f..0a49ffde5bcb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -606,7 +606,14 @@ static int fill_inode(struct inode *inode,
 			    le32_to_cpu(info->time_warp_seq),
 			    &ctime, &mtime, &atime);
 
-	ci->i_max_size = le64_to_cpu(info->max_size);
+	/* only update max_size on auth cap */
+	if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+	    ci->i_max_size != le64_to_cpu(info->max_size)) {
+		dout("max_size %lld -> %llu\n", ci->i_max_size,
+		     le64_to_cpu(info->max_size));
+		ci->i_max_size = le64_to_cpu(info->max_size);
+	}
+
 	ci->i_layout = info->layout;
 	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
 
-- 
cgit v1.2.2


From cd045cb42a266882ac24bc21a3a8d03683c72954 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 4 Nov 2010 11:05:05 -0700
Subject: ceph: fix rdcache_gen usage and invalidate

We used to use rdcache_gen to indicate whether we "might" have cached
pages.  Now we just look at the mapping to determine that.  However, some
old behavior remains from that transition.

First, rdcache_gen == 0 no longer means we have no pages.  That can happen
at any time (presumably when we carry FILE_CACHE).  We should not reset it
to zero, and we should not check that it is zero.

That means that the only purpose for rdcache_revoking is to resolve races
between new issues of FILE_CACHE and an async invalidate.  If they are
equal, we should invalidate.  On success, we decrement rdcache_revoking,
so that it is no longer equal to rdcache_gen.  Similarly, if we success
in doing a sync invalidate, set revoking = gen - 1.  (This is a small
optimization to avoid doing unnecessary invalidate work and does not
affect correctness.)

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs/ceph/inode.c')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0a49ffde5bcb..5a9f907b805e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1394,11 +1394,8 @@ static void ceph_invalidate_work(struct work_struct *work)
 	spin_lock(&inode->i_lock);
 	dout("invalidate_pages %p gen %d revoking %d\n", inode,
 	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
-	if (ci->i_rdcache_gen == 0 ||
-	    ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-		BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
 		/* nevermind! */
-		ci->i_rdcache_revoking = 0;
 		spin_unlock(&inode->i_lock);
 		goto out;
 	}
@@ -1408,15 +1405,16 @@ static void ceph_invalidate_work(struct work_struct *work)
 	ceph_invalidate_nondirty_pages(inode->i_mapping);
 
 	spin_lock(&inode->i_lock);
-	if (orig_gen == ci->i_rdcache_gen) {
+	if (orig_gen == ci->i_rdcache_gen &&
+	    orig_gen == ci->i_rdcache_revoking) {
 		dout("invalidate_pages %p gen %d successful\n", inode,
 		     ci->i_rdcache_gen);
-		ci->i_rdcache_gen = 0;
-		ci->i_rdcache_revoking = 0;
+		ci->i_rdcache_revoking--;
 		check = 1;
 	} else {
-		dout("invalidate_pages %p gen %d raced, gen now %d\n",
-		     inode, orig_gen, ci->i_rdcache_gen);
+		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
+		     inode, orig_gen, ci->i_rdcache_gen,
+		     ci->i_rdcache_revoking);
 	}
 	spin_unlock(&inode->i_lock);
 
-- 
cgit v1.2.2


From 8bd59e0188c04f6540f00e13f633f22e4804ce06 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 8 Nov 2010 09:23:12 -0800
Subject: ceph: fix version check on racing inode updates

We may get updates on the same inode from multiple MDSs; generally we only
pay attention if the update is newer than what we already have.  The
exception is when an MDS sense unstable information, in which case we
always update.

The old > check got this wrong when our version was odd (e.g. 3) and the
reply version was even (e.g. 2): the older stale (v2) info would be
applied.  Fixed and clarified the comment.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs/ceph/inode.c')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5a9f907b805e..425c5b1f944e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -567,12 +567,17 @@ static int fill_inode(struct inode *inode,
 
 	/*
 	 * provided version will be odd if inode value is projected,
-	 * even if stable.  skip the update if we have a newer info
-	 * (e.g., due to inode info racing form multiple MDSs), or if
-	 * we are getting projected (unstable) inode info.
+	 * even if stable.  skip the update if we have newer stable
+	 * info (ours>=theirs, e.g. due to racing mds replies), unless
+	 * we are getting projected (unstable) info (in which case the
+	 * version is odd, and we want ours>theirs).
+	 *   us   them
+	 *   2    2     skip
+	 *   3    2     skip
+	 *   3    3     update
 	 */
 	if (le64_to_cpu(info->version) > 0 &&
-	    (ci->i_version & ~1) > le64_to_cpu(info->version))
+	    (ci->i_version & ~1) >= le64_to_cpu(info->version))
 		goto no_change;
 
 	issued = __ceph_caps_issued(ci, &implemented);
-- 
cgit v1.2.2


From d8672d64b88cdb7aa8139fb6d218f40b8cbf60af Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 8 Nov 2010 09:24:34 -0800
Subject: ceph: fix update of ctime from MDS

The client can have a newer ctime than the MDS due to AUTH_EXCL and
XATTR_EXCL caps as well; update the check in ceph_fill_file_time
appropriately.

This fixes cases where ctime/mtime goes backward under the right sequence
of local updates (e.g. chmod) and mds replies (e.g. subsequent stat that
goes to the MDS).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/ceph/inode.c')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 425c5b1f944e..7bc0fbd26af2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -471,7 +471,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 
 	if (issued & (CEPH_CAP_FILE_EXCL|
 		      CEPH_CAP_FILE_WR|
-		      CEPH_CAP_FILE_BUFFER)) {
+		      CEPH_CAP_FILE_BUFFER|
+		      CEPH_CAP_AUTH_EXCL|
+		      CEPH_CAP_XATTR_EXCL)) {
 		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
 			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
 			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +513,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 			warn = 1;
 		}
 	} else {
-		/* we have no write caps; whatever the MDS says is true */
+		/* we have no write|excl caps; whatever the MDS says is true */
 		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
 			inode->i_ctime = *ctime;
 			inode->i_mtime = *mtime;
-- 
cgit v1.2.2


From b7495fc2ff941db6a118a93ab8d61149e3f4cef8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Nov 2010 12:43:12 -0800
Subject: ceph: make page alignment explicit in osd interface

We used to infer alignment of IOs within a page based on the file offset,
which assumed they matched.  This broke with direct IO that was not aligned
to pages (e.g., 512-byte aligned IO).  We were also trusting the alignment
specified in the OSD reply, which could have been adjusted by the server.

Explicitly specify the page alignment when setting up OSD IO requests.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ceph/inode.c')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7bc0fbd26af2..8153ee5a8d74 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1752,7 +1752,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
 		return 0;
 	}
 
-	dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+	dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
 	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
 		return 0;
 
-- 
cgit v1.2.2