From d8b16b3d1c9d8d9124d647d05797383d35e2d645 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 6 Nov 2010 12:41:16 -0700 Subject: ceph: fix bad pointer dereference in ceph_fill_trace We dereference *in a few lines down, but only set it on rename. It is apparently pretty rare for this to trigger, but I have been hitting it with a clustered MDSs. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ceph/inode.c') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b5a04c..cd0432c03d2f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1055,7 +1055,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - if (!dn->d_inode) { + in = dn->d_inode; + if (!in) { in = ceph_get_inode(sb, vino); if (IS_ERR(in)) { pr_err("fill_trace bad get_inode " -- cgit v1.2.2 From 912a9b0319a8eb9e0834b19a25e01013ab2d6a9f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 7 Nov 2010 09:37:25 -0800 Subject: ceph: only let auth caps update max_size Only the auth MDS has a meaningful max_size value for us, so only update it in fill_inode if we're being issued an auth cap. Otherwise, a random stat result from a non-auth MDS can clobber a meaningful max_size, get the client<->mds cap state out of sync, and make writes hang. Specifically, even if the client re-requests a larger max_size (which it will), the MDS won't respond because as far as it knows we already have a sufficiently large value. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/ceph/inode.c') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index cd0432c03d2f..0a49ffde5bcb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -606,7 +606,14 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - ci->i_max_size = le64_to_cpu(info->max_size); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; -- cgit v1.2.2 From cd045cb42a266882ac24bc21a3a8d03683c72954 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Nov 2010 11:05:05 -0700 Subject: ceph: fix rdcache_gen usage and invalidate We used to use rdcache_gen to indicate whether we "might" have cached pages. Now we just look at the mapping to determine that. However, some old behavior remains from that transition. First, rdcache_gen == 0 no longer means we have no pages. That can happen at any time (presumably when we carry FILE_CACHE). We should not reset it to zero, and we should not check that it is zero. That means that the only purpose for rdcache_revoking is to resolve races between new issues of FILE_CACHE and an async invalidate. If they are equal, we should invalidate. On success, we decrement rdcache_revoking, so that it is no longer equal to rdcache_gen. Similarly, if we success in doing a sync invalidate, set revoking = gen - 1. (This is a small optimization to avoid doing unnecessary invalidate work and does not affect correctness.) Signed-off-by: Sage Weil --- fs/ceph/inode.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs/ceph/inode.c') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0a49ffde5bcb..5a9f907b805e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1394,11 +1394,8 @@ static void ceph_invalidate_work(struct work_struct *work) spin_lock(&inode->i_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); - if (ci->i_rdcache_gen == 0 || - ci->i_rdcache_revoking != ci->i_rdcache_gen) { - BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - ci->i_rdcache_revoking = 0; spin_unlock(&inode->i_lock); goto out; } @@ -1408,15 +1405,16 @@ static void ceph_invalidate_work(struct work_struct *work) ceph_invalidate_nondirty_pages(inode->i_mapping); spin_lock(&inode->i_lock); - if (orig_gen == ci->i_rdcache_gen) { + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, ci->i_rdcache_gen); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, gen now %d\n", - inode, orig_gen, ci->i_rdcache_gen); + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); } spin_unlock(&inode->i_lock); -- cgit v1.2.2 From 8bd59e0188c04f6540f00e13f633f22e4804ce06 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 09:23:12 -0800 Subject: ceph: fix version check on racing inode updates We may get updates on the same inode from multiple MDSs; generally we only pay attention if the update is newer than what we already have. The exception is when an MDS sense unstable information, in which case we always update. The old > check got this wrong when our version was odd (e.g. 3) and the reply version was even (e.g. 2): the older stale (v2) info would be applied. Fixed and clarified the comment. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/ceph/inode.c') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5a9f907b805e..425c5b1f944e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -567,12 +567,17 @@ static int fill_inode(struct inode *inode, /* * provided version will be odd if inode value is projected, - * even if stable. skip the update if we have a newer info - * (e.g., due to inode info racing form multiple MDSs), or if - * we are getting projected (unstable) inode info. + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update */ if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) > le64_to_cpu(info->version)) + (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; issued = __ceph_caps_issued(ci, &implemented); -- cgit v1.2.2 From d8672d64b88cdb7aa8139fb6d218f40b8cbf60af Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 09:24:34 -0800 Subject: ceph: fix update of ctime from MDS The client can have a newer ctime than the MDS due to AUTH_EXCL and XATTR_EXCL caps as well; update the check in ceph_fill_file_time appropriately. This fixes cases where ctime/mtime goes backward under the right sequence of local updates (e.g. chmod) and mds replies (e.g. subsequent stat that goes to the MDS). Signed-off-by: Sage Weil --- fs/ceph/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ceph/inode.c') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 425c5b1f944e..7bc0fbd26af2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -471,7 +471,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, if (issued & (CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_WR| - CEPH_CAP_FILE_BUFFER)) { + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { if (timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -511,7 +513,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, warn = 1; } } else { - /* we have no write caps; whatever the MDS says is true */ + /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode->i_ctime = *ctime; inode->i_mtime = *mtime; -- cgit v1.2.2 From b7495fc2ff941db6a118a93ab8d61149e3f4cef8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:43:12 -0800 Subject: ceph: make page alignment explicit in osd interface We used to infer alignment of IOs within a page based on the file offset, which assumed they matched. This broke with direct IO that was not aligned to pages (e.g., 512-byte aligned IO). We were also trusting the alignment specified in the OSD reply, which could have been adjusted by the server. Explicitly specify the page alignment when setting up OSD IO requests. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph/inode.c') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7bc0fbd26af2..8153ee5a8d74 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1752,7 +1752,7 @@ int ceph_do_getattr(struct inode *inode, int mask) return 0; } - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; -- cgit v1.2.2