From 2f56f56ad991edd51ffd0baf1182245ee1277a04 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 27 Oct 2010 20:59:49 -0700 Subject: Revert "ceph: update issue_seq on cap grant" This reverts commit d91f2438d881514e4a923fd786dbd94b764a9440. The intent of issue_seq is to distinguish between mds->client messages that (re)create the cap and those that do not, which means we should _only_ be updating that value in the create paths. By updating it in handle_cap_grant, we reset it to zero, which then breaks release. The larger question is what workload/problem made me think it should be updated here... Signed-off-by: Sage Weil --- fs/ceph/caps.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 98ab13e2b71d..6e0942f33dd8 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - unsigned seq = le32_to_cpu(grant->seq); - unsigned issue_seq = le32_to_cpu(grant->issue_seq); + int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int issued, implemented, used, wanted, dirty; u64 size = le64_to_cpu(grant->size); @@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int revoked_rdcache = 0; int queue_invalidate = 0; - dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", - inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } cap->seq = seq; - cap->issue_seq = issue_seq; /* file layout may have changed */ ci->i_layout = grant->layout; -- cgit v1.2.2 From df9f86faf3ee610527ed02031fe7dd3c8b752e44 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 1 Nov 2010 15:49:23 -0700 Subject: ceph: fix small seq message skipping If the client gets out of sync with the server message sequence number, we normally skip low seq messages (ones we already received). The skip code was also incrementing the expected seq, such that all subsequent messages also appeared old and got skipped, and an eventual timeout on the osd connection. This resulted in some lagging requests and console messages like [233480.882885] ceph: skipping osd22 10.138.138.13:6804 seq 2016, expected 2017 [233480.882919] ceph: skipping osd22 10.138.138.13:6804 seq 2017, expected 2018 [233480.882963] ceph: skipping osd22 10.138.138.13:6804 seq 2018, expected 2019 [233480.883488] ceph: skipping osd22 10.138.138.13:6804 seq 2019, expected 2020 [233485.219558] ceph: skipping osd22 10.138.138.13:6804 seq 2020, expected 2021 [233485.906595] ceph: skipping osd22 10.138.138.13:6804 seq 2021, expected 2022 [233490.379536] ceph: skipping osd22 10.138.138.13:6804 seq 2022, expected 2023 [233495.523260] ceph: skipping osd22 10.138.138.13:6804 seq 2023, expected 2024 [233495.923194] ceph: skipping osd22 10.138.138.13:6804 seq 2024, expected 2025 [233500.534614] ceph: tid 6023602 timed out on osd22, will reset osd Reported-by: Theodore Ts'o Signed-off-by: Sage Weil --- net/ceph/messenger.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0e8157ee5d43..d379abf873bc 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1532,14 +1532,13 @@ static int read_partial_message(struct ceph_connection *con) /* verify seq# */ seq = le64_to_cpu(con->in_hdr.seq); if ((s64)seq - (s64)con->in_seq < 1) { - pr_info("skipping %s%lld %s seq %lld, expected %lld\n", + pr_info("skipping %s%lld %s seq %lld expected %lld\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; return 0; } else if ((s64)seq - (s64)con->in_seq > 1) { pr_err("read_partial_message bad seq %lld expected %lld\n", -- cgit v1.2.2 From d8b16b3d1c9d8d9124d647d05797383d35e2d645 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 6 Nov 2010 12:41:16 -0700 Subject: ceph: fix bad pointer dereference in ceph_fill_trace We dereference *in a few lines down, but only set it on rename. It is apparently pretty rare for this to trigger, but I have been hitting it with a clustered MDSs. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b5a04c..cd0432c03d2f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1055,7 +1055,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - if (!dn->d_inode) { + in = dn->d_inode; + if (!in) { in = ceph_get_inode(sb, vino); if (IS_ERR(in)) { pr_err("fill_trace bad get_inode " -- cgit v1.2.2 From 7421ab8041d98363edfb85955fa3b9849ffae366 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 7 Nov 2010 09:07:15 -0800 Subject: ceph: fix open for write on clustered mds Normally when we open a file we already have a cap, and simply update the wanted set. However, if we open a file for write, but don't have an auth cap, that doesn't work; we need to open a new cap with the auth MDS. Only reuse existing caps if we are opening for read or the existing cap is auth. Signed-off-by: Sage Weil --- fs/ceph/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e77c28cf3690..87ee944724f8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file) } /* - * No need to block if we have any caps. Update wanted set + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set * asynchronously. */ spin_lock(&inode->i_lock); - if (__ceph_is_any_real_caps(ci)) { + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); -- cgit v1.2.2 From 912a9b0319a8eb9e0834b19a25e01013ab2d6a9f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 7 Nov 2010 09:37:25 -0800 Subject: ceph: only let auth caps update max_size Only the auth MDS has a meaningful max_size value for us, so only update it in fill_inode if we're being issued an auth cap. Otherwise, a random stat result from a non-auth MDS can clobber a meaningful max_size, get the client<->mds cap state out of sync, and make writes hang. Specifically, even if the client re-requests a larger max_size (which it will), the MDS won't respond because as far as it knows we already have a sufficiently large value. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index cd0432c03d2f..0a49ffde5bcb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -606,7 +606,14 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - ci->i_max_size = le64_to_cpu(info->max_size); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; -- cgit v1.2.2 From feb4cc9bb433bf1491ac5ffbba133f3258dacf06 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 7 Nov 2010 09:39:00 -0800 Subject: ceph: re-request max_size if cap auth changes If the auth cap migrates to another MDS, clear requested_max_size so that we resend any pending max_size increase requests. This fixes potential hangs on writes that extend a file and race with an cap migration between MDSs. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6e0942f33dd8..04b207b0c842 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2689,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, NULL /* no caps context */); try_flush_caps(inode, session, NULL); up_read(&mdsc->snap_rwsem); + + /* make sure we re-request max_size, if necessary */ + spin_lock(&inode->i_lock); + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); } /* -- cgit v1.2.2 From cd045cb42a266882ac24bc21a3a8d03683c72954 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Nov 2010 11:05:05 -0700 Subject: ceph: fix rdcache_gen usage and invalidate We used to use rdcache_gen to indicate whether we "might" have cached pages. Now we just look at the mapping to determine that. However, some old behavior remains from that transition. First, rdcache_gen == 0 no longer means we have no pages. That can happen at any time (presumably when we carry FILE_CACHE). We should not reset it to zero, and we should not check that it is zero. That means that the only purpose for rdcache_revoking is to resolve races between new issues of FILE_CACHE and an async invalidate. If they are equal, we should invalidate. On success, we decrement rdcache_revoking, so that it is no longer equal to rdcache_gen. Similarly, if we success in doing a sync invalidate, set revoking = gen - 1. (This is a small optimization to avoid doing unnecessary invalidate work and does not affect correctness.) Signed-off-by: Sage Weil --- fs/ceph/caps.c | 4 ++-- fs/ceph/inode.c | 16 +++++++--------- fs/ceph/super.h | 4 +--- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 04b207b0c842..60d27bc9eb83 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } dout("try_nonblocking_invalidate %p failed\n", inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0a49ffde5bcb..5a9f907b805e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1394,11 +1394,8 @@ static void ceph_invalidate_work(struct work_struct *work) spin_lock(&inode->i_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); - if (ci->i_rdcache_gen == 0 || - ci->i_rdcache_revoking != ci->i_rdcache_gen) { - BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - ci->i_rdcache_revoking = 0; spin_unlock(&inode->i_lock); goto out; } @@ -1408,15 +1405,16 @@ static void ceph_invalidate_work(struct work_struct *work) ceph_invalidate_nondirty_pages(inode->i_mapping); spin_lock(&inode->i_lock); - if (orig_gen == ci->i_rdcache_gen) { + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, ci->i_rdcache_gen); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, gen now %d\n", - inode, orig_gen, ci->i_rdcache_gen); + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); } spin_unlock(&inode->i_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1886294e12f7..7f01728a4657 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -293,9 +293,7 @@ struct ceph_inode_info { int i_rd_ref, i_rdcache_ref, i_wr_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ - u32 i_rdcache_gen; /* we increment this each time we get - FILE_CACHE. If it's non-zero, we - _may_ have cached pages. */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_writes; /* uncommitted sync writes */ -- cgit v1.2.2 From cb4276cca4695670916a82e359f2e3776f0a9138 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 07:28:52 -0800 Subject: ceph: fix uid/gid on resent mds requests MDS requests can be rebuilt and resent in non-process context, but were filling in uid/gid from current_fsuid/gid. Put that information in the request struct on request setup. This fixes incorrect (and root) uid/gid getting set for requests that are forwarded between MDSs, usually due to metadata migrations. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 7 +++++-- fs/ceph/mds_client.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3142b15940c2..d22fbbef1959 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -529,6 +529,9 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); __insert_request(mdsc, req); + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -1588,8 +1591,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(current_fsuid()); - head->caller_gid = cpu_to_le32(current_fsgid()); + head->caller_uid = cpu_to_le32(req->r_uid); + head->caller_gid = cpu_to_le32(req->r_gid); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d66d63c72355..9341fd4f1432 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -170,6 +170,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ + uid_t r_uid; + gid_t r_gid; /* for choosing which mds to send this request to */ int r_direct_mode; -- cgit v1.2.2 From 8bd59e0188c04f6540f00e13f633f22e4804ce06 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 09:23:12 -0800 Subject: ceph: fix version check on racing inode updates We may get updates on the same inode from multiple MDSs; generally we only pay attention if the update is newer than what we already have. The exception is when an MDS sense unstable information, in which case we always update. The old > check got this wrong when our version was odd (e.g. 3) and the reply version was even (e.g. 2): the older stale (v2) info would be applied. Fixed and clarified the comment. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5a9f907b805e..425c5b1f944e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -567,12 +567,17 @@ static int fill_inode(struct inode *inode, /* * provided version will be odd if inode value is projected, - * even if stable. skip the update if we have a newer info - * (e.g., due to inode info racing form multiple MDSs), or if - * we are getting projected (unstable) inode info. + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update */ if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) > le64_to_cpu(info->version)) + (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; issued = __ceph_caps_issued(ci, &implemented); -- cgit v1.2.2 From d8672d64b88cdb7aa8139fb6d218f40b8cbf60af Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 8 Nov 2010 09:24:34 -0800 Subject: ceph: fix update of ctime from MDS The client can have a newer ctime than the MDS due to AUTH_EXCL and XATTR_EXCL caps as well; update the check in ceph_fill_file_time appropriately. This fixes cases where ctime/mtime goes backward under the right sequence of local updates (e.g. chmod) and mds replies (e.g. subsequent stat that goes to the MDS). Signed-off-by: Sage Weil --- fs/ceph/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 425c5b1f944e..7bc0fbd26af2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -471,7 +471,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, if (issued & (CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_WR| - CEPH_CAP_FILE_BUFFER)) { + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { if (timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -511,7 +513,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, warn = 1; } } else { - /* we have no write caps; whatever the MDS says is true */ + /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode->i_ctime = *ctime; inode->i_mtime = *mtime; -- cgit v1.2.2 From e98b6fed84d0f0155d7b398e0dfeac74c792f2d0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:24:53 -0800 Subject: ceph: fix comment, remove extraneous args The offset/length arguments aren't used. Signed-off-by: Sage Weil --- fs/ceph/file.c | 20 +++++++++----------- include/linux/ceph/libceph.h | 3 +-- net/ceph/pagevec.c | 3 +-- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 87ee944724f8..603fd00af0a6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -376,21 +376,19 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, dout("sync_read on file %p %llu~%u %s\n", file, off, len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, off, len); - - /* - * flush any page cache pages in this range. this - * will make concurrent normal and O_DIRECT io slow, - * but it will at least behave sensibly when they are - * in sequence. - */ - } else { + if (file->f_flags & O_DIRECT) + pages = ceph_get_direct_page_vector(data, num_pages); + else pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } if (IS_ERR(pages)) return PTR_ERR(pages); + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ ret = filemap_write_and_wait(inode->i_mapping); if (ret < 0) goto done; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index f22b2e941686..9e76d35670d2 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -227,8 +227,7 @@ extern int ceph_open_session(struct ceph_client *client); extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_get_direct_page_vector(const char __user *data, - int num_pages, - loff_t off, size_t len); + int num_pages); extern void ceph_put_page_vector(struct page **pages, int num_pages); extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 54caf0687155..ac34feeb2b3a 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -13,8 +13,7 @@ * build a vector of user pages */ struct page **ceph_get_direct_page_vector(const char __user *data, - int num_pages, - loff_t off, size_t len) + int num_pages) { struct page **pages; int rc; -- cgit v1.2.2 From b7495fc2ff941db6a118a93ab8d61149e3f4cef8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:43:12 -0800 Subject: ceph: make page alignment explicit in osd interface We used to infer alignment of IOs within a page based on the file offset, which assumed they matched. This broke with direct IO that was not aligned to pages (e.g., 512-byte aligned IO). We were also trusting the alignment specified in the OSD reply, which could have been adjusted by the server. Explicitly specify the page alignment when setting up OSD IO requests. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 6 +++--- fs/ceph/file.c | 26 +++++++++++++++++++++----- fs/ceph/inode.c | 2 +- include/linux/ceph/osd_client.h | 7 +++++-- net/ceph/osd_client.c | 22 ++++++++++++++-------- 5 files changed, 44 insertions(+), 19 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 51bcc5ce3230..4aa857763037 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, page->index << PAGE_CACHE_SHIFT, &len, ci->i_truncate_seq, ci->i_truncate_size, - &page, 1); + &page, 1, 0); if (err == -ENOENT) err = 0; if (err < 0) { @@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, offset, &len, ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages); + pages, nr_pages, 0); if (rc == -ENOENT) rc = 0; if (rc < 0) @@ -782,7 +782,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 1); + &inode->i_mtime, true, 1, 0); max_pages = req->r_num_pages; alloc_page_vec(fsc, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 603fd00af0a6..8d79b8912e31 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -282,11 +282,12 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof) + int *checkeof, bool align_to_pages) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; + int io_align, page_align; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; @@ -302,14 +303,19 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; + io_align = off & ~PAGE_MASK; more: + if (align_to_pages) + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, - page_pos, pages_left); + page_pos, pages_left, page_align); hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) @@ -393,7 +399,8 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, if (ret < 0) goto done; - ret = striped_read(inode, off, len, pages, num_pages, checkeof); + ret = striped_read(inode, off, len, pages, num_pages, checkeof, + file->f_flags & O_DIRECT); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) ret = ceph_copy_page_vector_to_user(pages, data, off, ret); @@ -448,6 +455,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int flags; int do_sync = 0; int check_caps = 0; + int page_align, io_align; int ret; struct timespec mtime = CURRENT_TIME; @@ -462,6 +470,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; + io_align = pos & ~PAGE_MASK; + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -486,20 +496,26 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; + if (file->f_flags & O_DIRECT) + /* write from beginning of first page, regardless of + io alignment */ + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2); + &mtime, false, 2, page_align); if (!req) return -ENOMEM; num_pages = calc_pages_for(pos, len); if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7bc0fbd26af2..8153ee5a8d74 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1752,7 +1752,7 @@ int ceph_do_getattr(struct inode *inode, int mask) return 0; } - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 6c91fb032c39..a1af29648fb5 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -79,6 +79,7 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ unsigned r_num_pages; /* size of page array (follows) */ + unsigned r_page_alignment; /* io offset in first page */ struct page **r_pages; /* pages for data payload */ int r_pages_from_pool; int r_own_pages; /* if true, i own page list */ @@ -194,7 +195,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, int do_sync, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - bool use_mempool, int num_reply); + bool use_mempool, int num_reply, + int page_align); static inline void ceph_osdc_get_request(struct ceph_osd_request *req) { @@ -218,7 +220,8 @@ extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, u64 off, u64 *plen, u32 truncate_seq, u64 truncate_size, - struct page **pages, int nr_pages); + struct page **pages, int nr_pages, + int page_align); extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 79391994b3ed..6c096239660c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -71,6 +71,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, op->extent.length = objlen; } req->r_num_pages = calc_pages_for(off, *plen); + req->r_page_alignment = off & ~PAGE_MASK; if (op->op == CEPH_OSD_OP_WRITE) op->payload_len = *plen; @@ -419,7 +420,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - bool use_mempool, int num_reply) + bool use_mempool, int num_reply, + int page_align) { struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; @@ -447,6 +449,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, calc_layout(osdc, vino, layout, off, plen, req, ops); req->r_file_layout = *layout; /* keep a copy */ + /* in case it differs from natural alignment that calc_layout + filled in for us */ + req->r_page_alignment = page_align; + ceph_osdc_build_request(req, off, plen, ops, snapc, mtime, @@ -1489,7 +1495,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 *plen, u32 truncate_seq, u64 truncate_size, - struct page **pages, int num_pages) + struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; int rc = 0; @@ -1499,15 +1505,15 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, req = ceph_osdc_new_request(osdc, layout, vino, off, plen, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, - false, 1); + false, 1, page_align); if (!req) return -ENOMEM; /* it may be a short read due to an object boundary */ req->r_pages = pages; - dout("readpages final extent is %llu~%llu (%d pages)\n", - off, *plen, req->r_num_pages); + dout("readpages final extent is %llu~%llu (%d pages align %d)\n", + off, *plen, req->r_num_pages, page_align); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1533,6 +1539,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, { struct ceph_osd_request *req; int rc = 0; + int page_align = off & ~PAGE_MASK; BUG_ON(vino.snap != CEPH_NOSNAP); req = ceph_osdc_new_request(osdc, layout, vino, off, &len, @@ -1541,7 +1548,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, CEPH_OSD_FLAG_WRITE, snapc, do_sync, truncate_seq, truncate_size, mtime, - nofail, 1); + nofail, 1, page_align); if (!req) return -ENOMEM; @@ -1638,8 +1645,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - unsigned data_off = le16_to_cpu(hdr->data_off); - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + int want = calc_pages_for(req->r_page_alignment, data_len); if (unlikely(req->r_num_pages < want)) { pr_warning("tid %lld reply %d > expected %d pages\n", -- cgit v1.2.2 From c5c6b19d4b8f5431fca05f28ae9e141045022149 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:40:00 -0800 Subject: ceph: explicitly specify page alignment in network messages The alignment used for reading data into or out of pages used to be taken from the data_off field in the message header. This only worked as long as the page alignment matched the object offset, breaking direct io to non-page aligned offsets. Instead, explicitly specify the page alignment next to the page vector in the ceph_msg struct, and use that instead of the message header (which probably shouldn't be trusted). The alloc_msg callback is responsible for filling in this field properly when it sets up the page vector. Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 1 + net/ceph/messenger.c | 10 +++++----- net/ceph/osd_client.c | 3 +++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 5956d62c3057..a108b425fee2 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -82,6 +82,7 @@ struct ceph_msg { struct ceph_buffer *middle; struct page **pages; /* data payload. NOT OWNER. */ unsigned nr_pages; /* size of page array */ + unsigned page_alignment; /* io offset in first page */ struct ceph_pagelist *pagelist; /* instead of pages */ struct list_head list_head; struct kref kref; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d379abf873bc..1c7a2ec4f3cc 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -540,8 +540,7 @@ static void prepare_write_message(struct ceph_connection *con) /* initialize page iterator */ con->out_msg_pos.page = 0; if (m->pages) - con->out_msg_pos.page_pos = - le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + con->out_msg_pos.page_pos = m->page_alignment; else con->out_msg_pos.page_pos = 0; con->out_msg_pos.data_pos = 0; @@ -1491,7 +1490,7 @@ static int read_partial_message(struct ceph_connection *con) struct ceph_msg *m = con->in_msg; int ret; int to, left; - unsigned front_len, middle_len, data_len, data_off; + unsigned front_len, middle_len, data_len; int datacrc = con->msgr->nocrc; int skip; u64 seq; @@ -1527,7 +1526,6 @@ static int read_partial_message(struct ceph_connection *con) data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) return -EIO; - data_off = le16_to_cpu(con->in_hdr.data_off); /* verify seq# */ seq = le64_to_cpu(con->in_hdr.seq); @@ -1575,7 +1573,7 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg_pos.page = 0; if (m->pages) - con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + con->in_msg_pos.page_pos = m->page_alignment; else con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; @@ -2300,6 +2298,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) /* data */ m->nr_pages = 0; + m->page_alignment = 0; m->pages = NULL; m->pagelist = NULL; m->bio = NULL; @@ -2369,6 +2368,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, type, front_len); return NULL; } + msg->page_alignment = le16_to_cpu(hdr->data_off); } memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6c096239660c..3e20a122ffa2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -391,6 +391,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, req->r_request->hdr.data_len = cpu_to_le32(data_len); } + req->r_request->page_alignment = req->r_page_alignment; + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); msg_size = p - msg->front.iov_base; msg->front.iov_len = msg_size; @@ -1657,6 +1659,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m->pages = req->r_pages; m->nr_pages = req->r_num_pages; + m->page_alignment = req->r_page_alignment; #ifdef CONFIG_BLOCK m->bio = req->r_bio; #endif -- cgit v1.2.2 From a1629c3b24f26ec1b0f534874af674a6b4c1540b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 11 Nov 2010 15:24:06 -0800 Subject: ceph: fix dangling pointer Clear fi->last_name when it's freed. The only caller is rewinddir() (or equivalent lseek). Signed-off-by: Sage Weil --- fs/ceph/dir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e0a2dc6fcafc..1e11ed716f85 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -414,6 +414,7 @@ static void reset_readdir(struct ceph_file_info *fi) fi->last_readdir = NULL; } kfree(fi->last_name); + fi->last_name = NULL; fi->next_offset = 2; /* compensate for . and .. */ if (fi->dentry) { dput(fi->dentry); -- cgit v1.2.2 From 7b88dadc13e0004947de52df128dbd5b0754ed0a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 11 Nov 2010 16:48:59 -0800 Subject: ceph: fix frag offset for non-leftmost frags We start at offset 2 for the leftmost frag, and 0 for subsequent frags. When we reach the end (rightmost), we go back to 2. This fixes readdir on fragmented (large) directories. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1e11ed716f85..5f67728ba4d7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -336,7 +336,10 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, -- cgit v1.2.2 From 3105c19c450ac7c18ab28c19d364b588767261b3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Nov 2010 09:15:07 -0800 Subject: ceph: fix readdir EOVERFLOW on 32-bit archs One of the readdir filldir_t callers was passing the raw ceph 64-bit ino instead of the hashed 32-bit one, producing an EOVERFLOW in the filler callback. Fix this by calling the ceph_vino_to_ino() helper to do the conversion. Reported-by: Jan Smets Tested-by: Jan Smets Signed-off-by: Sage Weil --- fs/ceph/dir.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 5f67728ba4d7..7d447af84ec4 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -358,18 +358,22 @@ more: u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", off, off - fi->offset, rinfo->dir_nr, pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); ftype = le32_to_cpu(in->mode) >> 12; + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); if (filldir(dirent, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - le64_to_cpu(in->ino), - ftype) < 0) { + pos, ino, ftype) < 0) { dout("filldir stopping us...\n"); return 0; } -- cgit v1.2.2