From 6e8575faa8fa680d59404a4d58d12190667be815 Mon Sep 17 00:00:00 2001 From: Sam Lang Date: Fri, 28 Dec 2012 09:56:46 -0800 Subject: ceph: Check for created flag in response from mds The mds now sends back a created inode if the create request performed the create. If the file already existed, no inode is returned in the reply. This allows ceph to set the created flag in atomic_open so that permissions are properly checked in the case that the file wasn't created by the create call to the mds. To ensure compability with previous kernels, a feature for sending back the inode in the create reply was added, so that the mds will only send back the inode if the client indicates it supports the feature. Signed-off-by: Sam Lang Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index dad579b0c0e6..6b7c6acbb3bf 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -14,13 +14,16 @@ #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) /* bits 8-17 defined by user-space; not supported yet here */ #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) +/* bits 19-25 defined by user-space; not supported yet here */ +#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) /* * Features supported. */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ - CEPH_FEATURE_CRUSH_TUNABLES) + CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_REPLY_CREATE_INODE) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR) -- cgit v1.2.2 From 1604f488ac2dcce33c8218e75a000e8c5fb57e61 Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Fri, 30 Nov 2012 09:15:25 -0700 Subject: libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed Add libceph support for a new CRUSH tunable recently added to Ceph servers. Consider the CRUSH rule step chooseleaf firstn 0 type This rule means that replicas will be chosen in a manner such that each chosen leaf's branch will contain a unique instance of . When an object is re-replicated after a leaf failure, if the CRUSH map uses a chooseleaf rule the remapped replica ends up under the bucket that held the failed leaf. This causes uneven data distribution across the storage cluster, to the point that when all the leaves but one fail under a particular bucket, that remaining leaf holds all the data from its failed peers. This behavior also limits the number of peers that can participate in the re-replication of the data held by the failed leaf, which increases the time required to re-replicate after a failure. For a chooseleaf CRUSH rule, the tree descent has two steps: call them the inner and outer descents. If the tree descent down to is the outer descent, and the descent from down to a leaf is the inner descent, the issue is that a down leaf is detected on the inner descent, so only the inner descent is retried. In order to disperse re-replicated data as widely as possible across a storage cluster after a failure, we want to retry the outer descent. So, fix up crush_choose() to allow the inner descent to return immediately on choosing a failed leaf. Wire this up as a new CRUSH tunable. Note that after this change, for a chooseleaf rule, if the primary OSD in a placement group has failed, choosing a replacement may result in one of the other OSDs in the PG colliding with the new primary. This requires that OSD's data for that PG to need moving as well. This seems unavoidable but should be relatively rare. This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc. Signed-off-by: Jim Schutt Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 7 +++++-- include/linux/crush/crush.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 6b7c6acbb3bf..2160aab482f6 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -14,7 +14,9 @@ #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) /* bits 8-17 defined by user-space; not supported yet here */ #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) -/* bits 19-25 defined by user-space; not supported yet here */ +/* bits 19-24 defined by user-space; not supported yet here */ +#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) +/* bit 26 defined by user-space; not supported yet here */ #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) /* @@ -22,7 +24,8 @@ */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ - CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE) #define CEPH_FEATURES_REQUIRED_DEFAULT \ diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 25baa287cff7..6a1101f24cfb 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -162,6 +162,8 @@ struct crush_map { __u32 choose_local_fallback_tries; /* choose attempts before giving up */ __u32 choose_total_tries; + /* attempt chooseleaf inner descent once; on failure retry outer descent */ + __u32 chooseleaf_descend_once; }; -- cgit v1.2.2 From dd5f049dbdf973d9bceebef1fd73647a5ede6732 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:27 -0500 Subject: ceph: define ceph_encode_8_safe() It's kind of a silly macro, but ceph_encode_8_safe() is the only one missing from an otherwise pretty complete set. It's not used, but neither are a couple of the others in this set. While in there, insert some whitespace to tidy up the alignment of the line-terminating backslashes in some of the macro definitions. Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- include/linux/ceph/decode.h | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 4bbf2db45f46..cd679f2d348b 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n) return end >= *p && n <= end - *p; } -#define ceph_decode_need(p, end, n, bad) \ - do { \ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_decode_need(p, end, n, bad) \ + do { \ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_decode_64_safe(p, end, v, bad) \ @@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n) * * There are two possible failures: * - converting the string would require accessing memory at or - * beyond the "end" pointer provided (-E - * - memory could not be allocated for the result + * beyond the "end" pointer provided (-ERANGE) + * - memory could not be allocated for the result (-ENOMEM) */ static inline char *ceph_extract_encoded_string(void **p, void *end, size_t *lenp, gfp_t gfp) @@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end, *p += len; } -#define ceph_encode_need(p, end, n, bad) \ - do { \ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_encode_need(p, end, n, bad) \ + do { \ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_encode_64_safe(p, end, v, bad) \ @@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end, #define ceph_encode_32_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u32), bad); \ - ceph_encode_32(p, v); \ + ceph_encode_32(p, v); \ } while (0) #define ceph_encode_16_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u16), bad); \ - ceph_encode_16(p, v); \ + ceph_encode_16(p, v); \ + } while (0) +#define ceph_encode_8_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u8), bad); \ + ceph_encode_8(p, v); \ } while (0) #define ceph_encode_copy_safe(p, end, pv, n, bad) \ -- cgit v1.2.2 From af77f26caa35a95af09d1dac5c513b3901de7e37 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: rbd: drop oid parameters from ceph_osdc_build_request() The last two parameters to ceph_osd_build_request() describe the object id, but the values passed always come from the osd request structure whose address is also provided. Get rid of those last two parameters. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d9b880e977e6..f2e5d2cdca06 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -227,9 +227,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 *plen, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, - struct timespec *mtime, - const char *oid, - int oid_len); + struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, -- cgit v1.2.2 From c885837f7d4f8c4f5cb2a744cc6929bc078e9dc0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: always allow trail in osd request An osd request structure contains an optional trail portion, which if present will contain data to be passed in the payload portion of the message containing the request. The trail field is a ceph_pagelist pointer, and if null it indicates there is no trail. A ceph_pagelist structure contains a length field, and it can legitimately hold value 0. Make use of this to change the interpretation of the "trail" of an osd request so that every osd request has trailing data, it just might have length 0. This means we change the r_trail field in a ceph_osd_request structure from a pointer to a structure that is always initialized. Note that in ceph_osdc_start_request(), the trail pointer (or now address of that structure) is assigned to a ceph message's trail field. Here's why that's still OK (looking at net/ceph/messenger.c): - What would have resulted in a null pointer previously will now refer to a 0-length page list. That message trail pointer is used in two functions, write_partial_msg_pages() and out_msg_pos_next(). - In write_partial_msg_pages(), a null page list pointer is handled the same as a message with 0-length trail, and both result in a "in_trail" variable set to false. The trail pointer is only used if in_trail is true. - The only other place the message trail pointer is used is out_msg_pos_next(). That function is only called by write_partial_msg_pages() and only touches the trail pointer if the in_trail value it is passed is true. Therefore a null ceph_msg->trail pointer is equivalent to a non-null pointer referring to a 0-length page list structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f2e5d2cdca06..61562c792855 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * Maximum object name size @@ -22,7 +23,6 @@ struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; struct ceph_authorizer; -struct ceph_pagelist; /* * completion callback for async writepages @@ -95,7 +95,7 @@ struct ceph_osd_request { struct bio *r_bio; /* instead of pages */ #endif - struct ceph_pagelist *r_trail; /* trailing part of the data */ + struct ceph_pagelist r_trail; /* trailing part of the data */ }; struct ceph_osd_event { -- cgit v1.2.2 From 0120be3c60d46d6d55f4bf7a3d654cc705eb0c54 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 09:38:19 -0600 Subject: libceph: pass length to ceph_osdc_build_request() The len argument to ceph_osdc_build_request() is set up to be passed by address, but that function never updates its value so there's no need to do this. Tighten up the interface by passing the length directly. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 61562c792855..4bfb4582439a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,7 +224,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct bio *bio); extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, + u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, struct timespec *mtime); -- cgit v1.2.2 From e8afad656cbcd06d02a7bacd4b318fa0e2907de0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 09:38:19 -0600 Subject: libceph: pass length to ceph_calc_file_object_mapping() ceph_calc_file_object_mapping() takes (among other things) a "file" offset and length, and based on the layout, determines the object number ("bno") backing the affected portion of the file's data and the offset into that object where the desired range begins. It also computes the size that should be used for the request--either the amount requested or something less if that would exceed the end of the object. This patch changes the input length parameter in this function so it is used only for input. That is, the argument will be passed by value rather than by address, so the value provided won't get updated by the function. The value would only get updated if the length would surpass the current object, and in that case the value it got updated to would be exactly that returned in *oxlen. Only one of the two callers is affected by this change. Update ceph_calc_raw_layout() so it records any updated value. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osdmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5ea57ba69320..1f653e2ff5cc 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -110,7 +110,7 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, + u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -- cgit v1.2.2 From 4d6b250bf18d44571d69a0f4afec4b6a1969729f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: drop snapid in ceph_calc_raw_layout() A snapshot id must be provided to ceph_calc_raw_layout() even though it is not needed at all for calculating the layout. Where the snapshot id *is* needed is when building the request message for an osd operation. Drop the snapid parameter from ceph_calc_raw_layout() and pass that value instead in ceph_osdc_build_request(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4bfb4582439a..0e82a0a967ef 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -209,7 +209,6 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, - u64 snapid, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); @@ -227,6 +226,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, + u64 snap_id, struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, -- cgit v1.2.2 From e75b45cf36565fd8ba206a9d80f670a86e61ba2f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:14 -0600 Subject: libceph: drop osdc from ceph_calc_raw_layout() The osdc parameter to ceph_calc_raw_layout() is not used, so get rid of it. Consequently, the corresponding parameter in calc_layout() becomes unused, so get rid of that as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0e82a0a967ef..fe3a6e8db1f9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -207,8 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, +extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); -- cgit v1.2.2 From d178a9e74006e80f568d87e29f2a68f14fc7cbb1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: don't set flags in ceph_osdc_alloc_request() The only thing ceph_osdc_alloc_request() really does with the flags value it is passed is assign it to the newly-created osd request structure. Do that in the caller instead. Both callers subsequently call ceph_osdc_build_request(), so have that function (instead of ceph_osdc_alloc_request()) issue a warning if a request comes through with neither the read nor write flags set. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fe3a6e8db1f9..6ddda5bbd1a6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -213,7 +213,6 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, struct ceph_osd_req_op *op); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, -- cgit v1.2.2 From 54a5400721da7fa5a16cea151aade5bdfee74111 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: don't set pages or bio in ceph_osdc_alloc_request() Only one of the two callers of ceph_osdc_alloc_request() provides page or bio data for its payload. And essentially all that function was doing with those arguments was assigning them to fields in the osd request structure. Simplify ceph_osdc_alloc_request() by having the caller take care of making those assignments Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 6ddda5bbd1a6..75f56d372d44 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -216,9 +216,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio); + gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, -- cgit v1.2.2 From ae7ca4a35b1f5df86e2c32b2cfc01a8d528c7b8c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: pass num_op with ops Both ceph_osdc_alloc_request() and ceph_osdc_build_request() are provided an array of ceph osd request operations. Rather than just passing the number of operations in the array, the caller is required append an additional zeroed operation structure to signal the end of the array. All callers know the number of operations at the time these functions are called, so drop the silly zero entry and supply that number directly. As a result, get_num_ops() is no longer needed. This also means that ceph_osdc_alloc_request() never uses its ops argument, so that can be dropped. Also rbd_create_rw_ops() no longer needs to add one to reserve room for the additional op. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 75f56d372d44..2b04d054e09d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -214,12 +214,13 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, + unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, -- cgit v1.2.2 From 2b5fc648af5eec2f4fe984cb6b926214e02c5cf4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 09:38:20 -0600 Subject: rbd: kill ceph_osd_req_op->flags The flags field of struct ceph_osd_req_op is never used, so just get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2b04d054e09d..69287ccfe68a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -157,7 +157,6 @@ struct ceph_osd_client { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ - u32 flags; /* CEPH_OSD_FLAG_* */ union { struct { u64 offset, length; -- cgit v1.2.2