aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJim Schutt <jaschut@sandia.gov>2012-11-30 11:15:25 -0500
committerAlex Elder <elder@inktank.com>2013-01-17 13:42:39 -0500
commit1604f488ac2dcce33c8218e75a000e8c5fb57e61 (patch)
tree084b399c1c9be245e62543a024d727241f7a9ad4 /net
parent390306c38dd43908f7f7730229999790a773d1d5 (diff)
libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed
Add libceph support for a new CRUSH tunable recently added to Ceph servers. Consider the CRUSH rule step chooseleaf firstn 0 type <node_type> This rule means that <n> replicas will be chosen in a manner such that each chosen leaf's branch will contain a unique instance of <node_type>. When an object is re-replicated after a leaf failure, if the CRUSH map uses a chooseleaf rule the remapped replica ends up under the <node_type> bucket that held the failed leaf. This causes uneven data distribution across the storage cluster, to the point that when all the leaves but one fail under a particular <node_type> bucket, that remaining leaf holds all the data from its failed peers. This behavior also limits the number of peers that can participate in the re-replication of the data held by the failed leaf, which increases the time required to re-replicate after a failure. For a chooseleaf CRUSH rule, the tree descent has two steps: call them the inner and outer descents. If the tree descent down to <node_type> is the outer descent, and the descent from <node_type> down to a leaf is the inner descent, the issue is that a down leaf is detected on the inner descent, so only the inner descent is retried. In order to disperse re-replicated data as widely as possible across a storage cluster after a failure, we want to retry the outer descent. So, fix up crush_choose() to allow the inner descent to return immediately on choosing a failed leaf. Wire this up as a new CRUSH tunable. Note that after this change, for a chooseleaf rule, if the primary OSD in a placement group has failed, choosing a replacement may result in one of the other OSDs in the PG colliding with the new primary. This requires that OSD's data for that PG to need moving as well. This seems unavoidable but should be relatively rare. This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc. Signed-off-by: Jim Schutt <jaschut@sandia.gov> Reviewed-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'net')
-rw-r--r--net/ceph/crush/mapper.c13
-rw-r--r--net/ceph/osdmap.c6
2 files changed, 16 insertions, 3 deletions
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 35fce755ce10..96c8a58937db 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
287 * @outpos: our position in that vector 287 * @outpos: our position in that vector
288 * @firstn: true if choosing "first n" items, false if choosing "indep" 288 * @firstn: true if choosing "first n" items, false if choosing "indep"
289 * @recurse_to_leaf: true if we want one device under each item of given type 289 * @recurse_to_leaf: true if we want one device under each item of given type
290 * @descend_once: true if we should only try one descent before giving up
290 * @out2: second output vector for leaf items (if @recurse_to_leaf) 291 * @out2: second output vector for leaf items (if @recurse_to_leaf)
291 */ 292 */
292static int crush_choose(const struct crush_map *map, 293static int crush_choose(const struct crush_map *map,
@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map,
295 int x, int numrep, int type, 296 int x, int numrep, int type,
296 int *out, int outpos, 297 int *out, int outpos,
297 int firstn, int recurse_to_leaf, 298 int firstn, int recurse_to_leaf,
298 int *out2) 299 int descend_once, int *out2)
299{ 300{
300 int rep; 301 int rep;
301 unsigned int ftotal, flocal; 302 unsigned int ftotal, flocal;
@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map,
399 x, outpos+1, 0, 400 x, outpos+1, 0,
400 out2, outpos, 401 out2, outpos,
401 firstn, 0, 402 firstn, 0,
403 map->chooseleaf_descend_once,
402 NULL) <= outpos) 404 NULL) <= outpos)
403 /* didn't get leaf */ 405 /* didn't get leaf */
404 reject = 1; 406 reject = 1;
@@ -422,7 +424,10 @@ reject:
422 ftotal++; 424 ftotal++;
423 flocal++; 425 flocal++;
424 426
425 if (collide && flocal <= map->choose_local_tries) 427 if (reject && descend_once)
428 /* let outer call try again */
429 skip_rep = 1;
430 else if (collide && flocal <= map->choose_local_tries)
426 /* retry locally a few times */ 431 /* retry locally a few times */
427 retry_bucket = 1; 432 retry_bucket = 1;
428 else if (map->choose_local_fallback_tries > 0 && 433 else if (map->choose_local_fallback_tries > 0 &&
@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map,
485 int i, j; 490 int i, j;
486 int numrep; 491 int numrep;
487 int firstn; 492 int firstn;
493 const int descend_once = 0;
488 494
489 if ((__u32)ruleno >= map->max_rules) { 495 if ((__u32)ruleno >= map->max_rules) {
490 dprintk(" bad ruleno %d\n", ruleno); 496 dprintk(" bad ruleno %d\n", ruleno);
@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map,
544 curstep->arg2, 550 curstep->arg2,
545 o+osize, j, 551 o+osize, j,
546 firstn, 552 firstn,
547 recurse_to_leaf, c+osize); 553 recurse_to_leaf,
554 descend_once, c+osize);
548 } 555 }
549 556
550 if (recurse_to_leaf) 557 if (recurse_to_leaf)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214b5d26..ca05871635bc 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -170,6 +170,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
170 c->choose_local_tries = 2; 170 c->choose_local_tries = 2;
171 c->choose_local_fallback_tries = 5; 171 c->choose_local_fallback_tries = 5;
172 c->choose_total_tries = 19; 172 c->choose_total_tries = 19;
173 c->chooseleaf_descend_once = 0;
173 174
174 ceph_decode_need(p, end, 4*sizeof(u32), bad); 175 ceph_decode_need(p, end, 4*sizeof(u32), bad);
175 magic = ceph_decode_32(p); 176 magic = ceph_decode_32(p);
@@ -336,6 +337,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
336 dout("crush decode tunable choose_total_tries = %d", 337 dout("crush decode tunable choose_total_tries = %d",
337 c->choose_total_tries); 338 c->choose_total_tries);
338 339
340 ceph_decode_need(p, end, sizeof(u32), done);
341 c->chooseleaf_descend_once = ceph_decode_32(p);
342 dout("crush decode tunable chooseleaf_descend_once = %d",
343 c->chooseleaf_descend_once);
344
339done: 345done:
340 dout("crush_decode success\n"); 346 dout("crush_decode success\n");
341 return c; 347 return c;