diff options
author | Ilya Dryomov <ilya.dryomov@inktank.com> | 2014-03-19 10:58:37 -0400 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2014-04-05 00:07:26 -0400 |
commit | e2b149cc4ba00766aceb87950c6de72ea7fc8b2e (patch) | |
tree | 5f3d7b5dd55b7f75c412db786e1e6f4915ef9ed8 | |
parent | 6ed1002f368c63ef79d7f659fcb4368a90098132 (diff) |
crush: add chooseleaf_vary_r tunable
The current crush_choose_firstn code will re-use the same 'r' value for
the recursive call. That means that if we are hitting a collision or
rejection for some reason (say, an OSD that is marked out) and need to
retry, we will keep making the same (bad) choice in that recursive
selection.
Introduce a tunable that fixes that behavior by incorporating the parent
'r' value into the recursive starting point, so that a different path
will be taken in subsequent placement attempts.
Note that this was done from the get-go for the new crush_choose_indep
algorithm.
This was exposed by a user who was seeing PGs stuck in active+remapped
after reweight-by-utilization because the up set mapped to a single OSD.
Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
-rw-r--r-- | include/linux/crush/crush.h | 6 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 30 |
2 files changed, 30 insertions, 6 deletions
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index acaa5615d634..75f36a6c7f67 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h | |||
@@ -173,6 +173,12 @@ struct crush_map { | |||
173 | * apply to a collision: in that case we will retry as we used | 173 | * apply to a collision: in that case we will retry as we used |
174 | * to. */ | 174 | * to. */ |
175 | __u32 chooseleaf_descend_once; | 175 | __u32 chooseleaf_descend_once; |
176 | |||
177 | /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1) | ||
178 | * bits. a value of 1 is best for new clusters. for legacy clusters | ||
179 | * that want to limit reshuffling, a value of 3 or 4 will make the | ||
180 | * mappings line up a bit better with previous mappings. */ | ||
181 | __u8 chooseleaf_vary_r; | ||
176 | }; | 182 | }; |
177 | 183 | ||
178 | 184 | ||
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b3fb84903b30..947150cde297 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
@@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map, | |||
295 | * @local_retries: localized retries | 295 | * @local_retries: localized retries |
296 | * @local_fallback_retries: localized fallback retries | 296 | * @local_fallback_retries: localized fallback retries |
297 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) | 297 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) |
298 | * @vary_r: pass r to recursive calls | ||
298 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | 299 | * @out2: second output vector for leaf items (if @recurse_to_leaf) |
300 | * @parent_r: r value passed from the parent | ||
299 | */ | 301 | */ |
300 | static int crush_choose_firstn(const struct crush_map *map, | 302 | static int crush_choose_firstn(const struct crush_map *map, |
301 | struct crush_bucket *bucket, | 303 | struct crush_bucket *bucket, |
@@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
307 | unsigned int local_retries, | 309 | unsigned int local_retries, |
308 | unsigned int local_fallback_retries, | 310 | unsigned int local_fallback_retries, |
309 | int recurse_to_leaf, | 311 | int recurse_to_leaf, |
310 | int *out2) | 312 | unsigned int vary_r, |
313 | int *out2, | ||
314 | int parent_r) | ||
311 | { | 315 | { |
312 | int rep; | 316 | int rep; |
313 | unsigned int ftotal, flocal; | 317 | unsigned int ftotal, flocal; |
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
319 | int itemtype; | 323 | int itemtype; |
320 | int collide, reject; | 324 | int collide, reject; |
321 | 325 | ||
322 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", | 326 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", |
323 | bucket->id, x, outpos, numrep); | 327 | recurse_to_leaf ? "_LEAF" : "", |
328 | bucket->id, x, outpos, numrep, | ||
329 | tries, recurse_tries, local_retries, local_fallback_retries, | ||
330 | parent_r); | ||
324 | 331 | ||
325 | for (rep = outpos; rep < numrep; rep++) { | 332 | for (rep = outpos; rep < numrep; rep++) { |
326 | /* keep trying until we get a non-out, non-colliding item */ | 333 | /* keep trying until we get a non-out, non-colliding item */ |
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
335 | do { | 342 | do { |
336 | collide = 0; | 343 | collide = 0; |
337 | retry_bucket = 0; | 344 | retry_bucket = 0; |
338 | r = rep; | 345 | r = rep + parent_r; |
339 | /* r' = r + f_total */ | 346 | /* r' = r + f_total */ |
340 | r += ftotal; | 347 | r += ftotal; |
341 | 348 | ||
@@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
387 | reject = 0; | 394 | reject = 0; |
388 | if (!collide && recurse_to_leaf) { | 395 | if (!collide && recurse_to_leaf) { |
389 | if (item < 0) { | 396 | if (item < 0) { |
397 | int sub_r; | ||
398 | if (vary_r) | ||
399 | sub_r = r >> (vary_r-1); | ||
400 | else | ||
401 | sub_r = 0; | ||
390 | if (crush_choose_firstn(map, | 402 | if (crush_choose_firstn(map, |
391 | map->buckets[-1-item], | 403 | map->buckets[-1-item], |
392 | weight, weight_max, | 404 | weight, weight_max, |
@@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
396 | local_retries, | 408 | local_retries, |
397 | local_fallback_retries, | 409 | local_fallback_retries, |
398 | 0, | 410 | 0, |
399 | NULL) <= outpos) | 411 | vary_r, |
412 | NULL, | ||
413 | sub_r) <= outpos) | ||
400 | /* didn't get leaf */ | 414 | /* didn't get leaf */ |
401 | reject = 1; | 415 | reject = 1; |
402 | } else { | 416 | } else { |
@@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map, | |||
653 | int choose_local_retries = map->choose_local_tries; | 667 | int choose_local_retries = map->choose_local_tries; |
654 | int choose_local_fallback_retries = map->choose_local_fallback_tries; | 668 | int choose_local_fallback_retries = map->choose_local_fallback_tries; |
655 | 669 | ||
670 | int vary_r = map->chooseleaf_vary_r; | ||
671 | |||
656 | if ((__u32)ruleno >= map->max_rules) { | 672 | if ((__u32)ruleno >= map->max_rules) { |
657 | dprintk(" bad ruleno %d\n", ruleno); | 673 | dprintk(" bad ruleno %d\n", ruleno); |
658 | return 0; | 674 | return 0; |
@@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map, | |||
745 | choose_local_retries, | 761 | choose_local_retries, |
746 | choose_local_fallback_retries, | 762 | choose_local_fallback_retries, |
747 | recurse_to_leaf, | 763 | recurse_to_leaf, |
748 | c+osize); | 764 | vary_r, |
765 | c+osize, | ||
766 | 0); | ||
749 | } else { | 767 | } else { |
750 | crush_choose_indep( | 768 | crush_choose_indep( |
751 | map, | 769 | map, |