diff options
author | Ilya Dryomov <idryomov@gmail.com> | 2016-01-31 08:36:07 -0500 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2016-02-04 12:25:55 -0500 |
commit | dc6ae6d8e7726bad4f1c87244b49cac851746c65 (patch) | |
tree | 3782e77f13157c488cd9a49db5726e7411f0a2d7 | |
parent | 56a4f3091dceb7dfc14dc3ef1d5f59fe39ba4447 (diff) |
crush: add chooseleaf_stable tunable
Add a tunable to fix the bug that chooseleaf may cause unnecessary pg
migrations when some device fails.
Reflects ceph.git commit fdb3f664448e80d984470f32f04e2e6f03ab52ec.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
-rw-r--r-- | include/linux/crush/crush.h | 8 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 18 |
2 files changed, 21 insertions, 5 deletions
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 48b49305716b..be8f12b8f195 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h | |||
@@ -59,7 +59,8 @@ enum { | |||
59 | CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ | 59 | CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ |
60 | CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, | 60 | CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, |
61 | CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, | 61 | CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, |
62 | CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 | 62 | CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, |
63 | CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 | ||
63 | }; | 64 | }; |
64 | 65 | ||
65 | /* | 66 | /* |
@@ -205,6 +206,11 @@ struct crush_map { | |||
205 | * mappings line up a bit better with previous mappings. */ | 206 | * mappings line up a bit better with previous mappings. */ |
206 | __u8 chooseleaf_vary_r; | 207 | __u8 chooseleaf_vary_r; |
207 | 208 | ||
209 | /* if true, it makes chooseleaf firstn to return stable results (if | ||
210 | * no local retry) so that data migrations would be optimal when some | ||
211 | * device fails. */ | ||
212 | __u8 chooseleaf_stable; | ||
213 | |||
208 | #ifndef __KERNEL__ | 214 | #ifndef __KERNEL__ |
209 | /* | 215 | /* |
210 | * version 0 (original) of straw_calc has various flaws. version 1 | 216 | * version 0 (original) of straw_calc has various flaws. version 1 |
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index abb700621e4a..5fcfb98f309e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, | |||
403 | * @local_retries: localized retries | 403 | * @local_retries: localized retries |
404 | * @local_fallback_retries: localized fallback retries | 404 | * @local_fallback_retries: localized fallback retries |
405 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) | 405 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) |
406 | * @stable: stable mode starts rep=0 in the recursive call for all replicas | ||
406 | * @vary_r: pass r to recursive calls | 407 | * @vary_r: pass r to recursive calls |
407 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | 408 | * @out2: second output vector for leaf items (if @recurse_to_leaf) |
408 | * @parent_r: r value passed from the parent | 409 | * @parent_r: r value passed from the parent |
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
419 | unsigned int local_fallback_retries, | 420 | unsigned int local_fallback_retries, |
420 | int recurse_to_leaf, | 421 | int recurse_to_leaf, |
421 | unsigned int vary_r, | 422 | unsigned int vary_r, |
423 | unsigned int stable, | ||
422 | int *out2, | 424 | int *out2, |
423 | int parent_r) | 425 | int parent_r) |
424 | { | 426 | { |
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
433 | int collide, reject; | 435 | int collide, reject; |
434 | int count = out_size; | 436 | int count = out_size; |
435 | 437 | ||
436 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", | 438 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", |
437 | recurse_to_leaf ? "_LEAF" : "", | 439 | recurse_to_leaf ? "_LEAF" : "", |
438 | bucket->id, x, outpos, numrep, | 440 | bucket->id, x, outpos, numrep, |
439 | tries, recurse_tries, local_retries, local_fallback_retries, | 441 | tries, recurse_tries, local_retries, local_fallback_retries, |
440 | parent_r); | 442 | parent_r, stable); |
441 | 443 | ||
442 | for (rep = outpos; rep < numrep && count > 0 ; rep++) { | 444 | for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { |
443 | /* keep trying until we get a non-out, non-colliding item */ | 445 | /* keep trying until we get a non-out, non-colliding item */ |
444 | ftotal = 0; | 446 | ftotal = 0; |
445 | skip_rep = 0; | 447 | skip_rep = 0; |
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
512 | if (crush_choose_firstn(map, | 514 | if (crush_choose_firstn(map, |
513 | map->buckets[-1-item], | 515 | map->buckets[-1-item], |
514 | weight, weight_max, | 516 | weight, weight_max, |
515 | x, outpos+1, 0, | 517 | x, stable ? 1 : outpos+1, 0, |
516 | out2, outpos, count, | 518 | out2, outpos, count, |
517 | recurse_tries, 0, | 519 | recurse_tries, 0, |
518 | local_retries, | 520 | local_retries, |
519 | local_fallback_retries, | 521 | local_fallback_retries, |
520 | 0, | 522 | 0, |
521 | vary_r, | 523 | vary_r, |
524 | stable, | ||
522 | NULL, | 525 | NULL, |
523 | sub_r) <= outpos) | 526 | sub_r) <= outpos) |
524 | /* didn't get leaf */ | 527 | /* didn't get leaf */ |
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, | |||
816 | int choose_local_fallback_retries = map->choose_local_fallback_tries; | 819 | int choose_local_fallback_retries = map->choose_local_fallback_tries; |
817 | 820 | ||
818 | int vary_r = map->chooseleaf_vary_r; | 821 | int vary_r = map->chooseleaf_vary_r; |
822 | int stable = map->chooseleaf_stable; | ||
819 | 823 | ||
820 | if ((__u32)ruleno >= map->max_rules) { | 824 | if ((__u32)ruleno >= map->max_rules) { |
821 | dprintk(" bad ruleno %d\n", ruleno); | 825 | dprintk(" bad ruleno %d\n", ruleno); |
@@ -870,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, | |||
870 | vary_r = curstep->arg1; | 874 | vary_r = curstep->arg1; |
871 | break; | 875 | break; |
872 | 876 | ||
877 | case CRUSH_RULE_SET_CHOOSELEAF_STABLE: | ||
878 | if (curstep->arg1 >= 0) | ||
879 | stable = curstep->arg1; | ||
880 | break; | ||
881 | |||
873 | case CRUSH_RULE_CHOOSELEAF_FIRSTN: | 882 | case CRUSH_RULE_CHOOSELEAF_FIRSTN: |
874 | case CRUSH_RULE_CHOOSE_FIRSTN: | 883 | case CRUSH_RULE_CHOOSE_FIRSTN: |
875 | firstn = 1; | 884 | firstn = 1; |
@@ -932,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, | |||
932 | choose_local_fallback_retries, | 941 | choose_local_fallback_retries, |
933 | recurse_to_leaf, | 942 | recurse_to_leaf, |
934 | vary_r, | 943 | vary_r, |
944 | stable, | ||
935 | c+osize, | 945 | c+osize, |
936 | 0); | 946 | 0); |
937 | } else { | 947 | } else { |