aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Dryomov <idryomov@gmail.com>2016-01-31 08:36:07 -0500
committerIlya Dryomov <idryomov@gmail.com>2016-02-04 12:25:55 -0500
commitdc6ae6d8e7726bad4f1c87244b49cac851746c65 (patch)
tree3782e77f13157c488cd9a49db5726e7411f0a2d7
parent56a4f3091dceb7dfc14dc3ef1d5f59fe39ba4447 (diff)
crush: add chooseleaf_stable tunable
Add a tunable to fix the bug that chooseleaf may cause unnecessary pg migrations when some device fails. Reflects ceph.git commit fdb3f664448e80d984470f32f04e2e6f03ab52ec. Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Sage Weil <sage@redhat.com>
-rw-r--r--include/linux/crush/crush.h8
-rw-r--r--net/ceph/crush/mapper.c18
2 files changed, 21 insertions, 5 deletions
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 48b49305716b..be8f12b8f195 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -59,7 +59,8 @@ enum {
59 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 59 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
60 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 60 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
61 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 61 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
62 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 62 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
63 CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
63}; 64};
64 65
65/* 66/*
@@ -205,6 +206,11 @@ struct crush_map {
205 * mappings line up a bit better with previous mappings. */ 206 * mappings line up a bit better with previous mappings. */
206 __u8 chooseleaf_vary_r; 207 __u8 chooseleaf_vary_r;
207 208
209 /* if true, it makes chooseleaf firstn to return stable results (if
210 * no local retry) so that data migrations would be optimal when some
211 * device fails. */
212 __u8 chooseleaf_stable;
213
208#ifndef __KERNEL__ 214#ifndef __KERNEL__
209 /* 215 /*
210 * version 0 (original) of straw_calc has various flaws. version 1 216 * version 0 (original) of straw_calc has various flaws. version 1
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index abb700621e4a..5fcfb98f309e 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
403 * @local_retries: localized retries 403 * @local_retries: localized retries
404 * @local_fallback_retries: localized fallback retries 404 * @local_fallback_retries: localized fallback retries
405 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 405 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
406 * @stable: stable mode starts rep=0 in the recursive call for all replicas
406 * @vary_r: pass r to recursive calls 407 * @vary_r: pass r to recursive calls
407 * @out2: second output vector for leaf items (if @recurse_to_leaf) 408 * @out2: second output vector for leaf items (if @recurse_to_leaf)
408 * @parent_r: r value passed from the parent 409 * @parent_r: r value passed from the parent
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
419 unsigned int local_fallback_retries, 420 unsigned int local_fallback_retries,
420 int recurse_to_leaf, 421 int recurse_to_leaf,
421 unsigned int vary_r, 422 unsigned int vary_r,
423 unsigned int stable,
422 int *out2, 424 int *out2,
423 int parent_r) 425 int parent_r)
424{ 426{
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
433 int collide, reject; 435 int collide, reject;
434 int count = out_size; 436 int count = out_size;
435 437
436 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", 438 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
437 recurse_to_leaf ? "_LEAF" : "", 439 recurse_to_leaf ? "_LEAF" : "",
438 bucket->id, x, outpos, numrep, 440 bucket->id, x, outpos, numrep,
439 tries, recurse_tries, local_retries, local_fallback_retries, 441 tries, recurse_tries, local_retries, local_fallback_retries,
440 parent_r); 442 parent_r, stable);
441 443
442 for (rep = outpos; rep < numrep && count > 0 ; rep++) { 444 for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
443 /* keep trying until we get a non-out, non-colliding item */ 445 /* keep trying until we get a non-out, non-colliding item */
444 ftotal = 0; 446 ftotal = 0;
445 skip_rep = 0; 447 skip_rep = 0;
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
512 if (crush_choose_firstn(map, 514 if (crush_choose_firstn(map,
513 map->buckets[-1-item], 515 map->buckets[-1-item],
514 weight, weight_max, 516 weight, weight_max,
515 x, outpos+1, 0, 517 x, stable ? 1 : outpos+1, 0,
516 out2, outpos, count, 518 out2, outpos, count,
517 recurse_tries, 0, 519 recurse_tries, 0,
518 local_retries, 520 local_retries,
519 local_fallback_retries, 521 local_fallback_retries,
520 0, 522 0,
521 vary_r, 523 vary_r,
524 stable,
522 NULL, 525 NULL,
523 sub_r) <= outpos) 526 sub_r) <= outpos)
524 /* didn't get leaf */ 527 /* didn't get leaf */
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
816 int choose_local_fallback_retries = map->choose_local_fallback_tries; 819 int choose_local_fallback_retries = map->choose_local_fallback_tries;
817 820
818 int vary_r = map->chooseleaf_vary_r; 821 int vary_r = map->chooseleaf_vary_r;
822 int stable = map->chooseleaf_stable;
819 823
820 if ((__u32)ruleno >= map->max_rules) { 824 if ((__u32)ruleno >= map->max_rules) {
821 dprintk(" bad ruleno %d\n", ruleno); 825 dprintk(" bad ruleno %d\n", ruleno);
@@ -870,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
870 vary_r = curstep->arg1; 874 vary_r = curstep->arg1;
871 break; 875 break;
872 876
877 case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
878 if (curstep->arg1 >= 0)
879 stable = curstep->arg1;
880 break;
881
873 case CRUSH_RULE_CHOOSELEAF_FIRSTN: 882 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
874 case CRUSH_RULE_CHOOSE_FIRSTN: 883 case CRUSH_RULE_CHOOSE_FIRSTN:
875 firstn = 1; 884 firstn = 1;
@@ -932,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
932 choose_local_fallback_retries, 941 choose_local_fallback_retries,
933 recurse_to_leaf, 942 recurse_to_leaf,
934 vary_r, 943 vary_r,
944 stable,
935 c+osize, 945 c+osize,
936 0); 946 0);
937 } else { 947 } else {