aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Dryomov <ilya.dryomov@inktank.com>2014-03-19 10:58:37 -0400
committerSage Weil <sage@inktank.com>2014-04-05 00:07:26 -0400
commite2b149cc4ba00766aceb87950c6de72ea7fc8b2e (patch)
tree5f3d7b5dd55b7f75c412db786e1e6f4915ef9ed8
parent6ed1002f368c63ef79d7f659fcb4368a90098132 (diff)
crush: add chooseleaf_vary_r tunable
The current crush_choose_firstn code will re-use the same 'r' value for the recursive call. That means that if we are hitting a collision or rejection for some reason (say, an OSD that is marked out) and need to retry, we will keep making the same (bad) choice in that recursive selection. Introduce a tunable that fixes that behavior by incorporating the parent 'r' value into the recursive starting point, so that a different path will be taken in subsequent placement attempts. Note that this was done from the get-go for the new crush_choose_indep algorithm. This was exposed by a user who was seeing PGs stuck in active+remapped after reweight-by-utilization because the up set mapped to a single OSD. Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a. Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
-rw-r--r--include/linux/crush/crush.h6
-rw-r--r--net/ceph/crush/mapper.c30
2 files changed, 30 insertions, 6 deletions
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index acaa5615d634..75f36a6c7f67 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -173,6 +173,12 @@ struct crush_map {
173 * apply to a collision: in that case we will retry as we used 173 * apply to a collision: in that case we will retry as we used
174 * to. */ 174 * to. */
175 __u32 chooseleaf_descend_once; 175 __u32 chooseleaf_descend_once;
176
177 /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
178 * bits. a value of 1 is best for new clusters. for legacy clusters
179 * that want to limit reshuffling, a value of 3 or 4 will make the
180 * mappings line up a bit better with previous mappings. */
181 __u8 chooseleaf_vary_r;
176}; 182};
177 183
178 184
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b3fb84903b30..947150cde297 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map,
295 * @local_retries: localized retries 295 * @local_retries: localized retries
296 * @local_fallback_retries: localized fallback retries 296 * @local_fallback_retries: localized fallback retries
297 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 297 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
298 * @vary_r: pass r to recursive calls
298 * @out2: second output vector for leaf items (if @recurse_to_leaf) 299 * @out2: second output vector for leaf items (if @recurse_to_leaf)
300 * @parent_r: r value passed from the parent
299 */ 301 */
300static int crush_choose_firstn(const struct crush_map *map, 302static int crush_choose_firstn(const struct crush_map *map,
301 struct crush_bucket *bucket, 303 struct crush_bucket *bucket,
@@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map,
307 unsigned int local_retries, 309 unsigned int local_retries,
308 unsigned int local_fallback_retries, 310 unsigned int local_fallback_retries,
309 int recurse_to_leaf, 311 int recurse_to_leaf,
310 int *out2) 312 unsigned int vary_r,
313 int *out2,
314 int parent_r)
311{ 315{
312 int rep; 316 int rep;
313 unsigned int ftotal, flocal; 317 unsigned int ftotal, flocal;
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
319 int itemtype; 323 int itemtype;
320 int collide, reject; 324 int collide, reject;
321 325
322 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 326 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
323 bucket->id, x, outpos, numrep); 327 recurse_to_leaf ? "_LEAF" : "",
328 bucket->id, x, outpos, numrep,
329 tries, recurse_tries, local_retries, local_fallback_retries,
330 parent_r);
324 331
325 for (rep = outpos; rep < numrep; rep++) { 332 for (rep = outpos; rep < numrep; rep++) {
326 /* keep trying until we get a non-out, non-colliding item */ 333 /* keep trying until we get a non-out, non-colliding item */
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
335 do { 342 do {
336 collide = 0; 343 collide = 0;
337 retry_bucket = 0; 344 retry_bucket = 0;
338 r = rep; 345 r = rep + parent_r;
339 /* r' = r + f_total */ 346 /* r' = r + f_total */
340 r += ftotal; 347 r += ftotal;
341 348
@@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map,
387 reject = 0; 394 reject = 0;
388 if (!collide && recurse_to_leaf) { 395 if (!collide && recurse_to_leaf) {
389 if (item < 0) { 396 if (item < 0) {
397 int sub_r;
398 if (vary_r)
399 sub_r = r >> (vary_r-1);
400 else
401 sub_r = 0;
390 if (crush_choose_firstn(map, 402 if (crush_choose_firstn(map,
391 map->buckets[-1-item], 403 map->buckets[-1-item],
392 weight, weight_max, 404 weight, weight_max,
@@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map,
396 local_retries, 408 local_retries,
397 local_fallback_retries, 409 local_fallback_retries,
398 0, 410 0,
399 NULL) <= outpos) 411 vary_r,
412 NULL,
413 sub_r) <= outpos)
400 /* didn't get leaf */ 414 /* didn't get leaf */
401 reject = 1; 415 reject = 1;
402 } else { 416 } else {
@@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map,
653 int choose_local_retries = map->choose_local_tries; 667 int choose_local_retries = map->choose_local_tries;
654 int choose_local_fallback_retries = map->choose_local_fallback_tries; 668 int choose_local_fallback_retries = map->choose_local_fallback_tries;
655 669
670 int vary_r = map->chooseleaf_vary_r;
671
656 if ((__u32)ruleno >= map->max_rules) { 672 if ((__u32)ruleno >= map->max_rules) {
657 dprintk(" bad ruleno %d\n", ruleno); 673 dprintk(" bad ruleno %d\n", ruleno);
658 return 0; 674 return 0;
@@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map,
745 choose_local_retries, 761 choose_local_retries,
746 choose_local_fallback_retries, 762 choose_local_fallback_retries,
747 recurse_to_leaf, 763 recurse_to_leaf,
748 c+osize); 764 vary_r,
765 c+osize,
766 0);
749 } else { 767 } else {
750 crush_choose_indep( 768 crush_choose_indep(
751 map, 769 map,