aboutsummaryrefslogtreecommitdiffstats
path: root/net/ceph/crush/mapper.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ceph/crush/mapper.c')
-rw-r--r--net/ceph/crush/mapper.c336
1 files changed, 269 insertions, 67 deletions
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index cbd06a91941c..b703790b4e44 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -189,7 +189,7 @@ static int terminal(int x)
189static int bucket_tree_choose(struct crush_bucket_tree *bucket, 189static int bucket_tree_choose(struct crush_bucket_tree *bucket,
190 int x, int r) 190 int x, int r)
191{ 191{
192 int n, l; 192 int n;
193 __u32 w; 193 __u32 w;
194 __u64 t; 194 __u64 t;
195 195
@@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
197 n = bucket->num_nodes >> 1; 197 n = bucket->num_nodes >> 1;
198 198
199 while (!terminal(n)) { 199 while (!terminal(n)) {
200 int l;
200 /* pick point in [0, w) */ 201 /* pick point in [0, w) */
201 w = bucket->node_weights[n]; 202 w = bucket->node_weights[n];
202 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, 203 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
@@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
264 * true if device is marked "out" (failed, fully offloaded) 265 * true if device is marked "out" (failed, fully offloaded)
265 * of the cluster 266 * of the cluster
266 */ 267 */
267static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) 268static int is_out(const struct crush_map *map,
269 const __u32 *weight, int weight_max,
270 int item, int x)
268{ 271{
272 if (item >= weight_max)
273 return 1;
269 if (weight[item] >= 0x10000) 274 if (weight[item] >= 0x10000)
270 return 0; 275 return 0;
271 if (weight[item] == 0) 276 if (weight[item] == 0)
@@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
277} 282}
278 283
279/** 284/**
280 * crush_choose - choose numrep distinct items of given type 285 * crush_choose_firstn - choose numrep distinct items of given type
281 * @map: the crush_map 286 * @map: the crush_map
282 * @bucket: the bucket we are choose an item from 287 * @bucket: the bucket we are choose an item from
283 * @x: crush input value 288 * @x: crush input value
@@ -285,18 +290,24 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
285 * @type: the type of item to choose 290 * @type: the type of item to choose
286 * @out: pointer to output vector 291 * @out: pointer to output vector
287 * @outpos: our position in that vector 292 * @outpos: our position in that vector
288 * @firstn: true if choosing "first n" items, false if choosing "indep" 293 * @tries: number of attempts to make
289 * @recurse_to_leaf: true if we want one device under each item of given type 294 * @recurse_tries: number of attempts to have recursive chooseleaf make
290 * @descend_once: true if we should only try one descent before giving up 295 * @local_tries: localized retries
296 * @local_fallback_tries: localized fallback retries
297 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
291 * @out2: second output vector for leaf items (if @recurse_to_leaf) 298 * @out2: second output vector for leaf items (if @recurse_to_leaf)
292 */ 299 */
293static int crush_choose(const struct crush_map *map, 300static int crush_choose_firstn(const struct crush_map *map,
294 struct crush_bucket *bucket, 301 struct crush_bucket *bucket,
295 const __u32 *weight, 302 const __u32 *weight, int weight_max,
296 int x, int numrep, int type, 303 int x, int numrep, int type,
297 int *out, int outpos, 304 int *out, int outpos,
298 int firstn, int recurse_to_leaf, 305 unsigned int tries,
299 int descend_once, int *out2) 306 unsigned int recurse_tries,
307 unsigned int local_tries,
308 unsigned int local_fallback_tries,
309 int recurse_to_leaf,
310 int *out2)
300{ 311{
301 int rep; 312 int rep;
302 unsigned int ftotal, flocal; 313 unsigned int ftotal, flocal;
@@ -325,35 +336,17 @@ static int crush_choose(const struct crush_map *map,
325 collide = 0; 336 collide = 0;
326 retry_bucket = 0; 337 retry_bucket = 0;
327 r = rep; 338 r = rep;
328 if (in->alg == CRUSH_BUCKET_UNIFORM) { 339 /* r' = r + f_total */
329 /* be careful */ 340 r += ftotal;
330 if (firstn || (__u32)numrep >= in->size)
331 /* r' = r + f_total */
332 r += ftotal;
333 else if (in->size % numrep == 0)
334 /* r'=r+(n+1)*f_local */
335 r += (numrep+1) *
336 (flocal+ftotal);
337 else
338 /* r' = r + n*f_local */
339 r += numrep * (flocal+ftotal);
340 } else {
341 if (firstn)
342 /* r' = r + f_total */
343 r += ftotal;
344 else
345 /* r' = r + n*f_local */
346 r += numrep * (flocal+ftotal);
347 }
348 341
349 /* bucket choose */ 342 /* bucket choose */
350 if (in->size == 0) { 343 if (in->size == 0) {
351 reject = 1; 344 reject = 1;
352 goto reject; 345 goto reject;
353 } 346 }
354 if (map->choose_local_fallback_tries > 0 && 347 if (local_fallback_tries > 0 &&
355 flocal >= (in->size>>1) && 348 flocal >= (in->size>>1) &&
356 flocal > map->choose_local_fallback_tries) 349 flocal > local_fallback_tries)
357 item = bucket_perm_choose(in, x, r); 350 item = bucket_perm_choose(in, x, r);
358 else 351 else
359 item = crush_bucket_choose(in, x, r); 352 item = crush_bucket_choose(in, x, r);
@@ -394,13 +387,15 @@ static int crush_choose(const struct crush_map *map,
394 reject = 0; 387 reject = 0;
395 if (!collide && recurse_to_leaf) { 388 if (!collide && recurse_to_leaf) {
396 if (item < 0) { 389 if (item < 0) {
397 if (crush_choose(map, 390 if (crush_choose_firstn(map,
398 map->buckets[-1-item], 391 map->buckets[-1-item],
399 weight, 392 weight, weight_max,
400 x, outpos+1, 0, 393 x, outpos+1, 0,
401 out2, outpos, 394 out2, outpos,
402 firstn, 0, 395 recurse_tries, 0,
403 map->chooseleaf_descend_once, 396 local_tries,
397 local_fallback_tries,
398 0,
404 NULL) <= outpos) 399 NULL) <= outpos)
405 /* didn't get leaf */ 400 /* didn't get leaf */
406 reject = 1; 401 reject = 1;
@@ -414,6 +409,7 @@ static int crush_choose(const struct crush_map *map,
414 /* out? */ 409 /* out? */
415 if (itemtype == 0) 410 if (itemtype == 0)
416 reject = is_out(map, weight, 411 reject = is_out(map, weight,
412 weight_max,
417 item, x); 413 item, x);
418 else 414 else
419 reject = 0; 415 reject = 0;
@@ -424,17 +420,14 @@ reject:
424 ftotal++; 420 ftotal++;
425 flocal++; 421 flocal++;
426 422
427 if (reject && descend_once) 423 if (collide && flocal <= local_tries)
428 /* let outer call try again */
429 skip_rep = 1;
430 else if (collide && flocal <= map->choose_local_tries)
431 /* retry locally a few times */ 424 /* retry locally a few times */
432 retry_bucket = 1; 425 retry_bucket = 1;
433 else if (map->choose_local_fallback_tries > 0 && 426 else if (local_fallback_tries > 0 &&
434 flocal <= in->size + map->choose_local_fallback_tries) 427 flocal <= in->size + local_fallback_tries)
435 /* exhaustive bucket search */ 428 /* exhaustive bucket search */
436 retry_bucket = 1; 429 retry_bucket = 1;
437 else if (ftotal <= map->choose_total_tries) 430 else if (ftotal <= tries)
438 /* then retry descent */ 431 /* then retry descent */
439 retry_descent = 1; 432 retry_descent = 1;
440 else 433 else
@@ -464,21 +457,179 @@ reject:
464 457
465 458
466/** 459/**
460 * crush_choose_indep: alternative breadth-first positionally stable mapping
461 *
462 */
463static void crush_choose_indep(const struct crush_map *map,
464 struct crush_bucket *bucket,
465 const __u32 *weight, int weight_max,
466 int x, int left, int numrep, int type,
467 int *out, int outpos,
468 unsigned int tries,
469 unsigned int recurse_tries,
470 int recurse_to_leaf,
471 int *out2,
472 int parent_r)
473{
474 struct crush_bucket *in = bucket;
475 int endpos = outpos + left;
476 int rep;
477 unsigned int ftotal;
478 int r;
479 int i;
480 int item = 0;
481 int itemtype;
482 int collide;
483
484 dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
485 bucket->id, x, outpos, numrep);
486
487 /* initially my result is undefined */
488 for (rep = outpos; rep < endpos; rep++) {
489 out[rep] = CRUSH_ITEM_UNDEF;
490 if (out2)
491 out2[rep] = CRUSH_ITEM_UNDEF;
492 }
493
494 for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
495 for (rep = outpos; rep < endpos; rep++) {
496 if (out[rep] != CRUSH_ITEM_UNDEF)
497 continue;
498
499 in = bucket; /* initial bucket */
500
501 /* choose through intervening buckets */
502 for (;;) {
503 /* note: we base the choice on the position
504 * even in the nested call. that means that
505 * if the first layer chooses the same bucket
506 * in a different position, we will tend to
507 * choose a different item in that bucket.
508 * this will involve more devices in data
509 * movement and tend to distribute the load.
510 */
511 r = rep + parent_r;
512
513 /* be careful */
514 if (in->alg == CRUSH_BUCKET_UNIFORM &&
515 in->size % numrep == 0)
516 /* r'=r+(n+1)*f_total */
517 r += (numrep+1) * ftotal;
518 else
519 /* r' = r + n*f_total */
520 r += numrep * ftotal;
521
522 /* bucket choose */
523 if (in->size == 0) {
524 dprintk(" empty bucket\n");
525 break;
526 }
527
528 item = crush_bucket_choose(in, x, r);
529 if (item >= map->max_devices) {
530 dprintk(" bad item %d\n", item);
531 out[rep] = CRUSH_ITEM_NONE;
532 if (out2)
533 out2[rep] = CRUSH_ITEM_NONE;
534 left--;
535 break;
536 }
537
538 /* desired type? */
539 if (item < 0)
540 itemtype = map->buckets[-1-item]->type;
541 else
542 itemtype = 0;
543 dprintk(" item %d type %d\n", item, itemtype);
544
545 /* keep going? */
546 if (itemtype != type) {
547 if (item >= 0 ||
548 (-1-item) >= map->max_buckets) {
549 dprintk(" bad item type %d\n", type);
550 out[rep] = CRUSH_ITEM_NONE;
551 if (out2)
552 out2[rep] =
553 CRUSH_ITEM_NONE;
554 left--;
555 break;
556 }
557 in = map->buckets[-1-item];
558 continue;
559 }
560
561 /* collision? */
562 collide = 0;
563 for (i = outpos; i < endpos; i++) {
564 if (out[i] == item) {
565 collide = 1;
566 break;
567 }
568 }
569 if (collide)
570 break;
571
572 if (recurse_to_leaf) {
573 if (item < 0) {
574 crush_choose_indep(map,
575 map->buckets[-1-item],
576 weight, weight_max,
577 x, 1, numrep, 0,
578 out2, rep,
579 recurse_tries, 0,
580 0, NULL, r);
581 if (out2[rep] == CRUSH_ITEM_NONE) {
582 /* placed nothing; no leaf */
583 break;
584 }
585 } else {
586 /* we already have a leaf! */
587 out2[rep] = item;
588 }
589 }
590
591 /* out? */
592 if (itemtype == 0 &&
593 is_out(map, weight, weight_max, item, x))
594 break;
595
596 /* yay! */
597 out[rep] = item;
598 left--;
599 break;
600 }
601 }
602 }
603 for (rep = outpos; rep < endpos; rep++) {
604 if (out[rep] == CRUSH_ITEM_UNDEF) {
605 out[rep] = CRUSH_ITEM_NONE;
606 }
607 if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
608 out2[rep] = CRUSH_ITEM_NONE;
609 }
610 }
611}
612
613/**
467 * crush_do_rule - calculate a mapping with the given input and rule 614 * crush_do_rule - calculate a mapping with the given input and rule
468 * @map: the crush_map 615 * @map: the crush_map
469 * @ruleno: the rule id 616 * @ruleno: the rule id
470 * @x: hash input 617 * @x: hash input
471 * @result: pointer to result vector 618 * @result: pointer to result vector
472 * @result_max: maximum result size 619 * @result_max: maximum result size
620 * @weight: weight vector (for map leaves)
621 * @weight_max: size of weight vector
622 * @scratch: scratch vector for private use; must be >= 3 * result_max
473 */ 623 */
474int crush_do_rule(const struct crush_map *map, 624int crush_do_rule(const struct crush_map *map,
475 int ruleno, int x, int *result, int result_max, 625 int ruleno, int x, int *result, int result_max,
476 const __u32 *weight) 626 const __u32 *weight, int weight_max,
627 int *scratch)
477{ 628{
478 int result_len; 629 int result_len;
479 int a[CRUSH_MAX_SET]; 630 int *a = scratch;
480 int b[CRUSH_MAX_SET]; 631 int *b = scratch + result_max;
481 int c[CRUSH_MAX_SET]; 632 int *c = scratch + result_max*2;
482 int recurse_to_leaf; 633 int recurse_to_leaf;
483 int *w; 634 int *w;
484 int wsize = 0; 635 int wsize = 0;
@@ -489,8 +640,10 @@ int crush_do_rule(const struct crush_map *map,
489 __u32 step; 640 __u32 step;
490 int i, j; 641 int i, j;
491 int numrep; 642 int numrep;
492 int firstn; 643 int choose_tries = map->choose_total_tries;
493 const int descend_once = 0; 644 int choose_local_tries = map->choose_local_tries;
645 int choose_local_fallback_tries = map->choose_local_fallback_tries;
646 int choose_leaf_tries = 0;
494 647
495 if ((__u32)ruleno >= map->max_rules) { 648 if ((__u32)ruleno >= map->max_rules) {
496 dprintk(" bad ruleno %d\n", ruleno); 649 dprintk(" bad ruleno %d\n", ruleno);
@@ -503,29 +656,49 @@ int crush_do_rule(const struct crush_map *map,
503 o = b; 656 o = b;
504 657
505 for (step = 0; step < rule->len; step++) { 658 for (step = 0; step < rule->len; step++) {
659 int firstn = 0;
506 struct crush_rule_step *curstep = &rule->steps[step]; 660 struct crush_rule_step *curstep = &rule->steps[step];
507 661
508 firstn = 0;
509 switch (curstep->op) { 662 switch (curstep->op) {
510 case CRUSH_RULE_TAKE: 663 case CRUSH_RULE_TAKE:
511 w[0] = curstep->arg1; 664 w[0] = curstep->arg1;
512 wsize = 1; 665 wsize = 1;
513 break; 666 break;
514 667
515 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: 668 case CRUSH_RULE_SET_CHOOSE_TRIES:
669 if (curstep->arg1 > 0)
670 choose_tries = curstep->arg1;
671 break;
672
673 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
674 if (curstep->arg1 > 0)
675 choose_leaf_tries = curstep->arg1;
676 break;
677
678 case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
679 if (curstep->arg1 > 0)
680 choose_local_tries = curstep->arg1;
681 break;
682
683 case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
684 if (curstep->arg1 > 0)
685 choose_local_fallback_tries = curstep->arg1;
686 break;
687
688 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
516 case CRUSH_RULE_CHOOSE_FIRSTN: 689 case CRUSH_RULE_CHOOSE_FIRSTN:
517 firstn = 1; 690 firstn = 1;
518 /* fall through */ 691 /* fall through */
519 case CRUSH_RULE_CHOOSE_LEAF_INDEP: 692 case CRUSH_RULE_CHOOSELEAF_INDEP:
520 case CRUSH_RULE_CHOOSE_INDEP: 693 case CRUSH_RULE_CHOOSE_INDEP:
521 if (wsize == 0) 694 if (wsize == 0)
522 break; 695 break;
523 696
524 recurse_to_leaf = 697 recurse_to_leaf =
525 curstep->op == 698 curstep->op ==
526 CRUSH_RULE_CHOOSE_LEAF_FIRSTN || 699 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
527 curstep->op == 700 curstep->op ==
528 CRUSH_RULE_CHOOSE_LEAF_INDEP; 701 CRUSH_RULE_CHOOSELEAF_INDEP;
529 702
530 /* reset output */ 703 /* reset output */
531 osize = 0; 704 osize = 0;
@@ -543,22 +716,51 @@ int crush_do_rule(const struct crush_map *map,
543 continue; 716 continue;
544 } 717 }
545 j = 0; 718 j = 0;
546 osize += crush_choose(map, 719 if (firstn) {
547 map->buckets[-1-w[i]], 720 int recurse_tries;
548 weight, 721 if (choose_leaf_tries)
549 x, numrep, 722 recurse_tries =
550 curstep->arg2, 723 choose_leaf_tries;
551 o+osize, j, 724 else if (map->chooseleaf_descend_once)
552 firstn, 725 recurse_tries = 1;
553 recurse_to_leaf, 726 else
554 descend_once, c+osize); 727 recurse_tries = choose_tries;
728 osize += crush_choose_firstn(
729 map,
730 map->buckets[-1-w[i]],
731 weight, weight_max,
732 x, numrep,
733 curstep->arg2,
734 o+osize, j,
735 choose_tries,
736 recurse_tries,
737 choose_local_tries,
738 choose_local_fallback_tries,
739 recurse_to_leaf,
740 c+osize);
741 } else {
742 crush_choose_indep(
743 map,
744 map->buckets[-1-w[i]],
745 weight, weight_max,
746 x, numrep, numrep,
747 curstep->arg2,
748 o+osize, j,
749 choose_tries,
750 choose_leaf_tries ?
751 choose_leaf_tries : 1,
752 recurse_to_leaf,
753 c+osize,
754 0);
755 osize += numrep;
756 }
555 } 757 }
556 758
557 if (recurse_to_leaf) 759 if (recurse_to_leaf)
558 /* copy final _leaf_ values to output set */ 760 /* copy final _leaf_ values to output set */
559 memcpy(o, c, osize*sizeof(*o)); 761 memcpy(o, c, osize*sizeof(*o));
560 762
561 /* swap t and w arrays */ 763 /* swap o and w arrays */
562 tmp = o; 764 tmp = o;
563 o = w; 765 o = w;
564 w = tmp; 766 w = tmp;