diff options
Diffstat (limited to 'net/ceph/crush/mapper.c')
-rw-r--r-- | net/ceph/crush/mapper.c | 336 |
1 files changed, 269 insertions, 67 deletions
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index cbd06a91941c..b703790b4e44 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
@@ -189,7 +189,7 @@ static int terminal(int x) | |||
189 | static int bucket_tree_choose(struct crush_bucket_tree *bucket, | 189 | static int bucket_tree_choose(struct crush_bucket_tree *bucket, |
190 | int x, int r) | 190 | int x, int r) |
191 | { | 191 | { |
192 | int n, l; | 192 | int n; |
193 | __u32 w; | 193 | __u32 w; |
194 | __u64 t; | 194 | __u64 t; |
195 | 195 | ||
@@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, | |||
197 | n = bucket->num_nodes >> 1; | 197 | n = bucket->num_nodes >> 1; |
198 | 198 | ||
199 | while (!terminal(n)) { | 199 | while (!terminal(n)) { |
200 | int l; | ||
200 | /* pick point in [0, w) */ | 201 | /* pick point in [0, w) */ |
201 | w = bucket->node_weights[n]; | 202 | w = bucket->node_weights[n]; |
202 | t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, | 203 | t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, |
@@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) | |||
264 | * true if device is marked "out" (failed, fully offloaded) | 265 | * true if device is marked "out" (failed, fully offloaded) |
265 | * of the cluster | 266 | * of the cluster |
266 | */ | 267 | */ |
267 | static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) | 268 | static int is_out(const struct crush_map *map, |
269 | const __u32 *weight, int weight_max, | ||
270 | int item, int x) | ||
268 | { | 271 | { |
272 | if (item >= weight_max) | ||
273 | return 1; | ||
269 | if (weight[item] >= 0x10000) | 274 | if (weight[item] >= 0x10000) |
270 | return 0; | 275 | return 0; |
271 | if (weight[item] == 0) | 276 | if (weight[item] == 0) |
@@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in | |||
277 | } | 282 | } |
278 | 283 | ||
279 | /** | 284 | /** |
280 | * crush_choose - choose numrep distinct items of given type | 285 | * crush_choose_firstn - choose numrep distinct items of given type |
281 | * @map: the crush_map | 286 | * @map: the crush_map |
282 | * @bucket: the bucket we are choose an item from | 287 | * @bucket: the bucket we are choose an item from |
283 | * @x: crush input value | 288 | * @x: crush input value |
@@ -285,18 +290,24 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in | |||
285 | * @type: the type of item to choose | 290 | * @type: the type of item to choose |
286 | * @out: pointer to output vector | 291 | * @out: pointer to output vector |
287 | * @outpos: our position in that vector | 292 | * @outpos: our position in that vector |
288 | * @firstn: true if choosing "first n" items, false if choosing "indep" | 293 | * @tries: number of attempts to make |
289 | * @recurse_to_leaf: true if we want one device under each item of given type | 294 | * @recurse_tries: number of attempts to have recursive chooseleaf make |
290 | * @descend_once: true if we should only try one descent before giving up | 295 | * @local_tries: localized retries |
296 | * @local_fallback_tries: localized fallback retries | ||
297 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) | ||
291 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | 298 | * @out2: second output vector for leaf items (if @recurse_to_leaf) |
292 | */ | 299 | */ |
293 | static int crush_choose(const struct crush_map *map, | 300 | static int crush_choose_firstn(const struct crush_map *map, |
294 | struct crush_bucket *bucket, | 301 | struct crush_bucket *bucket, |
295 | const __u32 *weight, | 302 | const __u32 *weight, int weight_max, |
296 | int x, int numrep, int type, | 303 | int x, int numrep, int type, |
297 | int *out, int outpos, | 304 | int *out, int outpos, |
298 | int firstn, int recurse_to_leaf, | 305 | unsigned int tries, |
299 | int descend_once, int *out2) | 306 | unsigned int recurse_tries, |
307 | unsigned int local_tries, | ||
308 | unsigned int local_fallback_tries, | ||
309 | int recurse_to_leaf, | ||
310 | int *out2) | ||
300 | { | 311 | { |
301 | int rep; | 312 | int rep; |
302 | unsigned int ftotal, flocal; | 313 | unsigned int ftotal, flocal; |
@@ -325,35 +336,17 @@ static int crush_choose(const struct crush_map *map, | |||
325 | collide = 0; | 336 | collide = 0; |
326 | retry_bucket = 0; | 337 | retry_bucket = 0; |
327 | r = rep; | 338 | r = rep; |
328 | if (in->alg == CRUSH_BUCKET_UNIFORM) { | 339 | /* r' = r + f_total */ |
329 | /* be careful */ | 340 | r += ftotal; |
330 | if (firstn || (__u32)numrep >= in->size) | ||
331 | /* r' = r + f_total */ | ||
332 | r += ftotal; | ||
333 | else if (in->size % numrep == 0) | ||
334 | /* r'=r+(n+1)*f_local */ | ||
335 | r += (numrep+1) * | ||
336 | (flocal+ftotal); | ||
337 | else | ||
338 | /* r' = r + n*f_local */ | ||
339 | r += numrep * (flocal+ftotal); | ||
340 | } else { | ||
341 | if (firstn) | ||
342 | /* r' = r + f_total */ | ||
343 | r += ftotal; | ||
344 | else | ||
345 | /* r' = r + n*f_local */ | ||
346 | r += numrep * (flocal+ftotal); | ||
347 | } | ||
348 | 341 | ||
349 | /* bucket choose */ | 342 | /* bucket choose */ |
350 | if (in->size == 0) { | 343 | if (in->size == 0) { |
351 | reject = 1; | 344 | reject = 1; |
352 | goto reject; | 345 | goto reject; |
353 | } | 346 | } |
354 | if (map->choose_local_fallback_tries > 0 && | 347 | if (local_fallback_tries > 0 && |
355 | flocal >= (in->size>>1) && | 348 | flocal >= (in->size>>1) && |
356 | flocal > map->choose_local_fallback_tries) | 349 | flocal > local_fallback_tries) |
357 | item = bucket_perm_choose(in, x, r); | 350 | item = bucket_perm_choose(in, x, r); |
358 | else | 351 | else |
359 | item = crush_bucket_choose(in, x, r); | 352 | item = crush_bucket_choose(in, x, r); |
@@ -394,13 +387,15 @@ static int crush_choose(const struct crush_map *map, | |||
394 | reject = 0; | 387 | reject = 0; |
395 | if (!collide && recurse_to_leaf) { | 388 | if (!collide && recurse_to_leaf) { |
396 | if (item < 0) { | 389 | if (item < 0) { |
397 | if (crush_choose(map, | 390 | if (crush_choose_firstn(map, |
398 | map->buckets[-1-item], | 391 | map->buckets[-1-item], |
399 | weight, | 392 | weight, weight_max, |
400 | x, outpos+1, 0, | 393 | x, outpos+1, 0, |
401 | out2, outpos, | 394 | out2, outpos, |
402 | firstn, 0, | 395 | recurse_tries, 0, |
403 | map->chooseleaf_descend_once, | 396 | local_tries, |
397 | local_fallback_tries, | ||
398 | 0, | ||
404 | NULL) <= outpos) | 399 | NULL) <= outpos) |
405 | /* didn't get leaf */ | 400 | /* didn't get leaf */ |
406 | reject = 1; | 401 | reject = 1; |
@@ -414,6 +409,7 @@ static int crush_choose(const struct crush_map *map, | |||
414 | /* out? */ | 409 | /* out? */ |
415 | if (itemtype == 0) | 410 | if (itemtype == 0) |
416 | reject = is_out(map, weight, | 411 | reject = is_out(map, weight, |
412 | weight_max, | ||
417 | item, x); | 413 | item, x); |
418 | else | 414 | else |
419 | reject = 0; | 415 | reject = 0; |
@@ -424,17 +420,14 @@ reject: | |||
424 | ftotal++; | 420 | ftotal++; |
425 | flocal++; | 421 | flocal++; |
426 | 422 | ||
427 | if (reject && descend_once) | 423 | if (collide && flocal <= local_tries) |
428 | /* let outer call try again */ | ||
429 | skip_rep = 1; | ||
430 | else if (collide && flocal <= map->choose_local_tries) | ||
431 | /* retry locally a few times */ | 424 | /* retry locally a few times */ |
432 | retry_bucket = 1; | 425 | retry_bucket = 1; |
433 | else if (map->choose_local_fallback_tries > 0 && | 426 | else if (local_fallback_tries > 0 && |
434 | flocal <= in->size + map->choose_local_fallback_tries) | 427 | flocal <= in->size + local_fallback_tries) |
435 | /* exhaustive bucket search */ | 428 | /* exhaustive bucket search */ |
436 | retry_bucket = 1; | 429 | retry_bucket = 1; |
437 | else if (ftotal <= map->choose_total_tries) | 430 | else if (ftotal <= tries) |
438 | /* then retry descent */ | 431 | /* then retry descent */ |
439 | retry_descent = 1; | 432 | retry_descent = 1; |
440 | else | 433 | else |
@@ -464,21 +457,179 @@ reject: | |||
464 | 457 | ||
465 | 458 | ||
466 | /** | 459 | /** |
460 | * crush_choose_indep: alternative breadth-first positionally stable mapping | ||
461 | * | ||
462 | */ | ||
463 | static void crush_choose_indep(const struct crush_map *map, | ||
464 | struct crush_bucket *bucket, | ||
465 | const __u32 *weight, int weight_max, | ||
466 | int x, int left, int numrep, int type, | ||
467 | int *out, int outpos, | ||
468 | unsigned int tries, | ||
469 | unsigned int recurse_tries, | ||
470 | int recurse_to_leaf, | ||
471 | int *out2, | ||
472 | int parent_r) | ||
473 | { | ||
474 | struct crush_bucket *in = bucket; | ||
475 | int endpos = outpos + left; | ||
476 | int rep; | ||
477 | unsigned int ftotal; | ||
478 | int r; | ||
479 | int i; | ||
480 | int item = 0; | ||
481 | int itemtype; | ||
482 | int collide; | ||
483 | |||
484 | dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", | ||
485 | bucket->id, x, outpos, numrep); | ||
486 | |||
487 | /* initially my result is undefined */ | ||
488 | for (rep = outpos; rep < endpos; rep++) { | ||
489 | out[rep] = CRUSH_ITEM_UNDEF; | ||
490 | if (out2) | ||
491 | out2[rep] = CRUSH_ITEM_UNDEF; | ||
492 | } | ||
493 | |||
494 | for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { | ||
495 | for (rep = outpos; rep < endpos; rep++) { | ||
496 | if (out[rep] != CRUSH_ITEM_UNDEF) | ||
497 | continue; | ||
498 | |||
499 | in = bucket; /* initial bucket */ | ||
500 | |||
501 | /* choose through intervening buckets */ | ||
502 | for (;;) { | ||
503 | /* note: we base the choice on the position | ||
504 | * even in the nested call. that means that | ||
505 | * if the first layer chooses the same bucket | ||
506 | * in a different position, we will tend to | ||
507 | * choose a different item in that bucket. | ||
508 | * this will involve more devices in data | ||
509 | * movement and tend to distribute the load. | ||
510 | */ | ||
511 | r = rep + parent_r; | ||
512 | |||
513 | /* be careful */ | ||
514 | if (in->alg == CRUSH_BUCKET_UNIFORM && | ||
515 | in->size % numrep == 0) | ||
516 | /* r'=r+(n+1)*f_total */ | ||
517 | r += (numrep+1) * ftotal; | ||
518 | else | ||
519 | /* r' = r + n*f_total */ | ||
520 | r += numrep * ftotal; | ||
521 | |||
522 | /* bucket choose */ | ||
523 | if (in->size == 0) { | ||
524 | dprintk(" empty bucket\n"); | ||
525 | break; | ||
526 | } | ||
527 | |||
528 | item = crush_bucket_choose(in, x, r); | ||
529 | if (item >= map->max_devices) { | ||
530 | dprintk(" bad item %d\n", item); | ||
531 | out[rep] = CRUSH_ITEM_NONE; | ||
532 | if (out2) | ||
533 | out2[rep] = CRUSH_ITEM_NONE; | ||
534 | left--; | ||
535 | break; | ||
536 | } | ||
537 | |||
538 | /* desired type? */ | ||
539 | if (item < 0) | ||
540 | itemtype = map->buckets[-1-item]->type; | ||
541 | else | ||
542 | itemtype = 0; | ||
543 | dprintk(" item %d type %d\n", item, itemtype); | ||
544 | |||
545 | /* keep going? */ | ||
546 | if (itemtype != type) { | ||
547 | if (item >= 0 || | ||
548 | (-1-item) >= map->max_buckets) { | ||
549 | dprintk(" bad item type %d\n", type); | ||
550 | out[rep] = CRUSH_ITEM_NONE; | ||
551 | if (out2) | ||
552 | out2[rep] = | ||
553 | CRUSH_ITEM_NONE; | ||
554 | left--; | ||
555 | break; | ||
556 | } | ||
557 | in = map->buckets[-1-item]; | ||
558 | continue; | ||
559 | } | ||
560 | |||
561 | /* collision? */ | ||
562 | collide = 0; | ||
563 | for (i = outpos; i < endpos; i++) { | ||
564 | if (out[i] == item) { | ||
565 | collide = 1; | ||
566 | break; | ||
567 | } | ||
568 | } | ||
569 | if (collide) | ||
570 | break; | ||
571 | |||
572 | if (recurse_to_leaf) { | ||
573 | if (item < 0) { | ||
574 | crush_choose_indep(map, | ||
575 | map->buckets[-1-item], | ||
576 | weight, weight_max, | ||
577 | x, 1, numrep, 0, | ||
578 | out2, rep, | ||
579 | recurse_tries, 0, | ||
580 | 0, NULL, r); | ||
581 | if (out2[rep] == CRUSH_ITEM_NONE) { | ||
582 | /* placed nothing; no leaf */ | ||
583 | break; | ||
584 | } | ||
585 | } else { | ||
586 | /* we already have a leaf! */ | ||
587 | out2[rep] = item; | ||
588 | } | ||
589 | } | ||
590 | |||
591 | /* out? */ | ||
592 | if (itemtype == 0 && | ||
593 | is_out(map, weight, weight_max, item, x)) | ||
594 | break; | ||
595 | |||
596 | /* yay! */ | ||
597 | out[rep] = item; | ||
598 | left--; | ||
599 | break; | ||
600 | } | ||
601 | } | ||
602 | } | ||
603 | for (rep = outpos; rep < endpos; rep++) { | ||
604 | if (out[rep] == CRUSH_ITEM_UNDEF) { | ||
605 | out[rep] = CRUSH_ITEM_NONE; | ||
606 | } | ||
607 | if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { | ||
608 | out2[rep] = CRUSH_ITEM_NONE; | ||
609 | } | ||
610 | } | ||
611 | } | ||
612 | |||
613 | /** | ||
467 | * crush_do_rule - calculate a mapping with the given input and rule | 614 | * crush_do_rule - calculate a mapping with the given input and rule |
468 | * @map: the crush_map | 615 | * @map: the crush_map |
469 | * @ruleno: the rule id | 616 | * @ruleno: the rule id |
470 | * @x: hash input | 617 | * @x: hash input |
471 | * @result: pointer to result vector | 618 | * @result: pointer to result vector |
472 | * @result_max: maximum result size | 619 | * @result_max: maximum result size |
620 | * @weight: weight vector (for map leaves) | ||
621 | * @weight_max: size of weight vector | ||
622 | * @scratch: scratch vector for private use; must be >= 3 * result_max | ||
473 | */ | 623 | */ |
474 | int crush_do_rule(const struct crush_map *map, | 624 | int crush_do_rule(const struct crush_map *map, |
475 | int ruleno, int x, int *result, int result_max, | 625 | int ruleno, int x, int *result, int result_max, |
476 | const __u32 *weight) | 626 | const __u32 *weight, int weight_max, |
627 | int *scratch) | ||
477 | { | 628 | { |
478 | int result_len; | 629 | int result_len; |
479 | int a[CRUSH_MAX_SET]; | 630 | int *a = scratch; |
480 | int b[CRUSH_MAX_SET]; | 631 | int *b = scratch + result_max; |
481 | int c[CRUSH_MAX_SET]; | 632 | int *c = scratch + result_max*2; |
482 | int recurse_to_leaf; | 633 | int recurse_to_leaf; |
483 | int *w; | 634 | int *w; |
484 | int wsize = 0; | 635 | int wsize = 0; |
@@ -489,8 +640,10 @@ int crush_do_rule(const struct crush_map *map, | |||
489 | __u32 step; | 640 | __u32 step; |
490 | int i, j; | 641 | int i, j; |
491 | int numrep; | 642 | int numrep; |
492 | int firstn; | 643 | int choose_tries = map->choose_total_tries; |
493 | const int descend_once = 0; | 644 | int choose_local_tries = map->choose_local_tries; |
645 | int choose_local_fallback_tries = map->choose_local_fallback_tries; | ||
646 | int choose_leaf_tries = 0; | ||
494 | 647 | ||
495 | if ((__u32)ruleno >= map->max_rules) { | 648 | if ((__u32)ruleno >= map->max_rules) { |
496 | dprintk(" bad ruleno %d\n", ruleno); | 649 | dprintk(" bad ruleno %d\n", ruleno); |
@@ -503,29 +656,49 @@ int crush_do_rule(const struct crush_map *map, | |||
503 | o = b; | 656 | o = b; |
504 | 657 | ||
505 | for (step = 0; step < rule->len; step++) { | 658 | for (step = 0; step < rule->len; step++) { |
659 | int firstn = 0; | ||
506 | struct crush_rule_step *curstep = &rule->steps[step]; | 660 | struct crush_rule_step *curstep = &rule->steps[step]; |
507 | 661 | ||
508 | firstn = 0; | ||
509 | switch (curstep->op) { | 662 | switch (curstep->op) { |
510 | case CRUSH_RULE_TAKE: | 663 | case CRUSH_RULE_TAKE: |
511 | w[0] = curstep->arg1; | 664 | w[0] = curstep->arg1; |
512 | wsize = 1; | 665 | wsize = 1; |
513 | break; | 666 | break; |
514 | 667 | ||
515 | case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: | 668 | case CRUSH_RULE_SET_CHOOSE_TRIES: |
669 | if (curstep->arg1 > 0) | ||
670 | choose_tries = curstep->arg1; | ||
671 | break; | ||
672 | |||
673 | case CRUSH_RULE_SET_CHOOSELEAF_TRIES: | ||
674 | if (curstep->arg1 > 0) | ||
675 | choose_leaf_tries = curstep->arg1; | ||
676 | break; | ||
677 | |||
678 | case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: | ||
679 | if (curstep->arg1 > 0) | ||
680 | choose_local_tries = curstep->arg1; | ||
681 | break; | ||
682 | |||
683 | case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: | ||
684 | if (curstep->arg1 > 0) | ||
685 | choose_local_fallback_tries = curstep->arg1; | ||
686 | break; | ||
687 | |||
688 | case CRUSH_RULE_CHOOSELEAF_FIRSTN: | ||
516 | case CRUSH_RULE_CHOOSE_FIRSTN: | 689 | case CRUSH_RULE_CHOOSE_FIRSTN: |
517 | firstn = 1; | 690 | firstn = 1; |
518 | /* fall through */ | 691 | /* fall through */ |
519 | case CRUSH_RULE_CHOOSE_LEAF_INDEP: | 692 | case CRUSH_RULE_CHOOSELEAF_INDEP: |
520 | case CRUSH_RULE_CHOOSE_INDEP: | 693 | case CRUSH_RULE_CHOOSE_INDEP: |
521 | if (wsize == 0) | 694 | if (wsize == 0) |
522 | break; | 695 | break; |
523 | 696 | ||
524 | recurse_to_leaf = | 697 | recurse_to_leaf = |
525 | curstep->op == | 698 | curstep->op == |
526 | CRUSH_RULE_CHOOSE_LEAF_FIRSTN || | 699 | CRUSH_RULE_CHOOSELEAF_FIRSTN || |
527 | curstep->op == | 700 | curstep->op == |
528 | CRUSH_RULE_CHOOSE_LEAF_INDEP; | 701 | CRUSH_RULE_CHOOSELEAF_INDEP; |
529 | 702 | ||
530 | /* reset output */ | 703 | /* reset output */ |
531 | osize = 0; | 704 | osize = 0; |
@@ -543,22 +716,51 @@ int crush_do_rule(const struct crush_map *map, | |||
543 | continue; | 716 | continue; |
544 | } | 717 | } |
545 | j = 0; | 718 | j = 0; |
546 | osize += crush_choose(map, | 719 | if (firstn) { |
547 | map->buckets[-1-w[i]], | 720 | int recurse_tries; |
548 | weight, | 721 | if (choose_leaf_tries) |
549 | x, numrep, | 722 | recurse_tries = |
550 | curstep->arg2, | 723 | choose_leaf_tries; |
551 | o+osize, j, | 724 | else if (map->chooseleaf_descend_once) |
552 | firstn, | 725 | recurse_tries = 1; |
553 | recurse_to_leaf, | 726 | else |
554 | descend_once, c+osize); | 727 | recurse_tries = choose_tries; |
728 | osize += crush_choose_firstn( | ||
729 | map, | ||
730 | map->buckets[-1-w[i]], | ||
731 | weight, weight_max, | ||
732 | x, numrep, | ||
733 | curstep->arg2, | ||
734 | o+osize, j, | ||
735 | choose_tries, | ||
736 | recurse_tries, | ||
737 | choose_local_tries, | ||
738 | choose_local_fallback_tries, | ||
739 | recurse_to_leaf, | ||
740 | c+osize); | ||
741 | } else { | ||
742 | crush_choose_indep( | ||
743 | map, | ||
744 | map->buckets[-1-w[i]], | ||
745 | weight, weight_max, | ||
746 | x, numrep, numrep, | ||
747 | curstep->arg2, | ||
748 | o+osize, j, | ||
749 | choose_tries, | ||
750 | choose_leaf_tries ? | ||
751 | choose_leaf_tries : 1, | ||
752 | recurse_to_leaf, | ||
753 | c+osize, | ||
754 | 0); | ||
755 | osize += numrep; | ||
756 | } | ||
555 | } | 757 | } |
556 | 758 | ||
557 | if (recurse_to_leaf) | 759 | if (recurse_to_leaf) |
558 | /* copy final _leaf_ values to output set */ | 760 | /* copy final _leaf_ values to output set */ |
559 | memcpy(o, c, osize*sizeof(*o)); | 761 | memcpy(o, c, osize*sizeof(*o)); |
560 | 762 | ||
561 | /* swap t and w arrays */ | 763 | /* swap o and w arrays */ |
562 | tmp = o; | 764 | tmp = o; |
563 | o = w; | 765 | o = w; |
564 | w = tmp; | 766 | w = tmp; |