aboutsummaryrefslogtreecommitdiffstats
path: root/net/ceph/crush/mapper.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-30 14:17:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-30 14:17:19 -0400
commitaf56e0aa35f3ae2a4c1a6d1000702df1dd78cb76 (patch)
tree304bd85e5db2d07efa2913aa7c6313b918cfbfdb /net/ceph/crush/mapper.c
parent65a50c951a38e9827dd9655b6e686bde912e799b (diff)
parent6bd9adbdf9ca6a052b0b7455ac67b925eb38cfad (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull ceph updates from Sage Weil: "There are some updates and cleanups to the CRUSH placement code, a bug fix with incremental maps, several cleanups and fixes from Josh Durgin in the RBD block device code, a series of cleanups and bug fixes from Alex Elder in the messenger code, and some miscellaneous bounds checking and gfp cleanups/fixes." Fix up trivial conflicts in net/ceph/{messenger.c,osdmap.c} due to the networking people preferring "unsigned int" over just "unsigned". * git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (45 commits) libceph: fix pg_temp updates libceph: avoid unregistering osd request when not registered ceph: add auth buf in prepare_write_connect() ceph: rename prepare_connect_authorizer() ceph: return pointer from prepare_connect_authorizer() ceph: use info returned by get_authorizer ceph: have get_authorizer methods return pointers ceph: ensure auth ops are defined before use ceph: messenger: reduce args to create_authorizer ceph: define ceph_auth_handshake type ceph: messenger: check return from get_authorizer ceph: messenger: rework prepare_connect_authorizer() ceph: messenger: check prepare_write_connect() result ceph: don't set WRITE_PENDING too early ceph: drop msgr argument from prepare_write_connect() ceph: messenger: send banner in process_connect() ceph: messenger: reset connection kvec caller libceph: don't reset kvec in prepare_write_banner() ceph: ignore preferred_osd field ceph: fully initialize new layout ...
Diffstat (limited to 'net/ceph/crush/mapper.c')
-rw-r--r--net/ceph/crush/mapper.c124
1 files changed, 48 insertions, 76 deletions
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 363f8f7e6c3..d7edc24333b 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -33,9 +33,9 @@
33 * @type: storage ruleset type (user defined) 33 * @type: storage ruleset type (user defined)
34 * @size: output set size 34 * @size: output set size
35 */ 35 */
36int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) 36int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
37{ 37{
38 int i; 38 __u32 i;
39 39
40 for (i = 0; i < map->max_rules; i++) { 40 for (i = 0; i < map->max_rules; i++) {
41 if (map->rules[i] && 41 if (map->rules[i] &&
@@ -73,7 +73,7 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
73 unsigned int i, s; 73 unsigned int i, s;
74 74
75 /* start a new permutation if @x has changed */ 75 /* start a new permutation if @x has changed */
76 if (bucket->perm_x != x || bucket->perm_n == 0) { 76 if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
77 dprintk("bucket %d new x=%d\n", bucket->id, x); 77 dprintk("bucket %d new x=%d\n", bucket->id, x);
78 bucket->perm_x = x; 78 bucket->perm_x = x;
79 79
@@ -153,8 +153,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
153 return bucket->h.items[i]; 153 return bucket->h.items[i];
154 } 154 }
155 155
156 BUG_ON(1); 156 dprintk("bad list sums for bucket %d\n", bucket->h.id);
157 return 0; 157 return bucket->h.items[0];
158} 158}
159 159
160 160
@@ -220,7 +220,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
220static int bucket_straw_choose(struct crush_bucket_straw *bucket, 220static int bucket_straw_choose(struct crush_bucket_straw *bucket,
221 int x, int r) 221 int x, int r)
222{ 222{
223 int i; 223 __u32 i;
224 int high = 0; 224 int high = 0;
225 __u64 high_draw = 0; 225 __u64 high_draw = 0;
226 __u64 draw; 226 __u64 draw;
@@ -240,6 +240,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
240static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 240static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
241{ 241{
242 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 242 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
243 BUG_ON(in->size == 0);
243 switch (in->alg) { 244 switch (in->alg) {
244 case CRUSH_BUCKET_UNIFORM: 245 case CRUSH_BUCKET_UNIFORM:
245 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 246 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -254,7 +255,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
254 return bucket_straw_choose((struct crush_bucket_straw *)in, 255 return bucket_straw_choose((struct crush_bucket_straw *)in,
255 x, r); 256 x, r);
256 default: 257 default:
257 BUG_ON(1); 258 dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
258 return in->items[0]; 259 return in->items[0];
259 } 260 }
260} 261}
@@ -263,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
263 * true if device is marked "out" (failed, fully offloaded) 264 * true if device is marked "out" (failed, fully offloaded)
264 * of the cluster 265 * of the cluster
265 */ 266 */
266static int is_out(struct crush_map *map, __u32 *weight, int item, int x) 267static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x)
267{ 268{
268 if (weight[item] >= 0x10000) 269 if (weight[item] >= 0x10000)
269 return 0; 270 return 0;
@@ -288,16 +289,16 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
288 * @recurse_to_leaf: true if we want one device under each item of given type 289 * @recurse_to_leaf: true if we want one device under each item of given type
289 * @out2: second output vector for leaf items (if @recurse_to_leaf) 290 * @out2: second output vector for leaf items (if @recurse_to_leaf)
290 */ 291 */
291static int crush_choose(struct crush_map *map, 292static int crush_choose(const struct crush_map *map,
292 struct crush_bucket *bucket, 293 struct crush_bucket *bucket,
293 __u32 *weight, 294 const __u32 *weight,
294 int x, int numrep, int type, 295 int x, int numrep, int type,
295 int *out, int outpos, 296 int *out, int outpos,
296 int firstn, int recurse_to_leaf, 297 int firstn, int recurse_to_leaf,
297 int *out2) 298 int *out2)
298{ 299{
299 int rep; 300 int rep;
300 int ftotal, flocal; 301 unsigned int ftotal, flocal;
301 int retry_descent, retry_bucket, skip_rep; 302 int retry_descent, retry_bucket, skip_rep;
302 struct crush_bucket *in = bucket; 303 struct crush_bucket *in = bucket;
303 int r; 304 int r;
@@ -305,7 +306,7 @@ static int crush_choose(struct crush_map *map,
305 int item = 0; 306 int item = 0;
306 int itemtype; 307 int itemtype;
307 int collide, reject; 308 int collide, reject;
308 const int orig_tries = 5; /* attempts before we fall back to search */ 309 const unsigned int orig_tries = 5; /* attempts before we fall back to search */
309 310
310 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 311 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
311 bucket->id, x, outpos, numrep); 312 bucket->id, x, outpos, numrep);
@@ -326,7 +327,7 @@ static int crush_choose(struct crush_map *map,
326 r = rep; 327 r = rep;
327 if (in->alg == CRUSH_BUCKET_UNIFORM) { 328 if (in->alg == CRUSH_BUCKET_UNIFORM) {
328 /* be careful */ 329 /* be careful */
329 if (firstn || numrep >= in->size) 330 if (firstn || (__u32)numrep >= in->size)
330 /* r' = r + f_total */ 331 /* r' = r + f_total */
331 r += ftotal; 332 r += ftotal;
332 else if (in->size % numrep == 0) 333 else if (in->size % numrep == 0)
@@ -355,7 +356,11 @@ static int crush_choose(struct crush_map *map,
355 item = bucket_perm_choose(in, x, r); 356 item = bucket_perm_choose(in, x, r);
356 else 357 else
357 item = crush_bucket_choose(in, x, r); 358 item = crush_bucket_choose(in, x, r);
358 BUG_ON(item >= map->max_devices); 359 if (item >= map->max_devices) {
360 dprintk(" bad item %d\n", item);
361 skip_rep = 1;
362 break;
363 }
359 364
360 /* desired type? */ 365 /* desired type? */
361 if (item < 0) 366 if (item < 0)
@@ -366,8 +371,12 @@ static int crush_choose(struct crush_map *map,
366 371
367 /* keep going? */ 372 /* keep going? */
368 if (itemtype != type) { 373 if (itemtype != type) {
369 BUG_ON(item >= 0 || 374 if (item >= 0 ||
370 (-1-item) >= map->max_buckets); 375 (-1-item) >= map->max_buckets) {
376 dprintk(" bad item type %d\n", type);
377 skip_rep = 1;
378 break;
379 }
371 in = map->buckets[-1-item]; 380 in = map->buckets[-1-item];
372 retry_bucket = 1; 381 retry_bucket = 1;
373 continue; 382 continue;
@@ -416,7 +425,7 @@ reject:
416 if (collide && flocal < 3) 425 if (collide && flocal < 3)
417 /* retry locally a few times */ 426 /* retry locally a few times */
418 retry_bucket = 1; 427 retry_bucket = 1;
419 else if (flocal < in->size + orig_tries) 428 else if (flocal <= in->size + orig_tries)
420 /* exhaustive bucket search */ 429 /* exhaustive bucket search */
421 retry_bucket = 1; 430 retry_bucket = 1;
422 else if (ftotal < 20) 431 else if (ftotal < 20)
@@ -426,7 +435,7 @@ reject:
426 /* else give up */ 435 /* else give up */
427 skip_rep = 1; 436 skip_rep = 1;
428 dprintk(" reject %d collide %d " 437 dprintk(" reject %d collide %d "
429 "ftotal %d flocal %d\n", 438 "ftotal %u flocal %u\n",
430 reject, collide, ftotal, 439 reject, collide, ftotal,
431 flocal); 440 flocal);
432 } 441 }
@@ -455,15 +464,12 @@ reject:
455 * @x: hash input 464 * @x: hash input
456 * @result: pointer to result vector 465 * @result: pointer to result vector
457 * @result_max: maximum result size 466 * @result_max: maximum result size
458 * @force: force initial replica choice; -1 for none
459 */ 467 */
460int crush_do_rule(struct crush_map *map, 468int crush_do_rule(const struct crush_map *map,
461 int ruleno, int x, int *result, int result_max, 469 int ruleno, int x, int *result, int result_max,
462 int force, __u32 *weight) 470 const __u32 *weight)
463{ 471{
464 int result_len; 472 int result_len;
465 int force_context[CRUSH_MAX_DEPTH];
466 int force_pos = -1;
467 int a[CRUSH_MAX_SET]; 473 int a[CRUSH_MAX_SET];
468 int b[CRUSH_MAX_SET]; 474 int b[CRUSH_MAX_SET];
469 int c[CRUSH_MAX_SET]; 475 int c[CRUSH_MAX_SET];
@@ -474,66 +480,44 @@ int crush_do_rule(struct crush_map *map,
474 int osize; 480 int osize;
475 int *tmp; 481 int *tmp;
476 struct crush_rule *rule; 482 struct crush_rule *rule;
477 int step; 483 __u32 step;
478 int i, j; 484 int i, j;
479 int numrep; 485 int numrep;
480 int firstn; 486 int firstn;
481 487
482 BUG_ON(ruleno >= map->max_rules); 488 if ((__u32)ruleno >= map->max_rules) {
489 dprintk(" bad ruleno %d\n", ruleno);
490 return 0;
491 }
483 492
484 rule = map->rules[ruleno]; 493 rule = map->rules[ruleno];
485 result_len = 0; 494 result_len = 0;
486 w = a; 495 w = a;
487 o = b; 496 o = b;
488 497
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0 &&
495 force < map->max_devices &&
496 map->device_parents[force] != 0 &&
497 !is_out(map, weight, force, x)) {
498 while (1) {
499 force_context[++force_pos] = force;
500 if (force >= 0)
501 force = map->device_parents[force];
502 else
503 force = map->bucket_parents[-1-force];
504 if (force == 0)
505 break;
506 }
507 }
508
509 for (step = 0; step < rule->len; step++) { 498 for (step = 0; step < rule->len; step++) {
499 struct crush_rule_step *curstep = &rule->steps[step];
500
510 firstn = 0; 501 firstn = 0;
511 switch (rule->steps[step].op) { 502 switch (curstep->op) {
512 case CRUSH_RULE_TAKE: 503 case CRUSH_RULE_TAKE:
513 w[0] = rule->steps[step].arg1; 504 w[0] = curstep->arg1;
514
515 /* find position in force_context/hierarchy */
516 while (force_pos >= 0 &&
517 force_context[force_pos] != w[0])
518 force_pos--;
519 /* and move past it */
520 if (force_pos >= 0)
521 force_pos--;
522
523 wsize = 1; 505 wsize = 1;
524 break; 506 break;
525 507
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: 508 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN: 509 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1; 510 firstn = 1;
511 /* fall through */
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP: 512 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP: 513 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0); 514 if (wsize == 0)
515 break;
532 516
533 recurse_to_leaf = 517 recurse_to_leaf =
534 rule->steps[step].op == 518 curstep->op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN || 519 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op == 520 curstep->op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP; 521 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538 522
539 /* reset output */ 523 /* reset output */
@@ -545,32 +529,18 @@ int crush_do_rule(struct crush_map *map,
545 * basically, numrep <= 0 means relative to 529 * basically, numrep <= 0 means relative to
546 * the provided result_max 530 * the provided result_max
547 */ 531 */
548 numrep = rule->steps[step].arg1; 532 numrep = curstep->arg1;
549 if (numrep <= 0) { 533 if (numrep <= 0) {
550 numrep += result_max; 534 numrep += result_max;
551 if (numrep <= 0) 535 if (numrep <= 0)
552 continue; 536 continue;
553 } 537 }
554 j = 0; 538 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map, 539 osize += crush_choose(map,
570 map->buckets[-1-w[i]], 540 map->buckets[-1-w[i]],
571 weight, 541 weight,
572 x, numrep, 542 x, numrep,
573 rule->steps[step].arg2, 543 curstep->arg2,
574 o+osize, j, 544 o+osize, j,
575 firstn, 545 firstn,
576 recurse_to_leaf, c+osize); 546 recurse_to_leaf, c+osize);
@@ -597,7 +567,9 @@ int crush_do_rule(struct crush_map *map,
597 break; 567 break;
598 568
599 default: 569 default:
600 BUG_ON(1); 570 dprintk(" unknown op %d at step %d\n",
571 curstep->op, step);
572 break;
601 } 573 }
602 } 574 }
603 return result_len; 575 return result_len;