diff options
Diffstat (limited to 'net/ceph/osdmap.c')
| -rw-r--r-- | net/ceph/osdmap.c | 290 |
1 files changed, 155 insertions, 135 deletions
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index de73214b5d26..69bc4bf89e3e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
| @@ -13,26 +13,18 @@ | |||
| 13 | 13 | ||
| 14 | char *ceph_osdmap_state_str(char *str, int len, int state) | 14 | char *ceph_osdmap_state_str(char *str, int len, int state) |
| 15 | { | 15 | { |
| 16 | int flag = 0; | ||
| 17 | |||
| 18 | if (!len) | 16 | if (!len) |
| 19 | goto done; | 17 | return str; |
| 20 | 18 | ||
| 21 | *str = '\0'; | 19 | if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) |
| 22 | if (state) { | 20 | snprintf(str, len, "exists, up"); |
| 23 | if (state & CEPH_OSD_EXISTS) { | 21 | else if (state & CEPH_OSD_EXISTS) |
| 24 | snprintf(str, len, "exists"); | 22 | snprintf(str, len, "exists"); |
| 25 | flag = 1; | 23 | else if (state & CEPH_OSD_UP) |
| 26 | } | 24 | snprintf(str, len, "up"); |
| 27 | if (state & CEPH_OSD_UP) { | 25 | else |
| 28 | snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), | ||
| 29 | "up"); | ||
| 30 | flag = 1; | ||
| 31 | } | ||
| 32 | } else { | ||
| 33 | snprintf(str, len, "doesn't exist"); | 26 | snprintf(str, len, "doesn't exist"); |
| 34 | } | 27 | |
| 35 | done: | ||
| 36 | return str; | 28 | return str; |
| 37 | } | 29 | } |
| 38 | 30 | ||
| @@ -53,13 +45,8 @@ static int calc_bits_of(unsigned int t) | |||
| 53 | */ | 45 | */ |
| 54 | static void calc_pg_masks(struct ceph_pg_pool_info *pi) | 46 | static void calc_pg_masks(struct ceph_pg_pool_info *pi) |
| 55 | { | 47 | { |
| 56 | pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; | 48 | pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; |
| 57 | pi->pgp_num_mask = | 49 | pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; |
| 58 | (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; | ||
| 59 | pi->lpg_num_mask = | ||
| 60 | (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; | ||
| 61 | pi->lpgp_num_mask = | ||
| 62 | (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; | ||
| 63 | } | 50 | } |
| 64 | 51 | ||
| 65 | /* | 52 | /* |
| @@ -170,6 +157,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
| 170 | c->choose_local_tries = 2; | 157 | c->choose_local_tries = 2; |
| 171 | c->choose_local_fallback_tries = 5; | 158 | c->choose_local_fallback_tries = 5; |
| 172 | c->choose_total_tries = 19; | 159 | c->choose_total_tries = 19; |
| 160 | c->chooseleaf_descend_once = 0; | ||
| 173 | 161 | ||
| 174 | ceph_decode_need(p, end, 4*sizeof(u32), bad); | 162 | ceph_decode_need(p, end, 4*sizeof(u32), bad); |
| 175 | magic = ceph_decode_32(p); | 163 | magic = ceph_decode_32(p); |
| @@ -336,6 +324,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
| 336 | dout("crush decode tunable choose_total_tries = %d", | 324 | dout("crush decode tunable choose_total_tries = %d", |
| 337 | c->choose_total_tries); | 325 | c->choose_total_tries); |
| 338 | 326 | ||
| 327 | ceph_decode_need(p, end, sizeof(u32), done); | ||
| 328 | c->chooseleaf_descend_once = ceph_decode_32(p); | ||
| 329 | dout("crush decode tunable chooseleaf_descend_once = %d", | ||
| 330 | c->chooseleaf_descend_once); | ||
| 331 | |||
| 339 | done: | 332 | done: |
| 340 | dout("crush_decode success\n"); | 333 | dout("crush_decode success\n"); |
| 341 | return c; | 334 | return c; |
| @@ -354,12 +347,13 @@ bad: | |||
| 354 | */ | 347 | */ |
| 355 | static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) | 348 | static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) |
| 356 | { | 349 | { |
| 357 | u64 a = *(u64 *)&l; | 350 | if (l.pool < r.pool) |
| 358 | u64 b = *(u64 *)&r; | 351 | return -1; |
| 359 | 352 | if (l.pool > r.pool) | |
| 360 | if (a < b) | 353 | return 1; |
| 354 | if (l.seed < r.seed) | ||
| 361 | return -1; | 355 | return -1; |
| 362 | if (a > b) | 356 | if (l.seed > r.seed) |
| 363 | return 1; | 357 | return 1; |
| 364 | return 0; | 358 | return 0; |
| 365 | } | 359 | } |
| @@ -405,8 +399,8 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, | |||
| 405 | } else if (c > 0) { | 399 | } else if (c > 0) { |
| 406 | n = n->rb_right; | 400 | n = n->rb_right; |
| 407 | } else { | 401 | } else { |
| 408 | dout("__lookup_pg_mapping %llx got %p\n", | 402 | dout("__lookup_pg_mapping %lld.%x got %p\n", |
| 409 | *(u64 *)&pgid, pg); | 403 | pgid.pool, pgid.seed, pg); |
| 410 | return pg; | 404 | return pg; |
| 411 | } | 405 | } |
| 412 | } | 406 | } |
| @@ -418,12 +412,13 @@ static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) | |||
| 418 | struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); | 412 | struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); |
| 419 | 413 | ||
| 420 | if (pg) { | 414 | if (pg) { |
| 421 | dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg); | 415 | dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, |
| 416 | pg); | ||
| 422 | rb_erase(&pg->node, root); | 417 | rb_erase(&pg->node, root); |
| 423 | kfree(pg); | 418 | kfree(pg); |
| 424 | return 0; | 419 | return 0; |
| 425 | } | 420 | } |
| 426 | dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid); | 421 | dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); |
| 427 | return -ENOENT; | 422 | return -ENOENT; |
| 428 | } | 423 | } |
| 429 | 424 | ||
| @@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) | |||
| 452 | return 0; | 447 | return 0; |
| 453 | } | 448 | } |
| 454 | 449 | ||
| 455 | static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) | 450 | static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) |
| 456 | { | 451 | { |
| 457 | struct ceph_pg_pool_info *pi; | 452 | struct ceph_pg_pool_info *pi; |
| 458 | struct rb_node *n = root->rb_node; | 453 | struct rb_node *n = root->rb_node; |
| @@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) | |||
| 508 | 503 | ||
| 509 | static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) | 504 | static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) |
| 510 | { | 505 | { |
| 511 | unsigned int n, m; | 506 | u8 ev, cv; |
| 507 | unsigned len, num; | ||
| 508 | void *pool_end; | ||
| 509 | |||
| 510 | ceph_decode_need(p, end, 2 + 4, bad); | ||
| 511 | ev = ceph_decode_8(p); /* encoding version */ | ||
| 512 | cv = ceph_decode_8(p); /* compat version */ | ||
| 513 | if (ev < 5) { | ||
| 514 | pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); | ||
| 515 | return -EINVAL; | ||
| 516 | } | ||
| 517 | if (cv > 7) { | ||
| 518 | pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); | ||
| 519 | return -EINVAL; | ||
| 520 | } | ||
| 521 | len = ceph_decode_32(p); | ||
| 522 | ceph_decode_need(p, end, len, bad); | ||
| 523 | pool_end = *p + len; | ||
| 512 | 524 | ||
| 513 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | 525 | pi->type = ceph_decode_8(p); |
| 514 | calc_pg_masks(pi); | 526 | pi->size = ceph_decode_8(p); |
| 527 | pi->crush_ruleset = ceph_decode_8(p); | ||
| 528 | pi->object_hash = ceph_decode_8(p); | ||
| 529 | |||
| 530 | pi->pg_num = ceph_decode_32(p); | ||
| 531 | pi->pgp_num = ceph_decode_32(p); | ||
| 532 | |||
| 533 | *p += 4 + 4; /* skip lpg* */ | ||
| 534 | *p += 4; /* skip last_change */ | ||
| 535 | *p += 8 + 4; /* skip snap_seq, snap_epoch */ | ||
| 515 | 536 | ||
| 516 | /* num_snaps * snap_info_t */ | 537 | /* skip snaps */ |
| 517 | n = le32_to_cpu(pi->v.num_snaps); | 538 | num = ceph_decode_32(p); |
| 518 | while (n--) { | 539 | while (num--) { |
| 519 | ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + | 540 | *p += 8; /* snapid key */ |
| 520 | sizeof(struct ceph_timespec), bad); | 541 | *p += 1 + 1; /* versions */ |
| 521 | *p += sizeof(u64) + /* key */ | 542 | len = ceph_decode_32(p); |
| 522 | 1 + sizeof(u64) + /* u8, snapid */ | 543 | *p += len; |
| 523 | sizeof(struct ceph_timespec); | ||
| 524 | m = ceph_decode_32(p); /* snap name */ | ||
| 525 | *p += m; | ||
| 526 | } | 544 | } |
| 527 | 545 | ||
| 528 | *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; | 546 | /* skip removed snaps */ |
| 547 | num = ceph_decode_32(p); | ||
| 548 | *p += num * (8 + 8); | ||
| 549 | |||
| 550 | *p += 8; /* skip auid */ | ||
| 551 | pi->flags = ceph_decode_64(p); | ||
| 552 | |||
| 553 | /* ignore the rest */ | ||
| 554 | |||
| 555 | *p = pool_end; | ||
| 556 | calc_pg_masks(pi); | ||
| 529 | return 0; | 557 | return 0; |
| 530 | 558 | ||
| 531 | bad: | 559 | bad: |
| @@ -535,14 +563,15 @@ bad: | |||
| 535 | static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) | 563 | static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) |
| 536 | { | 564 | { |
| 537 | struct ceph_pg_pool_info *pi; | 565 | struct ceph_pg_pool_info *pi; |
| 538 | u32 num, len, pool; | 566 | u32 num, len; |
| 567 | u64 pool; | ||
| 539 | 568 | ||
| 540 | ceph_decode_32_safe(p, end, num, bad); | 569 | ceph_decode_32_safe(p, end, num, bad); |
| 541 | dout(" %d pool names\n", num); | 570 | dout(" %d pool names\n", num); |
| 542 | while (num--) { | 571 | while (num--) { |
| 543 | ceph_decode_32_safe(p, end, pool, bad); | 572 | ceph_decode_64_safe(p, end, pool, bad); |
| 544 | ceph_decode_32_safe(p, end, len, bad); | 573 | ceph_decode_32_safe(p, end, len, bad); |
| 545 | dout(" pool %d len %d\n", pool, len); | 574 | dout(" pool %llu len %d\n", pool, len); |
| 546 | ceph_decode_need(p, end, len, bad); | 575 | ceph_decode_need(p, end, len, bad); |
| 547 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 576 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
| 548 | if (pi) { | 577 | if (pi) { |
| @@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
| 633 | struct ceph_osdmap *map; | 662 | struct ceph_osdmap *map; |
| 634 | u16 version; | 663 | u16 version; |
| 635 | u32 len, max, i; | 664 | u32 len, max, i; |
| 636 | u8 ev; | ||
| 637 | int err = -EINVAL; | 665 | int err = -EINVAL; |
| 638 | void *start = *p; | 666 | void *start = *p; |
| 639 | struct ceph_pg_pool_info *pi; | 667 | struct ceph_pg_pool_info *pi; |
| @@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
| 646 | map->pg_temp = RB_ROOT; | 674 | map->pg_temp = RB_ROOT; |
| 647 | 675 | ||
| 648 | ceph_decode_16_safe(p, end, version, bad); | 676 | ceph_decode_16_safe(p, end, version, bad); |
| 649 | if (version > CEPH_OSDMAP_VERSION) { | 677 | if (version > 6) { |
| 650 | pr_warning("got unknown v %d > %d of osdmap\n", version, | 678 | pr_warning("got unknown v %d > 6 of osdmap\n", version); |
| 651 | CEPH_OSDMAP_VERSION); | 679 | goto bad; |
| 680 | } | ||
| 681 | if (version < 6) { | ||
| 682 | pr_warning("got old v %d < 6 of osdmap\n", version); | ||
| 652 | goto bad; | 683 | goto bad; |
| 653 | } | 684 | } |
| 654 | 685 | ||
| @@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
| 660 | 691 | ||
| 661 | ceph_decode_32_safe(p, end, max, bad); | 692 | ceph_decode_32_safe(p, end, max, bad); |
| 662 | while (max--) { | 693 | while (max--) { |
| 663 | ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); | 694 | ceph_decode_need(p, end, 8 + 2, bad); |
| 664 | err = -ENOMEM; | 695 | err = -ENOMEM; |
| 665 | pi = kzalloc(sizeof(*pi), GFP_NOFS); | 696 | pi = kzalloc(sizeof(*pi), GFP_NOFS); |
| 666 | if (!pi) | 697 | if (!pi) |
| 667 | goto bad; | 698 | goto bad; |
| 668 | pi->id = ceph_decode_32(p); | 699 | pi->id = ceph_decode_64(p); |
| 669 | err = -EINVAL; | ||
| 670 | ev = ceph_decode_8(p); /* encoding version */ | ||
| 671 | if (ev > CEPH_PG_POOL_VERSION) { | ||
| 672 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | ||
| 673 | ev, CEPH_PG_POOL_VERSION); | ||
| 674 | kfree(pi); | ||
| 675 | goto bad; | ||
| 676 | } | ||
| 677 | err = __decode_pool(p, end, pi); | 700 | err = __decode_pool(p, end, pi); |
| 678 | if (err < 0) { | 701 | if (err < 0) { |
| 679 | kfree(pi); | 702 | kfree(pi); |
| @@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
| 682 | __insert_pg_pool(&map->pg_pools, pi); | 705 | __insert_pg_pool(&map->pg_pools, pi); |
| 683 | } | 706 | } |
| 684 | 707 | ||
| 685 | if (version >= 5) { | 708 | err = __decode_pool_names(p, end, map); |
| 686 | err = __decode_pool_names(p, end, map); | 709 | if (err < 0) { |
| 687 | if (err < 0) { | 710 | dout("fail to decode pool names"); |
| 688 | dout("fail to decode pool names"); | 711 | goto bad; |
| 689 | goto bad; | ||
| 690 | } | ||
| 691 | } | 712 | } |
| 692 | 713 | ||
| 693 | ceph_decode_32_safe(p, end, map->pool_max, bad); | 714 | ceph_decode_32_safe(p, end, map->pool_max, bad); |
| @@ -724,10 +745,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
| 724 | for (i = 0; i < len; i++) { | 745 | for (i = 0; i < len; i++) { |
| 725 | int n, j; | 746 | int n, j; |
| 726 | struct ceph_pg pgid; | 747 | struct ceph_pg pgid; |
| 748 | struct ceph_pg_v1 pgid_v1; | ||
| 727 | struct ceph_pg_mapping *pg; | 749 | struct ceph_pg_mapping *pg; |
| 728 | 750 | ||
| 729 | ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); | 751 | ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); |
| 730 | ceph_decode_copy(p, &pgid, sizeof(pgid)); | 752 | ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1)); |
| 753 | pgid.pool = le32_to_cpu(pgid_v1.pool); | ||
| 754 | pgid.seed = le16_to_cpu(pgid_v1.ps); | ||
| 731 | n = ceph_decode_32(p); | 755 | n = ceph_decode_32(p); |
| 732 | err = -EINVAL; | 756 | err = -EINVAL; |
| 733 | if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) | 757 | if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) |
| @@ -745,7 +769,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
| 745 | err = __insert_pg_mapping(pg, &map->pg_temp); | 769 | err = __insert_pg_mapping(pg, &map->pg_temp); |
| 746 | if (err) | 770 | if (err) |
| 747 | goto bad; | 771 | goto bad; |
| 748 | dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); | 772 | dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, |
| 773 | len); | ||
| 749 | } | 774 | } |
| 750 | 775 | ||
| 751 | /* crush */ | 776 | /* crush */ |
| @@ -784,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 784 | struct ceph_fsid fsid; | 809 | struct ceph_fsid fsid; |
| 785 | u32 epoch = 0; | 810 | u32 epoch = 0; |
| 786 | struct ceph_timespec modified; | 811 | struct ceph_timespec modified; |
| 787 | u32 len, pool; | 812 | s32 len; |
| 788 | __s32 new_pool_max, new_flags, max; | 813 | u64 pool; |
| 814 | __s64 new_pool_max; | ||
| 815 | __s32 new_flags, max; | ||
| 789 | void *start = *p; | 816 | void *start = *p; |
| 790 | int err = -EINVAL; | 817 | int err = -EINVAL; |
| 791 | u16 version; | 818 | u16 version; |
| 792 | 819 | ||
| 793 | ceph_decode_16_safe(p, end, version, bad); | 820 | ceph_decode_16_safe(p, end, version, bad); |
| 794 | if (version > CEPH_OSDMAP_INC_VERSION) { | 821 | if (version > 6) { |
| 795 | pr_warning("got unknown v %d > %d of inc osdmap\n", version, | 822 | pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6); |
| 796 | CEPH_OSDMAP_INC_VERSION); | ||
| 797 | goto bad; | 823 | goto bad; |
| 798 | } | 824 | } |
| 799 | 825 | ||
| @@ -803,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 803 | epoch = ceph_decode_32(p); | 829 | epoch = ceph_decode_32(p); |
| 804 | BUG_ON(epoch != map->epoch+1); | 830 | BUG_ON(epoch != map->epoch+1); |
| 805 | ceph_decode_copy(p, &modified, sizeof(modified)); | 831 | ceph_decode_copy(p, &modified, sizeof(modified)); |
| 806 | new_pool_max = ceph_decode_32(p); | 832 | new_pool_max = ceph_decode_64(p); |
| 807 | new_flags = ceph_decode_32(p); | 833 | new_flags = ceph_decode_32(p); |
| 808 | 834 | ||
| 809 | /* full map? */ | 835 | /* full map? */ |
| @@ -853,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 853 | /* new_pool */ | 879 | /* new_pool */ |
| 854 | ceph_decode_32_safe(p, end, len, bad); | 880 | ceph_decode_32_safe(p, end, len, bad); |
| 855 | while (len--) { | 881 | while (len--) { |
| 856 | __u8 ev; | ||
| 857 | struct ceph_pg_pool_info *pi; | 882 | struct ceph_pg_pool_info *pi; |
| 858 | 883 | ||
| 859 | ceph_decode_32_safe(p, end, pool, bad); | 884 | ceph_decode_64_safe(p, end, pool, bad); |
| 860 | ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); | ||
| 861 | ev = ceph_decode_8(p); /* encoding version */ | ||
| 862 | if (ev > CEPH_PG_POOL_VERSION) { | ||
| 863 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | ||
| 864 | ev, CEPH_PG_POOL_VERSION); | ||
| 865 | err = -EINVAL; | ||
| 866 | goto bad; | ||
| 867 | } | ||
| 868 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 885 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
| 869 | if (!pi) { | 886 | if (!pi) { |
| 870 | pi = kzalloc(sizeof(*pi), GFP_NOFS); | 887 | pi = kzalloc(sizeof(*pi), GFP_NOFS); |
| @@ -890,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 890 | while (len--) { | 907 | while (len--) { |
| 891 | struct ceph_pg_pool_info *pi; | 908 | struct ceph_pg_pool_info *pi; |
| 892 | 909 | ||
| 893 | ceph_decode_32_safe(p, end, pool, bad); | 910 | ceph_decode_64_safe(p, end, pool, bad); |
| 894 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 911 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
| 895 | if (pi) | 912 | if (pi) |
| 896 | __remove_pg_pool(&map->pg_pools, pi); | 913 | __remove_pg_pool(&map->pg_pools, pi); |
| @@ -946,10 +963,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 946 | while (len--) { | 963 | while (len--) { |
| 947 | struct ceph_pg_mapping *pg; | 964 | struct ceph_pg_mapping *pg; |
| 948 | int j; | 965 | int j; |
| 966 | struct ceph_pg_v1 pgid_v1; | ||
| 949 | struct ceph_pg pgid; | 967 | struct ceph_pg pgid; |
| 950 | u32 pglen; | 968 | u32 pglen; |
| 951 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); | 969 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); |
| 952 | ceph_decode_copy(p, &pgid, sizeof(pgid)); | 970 | ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1)); |
| 971 | pgid.pool = le32_to_cpu(pgid_v1.pool); | ||
| 972 | pgid.seed = le16_to_cpu(pgid_v1.ps); | ||
| 953 | pglen = ceph_decode_32(p); | 973 | pglen = ceph_decode_32(p); |
| 954 | 974 | ||
| 955 | if (pglen) { | 975 | if (pglen) { |
| @@ -975,8 +995,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 975 | kfree(pg); | 995 | kfree(pg); |
| 976 | goto bad; | 996 | goto bad; |
| 977 | } | 997 | } |
| 978 | dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, | 998 | dout(" added pg_temp %lld.%x len %d\n", pgid.pool, |
| 979 | pglen); | 999 | pgid.seed, pglen); |
| 980 | } else { | 1000 | } else { |
| 981 | /* remove */ | 1001 | /* remove */ |
| 982 | __remove_pg_mapping(&map->pg_temp, pgid); | 1002 | __remove_pg_mapping(&map->pg_temp, pgid); |
| @@ -1010,7 +1030,7 @@ bad: | |||
| 1010 | * pass a stride back to the caller. | 1030 | * pass a stride back to the caller. |
| 1011 | */ | 1031 | */ |
| 1012 | int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | 1032 | int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, |
| 1013 | u64 off, u64 *plen, | 1033 | u64 off, u64 len, |
| 1014 | u64 *ono, | 1034 | u64 *ono, |
| 1015 | u64 *oxoff, u64 *oxlen) | 1035 | u64 *oxoff, u64 *oxlen) |
| 1016 | { | 1036 | { |
| @@ -1021,7 +1041,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | |||
| 1021 | u32 su_per_object; | 1041 | u32 su_per_object; |
| 1022 | u64 t, su_offset; | 1042 | u64 t, su_offset; |
| 1023 | 1043 | ||
| 1024 | dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, | 1044 | dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, |
| 1025 | osize, su); | 1045 | osize, su); |
| 1026 | if (su == 0 || sc == 0) | 1046 | if (su == 0 || sc == 0) |
| 1027 | goto invalid; | 1047 | goto invalid; |
| @@ -1054,11 +1074,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | |||
| 1054 | 1074 | ||
| 1055 | /* | 1075 | /* |
| 1056 | * Calculate the length of the extent being written to the selected | 1076 | * Calculate the length of the extent being written to the selected |
| 1057 | * object. This is the minimum of the full length requested (plen) or | 1077 | * object. This is the minimum of the full length requested (len) or |
| 1058 | * the remainder of the current stripe being written to. | 1078 | * the remainder of the current stripe being written to. |
| 1059 | */ | 1079 | */ |
| 1060 | *oxlen = min_t(u64, *plen, su - su_offset); | 1080 | *oxlen = min_t(u64, len, su - su_offset); |
| 1061 | *plen = *oxlen; | ||
| 1062 | 1081 | ||
| 1063 | dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); | 1082 | dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); |
| 1064 | return 0; | 1083 | return 0; |
| @@ -1076,33 +1095,24 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); | |||
| 1076 | * calculate an object layout (i.e. pgid) from an oid, | 1095 | * calculate an object layout (i.e. pgid) from an oid, |
| 1077 | * file_layout, and osdmap | 1096 | * file_layout, and osdmap |
| 1078 | */ | 1097 | */ |
| 1079 | int ceph_calc_object_layout(struct ceph_object_layout *ol, | 1098 | int ceph_calc_object_layout(struct ceph_pg *pg, |
| 1080 | const char *oid, | 1099 | const char *oid, |
| 1081 | struct ceph_file_layout *fl, | 1100 | struct ceph_file_layout *fl, |
| 1082 | struct ceph_osdmap *osdmap) | 1101 | struct ceph_osdmap *osdmap) |
| 1083 | { | 1102 | { |
| 1084 | unsigned int num, num_mask; | 1103 | unsigned int num, num_mask; |
| 1085 | struct ceph_pg pgid; | ||
| 1086 | int poolid = le32_to_cpu(fl->fl_pg_pool); | ||
| 1087 | struct ceph_pg_pool_info *pool; | 1104 | struct ceph_pg_pool_info *pool; |
| 1088 | unsigned int ps; | ||
| 1089 | 1105 | ||
| 1090 | BUG_ON(!osdmap); | 1106 | BUG_ON(!osdmap); |
| 1091 | 1107 | pg->pool = le32_to_cpu(fl->fl_pg_pool); | |
| 1092 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); | 1108 | pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); |
| 1093 | if (!pool) | 1109 | if (!pool) |
| 1094 | return -EIO; | 1110 | return -EIO; |
| 1095 | ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); | 1111 | pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); |
| 1096 | num = le32_to_cpu(pool->v.pg_num); | 1112 | num = pool->pg_num; |
| 1097 | num_mask = pool->pg_num_mask; | 1113 | num_mask = pool->pg_num_mask; |
| 1098 | 1114 | ||
| 1099 | pgid.ps = cpu_to_le16(ps); | 1115 | dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); |
| 1100 | pgid.preferred = cpu_to_le16(-1); | ||
| 1101 | pgid.pool = fl->fl_pg_pool; | ||
| 1102 | dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); | ||
| 1103 | |||
| 1104 | ol->ol_pgid = pgid; | ||
| 1105 | ol->ol_stripe_unit = fl->fl_object_stripe_unit; | ||
| 1106 | return 0; | 1116 | return 0; |
| 1107 | } | 1117 | } |
| 1108 | EXPORT_SYMBOL(ceph_calc_object_layout); | 1118 | EXPORT_SYMBOL(ceph_calc_object_layout); |
| @@ -1117,19 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
| 1117 | struct ceph_pg_mapping *pg; | 1127 | struct ceph_pg_mapping *pg; |
| 1118 | struct ceph_pg_pool_info *pool; | 1128 | struct ceph_pg_pool_info *pool; |
| 1119 | int ruleno; | 1129 | int ruleno; |
| 1120 | unsigned int poolid, ps, pps, t, r; | 1130 | int r; |
| 1121 | 1131 | u32 pps; | |
| 1122 | poolid = le32_to_cpu(pgid.pool); | ||
| 1123 | ps = le16_to_cpu(pgid.ps); | ||
| 1124 | 1132 | ||
| 1125 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); | 1133 | pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); |
| 1126 | if (!pool) | 1134 | if (!pool) |
| 1127 | return NULL; | 1135 | return NULL; |
| 1128 | 1136 | ||
| 1129 | /* pg_temp? */ | 1137 | /* pg_temp? */ |
| 1130 | t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), | 1138 | pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, |
| 1131 | pool->pgp_num_mask); | 1139 | pool->pgp_num_mask); |
| 1132 | pgid.ps = cpu_to_le16(t); | ||
| 1133 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); | 1140 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); |
| 1134 | if (pg) { | 1141 | if (pg) { |
| 1135 | *num = pg->len; | 1142 | *num = pg->len; |
| @@ -1137,26 +1144,39 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
| 1137 | } | 1144 | } |
| 1138 | 1145 | ||
| 1139 | /* crush */ | 1146 | /* crush */ |
| 1140 | ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, | 1147 | ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, |
| 1141 | pool->v.type, pool->v.size); | 1148 | pool->type, pool->size); |
| 1142 | if (ruleno < 0) { | 1149 | if (ruleno < 0) { |
| 1143 | pr_err("no crush rule pool %d ruleset %d type %d size %d\n", | 1150 | pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", |
| 1144 | poolid, pool->v.crush_ruleset, pool->v.type, | 1151 | pgid.pool, pool->crush_ruleset, pool->type, |
| 1145 | pool->v.size); | 1152 | pool->size); |
| 1146 | return NULL; | 1153 | return NULL; |
| 1147 | } | 1154 | } |
| 1148 | 1155 | ||
| 1149 | pps = ceph_stable_mod(ps, | 1156 | if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { |
| 1150 | le32_to_cpu(pool->v.pgp_num), | 1157 | /* hash pool id and seed sothat pool PGs do not overlap */ |
| 1151 | pool->pgp_num_mask); | 1158 | pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, |
| 1152 | pps += poolid; | 1159 | ceph_stable_mod(pgid.seed, pool->pgp_num, |
| 1160 | pool->pgp_num_mask), | ||
| 1161 | pgid.pool); | ||
| 1162 | } else { | ||
| 1163 | /* | ||
| 1164 | * legacy ehavior: add ps and pool together. this is | ||
| 1165 | * not a great approach because the PGs from each pool | ||
| 1166 | * will overlap on top of each other: 0.5 == 1.4 == | ||
| 1167 | * 2.3 == ... | ||
| 1168 | */ | ||
| 1169 | pps = ceph_stable_mod(pgid.seed, pool->pgp_num, | ||
| 1170 | pool->pgp_num_mask) + | ||
| 1171 | (unsigned)pgid.pool; | ||
| 1172 | } | ||
| 1153 | r = crush_do_rule(osdmap->crush, ruleno, pps, osds, | 1173 | r = crush_do_rule(osdmap->crush, ruleno, pps, osds, |
| 1154 | min_t(int, pool->v.size, *num), | 1174 | min_t(int, pool->size, *num), |
| 1155 | osdmap->osd_weight); | 1175 | osdmap->osd_weight); |
| 1156 | if (r < 0) { | 1176 | if (r < 0) { |
| 1157 | pr_err("error %d from crush rule: pool %d ruleset %d type %d" | 1177 | pr_err("error %d from crush rule: pool %lld ruleset %d type %d" |
| 1158 | " size %d\n", r, poolid, pool->v.crush_ruleset, | 1178 | " size %d\n", r, pgid.pool, pool->crush_ruleset, |
| 1159 | pool->v.type, pool->v.size); | 1179 | pool->type, pool->size); |
| 1160 | return NULL; | 1180 | return NULL; |
| 1161 | } | 1181 | } |
| 1162 | *num = r; | 1182 | *num = r; |
