diff options
author | Sage Weil <sage@newdream.net> | 2010-02-16 18:55:03 -0500 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2010-02-17 13:02:49 -0500 |
commit | 4fc51be8fa7043ff9a1e34fef0e99214373332ac (patch) | |
tree | beba3bc83b5a07a16bb63c6e80713cb2b42bf4fb /fs | |
parent | 9794b146fa7b93f8ab74fb62d67fdefad760769f (diff) |
ceph: use rbtree for pg pools; decode new osdmap format
Since we can now create and destroy pg pools, the pool ids will be sparse,
and an array no longer makes sense for looking up by pool id. Use an
rbtree instead.
The OSDMap encoding also no longer has a max pool count (previously used to
allocate the array). There is a new pool_max, that is the largest pool id
we've ever used, although we don't actually need it in the client.
Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/debugfs.c | 7 | ||||
-rw-r--r-- | fs/ceph/osdmap.c | 136 | ||||
-rw-r--r-- | fs/ceph/osdmap.h | 7 | ||||
-rw-r--r-- | fs/ceph/rados.h | 4 |
4 files changed, 104 insertions, 50 deletions
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1a47b5c25b5f..e159f1415110 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c | |||
@@ -78,6 +78,7 @@ static int osdmap_show(struct seq_file *s, void *p) | |||
78 | { | 78 | { |
79 | int i; | 79 | int i; |
80 | struct ceph_client *client = s->private; | 80 | struct ceph_client *client = s->private; |
81 | struct rb_node *n; | ||
81 | 82 | ||
82 | if (client->osdc.osdmap == NULL) | 83 | if (client->osdc.osdmap == NULL) |
83 | return 0; | 84 | return 0; |
@@ -87,11 +88,11 @@ static int osdmap_show(struct seq_file *s, void *p) | |||
87 | " NEARFULL" : "", | 88 | " NEARFULL" : "", |
88 | (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? | 89 | (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? |
89 | " FULL" : ""); | 90 | " FULL" : ""); |
90 | for (i = 0; i < client->osdc.osdmap->num_pools; i++) { | 91 | for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { |
91 | struct ceph_pg_pool_info *pool = | 92 | struct ceph_pg_pool_info *pool = |
92 | &client->osdc.osdmap->pg_pool[i]; | 93 | rb_entry(n, struct ceph_pg_pool_info, node); |
93 | seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", | 94 | seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", |
94 | i, pool->v.pg_num, pool->pg_num_mask, | 95 | pool->id, pool->v.pg_num, pool->pg_num_mask, |
95 | pool->v.lpg_num, pool->lpg_num_mask); | 96 | pool->v.lpg_num, pool->lpg_num_mask); |
96 | } | 97 | } |
97 | for (i = 0; i < client->osdc.osdmap->max_osd; i++) { | 98 | for (i = 0; i < client->osdc.osdmap->max_osd; i++) { |
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 443fdcdb19c4..34b5696c84fd 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c | |||
@@ -328,9 +328,15 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) | |||
328 | rb_erase(&pg->node, &map->pg_temp); | 328 | rb_erase(&pg->node, &map->pg_temp); |
329 | kfree(pg); | 329 | kfree(pg); |
330 | } | 330 | } |
331 | while (!RB_EMPTY_ROOT(&map->pg_pools)) { | ||
332 | struct ceph_pg_pool_info *pi = | ||
333 | rb_entry(rb_first(&map->pg_pools), | ||
334 | struct ceph_pg_pool_info, node); | ||
335 | rb_erase(&pi->node, &map->pg_pools); | ||
336 | kfree(pi); | ||
337 | } | ||
331 | kfree(map->osd_state); | 338 | kfree(map->osd_state); |
332 | kfree(map->osd_weight); | 339 | kfree(map->osd_weight); |
333 | kfree(map->pg_pool); | ||
334 | kfree(map->osd_addr); | 340 | kfree(map->osd_addr); |
335 | kfree(map); | 341 | kfree(map); |
336 | } | 342 | } |
@@ -433,6 +439,48 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, | |||
433 | } | 439 | } |
434 | 440 | ||
435 | /* | 441 | /* |
442 | * rbtree of pg pool info | ||
443 | */ | ||
444 | static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) | ||
445 | { | ||
446 | struct rb_node **p = &root->rb_node; | ||
447 | struct rb_node *parent = NULL; | ||
448 | struct ceph_pg_pool_info *pi = NULL; | ||
449 | |||
450 | while (*p) { | ||
451 | parent = *p; | ||
452 | pi = rb_entry(parent, struct ceph_pg_pool_info, node); | ||
453 | if (new->id < pi->id) | ||
454 | p = &(*p)->rb_left; | ||
455 | else if (new->id > pi->id) | ||
456 | p = &(*p)->rb_right; | ||
457 | else | ||
458 | return -EEXIST; | ||
459 | } | ||
460 | |||
461 | rb_link_node(&new->node, parent, p); | ||
462 | rb_insert_color(&new->node, root); | ||
463 | return 0; | ||
464 | } | ||
465 | |||
466 | static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) | ||
467 | { | ||
468 | struct ceph_pg_pool_info *pi; | ||
469 | struct rb_node *n = root->rb_node; | ||
470 | |||
471 | while (n) { | ||
472 | pi = rb_entry(n, struct ceph_pg_pool_info, node); | ||
473 | if (id < pi->id) | ||
474 | n = n->rb_left; | ||
475 | else if (id > pi->id) | ||
476 | n = n->rb_right; | ||
477 | else | ||
478 | return pi; | ||
479 | } | ||
480 | return NULL; | ||
481 | } | ||
482 | |||
483 | /* | ||
436 | * decode a full map. | 484 | * decode a full map. |
437 | */ | 485 | */ |
438 | struct ceph_osdmap *osdmap_decode(void **p, void *end) | 486 | struct ceph_osdmap *osdmap_decode(void **p, void *end) |
@@ -443,6 +491,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
443 | u8 ev; | 491 | u8 ev; |
444 | int err = -EINVAL; | 492 | int err = -EINVAL; |
445 | void *start = *p; | 493 | void *start = *p; |
494 | struct ceph_pg_pool_info *pi; | ||
446 | 495 | ||
447 | dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); | 496 | dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); |
448 | 497 | ||
@@ -464,32 +513,27 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
464 | ceph_decode_copy(p, &map->created, sizeof(map->created)); | 513 | ceph_decode_copy(p, &map->created, sizeof(map->created)); |
465 | ceph_decode_copy(p, &map->modified, sizeof(map->modified)); | 514 | ceph_decode_copy(p, &map->modified, sizeof(map->modified)); |
466 | 515 | ||
467 | map->num_pools = ceph_decode_32(p); | ||
468 | map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool), | ||
469 | GFP_NOFS); | ||
470 | if (!map->pg_pool) { | ||
471 | err = -ENOMEM; | ||
472 | goto bad; | ||
473 | } | ||
474 | ceph_decode_32_safe(p, end, max, bad); | 516 | ceph_decode_32_safe(p, end, max, bad); |
475 | while (max--) { | 517 | while (max--) { |
476 | ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad); | 518 | ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); |
477 | i = ceph_decode_32(p); | 519 | pi = kmalloc(sizeof(*pi), GFP_NOFS); |
478 | if (i >= map->num_pools) | 520 | if (!pi) |
479 | goto bad; | 521 | goto bad; |
522 | pi->id = ceph_decode_32(p); | ||
480 | ev = ceph_decode_8(p); /* encoding version */ | 523 | ev = ceph_decode_8(p); /* encoding version */ |
481 | if (ev > CEPH_PG_POOL_VERSION) { | 524 | if (ev > CEPH_PG_POOL_VERSION) { |
482 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | 525 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", |
483 | ev, CEPH_PG_POOL_VERSION); | 526 | ev, CEPH_PG_POOL_VERSION); |
484 | goto bad; | 527 | goto bad; |
485 | } | 528 | } |
486 | ceph_decode_copy(p, &map->pg_pool[i].v, | 529 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); |
487 | sizeof(map->pg_pool->v)); | 530 | __insert_pg_pool(&map->pg_pools, pi); |
488 | calc_pg_masks(&map->pg_pool[i]); | 531 | calc_pg_masks(pi); |
489 | p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64); | 532 | p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); |
490 | p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals) | 533 | p += le32_to_cpu(pi->v.num_removed_snap_intervals) |
491 | * sizeof(u64) * 2; | 534 | * sizeof(u64) * 2; |
492 | } | 535 | } |
536 | ceph_decode_32_safe(p, end, map->pool_max, bad); | ||
493 | 537 | ||
494 | ceph_decode_32_safe(p, end, map->flags, bad); | 538 | ceph_decode_32_safe(p, end, map->flags, bad); |
495 | 539 | ||
@@ -581,7 +625,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
581 | u32 epoch = 0; | 625 | u32 epoch = 0; |
582 | struct ceph_timespec modified; | 626 | struct ceph_timespec modified; |
583 | u32 len, pool; | 627 | u32 len, pool; |
584 | __s32 new_flags, max; | 628 | __s32 new_pool_max, new_flags, max; |
585 | void *start = *p; | 629 | void *start = *p; |
586 | int err = -EINVAL; | 630 | int err = -EINVAL; |
587 | u16 version; | 631 | u16 version; |
@@ -600,6 +644,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
600 | epoch = ceph_decode_32(p); | 644 | epoch = ceph_decode_32(p); |
601 | BUG_ON(epoch != map->epoch+1); | 645 | BUG_ON(epoch != map->epoch+1); |
602 | ceph_decode_copy(p, &modified, sizeof(modified)); | 646 | ceph_decode_copy(p, &modified, sizeof(modified)); |
647 | new_pool_max = ceph_decode_32(p); | ||
603 | new_flags = ceph_decode_32(p); | 648 | new_flags = ceph_decode_32(p); |
604 | 649 | ||
605 | /* full map? */ | 650 | /* full map? */ |
@@ -623,6 +668,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
623 | /* new flags? */ | 668 | /* new flags? */ |
624 | if (new_flags >= 0) | 669 | if (new_flags >= 0) |
625 | map->flags = new_flags; | 670 | map->flags = new_flags; |
671 | if (new_pool_max >= 0) | ||
672 | map->pool_max = new_pool_max; | ||
626 | 673 | ||
627 | ceph_decode_need(p, end, 5*sizeof(u32), bad); | 674 | ceph_decode_need(p, end, 5*sizeof(u32), bad); |
628 | 675 | ||
@@ -647,37 +694,42 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
647 | ceph_decode_32_safe(p, end, len, bad); | 694 | ceph_decode_32_safe(p, end, len, bad); |
648 | while (len--) { | 695 | while (len--) { |
649 | __u8 ev; | 696 | __u8 ev; |
697 | struct ceph_pg_pool_info *pi; | ||
650 | 698 | ||
651 | ceph_decode_32_safe(p, end, pool, bad); | 699 | ceph_decode_32_safe(p, end, pool, bad); |
652 | if (pool >= map->num_pools) { | 700 | ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); |
653 | void *pg_pool = kcalloc(pool + 1, | ||
654 | sizeof(*map->pg_pool), | ||
655 | GFP_NOFS); | ||
656 | if (!pg_pool) { | ||
657 | err = -ENOMEM; | ||
658 | goto bad; | ||
659 | } | ||
660 | memcpy(pg_pool, map->pg_pool, | ||
661 | map->num_pools * sizeof(*map->pg_pool)); | ||
662 | kfree(map->pg_pool); | ||
663 | map->pg_pool = pg_pool; | ||
664 | map->num_pools = pool+1; | ||
665 | } | ||
666 | ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad); | ||
667 | ev = ceph_decode_8(p); /* encoding version */ | 701 | ev = ceph_decode_8(p); /* encoding version */ |
668 | if (ev > CEPH_PG_POOL_VERSION) { | 702 | if (ev > CEPH_PG_POOL_VERSION) { |
669 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | 703 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", |
670 | ev, CEPH_PG_POOL_VERSION); | 704 | ev, CEPH_PG_POOL_VERSION); |
671 | goto bad; | 705 | goto bad; |
672 | } | 706 | } |
673 | ceph_decode_copy(p, &map->pg_pool[pool].v, | 707 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
674 | sizeof(map->pg_pool->v)); | 708 | if (!pi) { |
675 | calc_pg_masks(&map->pg_pool[pool]); | 709 | pi = kmalloc(sizeof(*pi), GFP_NOFS); |
710 | if (!pi) { | ||
711 | err = -ENOMEM; | ||
712 | goto bad; | ||
713 | } | ||
714 | pi->id = pool; | ||
715 | __insert_pg_pool(&map->pg_pools, pi); | ||
716 | } | ||
717 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | ||
718 | calc_pg_masks(pi); | ||
676 | } | 719 | } |
677 | 720 | ||
678 | /* old_pool (ignore) */ | 721 | /* old_pool */ |
679 | ceph_decode_32_safe(p, end, len, bad); | 722 | ceph_decode_32_safe(p, end, len, bad); |
680 | *p += len * sizeof(u32); | 723 | while (len--) { |
724 | struct ceph_pg_pool_info *pi; | ||
725 | |||
726 | ceph_decode_32_safe(p, end, pool, bad); | ||
727 | pi = __lookup_pg_pool(&map->pg_pools, pool); | ||
728 | if (pi) { | ||
729 | rb_erase(&pi->node, &map->pg_pools); | ||
730 | kfree(pi); | ||
731 | } | ||
732 | } | ||
681 | 733 | ||
682 | /* new_up */ | 734 | /* new_up */ |
683 | err = -EINVAL; | 735 | err = -EINVAL; |
@@ -861,10 +913,10 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, | |||
861 | unsigned ps; | 913 | unsigned ps; |
862 | 914 | ||
863 | BUG_ON(!osdmap); | 915 | BUG_ON(!osdmap); |
864 | if (poolid >= osdmap->num_pools) | ||
865 | return -EIO; | ||
866 | 916 | ||
867 | pool = &osdmap->pg_pool[poolid]; | 917 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); |
918 | if (!pool) | ||
919 | return -EIO; | ||
868 | ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); | 920 | ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); |
869 | if (preferred >= 0) { | 921 | if (preferred >= 0) { |
870 | ps += preferred; | 922 | ps += preferred; |
@@ -919,9 +971,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
919 | preferred >= osdmap->crush->max_devices) | 971 | preferred >= osdmap->crush->max_devices) |
920 | preferred = -1; | 972 | preferred = -1; |
921 | 973 | ||
922 | if (poolid >= osdmap->num_pools) | 974 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); |
975 | if (!pool) | ||
923 | return NULL; | 976 | return NULL; |
924 | pool = &osdmap->pg_pool[poolid]; | ||
925 | ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, | 977 | ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, |
926 | pool->v.type, pool->v.size); | 978 | pool->v.type, pool->v.size); |
927 | if (ruleno < 0) { | 979 | if (ruleno < 0) { |
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h index c4af8418aa00..1fb55afb2642 100644 --- a/fs/ceph/osdmap.h +++ b/fs/ceph/osdmap.h | |||
@@ -19,6 +19,8 @@ | |||
19 | * the change between two successive epochs, or as a fully encoded map. | 19 | * the change between two successive epochs, or as a fully encoded map. |
20 | */ | 20 | */ |
21 | struct ceph_pg_pool_info { | 21 | struct ceph_pg_pool_info { |
22 | struct rb_node node; | ||
23 | int id; | ||
22 | struct ceph_pg_pool v; | 24 | struct ceph_pg_pool v; |
23 | int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; | 25 | int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; |
24 | }; | 26 | }; |
@@ -44,9 +46,8 @@ struct ceph_osdmap { | |||
44 | struct ceph_entity_addr *osd_addr; | 46 | struct ceph_entity_addr *osd_addr; |
45 | 47 | ||
46 | struct rb_root pg_temp; | 48 | struct rb_root pg_temp; |
47 | 49 | struct rb_root pg_pools; | |
48 | u32 num_pools; | 50 | u32 pool_max; |
49 | struct ceph_pg_pool_info *pg_pool; | ||
50 | 51 | ||
51 | /* the CRUSH map specifies the mapping of placement groups to | 52 | /* the CRUSH map specifies the mapping of placement groups to |
52 | * the list of osds that store+replicate them. */ | 53 | * the list of osds that store+replicate them. */ |
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 1f4c78640541..26ac8b89a676 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h | |||
@@ -11,8 +11,8 @@ | |||
11 | /* | 11 | /* |
12 | * osdmap encoding versions | 12 | * osdmap encoding versions |
13 | */ | 13 | */ |
14 | #define CEPH_OSDMAP_INC_VERSION 3 | 14 | #define CEPH_OSDMAP_INC_VERSION 4 |
15 | #define CEPH_OSDMAP_VERSION 3 | 15 | #define CEPH_OSDMAP_VERSION 4 |
16 | 16 | ||
17 | /* | 17 | /* |
18 | * fs id | 18 | * fs id |