aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2010-02-16 18:55:03 -0500
committerSage Weil <sage@newdream.net>2010-02-17 13:02:49 -0500
commit4fc51be8fa7043ff9a1e34fef0e99214373332ac (patch)
treebeba3bc83b5a07a16bb63c6e80713cb2b42bf4fb /fs
parent9794b146fa7b93f8ab74fb62d67fdefad760769f (diff)
ceph: use rbtree for pg pools; decode new osdmap format
Since we can now create and destroy pg pools, the pool ids will be sparse, and an array no longer makes sense for looking up by pool id. Use an rbtree instead. The OSDMap encoding also no longer has a max pool count (previously used to allocate the array). There is a new pool_max, that is the largest pool id we've ever used, although we don't actually need it in the client. Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs')
-rw-r--r--fs/ceph/debugfs.c7
-rw-r--r--fs/ceph/osdmap.c136
-rw-r--r--fs/ceph/osdmap.h7
-rw-r--r--fs/ceph/rados.h4
4 files changed, 104 insertions, 50 deletions
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 1a47b5c25b5f..e159f1415110 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -78,6 +78,7 @@ static int osdmap_show(struct seq_file *s, void *p)
78{ 78{
79 int i; 79 int i;
80 struct ceph_client *client = s->private; 80 struct ceph_client *client = s->private;
81 struct rb_node *n;
81 82
82 if (client->osdc.osdmap == NULL) 83 if (client->osdc.osdmap == NULL)
83 return 0; 84 return 0;
@@ -87,11 +88,11 @@ static int osdmap_show(struct seq_file *s, void *p)
87 " NEARFULL" : "", 88 " NEARFULL" : "",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? 89 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
89 " FULL" : ""); 90 " FULL" : "");
90 for (i = 0; i < client->osdc.osdmap->num_pools; i++) { 91 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
91 struct ceph_pg_pool_info *pool = 92 struct ceph_pg_pool_info *pool =
92 &client->osdc.osdmap->pg_pool[i]; 93 rb_entry(n, struct ceph_pg_pool_info, node);
93 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", 94 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
94 i, pool->v.pg_num, pool->pg_num_mask, 95 pool->id, pool->v.pg_num, pool->pg_num_mask,
95 pool->v.lpg_num, pool->lpg_num_mask); 96 pool->v.lpg_num, pool->lpg_num_mask);
96 } 97 }
97 for (i = 0; i < client->osdc.osdmap->max_osd; i++) { 98 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 443fdcdb19c4..34b5696c84fd 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -328,9 +328,15 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
328 rb_erase(&pg->node, &map->pg_temp); 328 rb_erase(&pg->node, &map->pg_temp);
329 kfree(pg); 329 kfree(pg);
330 } 330 }
331 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
332 struct ceph_pg_pool_info *pi =
333 rb_entry(rb_first(&map->pg_pools),
334 struct ceph_pg_pool_info, node);
335 rb_erase(&pi->node, &map->pg_pools);
336 kfree(pi);
337 }
331 kfree(map->osd_state); 338 kfree(map->osd_state);
332 kfree(map->osd_weight); 339 kfree(map->osd_weight);
333 kfree(map->pg_pool);
334 kfree(map->osd_addr); 340 kfree(map->osd_addr);
335 kfree(map); 341 kfree(map);
336} 342}
@@ -433,6 +439,48 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
433} 439}
434 440
435/* 441/*
442 * rbtree of pg pool info
443 */
444static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
445{
446 struct rb_node **p = &root->rb_node;
447 struct rb_node *parent = NULL;
448 struct ceph_pg_pool_info *pi = NULL;
449
450 while (*p) {
451 parent = *p;
452 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
453 if (new->id < pi->id)
454 p = &(*p)->rb_left;
455 else if (new->id > pi->id)
456 p = &(*p)->rb_right;
457 else
458 return -EEXIST;
459 }
460
461 rb_link_node(&new->node, parent, p);
462 rb_insert_color(&new->node, root);
463 return 0;
464}
465
466static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
467{
468 struct ceph_pg_pool_info *pi;
469 struct rb_node *n = root->rb_node;
470
471 while (n) {
472 pi = rb_entry(n, struct ceph_pg_pool_info, node);
473 if (id < pi->id)
474 n = n->rb_left;
475 else if (id > pi->id)
476 n = n->rb_right;
477 else
478 return pi;
479 }
480 return NULL;
481}
482
483/*
436 * decode a full map. 484 * decode a full map.
437 */ 485 */
438struct ceph_osdmap *osdmap_decode(void **p, void *end) 486struct ceph_osdmap *osdmap_decode(void **p, void *end)
@@ -443,6 +491,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
443 u8 ev; 491 u8 ev;
444 int err = -EINVAL; 492 int err = -EINVAL;
445 void *start = *p; 493 void *start = *p;
494 struct ceph_pg_pool_info *pi;
446 495
447 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 496 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
448 497
@@ -464,32 +513,27 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
464 ceph_decode_copy(p, &map->created, sizeof(map->created)); 513 ceph_decode_copy(p, &map->created, sizeof(map->created));
465 ceph_decode_copy(p, &map->modified, sizeof(map->modified)); 514 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
466 515
467 map->num_pools = ceph_decode_32(p);
468 map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
469 GFP_NOFS);
470 if (!map->pg_pool) {
471 err = -ENOMEM;
472 goto bad;
473 }
474 ceph_decode_32_safe(p, end, max, bad); 516 ceph_decode_32_safe(p, end, max, bad);
475 while (max--) { 517 while (max--) {
476 ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad); 518 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
477 i = ceph_decode_32(p); 519 pi = kmalloc(sizeof(*pi), GFP_NOFS);
478 if (i >= map->num_pools) 520 if (!pi)
479 goto bad; 521 goto bad;
522 pi->id = ceph_decode_32(p);
480 ev = ceph_decode_8(p); /* encoding version */ 523 ev = ceph_decode_8(p); /* encoding version */
481 if (ev > CEPH_PG_POOL_VERSION) { 524 if (ev > CEPH_PG_POOL_VERSION) {
482 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 525 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
483 ev, CEPH_PG_POOL_VERSION); 526 ev, CEPH_PG_POOL_VERSION);
484 goto bad; 527 goto bad;
485 } 528 }
486 ceph_decode_copy(p, &map->pg_pool[i].v, 529 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
487 sizeof(map->pg_pool->v)); 530 __insert_pg_pool(&map->pg_pools, pi);
488 calc_pg_masks(&map->pg_pool[i]); 531 calc_pg_masks(pi);
489 p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64); 532 p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
490 p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals) 533 p += le32_to_cpu(pi->v.num_removed_snap_intervals)
491 * sizeof(u64) * 2; 534 * sizeof(u64) * 2;
492 } 535 }
536 ceph_decode_32_safe(p, end, map->pool_max, bad);
493 537
494 ceph_decode_32_safe(p, end, map->flags, bad); 538 ceph_decode_32_safe(p, end, map->flags, bad);
495 539
@@ -581,7 +625,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
581 u32 epoch = 0; 625 u32 epoch = 0;
582 struct ceph_timespec modified; 626 struct ceph_timespec modified;
583 u32 len, pool; 627 u32 len, pool;
584 __s32 new_flags, max; 628 __s32 new_pool_max, new_flags, max;
585 void *start = *p; 629 void *start = *p;
586 int err = -EINVAL; 630 int err = -EINVAL;
587 u16 version; 631 u16 version;
@@ -600,6 +644,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
600 epoch = ceph_decode_32(p); 644 epoch = ceph_decode_32(p);
601 BUG_ON(epoch != map->epoch+1); 645 BUG_ON(epoch != map->epoch+1);
602 ceph_decode_copy(p, &modified, sizeof(modified)); 646 ceph_decode_copy(p, &modified, sizeof(modified));
647 new_pool_max = ceph_decode_32(p);
603 new_flags = ceph_decode_32(p); 648 new_flags = ceph_decode_32(p);
604 649
605 /* full map? */ 650 /* full map? */
@@ -623,6 +668,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
623 /* new flags? */ 668 /* new flags? */
624 if (new_flags >= 0) 669 if (new_flags >= 0)
625 map->flags = new_flags; 670 map->flags = new_flags;
671 if (new_pool_max >= 0)
672 map->pool_max = new_pool_max;
626 673
627 ceph_decode_need(p, end, 5*sizeof(u32), bad); 674 ceph_decode_need(p, end, 5*sizeof(u32), bad);
628 675
@@ -647,37 +694,42 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
647 ceph_decode_32_safe(p, end, len, bad); 694 ceph_decode_32_safe(p, end, len, bad);
648 while (len--) { 695 while (len--) {
649 __u8 ev; 696 __u8 ev;
697 struct ceph_pg_pool_info *pi;
650 698
651 ceph_decode_32_safe(p, end, pool, bad); 699 ceph_decode_32_safe(p, end, pool, bad);
652 if (pool >= map->num_pools) { 700 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
653 void *pg_pool = kcalloc(pool + 1,
654 sizeof(*map->pg_pool),
655 GFP_NOFS);
656 if (!pg_pool) {
657 err = -ENOMEM;
658 goto bad;
659 }
660 memcpy(pg_pool, map->pg_pool,
661 map->num_pools * sizeof(*map->pg_pool));
662 kfree(map->pg_pool);
663 map->pg_pool = pg_pool;
664 map->num_pools = pool+1;
665 }
666 ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad);
667 ev = ceph_decode_8(p); /* encoding version */ 701 ev = ceph_decode_8(p); /* encoding version */
668 if (ev > CEPH_PG_POOL_VERSION) { 702 if (ev > CEPH_PG_POOL_VERSION) {
669 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 703 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
670 ev, CEPH_PG_POOL_VERSION); 704 ev, CEPH_PG_POOL_VERSION);
671 goto bad; 705 goto bad;
672 } 706 }
673 ceph_decode_copy(p, &map->pg_pool[pool].v, 707 pi = __lookup_pg_pool(&map->pg_pools, pool);
674 sizeof(map->pg_pool->v)); 708 if (!pi) {
675 calc_pg_masks(&map->pg_pool[pool]); 709 pi = kmalloc(sizeof(*pi), GFP_NOFS);
710 if (!pi) {
711 err = -ENOMEM;
712 goto bad;
713 }
714 pi->id = pool;
715 __insert_pg_pool(&map->pg_pools, pi);
716 }
717 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
718 calc_pg_masks(pi);
676 } 719 }
677 720
678 /* old_pool (ignore) */ 721 /* old_pool */
679 ceph_decode_32_safe(p, end, len, bad); 722 ceph_decode_32_safe(p, end, len, bad);
680 *p += len * sizeof(u32); 723 while (len--) {
724 struct ceph_pg_pool_info *pi;
725
726 ceph_decode_32_safe(p, end, pool, bad);
727 pi = __lookup_pg_pool(&map->pg_pools, pool);
728 if (pi) {
729 rb_erase(&pi->node, &map->pg_pools);
730 kfree(pi);
731 }
732 }
681 733
682 /* new_up */ 734 /* new_up */
683 err = -EINVAL; 735 err = -EINVAL;
@@ -861,10 +913,10 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
861 unsigned ps; 913 unsigned ps;
862 914
863 BUG_ON(!osdmap); 915 BUG_ON(!osdmap);
864 if (poolid >= osdmap->num_pools)
865 return -EIO;
866 916
867 pool = &osdmap->pg_pool[poolid]; 917 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
918 if (!pool)
919 return -EIO;
868 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); 920 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
869 if (preferred >= 0) { 921 if (preferred >= 0) {
870 ps += preferred; 922 ps += preferred;
@@ -919,9 +971,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
919 preferred >= osdmap->crush->max_devices) 971 preferred >= osdmap->crush->max_devices)
920 preferred = -1; 972 preferred = -1;
921 973
922 if (poolid >= osdmap->num_pools) 974 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
975 if (!pool)
923 return NULL; 976 return NULL;
924 pool = &osdmap->pg_pool[poolid];
925 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 977 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
926 pool->v.type, pool->v.size); 978 pool->v.type, pool->v.size);
927 if (ruleno < 0) { 979 if (ruleno < 0) {
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index c4af8418aa00..1fb55afb2642 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -19,6 +19,8 @@
19 * the change between two successive epochs, or as a fully encoded map. 19 * the change between two successive epochs, or as a fully encoded map.
20 */ 20 */
21struct ceph_pg_pool_info { 21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
22 struct ceph_pg_pool v; 24 struct ceph_pg_pool v;
23 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
24}; 26};
@@ -44,9 +46,8 @@ struct ceph_osdmap {
44 struct ceph_entity_addr *osd_addr; 46 struct ceph_entity_addr *osd_addr;
45 47
46 struct rb_root pg_temp; 48 struct rb_root pg_temp;
47 49 struct rb_root pg_pools;
48 u32 num_pools; 50 u32 pool_max;
49 struct ceph_pg_pool_info *pg_pool;
50 51
51 /* the CRUSH map specifies the mapping of placement groups to 52 /* the CRUSH map specifies the mapping of placement groups to
52 * the list of osds that store+replicate them. */ 53 * the list of osds that store+replicate them. */
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 1f4c78640541..26ac8b89a676 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,8 @@
11/* 11/*
12 * osdmap encoding versions 12 * osdmap encoding versions
13 */ 13 */
14#define CEPH_OSDMAP_INC_VERSION 3 14#define CEPH_OSDMAP_INC_VERSION 4
15#define CEPH_OSDMAP_VERSION 3 15#define CEPH_OSDMAP_VERSION 4
16 16
17/* 17/*
18 * fs id 18 * fs id