From f24e9980eb860d8600cbe5ef3d2fd9295320d229 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 6 Oct 2009 11:31:10 -0700 Subject: ceph: OSD client The OSD client is responsible for reading and writing data from/to the object storage pool. This includes determining where objects are stored in the cluster, and ensuring that requests are retried or redirected in the event of a node failure or data migration. If an OSD does not respond before a timeout expires, keepalive messages are sent across the lossless, ordered communications channel to ensure that any break in the TCP is discovered. If the session does reset, a reconnection is attempted and affected requests are resent (by the message transport layer). Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 875 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 875 insertions(+) create mode 100644 fs/ceph/osdmap.c (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c new file mode 100644 index 000000000000..e38fe6309b1c --- /dev/null +++ b/fs/ceph/osdmap.c @@ -0,0 +1,875 @@ + +#include + +#include "super.h" +#include "osdmap.h" +#include "crush/hash.h" +#include "crush/mapper.h" +#include "decode.h" +#include "ceph_debug.h" + +char *ceph_osdmap_state_str(char *str, int len, int state) +{ + int flag = 0; + + if (!len) + goto done; + + *str = '\0'; + if (state) { + if (state & CEPH_OSD_EXISTS) { + snprintf(str, len, "exists"); + flag = 1; + } + if (state & CEPH_OSD_UP) { + snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), + "up"); + flag = 1; + } + } else { + snprintf(str, len, "doesn't exist"); + } +done: + return str; +} + +/* maps */ + +static int calc_bits_of(unsigned t) +{ + int b = 0; + while (t) { + t = t >> 1; + b++; + } + return b; +} + +/* + * the foo_mask is the smallest value 2^n-1 that is >= foo. + */ +static void calc_pg_masks(struct ceph_pg_pool_info *pi) +{ + pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; + pi->pgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; + pi->lpg_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; + pi->lpgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; +} + +/* + * decode crush map + */ +static int crush_decode_uniform_bucket(void **p, void *end, + struct crush_bucket_uniform *b) +{ + dout("crush_decode_uniform_bucket %p to %p\n", *p, end); + ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); + ceph_decode_32(p, b->item_weight); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_list_bucket(void **p, void *end, + struct crush_bucket_list *b) +{ + int j; + dout("crush_decode_list_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->sum_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + ceph_decode_32(p, b->item_weights[j]); + ceph_decode_32(p, b->sum_weights[j]); + } + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_tree_bucket(void **p, void *end, + struct crush_bucket_tree *b) +{ + int j; + dout("crush_decode_tree_bucket %p to %p\n", *p, end); + ceph_decode_32_safe(p, end, b->num_nodes, bad); + b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); + if (b->node_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); + for (j = 0; j < b->num_nodes; j++) + ceph_decode_32(p, b->node_weights[j]); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_straw_bucket(void **p, void *end, + struct crush_bucket_straw *b) +{ + int j; + dout("crush_decode_straw_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->straws == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + ceph_decode_32(p, b->item_weights[j]); + ceph_decode_32(p, b->straws[j]); + } + return 0; +bad: + return -EINVAL; +} + +static struct crush_map *crush_decode(void *pbyval, void *end) +{ + struct crush_map *c; + int err = -EINVAL; + int i, j; + void **p = &pbyval; + void *start = pbyval; + u32 magic; + + dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + + c = kzalloc(sizeof(*c), GFP_NOFS); + if (c == NULL) + return ERR_PTR(-ENOMEM); + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + ceph_decode_32(p, magic); + if (magic != CRUSH_MAGIC) { + pr_err("crush_decode magic %x != current %x\n", + (unsigned)magic, (unsigned)CRUSH_MAGIC); + goto bad; + } + ceph_decode_32(p, c->max_buckets); + ceph_decode_32(p, c->max_rules); + ceph_decode_32(p, c->max_devices); + + c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); + if (c->device_parents == NULL) + goto badmem; + c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); + if (c->bucket_parents == NULL) + goto badmem; + + c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); + if (c->buckets == NULL) + goto badmem; + c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); + if (c->rules == NULL) + goto badmem; + + /* buckets */ + for (i = 0; i < c->max_buckets; i++) { + int size = 0; + u32 alg; + struct crush_bucket *b; + + ceph_decode_32_safe(p, end, alg, bad); + if (alg == 0) { + c->buckets[i] = NULL; + continue; + } + dout("crush_decode bucket %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + size = sizeof(struct crush_bucket_uniform); + break; + case CRUSH_BUCKET_LIST: + size = sizeof(struct crush_bucket_list); + break; + case CRUSH_BUCKET_TREE: + size = sizeof(struct crush_bucket_tree); + break; + case CRUSH_BUCKET_STRAW: + size = sizeof(struct crush_bucket_straw); + break; + default: + goto bad; + } + BUG_ON(size == 0); + b = c->buckets[i] = kzalloc(size, GFP_NOFS); + if (b == NULL) + goto badmem; + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + ceph_decode_32(p, b->id); + ceph_decode_16(p, b->type); + ceph_decode_16(p, b->alg); + ceph_decode_32(p, b->weight); + ceph_decode_32(p, b->size); + + dout("crush_decode bucket size %d off %x %p to %p\n", + b->size, (int)(*p-start), *p, end); + + b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); + if (b->items == NULL) + goto badmem; + b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); + if (b->perm == NULL) + goto badmem; + b->perm_n = 0; + + ceph_decode_need(p, end, b->size*sizeof(u32), bad); + for (j = 0; j < b->size; j++) + ceph_decode_32(p, b->items[j]); + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + err = crush_decode_uniform_bucket(p, end, + (struct crush_bucket_uniform *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_LIST: + err = crush_decode_list_bucket(p, end, + (struct crush_bucket_list *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_TREE: + err = crush_decode_tree_bucket(p, end, + (struct crush_bucket_tree *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_STRAW: + err = crush_decode_straw_bucket(p, end, + (struct crush_bucket_straw *)b); + if (err < 0) + goto bad; + break; + } + } + + /* rules */ + dout("rule vec is %p\n", c->rules); + for (i = 0; i < c->max_rules; i++) { + u32 yes; + struct crush_rule *r; + + ceph_decode_32_safe(p, end, yes, bad); + if (!yes) { + dout("crush_decode NO rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + c->rules[i] = NULL; + continue; + } + + dout("crush_decode rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + /* len */ + ceph_decode_32_safe(p, end, yes, bad); +#if BITS_PER_LONG == 32 + if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) + goto bad; +#endif + r = c->rules[i] = kmalloc(sizeof(*r) + + yes*sizeof(struct crush_rule_step), + GFP_NOFS); + if (r == NULL) + goto badmem; + dout(" rule %d is at %p\n", i, r); + r->len = yes; + ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ + ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); + for (j = 0; j < r->len; j++) { + ceph_decode_32(p, r->steps[j].op); + ceph_decode_32(p, r->steps[j].arg1); + ceph_decode_32(p, r->steps[j].arg2); + } + } + + /* ignore trailing name maps. */ + + dout("crush_decode success\n"); + return c; + +badmem: + err = -ENOMEM; +bad: + dout("crush_decode fail %d\n", err); + crush_destroy(c); + return ERR_PTR(err); +} + + +/* + * osd map + */ +void ceph_osdmap_destroy(struct ceph_osdmap *map) +{ + dout("osdmap_destroy %p\n", map); + if (map->crush) + crush_destroy(map->crush); + while (!RB_EMPTY_ROOT(&map->pg_temp)) + rb_erase(rb_first(&map->pg_temp), &map->pg_temp); + kfree(map->osd_state); + kfree(map->osd_weight); + kfree(map->pg_pool); + kfree(map->osd_addr); + kfree(map); +} + +/* + * adjust max osd value. reallocate arrays. + */ +static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +{ + u8 *state; + struct ceph_entity_addr *addr; + u32 *weight; + + state = kcalloc(max, sizeof(*state), GFP_NOFS); + addr = kcalloc(max, sizeof(*addr), GFP_NOFS); + weight = kcalloc(max, sizeof(*weight), GFP_NOFS); + if (state == NULL || addr == NULL || weight == NULL) { + kfree(state); + kfree(addr); + kfree(weight); + return -ENOMEM; + } + + /* copy old? */ + if (map->osd_state) { + memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); + memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); + memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); + kfree(map->osd_state); + kfree(map->osd_addr); + kfree(map->osd_weight); + } + + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; + map->max_osd = max; + return 0; +} + +/* + * Insert a new pg_temp mapping + */ +static void __insert_pg_mapping(struct ceph_pg_mapping *new, + struct rb_root *root) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_mapping *pg = NULL; + + while (*p) { + parent = *p; + pg = rb_entry(parent, struct ceph_pg_mapping, node); + if (new->pgid < pg->pgid) + p = &(*p)->rb_left; + else if (new->pgid > pg->pgid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); +} + +/* + * decode a full map. + */ +struct ceph_osdmap *osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + u16 version; + u32 len, max, i; + int err = -EINVAL; + void *start = *p; + + dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (map == NULL) + return ERR_PTR(-ENOMEM); + map->pg_temp = RB_ROOT; + + ceph_decode_16_safe(p, end, version, bad); + + ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); + ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); + ceph_decode_32(p, map->epoch); + ceph_decode_copy(p, &map->created, sizeof(map->created)); + ceph_decode_copy(p, &map->modified, sizeof(map->modified)); + + ceph_decode_32(p, map->num_pools); + map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool), + GFP_NOFS); + if (!map->pg_pool) { + err = -ENOMEM; + goto bad; + } + ceph_decode_32_safe(p, end, max, bad); + while (max--) { + ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad); + ceph_decode_32(p, i); + if (i >= map->num_pools) + goto bad; + ceph_decode_copy(p, &map->pg_pool[i].v, + sizeof(map->pg_pool->v)); + calc_pg_masks(&map->pg_pool[i]); + p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64); + p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals) + * sizeof(u64) * 2; + } + + ceph_decode_32_safe(p, end, map->flags, bad); + + ceph_decode_32(p, max); + + /* (re)alloc osd arrays */ + err = osdmap_set_max_osd(map, max); + if (err < 0) + goto bad; + dout("osdmap_decode max_osd = %d\n", map->max_osd); + + /* osds */ + err = -EINVAL; + ceph_decode_need(p, end, 3*sizeof(u32) + + map->max_osd*(1 + sizeof(*map->osd_weight) + + sizeof(*map->osd_addr)), bad); + *p += 4; /* skip length field (should match max) */ + ceph_decode_copy(p, map->osd_state, map->max_osd); + + *p += 4; /* skip length field (should match max) */ + for (i = 0; i < map->max_osd; i++) + ceph_decode_32(p, map->osd_weight[i]); + + *p += 4; /* skip length field (should match max) */ + ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); + + /* pg_temp */ + ceph_decode_32_safe(p, end, len, bad); + for (i = 0; i < len; i++) { + int n, j; + u64 pgid; + struct ceph_pg_mapping *pg; + + ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); + ceph_decode_64(p, pgid); + ceph_decode_32(p, n); + ceph_decode_need(p, end, n * sizeof(u32), bad); + pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); + if (!pg) { + err = -ENOMEM; + goto bad; + } + pg->pgid = pgid; + pg->len = n; + for (j = 0; j < n; j++) + ceph_decode_32(p, pg->osds[j]); + + __insert_pg_mapping(pg, &map->pg_temp); + dout(" added pg_temp %llx len %d\n", pgid, len); + } + + /* crush */ + ceph_decode_32_safe(p, end, len, bad); + dout("osdmap_decode crush len %d from off 0x%x\n", len, + (int)(*p - start)); + ceph_decode_need(p, end, len, bad); + map->crush = crush_decode(*p, end); + *p += len; + if (IS_ERR(map->crush)) { + err = PTR_ERR(map->crush); + map->crush = NULL; + goto bad; + } + + /* ignore the rest of the map */ + *p = end; + + dout("osdmap_decode done %p %p\n", *p, end); + return map; + +bad: + dout("osdmap_decode fail\n"); + ceph_osdmap_destroy(map); + return ERR_PTR(err); +} + +/* + * decode and apply an incremental map update. + */ +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map, + struct ceph_messenger *msgr) +{ + struct ceph_osdmap *newmap = map; + struct crush_map *newcrush = NULL; + struct ceph_fsid fsid; + u32 epoch = 0; + struct ceph_timespec modified; + u32 len, pool; + __s32 new_flags, max; + void *start = *p; + int err = -EINVAL; + u16 version; + struct rb_node *rbp; + + ceph_decode_16_safe(p, end, version, bad); + + ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), + bad); + ceph_decode_copy(p, &fsid, sizeof(fsid)); + ceph_decode_32(p, epoch); + BUG_ON(epoch != map->epoch+1); + ceph_decode_copy(p, &modified, sizeof(modified)); + ceph_decode_32(p, new_flags); + + /* full map? */ + ceph_decode_32_safe(p, end, len, bad); + if (len > 0) { + dout("apply_incremental full map len %d, %p to %p\n", + len, *p, end); + newmap = osdmap_decode(p, min(*p+len, end)); + return newmap; /* error or not */ + } + + /* new crush? */ + ceph_decode_32_safe(p, end, len, bad); + if (len > 0) { + dout("apply_incremental new crush map len %d, %p to %p\n", + len, *p, end); + newcrush = crush_decode(*p, min(*p+len, end)); + if (IS_ERR(newcrush)) + return ERR_PTR(PTR_ERR(newcrush)); + } + + /* new flags? */ + if (new_flags >= 0) + map->flags = new_flags; + + ceph_decode_need(p, end, 5*sizeof(u32), bad); + + /* new max? */ + ceph_decode_32(p, max); + if (max >= 0) { + err = osdmap_set_max_osd(map, max); + if (err < 0) + goto bad; + } + + map->epoch++; + map->modified = map->modified; + if (newcrush) { + if (map->crush) + crush_destroy(map->crush); + map->crush = newcrush; + newcrush = NULL; + } + + /* new_pool */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + ceph_decode_32_safe(p, end, pool, bad); + if (pool >= map->num_pools) { + void *pg_pool = kcalloc(pool + 1, + sizeof(*map->pg_pool), + GFP_NOFS); + if (!pg_pool) { + err = -ENOMEM; + goto bad; + } + memcpy(pg_pool, map->pg_pool, + map->num_pools * sizeof(*map->pg_pool)); + kfree(map->pg_pool); + map->pg_pool = pg_pool; + map->num_pools = pool+1; + } + ceph_decode_copy(p, &map->pg_pool[pool].v, + sizeof(map->pg_pool->v)); + calc_pg_masks(&map->pg_pool[pool]); + } + + /* old_pool (ignore) */ + ceph_decode_32_safe(p, end, len, bad); + *p += len * sizeof(u32); + + /* new_up */ + err = -EINVAL; + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd; + struct ceph_entity_addr addr; + ceph_decode_32_safe(p, end, osd, bad); + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + pr_info("osd%d up\n", osd); + BUG_ON(osd >= map->max_osd); + map->osd_state[osd] |= CEPH_OSD_UP; + map->osd_addr[osd] = addr; + } + + /* new_down */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd; + ceph_decode_32_safe(p, end, osd, bad); + (*p)++; /* clean flag */ + pr_info("ceph osd%d down\n", osd); + if (osd < map->max_osd) + map->osd_state[osd] &= ~CEPH_OSD_UP; + } + + /* new_weight */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd, off; + ceph_decode_need(p, end, sizeof(u32)*2, bad); + ceph_decode_32(p, osd); + ceph_decode_32(p, off); + pr_info("osd%d weight 0x%x %s\n", osd, off, + off == CEPH_OSD_IN ? "(in)" : + (off == CEPH_OSD_OUT ? "(out)" : "")); + if (osd < map->max_osd) + map->osd_weight[osd] = off; + } + + /* new_pg_temp */ + rbp = rb_first(&map->pg_temp); + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + struct ceph_pg_mapping *pg; + int j; + u64 pgid; + u32 pglen; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); + ceph_decode_64(p, pgid); + ceph_decode_32(p, pglen); + + /* remove any? */ + while (rbp && rb_entry(rbp, struct ceph_pg_mapping, + node)->pgid <= pgid) { + struct rb_node *cur = rbp; + rbp = rb_next(rbp); + dout(" removed pg_temp %llx\n", + rb_entry(cur, struct ceph_pg_mapping, node)->pgid); + rb_erase(cur, &map->pg_temp); + } + + if (pglen) { + /* insert */ + ceph_decode_need(p, end, pglen*sizeof(u32), bad); + pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); + if (!pg) { + err = -ENOMEM; + goto bad; + } + pg->pgid = pgid; + pg->len = pglen; + for (j = 0; j < len; j++) + ceph_decode_32(p, pg->osds[j]); + __insert_pg_mapping(pg, &map->pg_temp); + dout(" added pg_temp %llx len %d\n", pgid, pglen); + } + } + while (rbp) { + struct rb_node *cur = rbp; + rbp = rb_next(rbp); + dout(" removed pg_temp %llx\n", + rb_entry(cur, struct ceph_pg_mapping, node)->pgid); + rb_erase(cur, &map->pg_temp); + } + + /* ignore the rest */ + *p = end; + return map; + +bad: + pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", + epoch, (int)(*p - start), *p, start, end); + if (newcrush) + crush_destroy(newcrush); + return ERR_PTR(err); +} + + + + +/* + * calculate file layout from given offset, length. + * fill in correct oid, logical length, and object extent + * offset, length. + * + * for now, we write only a single su, until we can + * pass a stride back to the caller. + */ +void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 *plen, + u64 *bno, + u64 *oxoff, u64 *oxlen) +{ + u32 osize = le32_to_cpu(layout->fl_object_size); + u32 su = le32_to_cpu(layout->fl_stripe_unit); + u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 bl, stripeno, stripepos, objsetno; + u32 su_per_object; + u64 t; + + dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, + osize, su); + su_per_object = osize / le32_to_cpu(layout->fl_stripe_unit); + dout("osize %u / su %u = su_per_object %u\n", osize, su, + su_per_object); + + BUG_ON((su & ~PAGE_MASK) != 0); + /* bl = *off / su; */ + t = off; + do_div(t, su); + bl = t; + dout("off %llu / su %u = bl %u\n", off, su, bl); + + stripeno = bl / sc; + stripepos = bl % sc; + objsetno = stripeno / su_per_object; + + *bno = objsetno * sc + stripepos; + dout("objset %u * sc %u = bno %u\n", objsetno, sc, (unsigned)*bno); + /* *oxoff = *off / layout->fl_stripe_unit; */ + t = off; + *oxoff = do_div(t, su); + *oxlen = min_t(u64, *plen, su - *oxoff); + *plen = *oxlen; + + dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); +} + +/* + * calculate an object layout (i.e. pgid) from an oid, + * file_layout, and osdmap + */ +int ceph_calc_object_layout(struct ceph_object_layout *ol, + const char *oid, + struct ceph_file_layout *fl, + struct ceph_osdmap *osdmap) +{ + unsigned num, num_mask; + union ceph_pg pgid; + s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); + int poolid = le32_to_cpu(fl->fl_pg_pool); + struct ceph_pg_pool_info *pool; + + if (poolid >= osdmap->num_pools) + return -EIO; + pool = &osdmap->pg_pool[poolid]; + + if (preferred >= 0) { + num = le32_to_cpu(pool->v.lpg_num); + num_mask = pool->lpg_num_mask; + } else { + num = le32_to_cpu(pool->v.pg_num); + num_mask = pool->pg_num_mask; + } + + pgid.pg64 = 0; /* start with it zeroed out */ + pgid.pg.ps = ceph_full_name_hash(oid, strlen(oid)); + pgid.pg.preferred = preferred; + pgid.pg.pool = le32_to_cpu(fl->fl_pg_pool); + if (preferred >= 0) + dout("calc_object_layout '%s' pgid %d.%xp%d (%llx)\n", oid, + pgid.pg.pool, pgid.pg.ps, (int)preferred, pgid.pg64); + else + dout("calc_object_layout '%s' pgid %d.%x (%llx)\n", oid, + pgid.pg.pool, pgid.pg.ps, pgid.pg64); + + ol->ol_pgid = cpu_to_le64(pgid.pg64); + ol->ol_stripe_unit = fl->fl_object_stripe_unit; + + return 0; +} + +/* + * Calculate raw osd vector for the given pgid. Return pointer to osd + * array, or NULL on failure. + */ +static int *calc_pg_raw(struct ceph_osdmap *osdmap, union ceph_pg pgid, + int *osds, int *num) +{ + struct rb_node *n = osdmap->pg_temp.rb_node; + struct ceph_pg_mapping *pg; + struct ceph_pg_pool_info *pool; + int ruleno; + unsigned pps; /* placement ps */ + + /* pg_temp? */ + while (n) { + pg = rb_entry(n, struct ceph_pg_mapping, node); + if (pgid.pg64 < pg->pgid) + n = n->rb_left; + else if (pgid.pg64 > pg->pgid) + n = n->rb_right; + else { + *num = pg->len; + return pg->osds; + } + } + + /* crush */ + if (pgid.pg.pool >= osdmap->num_pools) + return NULL; + pool = &osdmap->pg_pool[pgid.pg.pool]; + ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, + pool->v.type, pool->v.size); + if (ruleno < 0) { + pr_err("no crush rule pool %d type %d size %d\n", + pgid.pg.pool, pool->v.type, pool->v.size); + return NULL; + } + + if (pgid.pg.preferred >= 0) + pps = ceph_stable_mod(pgid.pg.ps, + le32_to_cpu(pool->v.lpgp_num), + pool->lpgp_num_mask); + else + pps = ceph_stable_mod(pgid.pg.ps, + le32_to_cpu(pool->v.pgp_num), + pool->pgp_num_mask); + pps += pgid.pg.pool; + *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, + min_t(int, pool->v.size, *num), + pgid.pg.preferred, osdmap->osd_weight); + return osds; +} + +/* + * Return primary osd for given pgid, or -1 if none. + */ +int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid) +{ + int rawosds[10], *osds; + int i, num = ARRAY_SIZE(rawosds); + + osds = calc_pg_raw(osdmap, pgid, rawosds, &num); + if (!osds) + return -1; + + /* primary is first up osd */ + for (i = 0; i < num; i++) + if (ceph_osd_is_up(osdmap, osds[i])) { + return osds[i]; + break; + } + return -1; +} -- cgit v1.2.2 From 991abb6ecfc8edf9863aa6a3f43249e63f9d4d4e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 8 Oct 2009 22:22:37 -0700 Subject: ceph: fail gracefully on corrupt osdmap (bad pg_temp mapping) Return an error and report a corrupt map instead of crying BUG(). Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index e38fe6309b1c..342e5f80996b 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -366,8 +366,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) /* * Insert a new pg_temp mapping */ -static void __insert_pg_mapping(struct ceph_pg_mapping *new, - struct rb_root *root) +static int __insert_pg_mapping(struct ceph_pg_mapping *new, + struct rb_root *root) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -381,11 +381,12 @@ static void __insert_pg_mapping(struct ceph_pg_mapping *new, else if (new->pgid > pg->pgid) p = &(*p)->rb_right; else - BUG(); + return -EEXIST; } rb_link_node(&new->node, parent, p); rb_insert_color(&new->node, root); + return 0; } /* @@ -481,7 +482,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) for (j = 0; j < n; j++) ceph_decode_32(p, pg->osds[j]); - __insert_pg_mapping(pg, &map->pg_temp); + err = __insert_pg_mapping(pg, &map->pg_temp); + if (err) + goto bad; dout(" added pg_temp %llx len %d\n", pgid, len); } @@ -681,7 +684,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pg->len = pglen; for (j = 0; j < len; j++) ceph_decode_32(p, pg->osds[j]); - __insert_pg_mapping(pg, &map->pg_temp); + err = __insert_pg_mapping(pg, &map->pg_temp); + if (err) + goto bad; dout(" added pg_temp %llx len %d\n", pgid, pglen); } } -- cgit v1.2.2 From c89136ea4253c73e89e97f5138bb22d97ad9f564 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 14 Oct 2009 09:59:09 -0700 Subject: ceph: convert encode/decode macros to inlines This avoids the fugly pass by reference and makes the code a bit easier to read. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 70 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 342e5f80996b..6f0aeff4185a 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -67,7 +67,7 @@ static int crush_decode_uniform_bucket(void **p, void *end, { dout("crush_decode_uniform_bucket %p to %p\n", *p, end); ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); - ceph_decode_32(p, b->item_weight); + b->item_weight = ceph_decode_32(p); return 0; bad: return -EINVAL; @@ -86,8 +86,8 @@ static int crush_decode_list_bucket(void **p, void *end, return -ENOMEM; ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) { - ceph_decode_32(p, b->item_weights[j]); - ceph_decode_32(p, b->sum_weights[j]); + b->item_weights[j] = ceph_decode_32(p); + b->sum_weights[j] = ceph_decode_32(p); } return 0; bad: @@ -105,7 +105,7 @@ static int crush_decode_tree_bucket(void **p, void *end, return -ENOMEM; ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); for (j = 0; j < b->num_nodes; j++) - ceph_decode_32(p, b->node_weights[j]); + b->node_weights[j] = ceph_decode_32(p); return 0; bad: return -EINVAL; @@ -124,8 +124,8 @@ static int crush_decode_straw_bucket(void **p, void *end, return -ENOMEM; ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) { - ceph_decode_32(p, b->item_weights[j]); - ceph_decode_32(p, b->straws[j]); + b->item_weights[j] = ceph_decode_32(p); + b->straws[j] = ceph_decode_32(p); } return 0; bad: @@ -148,15 +148,15 @@ static struct crush_map *crush_decode(void *pbyval, void *end) return ERR_PTR(-ENOMEM); ceph_decode_need(p, end, 4*sizeof(u32), bad); - ceph_decode_32(p, magic); + magic = ceph_decode_32(p); if (magic != CRUSH_MAGIC) { pr_err("crush_decode magic %x != current %x\n", (unsigned)magic, (unsigned)CRUSH_MAGIC); goto bad; } - ceph_decode_32(p, c->max_buckets); - ceph_decode_32(p, c->max_rules); - ceph_decode_32(p, c->max_devices); + c->max_buckets = ceph_decode_32(p); + c->max_rules = ceph_decode_32(p); + c->max_devices = ceph_decode_32(p); c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); if (c->device_parents == NULL) @@ -208,11 +208,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) goto badmem; ceph_decode_need(p, end, 4*sizeof(u32), bad); - ceph_decode_32(p, b->id); - ceph_decode_16(p, b->type); - ceph_decode_16(p, b->alg); - ceph_decode_32(p, b->weight); - ceph_decode_32(p, b->size); + b->id = ceph_decode_32(p); + b->type = ceph_decode_16(p); + b->alg = ceph_decode_16(p); + b->weight = ceph_decode_32(p); + b->size = ceph_decode_32(p); dout("crush_decode bucket size %d off %x %p to %p\n", b->size, (int)(*p-start), *p, end); @@ -227,7 +227,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ceph_decode_need(p, end, b->size*sizeof(u32), bad); for (j = 0; j < b->size; j++) - ceph_decode_32(p, b->items[j]); + b->items[j] = ceph_decode_32(p); switch (b->alg) { case CRUSH_BUCKET_UNIFORM: @@ -290,9 +290,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); for (j = 0; j < r->len; j++) { - ceph_decode_32(p, r->steps[j].op); - ceph_decode_32(p, r->steps[j].arg1); - ceph_decode_32(p, r->steps[j].arg2); + r->steps[j].op = ceph_decode_32(p); + r->steps[j].arg1 = ceph_decode_32(p); + r->steps[j].arg2 = ceph_decode_32(p); } } @@ -411,11 +411,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); - ceph_decode_32(p, map->epoch); + map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - ceph_decode_32(p, map->num_pools); + map->num_pools = ceph_decode_32(p); map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool), GFP_NOFS); if (!map->pg_pool) { @@ -425,7 +425,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, max, bad); while (max--) { ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad); - ceph_decode_32(p, i); + i = ceph_decode_32(p); if (i >= map->num_pools) goto bad; ceph_decode_copy(p, &map->pg_pool[i].v, @@ -438,7 +438,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, map->flags, bad); - ceph_decode_32(p, max); + max = ceph_decode_32(p); /* (re)alloc osd arrays */ err = osdmap_set_max_osd(map, max); @@ -456,7 +456,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) *p += 4; /* skip length field (should match max) */ for (i = 0; i < map->max_osd; i++) - ceph_decode_32(p, map->osd_weight[i]); + map->osd_weight[i] = ceph_decode_32(p); *p += 4; /* skip length field (should match max) */ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); @@ -469,8 +469,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_pg_mapping *pg; ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); - ceph_decode_64(p, pgid); - ceph_decode_32(p, n); + pgid = ceph_decode_64(p); + n = ceph_decode_32(p); ceph_decode_need(p, end, n * sizeof(u32), bad); pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); if (!pg) { @@ -480,7 +480,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) pg->pgid = pgid; pg->len = n; for (j = 0; j < n; j++) - ceph_decode_32(p, pg->osds[j]); + pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); if (err) @@ -537,10 +537,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), bad); ceph_decode_copy(p, &fsid, sizeof(fsid)); - ceph_decode_32(p, epoch); + epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); - ceph_decode_32(p, new_flags); + new_flags = ceph_decode_32(p); /* full map? */ ceph_decode_32_safe(p, end, len, bad); @@ -568,7 +568,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_need(p, end, 5*sizeof(u32), bad); /* new max? */ - ceph_decode_32(p, max); + max = ceph_decode_32(p); if (max >= 0) { err = osdmap_set_max_osd(map, max); if (err < 0) @@ -641,8 +641,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, while (len--) { u32 osd, off; ceph_decode_need(p, end, sizeof(u32)*2, bad); - ceph_decode_32(p, osd); - ceph_decode_32(p, off); + osd = ceph_decode_32(p); + off = ceph_decode_32(p); pr_info("osd%d weight 0x%x %s\n", osd, off, off == CEPH_OSD_IN ? "(in)" : (off == CEPH_OSD_OUT ? "(out)" : "")); @@ -659,8 +659,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, u64 pgid; u32 pglen; ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); - ceph_decode_64(p, pgid); - ceph_decode_32(p, pglen); + pgid = ceph_decode_64(p); + pglen = ceph_decode_32(p); /* remove any? */ while (rbp && rb_entry(rbp, struct ceph_pg_mapping, @@ -683,7 +683,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pg->pgid = pgid; pg->len = pglen; for (j = 0; j < len; j++) - ceph_decode_32(p, pg->osds[j]); + pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); if (err) goto bad; -- cgit v1.2.2 From ee7fdfaff7702bd209e3a013b2fc4643233f5465 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Oct 2009 11:41:51 -0700 Subject: ceph: include preferred osd in placement seed Mix the preferred osd (if any) into the placement seed that is fed into the CRUSH object placement calculation. This prevents all the placement pgs from peering with the same osds. Rev the osd client protocol with this change. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 6f0aeff4185a..72d75a239ac2 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -791,6 +791,8 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, pgid.pg64 = 0; /* start with it zeroed out */ pgid.pg.ps = ceph_full_name_hash(oid, strlen(oid)); pgid.pg.preferred = preferred; + if (preferred >= 0) + pgid.pg.ps += preferred; pgid.pg.pool = le32_to_cpu(fl->fl_pg_pool); if (preferred >= 0) dout("calc_object_layout '%s' pgid %d.%xp%d (%llx)\n", oid, -- cgit v1.2.2 From 35e054a66e07f508aa7cfabc7db1757379093689 Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Wed, 28 Oct 2009 14:04:48 -0700 Subject: ceph: remove redundant use of le32_to_cpu Using stripe unit size calculated and saved on the stack to avoid a redundant call to le32_to_cpu. Signed-off-by: Noah Watkins Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 72d75a239ac2..60012e05bdfd 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -735,7 +735,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, osize, su); - su_per_object = osize / le32_to_cpu(layout->fl_stripe_unit); + su_per_object = osize / su; dout("osize %u / su %u = su_per_object %u\n", osize, su, su_per_object); -- cgit v1.2.2 From 5600f5ebd318f7af6f4b19a29f08d18bb85264e5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Oct 2009 14:57:25 -0700 Subject: ceph: correct comment to match striping calculation The object extent offset is the file offset _modulo_ the stripe unit. The code was correct, the comment was wrong. Reported-by: Noah Watkins Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 60012e05bdfd..a9a4143234fa 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -752,7 +752,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, *bno = objsetno * sc + stripepos; dout("objset %u * sc %u = bno %u\n", objsetno, sc, (unsigned)*bno); - /* *oxoff = *off / layout->fl_stripe_unit; */ + /* *oxoff = *off % layout->fl_stripe_unit; */ t = off; *oxoff = do_div(t, su); *oxlen = min_t(u64, *plen, su - *oxoff); -- cgit v1.2.2 From 645a102581b3639836b17d147c35d574fd6e8267 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Oct 2009 15:15:05 -0700 Subject: ceph: fix object striping calculation for non-default striping schemes We were incorrectly calculationing of object offset. If we have multiple stripe units per object, we need to shift to the start of the current su in addition to the offset within the su. Also rename bno to ono (object number) to avoid some variable naming confusion. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index a9a4143234fa..5a5520c5a2b3 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -723,7 +723,7 @@ bad: */ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 *plen, - u64 *bno, + u64 *ono, u64 *oxoff, u64 *oxlen) { u32 osize = le32_to_cpu(layout->fl_object_size); @@ -750,11 +750,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, stripepos = bl % sc; objsetno = stripeno / su_per_object; - *bno = objsetno * sc + stripepos; - dout("objset %u * sc %u = bno %u\n", objsetno, sc, (unsigned)*bno); - /* *oxoff = *off % layout->fl_stripe_unit; */ + *ono = objsetno * sc + stripepos; + dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); + + /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ t = off; *oxoff = do_div(t, su); + *oxoff += (stripeno % su_per_object) * su; + *oxlen = min_t(u64, *plen, su - *oxoff); *plen = *oxlen; -- cgit v1.2.2 From ff1d1f7179363209b7f1493ea39b666f50d05cf4 Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Fri, 30 Oct 2009 12:57:30 -0700 Subject: ceph: fix intra strip unit length calculation Commit 645a102581b3639836b17d147c35d574fd6e8267 fixes calculation of object offset for layouts with multiple stripes per object. This updates the calculation of the length written to take into account multiple stripes per object. Signed-off-by: Noah Watkins Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 5a5520c5a2b3..d62e111b8a34 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -731,7 +731,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u32 sc = le32_to_cpu(layout->fl_stripe_count); u32 bl, stripeno, stripepos, objsetno; u32 su_per_object; - u64 t; + u64 t, su_offset; dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, osize, su); @@ -755,10 +755,15 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ t = off; - *oxoff = do_div(t, su); - *oxoff += (stripeno % su_per_object) * su; - - *oxlen = min_t(u64, *plen, su - *oxoff); + su_offset = do_div(t, su); + *oxoff = su_offset + (stripeno % su_per_object) * su; + + /* + * Calculate the length of the extent being written to the selected + * object. This is the minimum of the full length requested (plen) or + * the remainder of the current stripe being written to. + */ + *oxlen = min_t(u64, *plen, su - su_offset); *plen = *oxlen; dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); -- cgit v1.2.2 From 63f2d211954b790fea0a9caeae605c7956535af6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 Nov 2009 15:17:56 -0800 Subject: ceph: use fixed endian encoding for ceph_entity_addr We exchange struct ceph_entity_addr over the wire and store it on disk. The sockaddr_storage.ss_family field, however, is host endianness. So, fix ss_family endianness to big endian when sending/receiving over the wire. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index d62e111b8a34..cd7bb265d789 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -460,6 +460,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) *p += 4; /* skip length field (should match max) */ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); + for (i = 0; i < map->max_osd; i++) + ceph_decode_addr(&map->osd_addr[i]); /* pg_temp */ ceph_decode_32_safe(p, end, len, bad); @@ -619,6 +621,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_entity_addr addr; ceph_decode_32_safe(p, end, osd, bad); ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); map->osd_state[osd] |= CEPH_OSD_UP; -- cgit v1.2.2 From 51042122d4f85e0f8ee577a4230f172fcc57c456 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 4 Nov 2009 11:39:12 -0800 Subject: ceph: fix endian conversions for ceph_pg The endian conversions don't quite work with the old union ceph_pg. Just make it a regular struct, and make each field __le. This is simpler and it has the added bonus of actually working. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 100 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 39 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index cd7bb265d789..8b0cd1107507 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -366,19 +366,33 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) /* * Insert a new pg_temp mapping */ +static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) +{ + u64 a = *(u64 *)&l; + u64 b = *(u64 *)&r; + + if (a < b) + return -1; + if (a > b) + return 1; + return 0; +} + static int __insert_pg_mapping(struct ceph_pg_mapping *new, struct rb_root *root) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct ceph_pg_mapping *pg = NULL; + int c; while (*p) { parent = *p; pg = rb_entry(parent, struct ceph_pg_mapping, node); - if (new->pgid < pg->pgid) + c = pgid_cmp(new->pgid, pg->pgid); + if (c < 0) p = &(*p)->rb_left; - else if (new->pgid > pg->pgid) + else if (c > 0) p = &(*p)->rb_right; else return -EEXIST; @@ -467,11 +481,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, len, bad); for (i = 0; i < len; i++) { int n, j; - u64 pgid; + struct ceph_pg pgid; struct ceph_pg_mapping *pg; ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); - pgid = ceph_decode_64(p); + ceph_decode_copy(p, &pgid, sizeof(pgid)); n = ceph_decode_32(p); ceph_decode_need(p, end, n * sizeof(u32), bad); pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); @@ -487,7 +501,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) err = __insert_pg_mapping(pg, &map->pg_temp); if (err) goto bad; - dout(" added pg_temp %llx len %d\n", pgid, len); + dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); } /* crush */ @@ -659,19 +673,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, while (len--) { struct ceph_pg_mapping *pg; int j; - u64 pgid; + struct ceph_pg pgid; u32 pglen; ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); - pgid = ceph_decode_64(p); + ceph_decode_copy(p, &pgid, sizeof(pgid)); pglen = ceph_decode_32(p); /* remove any? */ - while (rbp && rb_entry(rbp, struct ceph_pg_mapping, - node)->pgid <= pgid) { + while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, + node)->pgid, pgid) <= 0) { struct rb_node *cur = rbp; rbp = rb_next(rbp); dout(" removed pg_temp %llx\n", - rb_entry(cur, struct ceph_pg_mapping, node)->pgid); + *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, + node)->pgid); rb_erase(cur, &map->pg_temp); } @@ -690,14 +705,16 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, err = __insert_pg_mapping(pg, &map->pg_temp); if (err) goto bad; - dout(" added pg_temp %llx len %d\n", pgid, pglen); + dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, + pglen); } } while (rbp) { struct rb_node *cur = rbp; rbp = rb_next(rbp); dout(" removed pg_temp %llx\n", - rb_entry(cur, struct ceph_pg_mapping, node)->pgid); + *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, + node)->pgid); rb_erase(cur, &map->pg_temp); } @@ -782,16 +799,19 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, struct ceph_osdmap *osdmap) { unsigned num, num_mask; - union ceph_pg pgid; + struct ceph_pg pgid; s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); int poolid = le32_to_cpu(fl->fl_pg_pool); struct ceph_pg_pool_info *pool; + unsigned ps; if (poolid >= osdmap->num_pools) return -EIO; - pool = &osdmap->pg_pool[poolid]; + pool = &osdmap->pg_pool[poolid]; + ps = ceph_full_name_hash(oid, strlen(oid)); if (preferred >= 0) { + ps += preferred; num = le32_to_cpu(pool->v.lpg_num); num_mask = pool->lpg_num_mask; } else { @@ -799,22 +819,17 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, num_mask = pool->pg_num_mask; } - pgid.pg64 = 0; /* start with it zeroed out */ - pgid.pg.ps = ceph_full_name_hash(oid, strlen(oid)); - pgid.pg.preferred = preferred; - if (preferred >= 0) - pgid.pg.ps += preferred; - pgid.pg.pool = le32_to_cpu(fl->fl_pg_pool); + pgid.ps = cpu_to_le16(ps); + pgid.preferred = cpu_to_le16(preferred); + pgid.pool = fl->fl_pg_pool; if (preferred >= 0) - dout("calc_object_layout '%s' pgid %d.%xp%d (%llx)\n", oid, - pgid.pg.pool, pgid.pg.ps, (int)preferred, pgid.pg64); + dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, + (int)preferred); else - dout("calc_object_layout '%s' pgid %d.%x (%llx)\n", oid, - pgid.pg.pool, pgid.pg.ps, pgid.pg64); + dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); - ol->ol_pgid = cpu_to_le64(pgid.pg64); + ol->ol_pgid = pgid; ol->ol_stripe_unit = fl->fl_object_stripe_unit; - return 0; } @@ -822,21 +837,24 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, * Calculate raw osd vector for the given pgid. Return pointer to osd * array, or NULL on failure. */ -static int *calc_pg_raw(struct ceph_osdmap *osdmap, union ceph_pg pgid, +static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *osds, int *num) { struct rb_node *n = osdmap->pg_temp.rb_node; struct ceph_pg_mapping *pg; struct ceph_pg_pool_info *pool; int ruleno; - unsigned pps; /* placement ps */ + unsigned poolid, ps, pps; + int preferred; + int c; /* pg_temp? */ while (n) { pg = rb_entry(n, struct ceph_pg_mapping, node); - if (pgid.pg64 < pg->pgid) + c = pgid_cmp(pgid, pg->pgid); + if (c < 0) n = n->rb_left; - else if (pgid.pg64 > pg->pgid) + else if (c > 0) n = n->rb_right; else { *num = pg->len; @@ -845,36 +863,40 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, union ceph_pg pgid, } /* crush */ - if (pgid.pg.pool >= osdmap->num_pools) + poolid = le32_to_cpu(pgid.pool); + ps = le16_to_cpu(pgid.ps); + preferred = (s16)le16_to_cpu(pgid.preferred); + + if (poolid >= osdmap->num_pools) return NULL; - pool = &osdmap->pg_pool[pgid.pg.pool]; + pool = &osdmap->pg_pool[poolid]; ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, pool->v.type, pool->v.size); if (ruleno < 0) { pr_err("no crush rule pool %d type %d size %d\n", - pgid.pg.pool, pool->v.type, pool->v.size); + poolid, pool->v.type, pool->v.size); return NULL; } - if (pgid.pg.preferred >= 0) - pps = ceph_stable_mod(pgid.pg.ps, + if (preferred >= 0) + pps = ceph_stable_mod(ps, le32_to_cpu(pool->v.lpgp_num), pool->lpgp_num_mask); else - pps = ceph_stable_mod(pgid.pg.ps, + pps = ceph_stable_mod(ps, le32_to_cpu(pool->v.pgp_num), pool->pgp_num_mask); - pps += pgid.pg.pool; + pps += poolid; *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, min_t(int, pool->v.size, *num), - pgid.pg.preferred, osdmap->osd_weight); + preferred, osdmap->osd_weight); return osds; } /* * Return primary osd for given pgid, or -1 if none. */ -int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid) +int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { int rawosds[10], *osds; int i, num = ARRAY_SIZE(rawosds); -- cgit v1.2.2 From 1bdb70e59026838a79f77c440f8fe480a66e65e8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 6 Nov 2009 13:57:49 -0800 Subject: ceph: clean up 'osd%d down' console msg No ceph prefix. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 8b0cd1107507..a025555b70af 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -648,7 +648,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, u32 osd; ceph_decode_32_safe(p, end, osd, bad); (*p)++; /* clean flag */ - pr_info("ceph osd%d down\n", osd); + pr_info("osd%d down\n", osd); if (osd < map->max_osd) map->osd_state[osd] &= ~CEPH_OSD_UP; } -- cgit v1.2.2 From 1654dd0cf5ee1827322aca156af7d96d757201c7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 6 Nov 2009 21:55:25 -0800 Subject: ceph: make object hash a pg_pool property The object will be hashed to a placement seed (ps) based on the pg_pool's hash function. This allows new hashes to be introduced into an existing object store, or selection of a hash appropriate to the objects that will be stored in a particular pool. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index a025555b70af..68478270ad70 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -809,7 +809,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, return -EIO; pool = &osdmap->pg_pool[poolid]; - ps = ceph_full_name_hash(oid, strlen(oid)); + ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); if (preferred >= 0) { ps += preferred; num = le32_to_cpu(pool->v.lpg_num); -- cgit v1.2.2 From fb690390e305ea51e1883b105c7d3c52d7100ba5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 7 Nov 2009 20:18:22 -0800 Subject: ceph: make CRUSH hash function a bucket property Make the integer hash function a property of the bucket it is used on. This allows us to gracefully add support for new hash functions without starting from scatch. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 68478270ad70..8c994c714781 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -210,7 +210,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ceph_decode_need(p, end, 4*sizeof(u32), bad); b->id = ceph_decode_32(p); b->type = ceph_decode_16(p); - b->alg = ceph_decode_16(p); + b->alg = ceph_decode_8(p); + b->hash = ceph_decode_8(p); b->weight = ceph_decode_32(p); b->size = ceph_decode_32(p); -- cgit v1.2.2 From 767ea5c33a360ce88da24e296e802dace5821799 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 9 Dec 2009 12:34:01 -0800 Subject: ceph: do not feed bad device ids to crush Do not feed bad (large) device ids to CRUSH. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 8c994c714781..be5318aa7714 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -868,6 +868,11 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, ps = le16_to_cpu(pgid.ps); preferred = (s16)le16_to_cpu(pgid.preferred); + /* don't forcefeed bad device ids to crush */ + if (preferred >= osdmap->max_osd || + preferred >= osdmap->crush->max_devices) + preferred = -1; + if (poolid >= osdmap->num_pools) return NULL; pool = &osdmap->pg_pool[poolid]; -- cgit v1.2.2 From 9ec7cab14e6de732d4e7c355fe67c5810c32c758 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 14 Dec 2009 15:13:47 -0800 Subject: ceph: hex dump corrupt server data to KERN_DEBUG Also, print fsid using standard format, NOT hex dump. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index be5318aa7714..8c8ffe5ef7d4 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -726,6 +726,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bad: pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); if (newcrush) crush_destroy(newcrush); return ERR_PTR(err); -- cgit v1.2.2 From 30dc6381bbac213987be6fe0b0fb89868ff1f2c0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Dec 2009 14:49:37 -0800 Subject: ceph: fix error paths for corrupt osdmap messages Both osdmap_decode() and osdmap_apply_incremental() should never return NULL. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 8c8ffe5ef7d4..a9416308de6f 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -200,6 +200,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) size = sizeof(struct crush_bucket_straw); break; default: + err = -EINVAL; goto bad; } BUG_ON(size == 0); @@ -278,6 +279,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) /* len */ ceph_decode_32_safe(p, end, yes, bad); #if BITS_PER_LONG == 32 + err = -EINVAL; if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) goto bad; #endif @@ -489,11 +491,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_copy(p, &pgid, sizeof(pgid)); n = ceph_decode_32(p); ceph_decode_need(p, end, n * sizeof(u32), bad); + err = -ENOMEM; pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); - if (!pg) { - err = -ENOMEM; + if (!pg) goto bad; - } pg->pgid = pgid; pg->len = n; for (j = 0; j < n; j++) @@ -564,8 +565,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - newmap = osdmap_decode(p, min(*p+len, end)); - return newmap; /* error or not */ + return osdmap_decode(p, min(*p+len, end)); } /* new crush? */ @@ -809,6 +809,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, struct ceph_pg_pool_info *pool; unsigned ps; + BUG_ON(!osdmap); if (poolid >= osdmap->num_pools) return -EIO; -- cgit v1.2.2 From 7067f797b8409f1e10ec95ac2c1e17a200173d13 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 Dec 2009 16:02:37 -0800 Subject: ceph: fix incremental osdmap pg_temp decoding bug An incremental pg_temp wasn't being decoded properly (wrong bound on for loop). Also remove unused local variable, while we're at it. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index a9416308de6f..0dbd606e21c4 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -538,7 +538,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map, struct ceph_messenger *msgr) { - struct ceph_osdmap *newmap = map; struct crush_map *newcrush = NULL; struct ceph_fsid fsid; u32 epoch = 0; @@ -701,7 +700,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } pg->pgid = pgid; pg->len = pglen; - for (j = 0; j < len; j++) + for (j = 0; j < pglen; j++) pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); if (err) -- cgit v1.2.2 From 361be8601d78e488b5249032cc4e779b81d7928e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Jan 2010 16:03:02 -0800 Subject: ceph: precede encoded ceph_pg_pool struct with version Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 0dbd606e21c4..a143c51c2cfb 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -414,6 +414,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_osdmap *map; u16 version; u32 len, max, i; + u8 ev; int err = -EINVAL; void *start = *p; @@ -441,10 +442,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) } ceph_decode_32_safe(p, end, max, bad); while (max--) { - ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad); + ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad); i = ceph_decode_32(p); if (i >= map->num_pools) goto bad; + ev = ceph_decode_8(p); /* encoding version */ ceph_decode_copy(p, &map->pg_pool[i].v, sizeof(map->pg_pool->v)); calc_pg_masks(&map->pg_pool[i]); @@ -603,6 +605,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new_pool */ ceph_decode_32_safe(p, end, len, bad); while (len--) { + __u8 ev; + ceph_decode_32_safe(p, end, pool, bad); if (pool >= map->num_pools) { void *pg_pool = kcalloc(pool + 1, @@ -618,6 +622,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->pg_pool = pg_pool; map->num_pools = pool+1; } + ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad); + ev = ceph_decode_8(p); /* encoding version */ ceph_decode_copy(p, &map->pg_pool[pool].v, sizeof(map->pg_pool->v)); calc_pg_masks(&map->pg_pool[pool]); -- cgit v1.2.2 From 02f90c61096ec3ad691e808a4aa7ca5a06e550ec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 4 Feb 2010 16:18:10 -0800 Subject: ceph: add uid field to ceph_pg_pool Also verify encoding version as we go. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index a143c51c2cfb..a6afe3836f7e 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -426,6 +426,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) map->pg_temp = RB_ROOT; ceph_decode_16_safe(p, end, version, bad); + if (version > CEPH_OSDMAP_VERSION) { + pr_warning("got unknown v %d > %d of osdmap\n", version, + CEPH_OSDMAP_VERSION); + goto bad; + } ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); @@ -447,6 +452,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (i >= map->num_pools) goto bad; ev = ceph_decode_8(p); /* encoding version */ + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", + ev, CEPH_PG_POOL_VERSION); + goto bad; + } ceph_decode_copy(p, &map->pg_pool[i].v, sizeof(map->pg_pool->v)); calc_pg_masks(&map->pg_pool[i]); @@ -552,6 +562,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct rb_node *rbp; ceph_decode_16_safe(p, end, version, bad); + if (version > CEPH_OSDMAP_INC_VERSION) { + pr_warning("got unknown v %d > %d of inc osdmap\n", version, + CEPH_OSDMAP_INC_VERSION); + goto bad; + } ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), bad); @@ -624,6 +639,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad); ev = ceph_decode_8(p); /* encoding version */ + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", + ev, CEPH_PG_POOL_VERSION); + goto bad; + } ceph_decode_copy(p, &map->pg_pool[pool].v, sizeof(map->pg_pool->v)); calc_pg_masks(&map->pg_pool[pool]); -- cgit v1.2.2 From 9794b146fa7b93f8ab74fb62d67fdefad760769f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Feb 2010 15:53:32 -0800 Subject: ceph: fix memory leak when destroying osdmap with pg_temp mappings Also move _lookup_pg_mapping into a helper. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index a6afe3836f7e..443fdcdb19c4 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -321,8 +321,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) dout("osdmap_destroy %p\n", map); if (map->crush) crush_destroy(map->crush); - while (!RB_EMPTY_ROOT(&map->pg_temp)) - rb_erase(rb_first(&map->pg_temp), &map->pg_temp); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_temp); + kfree(pg); + } kfree(map->osd_state); kfree(map->osd_weight); kfree(map->pg_pool); @@ -367,7 +372,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) } /* - * Insert a new pg_temp mapping + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) */ static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) { @@ -406,6 +412,26 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new, return 0; } +static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, + struct ceph_pg pgid) +{ + struct rb_node *n = root->rb_node; + struct ceph_pg_mapping *pg; + int c; + + while (n) { + pg = rb_entry(n, struct ceph_pg_mapping, node); + c = pgid_cmp(pgid, pg->pgid); + if (c < 0) + n = n->rb_left; + else if (c > 0) + n = n->rb_right; + else + return pg; + } + return NULL; +} + /* * decode a full map. */ @@ -870,26 +896,17 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *osds, int *num) { - struct rb_node *n = osdmap->pg_temp.rb_node; struct ceph_pg_mapping *pg; struct ceph_pg_pool_info *pool; int ruleno; unsigned poolid, ps, pps; int preferred; - int c; /* pg_temp? */ - while (n) { - pg = rb_entry(n, struct ceph_pg_mapping, node); - c = pgid_cmp(pgid, pg->pgid); - if (c < 0) - n = n->rb_left; - else if (c > 0) - n = n->rb_right; - else { - *num = pg->len; - return pg->osds; - } + pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); + if (pg) { + *num = pg->len; + return pg->osds; } /* crush */ -- cgit v1.2.2 From 4fc51be8fa7043ff9a1e34fef0e99214373332ac Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Feb 2010 15:55:03 -0800 Subject: ceph: use rbtree for pg pools; decode new osdmap format Since we can now create and destroy pg pools, the pool ids will be sparse, and an array no longer makes sense for looking up by pool id. Use an rbtree instead. The OSDMap encoding also no longer has a max pool count (previously used to allocate the array). There is a new pool_max, that is the largest pool id we've ever used, although we don't actually need it in the client. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 136 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 94 insertions(+), 42 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 443fdcdb19c4..34b5696c84fd 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -328,9 +328,15 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) rb_erase(&pg->node, &map->pg_temp); kfree(pg); } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } kfree(map->osd_state); kfree(map->osd_weight); - kfree(map->pg_pool); kfree(map->osd_addr); kfree(map); } @@ -432,6 +438,48 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, return NULL; } +/* + * rbtree of pg pool info + */ +static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_pool_info *pi = NULL; + + while (*p) { + parent = *p; + pi = rb_entry(parent, struct ceph_pg_pool_info, node); + if (new->id < pi->id) + p = &(*p)->rb_left; + else if (new->id > pi->id) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +{ + struct ceph_pg_pool_info *pi; + struct rb_node *n = root->rb_node; + + while (n) { + pi = rb_entry(n, struct ceph_pg_pool_info, node); + if (id < pi->id) + n = n->rb_left; + else if (id > pi->id) + n = n->rb_right; + else + return pi; + } + return NULL; +} + /* * decode a full map. */ @@ -443,6 +491,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) u8 ev; int err = -EINVAL; void *start = *p; + struct ceph_pg_pool_info *pi; dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -464,32 +513,27 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - map->num_pools = ceph_decode_32(p); - map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool), - GFP_NOFS); - if (!map->pg_pool) { - err = -ENOMEM; - goto bad; - } ceph_decode_32_safe(p, end, max, bad); while (max--) { - ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad); - i = ceph_decode_32(p); - if (i >= map->num_pools) + ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) goto bad; + pi->id = ceph_decode_32(p); ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); goto bad; } - ceph_decode_copy(p, &map->pg_pool[i].v, - sizeof(map->pg_pool->v)); - calc_pg_masks(&map->pg_pool[i]); - p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64); - p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals) + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + __insert_pg_pool(&map->pg_pools, pi); + calc_pg_masks(pi); + p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; } + ceph_decode_32_safe(p, end, map->pool_max, bad); ceph_decode_32_safe(p, end, map->flags, bad); @@ -581,7 +625,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, u32 epoch = 0; struct ceph_timespec modified; u32 len, pool; - __s32 new_flags, max; + __s32 new_pool_max, new_flags, max; void *start = *p; int err = -EINVAL; u16 version; @@ -600,6 +644,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); + new_pool_max = ceph_decode_32(p); new_flags = ceph_decode_32(p); /* full map? */ @@ -623,6 +668,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new flags? */ if (new_flags >= 0) map->flags = new_flags; + if (new_pool_max >= 0) + map->pool_max = new_pool_max; ceph_decode_need(p, end, 5*sizeof(u32), bad); @@ -647,37 +694,42 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_32_safe(p, end, len, bad); while (len--) { __u8 ev; + struct ceph_pg_pool_info *pi; ceph_decode_32_safe(p, end, pool, bad); - if (pool >= map->num_pools) { - void *pg_pool = kcalloc(pool + 1, - sizeof(*map->pg_pool), - GFP_NOFS); - if (!pg_pool) { - err = -ENOMEM; - goto bad; - } - memcpy(pg_pool, map->pg_pool, - map->num_pools * sizeof(*map->pg_pool)); - kfree(map->pg_pool); - map->pg_pool = pg_pool; - map->num_pools = pool+1; - } - ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad); + ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); goto bad; } - ceph_decode_copy(p, &map->pg_pool[pool].v, - sizeof(map->pg_pool->v)); - calc_pg_masks(&map->pg_pool[pool]); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!pi) { + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) { + err = -ENOMEM; + goto bad; + } + pi->id = pool; + __insert_pg_pool(&map->pg_pools, pi); + } + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + calc_pg_masks(pi); } - /* old_pool (ignore) */ + /* old_pool */ ceph_decode_32_safe(p, end, len, bad); - *p += len * sizeof(u32); + while (len--) { + struct ceph_pg_pool_info *pi; + + ceph_decode_32_safe(p, end, pool, bad); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } + } /* new_up */ err = -EINVAL; @@ -861,10 +913,10 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, unsigned ps; BUG_ON(!osdmap); - if (poolid >= osdmap->num_pools) - return -EIO; - pool = &osdmap->pg_pool[poolid]; + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return -EIO; ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); if (preferred >= 0) { ps += preferred; @@ -919,9 +971,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, preferred >= osdmap->crush->max_devices) preferred = -1; - if (poolid >= osdmap->num_pools) + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) return NULL; - pool = &osdmap->pg_pool[poolid]; ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, pool->v.type, pool->v.size); if (ruleno < 0) { -- cgit v1.2.2 From e53a8fd773065628b24605b289a9a40ee4a35d83 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 1 Mar 2010 14:50:05 -0800 Subject: ceph: fix osdmap decoding when pools include (removed) snaps Add missing pointer dereference (p is a void **). Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ceph/osdmap.c') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 34b5696c84fd..b83f2692b835 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -529,8 +529,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_copy(p, &pi->v, sizeof(pi->v)); __insert_pg_pool(&map->pg_pools, pi); calc_pg_masks(pi); - p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); - p += le32_to_cpu(pi->v.num_removed_snap_intervals) + *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; } ceph_decode_32_safe(p, end, map->pool_max, bad); -- cgit v1.2.2