aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-07 17:38:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-07 17:38:18 -0400
commit7035cdf36d5c4d913f68ff97e1c2e5603500d946 (patch)
treeeee5680c16771cb23bc6d3a47bc7b6f350171b22 /drivers/block
parent6432f2128414edbea5fd4f6c4fa4c28d0e1c6151 (diff)
parent6285bc231277419255f3498d3eb5ddc9f8e7fe79 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull ceph updates from Sage Weil: "The bulk of this pull is a series from Alex that refactors and cleans up the RBD code to lay the groundwork for supporting the new image format and evolving feature set. There are also some cleanups in libceph, and for ceph there's fixed validation of file striping layouts and a bugfix in the code handling a shrinking MDS cluster." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (71 commits) ceph: avoid 32-bit page index overflow ceph: return EIO on invalid layout on GET_DATALOC ioctl rbd: BUG on invalid layout ceph: propagate layout error on osd request creation libceph: check for invalid mapping ceph: convert to use le32_add_cpu() ceph: Fix oops when handling mdsmap that decreases max_mds rbd: update remaining header fields for v2 rbd: get snapshot name for a v2 image rbd: get the snapshot context for a v2 image rbd: get image features for a v2 image rbd: get the object prefix for a v2 rbd image rbd: add code to get the size of a v2 rbd image rbd: lay out header probe infrastructure rbd: encapsulate code that gets snapshot info rbd: add an rbd features field rbd: don't use index in __rbd_add_snap_dev() rbd: kill create_snap sysfs entry rbd: define rbd_dev_image_id() rbd: define some new format constants ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/rbd.c1784
-rw-r--r--drivers/block/rbd_types.h27
2 files changed, 1187 insertions, 624 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 54a55f03115d..bb3d9be3b1b4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -41,6 +41,8 @@
41 41
42#include "rbd_types.h" 42#include "rbd_types.h"
43 43
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
44/* 46/*
45 * The basic unit of block I/O is a sector. It is interpreted in a 47 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is 48 * number of contexts in Linux (blk, bio, genhd), but the default is
@@ -50,16 +52,24 @@
50#define SECTOR_SHIFT 9 52#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52 54
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
53#define RBD_DRV_NAME "rbd" 59#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)" 60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
55 61
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57 63
58#define RBD_MAX_SNAP_NAME_LEN 32 64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
59#define RBD_MAX_OPT_LEN 1024 66#define RBD_MAX_OPT_LEN 1024
60 67
61#define RBD_SNAP_HEAD_NAME "-" 68#define RBD_SNAP_HEAD_NAME "-"
62 69
70#define RBD_IMAGE_ID_LEN_MAX 64
71#define RBD_OBJ_PREFIX_LEN_MAX 64
72
63/* 73/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from 74 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier. 75 * RBD_DRV_NAME above, and # is a unique integer identifier.
@@ -69,21 +79,22 @@
69#define DEV_NAME_LEN 32 79#define DEV_NAME_LEN 32
70#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 80#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
71 81
72#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 82#define RBD_READ_ONLY_DEFAULT false
73 83
74/* 84/*
75 * block device image metadata (in-memory version) 85 * block device image metadata (in-memory version)
76 */ 86 */
77struct rbd_image_header { 87struct rbd_image_header {
78 u64 image_size; 88 /* These four fields never change for a given rbd image */
79 char *object_prefix; 89 char *object_prefix;
90 u64 features;
80 __u8 obj_order; 91 __u8 obj_order;
81 __u8 crypt_type; 92 __u8 crypt_type;
82 __u8 comp_type; 93 __u8 comp_type;
83 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
85 u32 total_snaps;
86 94
95 /* The remaining fields need to be updated occasionally */
96 u64 image_size;
97 struct ceph_snap_context *snapc;
87 char *snap_names; 98 char *snap_names;
88 u64 *snap_sizes; 99 u64 *snap_sizes;
89 100
@@ -91,7 +102,7 @@ struct rbd_image_header {
91}; 102};
92 103
93struct rbd_options { 104struct rbd_options {
94 int notify_timeout; 105 bool read_only;
95}; 106};
96 107
97/* 108/*
@@ -99,7 +110,6 @@ struct rbd_options {
99 */ 110 */
100struct rbd_client { 111struct rbd_client {
101 struct ceph_client *client; 112 struct ceph_client *client;
102 struct rbd_options *rbd_opts;
103 struct kref kref; 113 struct kref kref;
104 struct list_head node; 114 struct list_head node;
105}; 115};
@@ -141,6 +151,16 @@ struct rbd_snap {
141 u64 size; 151 u64 size;
142 struct list_head node; 152 struct list_head node;
143 u64 id; 153 u64 id;
154 u64 features;
155};
156
157struct rbd_mapping {
158 char *snap_name;
159 u64 snap_id;
160 u64 size;
161 u64 features;
162 bool snap_exists;
163 bool read_only;
144}; 164};
145 165
146/* 166/*
@@ -151,8 +171,9 @@ struct rbd_device {
151 171
152 int major; /* blkdev assigned major */ 172 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */ 173 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155 174
175 u32 image_format; /* Either 1 or 2 */
176 struct rbd_options rbd_opts;
156 struct rbd_client *rbd_client; 177 struct rbd_client *rbd_client;
157 178
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 179 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -160,6 +181,8 @@ struct rbd_device {
160 spinlock_t lock; /* queue lock */ 181 spinlock_t lock; /* queue lock */
161 182
162 struct rbd_image_header header; 183 struct rbd_image_header header;
184 char *image_id;
185 size_t image_id_len;
163 char *image_name; 186 char *image_name;
164 size_t image_name_len; 187 size_t image_name_len;
165 char *header_name; 188 char *header_name;
@@ -171,13 +194,8 @@ struct rbd_device {
171 194
172 /* protects updating the header */ 195 /* protects updating the header */
173 struct rw_semaphore header_rwsem; 196 struct rw_semaphore header_rwsem;
174 /* name of the snapshot this device reads from */ 197
175 char *snap_name; 198 struct rbd_mapping mapping;
176 /* id of the snapshot this device reads from */
177 u64 snap_id; /* current snapshot id */
178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
181 199
182 struct list_head node; 200 struct list_head node;
183 201
@@ -196,12 +214,10 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock);
196static LIST_HEAD(rbd_client_list); /* clients */ 214static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock); 215static DEFINE_SPINLOCK(rbd_client_list_lock);
198 216
199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); 217static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
218static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
219
200static void rbd_dev_release(struct device *dev); 220static void rbd_dev_release(struct device *dev);
201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
205static void __rbd_remove_snap_dev(struct rbd_snap *snap); 221static void __rbd_remove_snap_dev(struct rbd_snap *snap);
206 222
207static ssize_t rbd_add(struct bus_type *bus, const char *buf, 223static ssize_t rbd_add(struct bus_type *bus, const char *buf,
@@ -229,6 +245,18 @@ static struct device rbd_root_dev = {
229 .release = rbd_root_dev_release, 245 .release = rbd_root_dev_release,
230}; 246};
231 247
248#ifdef RBD_DEBUG
249#define rbd_assert(expr) \
250 if (unlikely(!(expr))) { \
251 printk(KERN_ERR "\nAssertion failure in %s() " \
252 "at line %d:\n\n" \
253 "\trbd_assert(%s);\n\n", \
254 __func__, __LINE__, #expr); \
255 BUG(); \
256 }
257#else /* !RBD_DEBUG */
258# define rbd_assert(expr) ((void) 0)
259#endif /* !RBD_DEBUG */
232 260
233static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 261static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{ 262{
@@ -246,11 +274,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
246{ 274{
247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 275 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
248 276
249 if ((mode & FMODE_WRITE) && rbd_dev->read_only) 277 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
250 return -EROFS; 278 return -EROFS;
251 279
252 rbd_get_dev(rbd_dev); 280 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only); 281 set_device_ro(bdev, rbd_dev->mapping.read_only);
254 282
255 return 0; 283 return 0;
256} 284}
@@ -274,8 +302,7 @@ static const struct block_device_operations rbd_bd_ops = {
274 * Initialize an rbd client instance. 302 * Initialize an rbd client instance.
275 * We own *ceph_opts. 303 * We own *ceph_opts.
276 */ 304 */
277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, 305static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
278 struct rbd_options *rbd_opts)
279{ 306{
280 struct rbd_client *rbdc; 307 struct rbd_client *rbdc;
281 int ret = -ENOMEM; 308 int ret = -ENOMEM;
@@ -299,8 +326,6 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
299 if (ret < 0) 326 if (ret < 0)
300 goto out_err; 327 goto out_err;
301 328
302 rbdc->rbd_opts = rbd_opts;
303
304 spin_lock(&rbd_client_list_lock); 329 spin_lock(&rbd_client_list_lock);
305 list_add_tail(&rbdc->node, &rbd_client_list); 330 list_add_tail(&rbdc->node, &rbd_client_list);
306 spin_unlock(&rbd_client_list_lock); 331 spin_unlock(&rbd_client_list_lock);
@@ -322,36 +347,52 @@ out_opt:
322} 347}
323 348
324/* 349/*
325 * Find a ceph client with specific addr and configuration. 350 * Find a ceph client with specific addr and configuration. If
351 * found, bump its reference count.
326 */ 352 */
327static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) 353static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
328{ 354{
329 struct rbd_client *client_node; 355 struct rbd_client *client_node;
356 bool found = false;
330 357
331 if (ceph_opts->flags & CEPH_OPT_NOSHARE) 358 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
332 return NULL; 359 return NULL;
333 360
334 list_for_each_entry(client_node, &rbd_client_list, node) 361 spin_lock(&rbd_client_list_lock);
335 if (!ceph_compare_options(ceph_opts, client_node->client)) 362 list_for_each_entry(client_node, &rbd_client_list, node) {
336 return client_node; 363 if (!ceph_compare_options(ceph_opts, client_node->client)) {
337 return NULL; 364 kref_get(&client_node->kref);
365 found = true;
366 break;
367 }
368 }
369 spin_unlock(&rbd_client_list_lock);
370
371 return found ? client_node : NULL;
338} 372}
339 373
340/* 374/*
341 * mount options 375 * mount options
342 */ 376 */
343enum { 377enum {
344 Opt_notify_timeout,
345 Opt_last_int, 378 Opt_last_int,
346 /* int args above */ 379 /* int args above */
347 Opt_last_string, 380 Opt_last_string,
348 /* string args above */ 381 /* string args above */
382 Opt_read_only,
383 Opt_read_write,
384 /* Boolean args above */
385 Opt_last_bool,
349}; 386};
350 387
351static match_table_t rbd_opts_tokens = { 388static match_table_t rbd_opts_tokens = {
352 {Opt_notify_timeout, "notify_timeout=%d"},
353 /* int args above */ 389 /* int args above */
354 /* string args above */ 390 /* string args above */
391 {Opt_read_only, "mapping.read_only"},
392 {Opt_read_only, "ro"}, /* Alternate spelling */
393 {Opt_read_write, "read_write"},
394 {Opt_read_write, "rw"}, /* Alternate spelling */
395 /* Boolean args above */
355 {-1, NULL} 396 {-1, NULL}
356}; 397};
357 398
@@ -376,16 +417,22 @@ static int parse_rbd_opts_token(char *c, void *private)
376 } else if (token > Opt_last_int && token < Opt_last_string) { 417 } else if (token > Opt_last_int && token < Opt_last_string) {
377 dout("got string token %d val %s\n", token, 418 dout("got string token %d val %s\n", token,
378 argstr[0].from); 419 argstr[0].from);
420 } else if (token > Opt_last_string && token < Opt_last_bool) {
421 dout("got Boolean token %d\n", token);
379 } else { 422 } else {
380 dout("got token %d\n", token); 423 dout("got token %d\n", token);
381 } 424 }
382 425
383 switch (token) { 426 switch (token) {
384 case Opt_notify_timeout: 427 case Opt_read_only:
385 rbd_opts->notify_timeout = intval; 428 rbd_opts->read_only = true;
429 break;
430 case Opt_read_write:
431 rbd_opts->read_only = false;
386 break; 432 break;
387 default: 433 default:
388 BUG_ON(token); 434 rbd_assert(false);
435 break;
389 } 436 }
390 return 0; 437 return 0;
391} 438}
@@ -394,48 +441,33 @@ static int parse_rbd_opts_token(char *c, void *private)
394 * Get a ceph client with specific addr and configuration, if one does 441 * Get a ceph client with specific addr and configuration, if one does
395 * not exist create it. 442 * not exist create it.
396 */ 443 */
397static struct rbd_client *rbd_get_client(const char *mon_addr, 444static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
398 size_t mon_addr_len, 445 size_t mon_addr_len, char *options)
399 char *options)
400{ 446{
401 struct rbd_client *rbdc; 447 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
402 struct ceph_options *ceph_opts; 448 struct ceph_options *ceph_opts;
403 struct rbd_options *rbd_opts; 449 struct rbd_client *rbdc;
404
405 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
406 if (!rbd_opts)
407 return ERR_PTR(-ENOMEM);
408 450
409 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; 451 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
410 452
411 ceph_opts = ceph_parse_options(options, mon_addr, 453 ceph_opts = ceph_parse_options(options, mon_addr,
412 mon_addr + mon_addr_len, 454 mon_addr + mon_addr_len,
413 parse_rbd_opts_token, rbd_opts); 455 parse_rbd_opts_token, rbd_opts);
414 if (IS_ERR(ceph_opts)) { 456 if (IS_ERR(ceph_opts))
415 kfree(rbd_opts); 457 return PTR_ERR(ceph_opts);
416 return ERR_CAST(ceph_opts);
417 }
418 458
419 spin_lock(&rbd_client_list_lock); 459 rbdc = rbd_client_find(ceph_opts);
420 rbdc = __rbd_client_find(ceph_opts);
421 if (rbdc) { 460 if (rbdc) {
422 /* using an existing client */ 461 /* using an existing client */
423 kref_get(&rbdc->kref);
424 spin_unlock(&rbd_client_list_lock);
425
426 ceph_destroy_options(ceph_opts); 462 ceph_destroy_options(ceph_opts);
427 kfree(rbd_opts); 463 } else {
428 464 rbdc = rbd_client_create(ceph_opts);
429 return rbdc; 465 if (IS_ERR(rbdc))
466 return PTR_ERR(rbdc);
430 } 467 }
431 spin_unlock(&rbd_client_list_lock); 468 rbd_dev->rbd_client = rbdc;
432
433 rbdc = rbd_client_create(ceph_opts, rbd_opts);
434 469
435 if (IS_ERR(rbdc)) 470 return 0;
436 kfree(rbd_opts);
437
438 return rbdc;
439} 471}
440 472
441/* 473/*
@@ -453,7 +485,6 @@ static void rbd_client_release(struct kref *kref)
453 spin_unlock(&rbd_client_list_lock); 485 spin_unlock(&rbd_client_list_lock);
454 486
455 ceph_destroy_client(rbdc->client); 487 ceph_destroy_client(rbdc->client);
456 kfree(rbdc->rbd_opts);
457 kfree(rbdc); 488 kfree(rbdc);
458} 489}
459 490
@@ -479,10 +510,38 @@ static void rbd_coll_release(struct kref *kref)
479 kfree(coll); 510 kfree(coll);
480} 511}
481 512
513static bool rbd_image_format_valid(u32 image_format)
514{
515 return image_format == 1 || image_format == 2;
516}
517
482static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 518static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
483{ 519{
484 return !memcmp(&ondisk->text, 520 size_t size;
485 RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); 521 u32 snap_count;
522
523 /* The header has to start with the magic rbd header text */
524 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
525 return false;
526
527 /*
528 * The size of a snapshot header has to fit in a size_t, and
529 * that limits the number of snapshots.
530 */
531 snap_count = le32_to_cpu(ondisk->snap_count);
532 size = SIZE_MAX - sizeof (struct ceph_snap_context);
533 if (snap_count > size / sizeof (__le64))
534 return false;
535
536 /*
537 * Not only that, but the size of the entire the snapshot
538 * header must also be representable in a size_t.
539 */
540 size -= snap_count * sizeof (__le64);
541 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
542 return false;
543
544 return true;
486} 545}
487 546
488/* 547/*
@@ -490,179 +549,203 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
490 * header. 549 * header.
491 */ 550 */
492static int rbd_header_from_disk(struct rbd_image_header *header, 551static int rbd_header_from_disk(struct rbd_image_header *header,
493 struct rbd_image_header_ondisk *ondisk, 552 struct rbd_image_header_ondisk *ondisk)
494 u32 allocated_snaps)
495{ 553{
496 u32 snap_count; 554 u32 snap_count;
555 size_t len;
556 size_t size;
557 u32 i;
497 558
498 if (!rbd_dev_ondisk_valid(ondisk)) 559 memset(header, 0, sizeof (*header));
499 return -ENXIO;
500 560
501 snap_count = le32_to_cpu(ondisk->snap_count); 561 snap_count = le32_to_cpu(ondisk->snap_count);
502 if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) 562
503 / sizeof (u64)) 563 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
504 return -EINVAL; 564 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
505 header->snapc = kmalloc(sizeof(struct ceph_snap_context) + 565 if (!header->object_prefix)
506 snap_count * sizeof(u64),
507 GFP_KERNEL);
508 if (!header->snapc)
509 return -ENOMEM; 566 return -ENOMEM;
567 memcpy(header->object_prefix, ondisk->object_prefix, len);
568 header->object_prefix[len] = '\0';
510 569
511 if (snap_count) { 570 if (snap_count) {
512 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); 571 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
513 header->snap_names = kmalloc(header->snap_names_len, 572
514 GFP_KERNEL); 573 /* Save a copy of the snapshot names */
574
575 if (snap_names_len > (u64) SIZE_MAX)
576 return -EIO;
577 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
515 if (!header->snap_names) 578 if (!header->snap_names)
516 goto err_snapc; 579 goto out_err;
517 header->snap_sizes = kmalloc(snap_count * sizeof(u64), 580 /*
518 GFP_KERNEL); 581 * Note that rbd_dev_v1_header_read() guarantees
582 * the ondisk buffer we're working with has
583 * snap_names_len bytes beyond the end of the
584 * snapshot id array, this memcpy() is safe.
585 */
586 memcpy(header->snap_names, &ondisk->snaps[snap_count],
587 snap_names_len);
588
589 /* Record each snapshot's size */
590
591 size = snap_count * sizeof (*header->snap_sizes);
592 header->snap_sizes = kmalloc(size, GFP_KERNEL);
519 if (!header->snap_sizes) 593 if (!header->snap_sizes)
520 goto err_names; 594 goto out_err;
595 for (i = 0; i < snap_count; i++)
596 header->snap_sizes[i] =
597 le64_to_cpu(ondisk->snaps[i].image_size);
521 } else { 598 } else {
522 WARN_ON(ondisk->snap_names_len); 599 WARN_ON(ondisk->snap_names_len);
523 header->snap_names_len = 0;
524 header->snap_names = NULL; 600 header->snap_names = NULL;
525 header->snap_sizes = NULL; 601 header->snap_sizes = NULL;
526 } 602 }
527 603
528 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, 604 header->features = 0; /* No features support in v1 images */
529 GFP_KERNEL);
530 if (!header->object_prefix)
531 goto err_sizes;
532
533 memcpy(header->object_prefix, ondisk->block_name,
534 sizeof(ondisk->block_name));
535 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
536
537 header->image_size = le64_to_cpu(ondisk->image_size);
538 header->obj_order = ondisk->options.order; 605 header->obj_order = ondisk->options.order;
539 header->crypt_type = ondisk->options.crypt_type; 606 header->crypt_type = ondisk->options.crypt_type;
540 header->comp_type = ondisk->options.comp_type; 607 header->comp_type = ondisk->options.comp_type;
541 608
609 /* Allocate and fill in the snapshot context */
610
611 header->image_size = le64_to_cpu(ondisk->image_size);
612 size = sizeof (struct ceph_snap_context);
613 size += snap_count * sizeof (header->snapc->snaps[0]);
614 header->snapc = kzalloc(size, GFP_KERNEL);
615 if (!header->snapc)
616 goto out_err;
617
542 atomic_set(&header->snapc->nref, 1); 618 atomic_set(&header->snapc->nref, 1);
543 header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 619 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
544 header->snapc->num_snaps = snap_count; 620 header->snapc->num_snaps = snap_count;
545 header->total_snaps = snap_count; 621 for (i = 0; i < snap_count; i++)
546 622 header->snapc->snaps[i] =
547 if (snap_count && allocated_snaps == snap_count) { 623 le64_to_cpu(ondisk->snaps[i].id);
548 int i;
549
550 for (i = 0; i < snap_count; i++) {
551 header->snapc->snaps[i] =
552 le64_to_cpu(ondisk->snaps[i].id);
553 header->snap_sizes[i] =
554 le64_to_cpu(ondisk->snaps[i].image_size);
555 }
556
557 /* copy snapshot names */
558 memcpy(header->snap_names, &ondisk->snaps[snap_count],
559 header->snap_names_len);
560 }
561 624
562 return 0; 625 return 0;
563 626
564err_sizes: 627out_err:
565 kfree(header->snap_sizes); 628 kfree(header->snap_sizes);
566 header->snap_sizes = NULL; 629 header->snap_sizes = NULL;
567err_names:
568 kfree(header->snap_names); 630 kfree(header->snap_names);
569 header->snap_names = NULL; 631 header->snap_names = NULL;
570err_snapc: 632 kfree(header->object_prefix);
571 kfree(header->snapc); 633 header->object_prefix = NULL;
572 header->snapc = NULL;
573 634
574 return -ENOMEM; 635 return -ENOMEM;
575} 636}
576 637
577static int snap_by_name(struct rbd_image_header *header, const char *snap_name, 638static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
578 u64 *seq, u64 *size)
579{ 639{
580 int i;
581 char *p = header->snap_names;
582 640
583 for (i = 0; i < header->total_snaps; i++) { 641 struct rbd_snap *snap;
584 if (!strcmp(snap_name, p)) {
585 642
586 /* Found it. Pass back its id and/or size */ 643 list_for_each_entry(snap, &rbd_dev->snaps, node) {
644 if (!strcmp(snap_name, snap->name)) {
645 rbd_dev->mapping.snap_id = snap->id;
646 rbd_dev->mapping.size = snap->size;
647 rbd_dev->mapping.features = snap->features;
587 648
588 if (seq) 649 return 0;
589 *seq = header->snapc->snaps[i];
590 if (size)
591 *size = header->snap_sizes[i];
592 return i;
593 } 650 }
594 p += strlen(p) + 1; /* Skip ahead to the next name */
595 } 651 }
652
596 return -ENOENT; 653 return -ENOENT;
597} 654}
598 655
599static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) 656static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
600{ 657{
601 int ret; 658 int ret;
602 659
603 down_write(&rbd_dev->header_rwsem); 660 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
604
605 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
606 sizeof (RBD_SNAP_HEAD_NAME))) { 661 sizeof (RBD_SNAP_HEAD_NAME))) {
607 rbd_dev->snap_id = CEPH_NOSNAP; 662 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
608 rbd_dev->snap_exists = false; 663 rbd_dev->mapping.size = rbd_dev->header.image_size;
609 rbd_dev->read_only = 0; 664 rbd_dev->mapping.features = rbd_dev->header.features;
610 if (size) 665 rbd_dev->mapping.snap_exists = false;
611 *size = rbd_dev->header.image_size; 666 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
667 ret = 0;
612 } else { 668 } else {
613 u64 snap_id = 0; 669 ret = snap_by_name(rbd_dev, snap_name);
614
615 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
616 &snap_id, size);
617 if (ret < 0) 670 if (ret < 0)
618 goto done; 671 goto done;
619 rbd_dev->snap_id = snap_id; 672 rbd_dev->mapping.snap_exists = true;
620 rbd_dev->snap_exists = true; 673 rbd_dev->mapping.read_only = true;
621 rbd_dev->read_only = 1;
622 } 674 }
623 675 rbd_dev->mapping.snap_name = snap_name;
624 ret = 0;
625done: 676done:
626 up_write(&rbd_dev->header_rwsem);
627 return ret; 677 return ret;
628} 678}
629 679
630static void rbd_header_free(struct rbd_image_header *header) 680static void rbd_header_free(struct rbd_image_header *header)
631{ 681{
632 kfree(header->object_prefix); 682 kfree(header->object_prefix);
683 header->object_prefix = NULL;
633 kfree(header->snap_sizes); 684 kfree(header->snap_sizes);
685 header->snap_sizes = NULL;
634 kfree(header->snap_names); 686 kfree(header->snap_names);
687 header->snap_names = NULL;
635 ceph_put_snap_context(header->snapc); 688 ceph_put_snap_context(header->snapc);
689 header->snapc = NULL;
636} 690}
637 691
638/* 692static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
639 * get the actual striped segment name, offset and length 693{
640 */ 694 char *name;
641static u64 rbd_get_segment(struct rbd_image_header *header, 695 u64 segment;
642 const char *object_prefix, 696 int ret;
643 u64 ofs, u64 len, 697
644 char *seg_name, u64 *segofs) 698 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
699 if (!name)
700 return NULL;
701 segment = offset >> rbd_dev->header.obj_order;
702 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
703 rbd_dev->header.object_prefix, segment);
704 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
705 pr_err("error formatting segment name for #%llu (%d)\n",
706 segment, ret);
707 kfree(name);
708 name = NULL;
709 }
710
711 return name;
712}
713
714static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
645{ 715{
646 u64 seg = ofs >> header->obj_order; 716 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
647 717
648 if (seg_name) 718 return offset & (segment_size - 1);
649 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, 719}
650 "%s.%012llx", object_prefix, seg); 720
721static u64 rbd_segment_length(struct rbd_device *rbd_dev,
722 u64 offset, u64 length)
723{
724 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
651 725
652 ofs = ofs & ((1 << header->obj_order) - 1); 726 offset &= segment_size - 1;
653 len = min_t(u64, len, (1 << header->obj_order) - ofs);
654 727
655 if (segofs) 728 rbd_assert(length <= U64_MAX - offset);
656 *segofs = ofs; 729 if (offset + length > segment_size)
730 length = segment_size - offset;
657 731
658 return len; 732 return length;
659} 733}
660 734
661static int rbd_get_num_segments(struct rbd_image_header *header, 735static int rbd_get_num_segments(struct rbd_image_header *header,
662 u64 ofs, u64 len) 736 u64 ofs, u64 len)
663{ 737{
664 u64 start_seg = ofs >> header->obj_order; 738 u64 start_seg;
665 u64 end_seg = (ofs + len - 1) >> header->obj_order; 739 u64 end_seg;
740
741 if (!len)
742 return 0;
743 if (len - 1 > U64_MAX - ofs)
744 return -ERANGE;
745
746 start_seg = ofs >> header->obj_order;
747 end_seg = (ofs + len - 1) >> header->obj_order;
748
666 return end_seg - start_seg + 1; 749 return end_seg - start_seg + 1;
667} 750}
668 751
@@ -724,7 +807,9 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
724 struct bio_pair **bp, 807 struct bio_pair **bp,
725 int len, gfp_t gfpmask) 808 int len, gfp_t gfpmask)
726{ 809{
727 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; 810 struct bio *old_chain = *old;
811 struct bio *new_chain = NULL;
812 struct bio *tail;
728 int total = 0; 813 int total = 0;
729 814
730 if (*bp) { 815 if (*bp) {
@@ -733,9 +818,12 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
733 } 818 }
734 819
735 while (old_chain && (total < len)) { 820 while (old_chain && (total < len)) {
821 struct bio *tmp;
822
736 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); 823 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
737 if (!tmp) 824 if (!tmp)
738 goto err_out; 825 goto err_out;
826 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
739 827
740 if (total + old_chain->bi_size > len) { 828 if (total + old_chain->bi_size > len) {
741 struct bio_pair *bp; 829 struct bio_pair *bp;
@@ -763,24 +851,18 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
763 } 851 }
764 852
765 tmp->bi_bdev = NULL; 853 tmp->bi_bdev = NULL;
766 gfpmask &= ~__GFP_WAIT;
767 tmp->bi_next = NULL; 854 tmp->bi_next = NULL;
768 855 if (new_chain)
769 if (!new_chain) {
770 new_chain = tail = tmp;
771 } else {
772 tail->bi_next = tmp; 856 tail->bi_next = tmp;
773 tail = tmp; 857 else
774 } 858 new_chain = tmp;
859 tail = tmp;
775 old_chain = old_chain->bi_next; 860 old_chain = old_chain->bi_next;
776 861
777 total += tmp->bi_size; 862 total += tmp->bi_size;
778 } 863 }
779 864
780 BUG_ON(total < len); 865 rbd_assert(total == len);
781
782 if (tail)
783 tail->bi_next = NULL;
784 866
785 *old = old_chain; 867 *old = old_chain;
786 868
@@ -938,8 +1020,9 @@ static int rbd_do_request(struct request *rq,
938 layout->fl_stripe_count = cpu_to_le32(1); 1020 layout->fl_stripe_count = cpu_to_le32(1);
939 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1021 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
940 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); 1022 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
941 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 1023 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
942 req, ops); 1024 req, ops);
1025 rbd_assert(ret == 0);
943 1026
944 ceph_osdc_build_request(req, ofs, &len, 1027 ceph_osdc_build_request(req, ofs, &len,
945 ops, 1028 ops,
@@ -1030,8 +1113,8 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1030 int flags, 1113 int flags,
1031 struct ceph_osd_req_op *ops, 1114 struct ceph_osd_req_op *ops,
1032 const char *object_name, 1115 const char *object_name,
1033 u64 ofs, u64 len, 1116 u64 ofs, u64 inbound_size,
1034 char *buf, 1117 char *inbound,
1035 struct ceph_osd_request **linger_req, 1118 struct ceph_osd_request **linger_req,
1036 u64 *ver) 1119 u64 *ver)
1037{ 1120{
@@ -1039,15 +1122,15 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1039 struct page **pages; 1122 struct page **pages;
1040 int num_pages; 1123 int num_pages;
1041 1124
1042 BUG_ON(ops == NULL); 1125 rbd_assert(ops != NULL);
1043 1126
1044 num_pages = calc_pages_for(ofs , len); 1127 num_pages = calc_pages_for(ofs, inbound_size);
1045 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1128 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1046 if (IS_ERR(pages)) 1129 if (IS_ERR(pages))
1047 return PTR_ERR(pages); 1130 return PTR_ERR(pages);
1048 1131
1049 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1132 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1050 object_name, ofs, len, NULL, 1133 object_name, ofs, inbound_size, NULL,
1051 pages, num_pages, 1134 pages, num_pages,
1052 flags, 1135 flags,
1053 ops, 1136 ops,
@@ -1057,8 +1140,8 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1057 if (ret < 0) 1140 if (ret < 0)
1058 goto done; 1141 goto done;
1059 1142
1060 if ((flags & CEPH_OSD_FLAG_READ) && buf) 1143 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1061 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); 1144 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1062 1145
1063done: 1146done:
1064 ceph_release_page_vector(pages, num_pages); 1147 ceph_release_page_vector(pages, num_pages);
@@ -1085,14 +1168,11 @@ static int rbd_do_op(struct request *rq,
1085 struct ceph_osd_req_op *ops; 1168 struct ceph_osd_req_op *ops;
1086 u32 payload_len; 1169 u32 payload_len;
1087 1170
1088 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 1171 seg_name = rbd_segment_name(rbd_dev, ofs);
1089 if (!seg_name) 1172 if (!seg_name)
1090 return -ENOMEM; 1173 return -ENOMEM;
1091 1174 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1092 seg_len = rbd_get_segment(&rbd_dev->header, 1175 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1093 rbd_dev->header.object_prefix,
1094 ofs, len,
1095 seg_name, &seg_ofs);
1096 1176
1097 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); 1177 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1098 1178
@@ -1104,7 +1184,7 @@ static int rbd_do_op(struct request *rq,
1104 /* we've taken care of segment sizes earlier when we 1184 /* we've taken care of segment sizes earlier when we
1105 cloned the bios. We should never have a segment 1185 cloned the bios. We should never have a segment
1106 truncated at this point */ 1186 truncated at this point */
1107 BUG_ON(seg_len < len); 1187 rbd_assert(seg_len == len);
1108 1188
1109 ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1189 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1110 seg_name, seg_ofs, seg_len, 1190 seg_name, seg_ofs, seg_len,
@@ -1306,89 +1386,36 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
1306 return ret; 1386 return ret;
1307} 1387}
1308 1388
1309struct rbd_notify_info {
1310 struct rbd_device *rbd_dev;
1311};
1312
1313static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1314{
1315 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1316 if (!rbd_dev)
1317 return;
1318
1319 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1320 rbd_dev->header_name, (unsigned long long) notify_id,
1321 (unsigned int) opcode);
1322}
1323
1324/* 1389/*
1325 * Request sync osd notify 1390 * Synchronous osd object method call
1326 */
1327static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
1328{
1329 struct ceph_osd_req_op *ops;
1330 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1331 struct ceph_osd_event *event;
1332 struct rbd_notify_info info;
1333 int payload_len = sizeof(u32) + sizeof(u32);
1334 int ret;
1335
1336 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1337 if (!ops)
1338 return -ENOMEM;
1339
1340 info.rbd_dev = rbd_dev;
1341
1342 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1343 (void *)&info, &event);
1344 if (ret < 0)
1345 goto fail;
1346
1347 ops[0].watch.ver = 1;
1348 ops[0].watch.flag = 1;
1349 ops[0].watch.cookie = event->cookie;
1350 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1351 ops[0].watch.timeout = 12;
1352
1353 ret = rbd_req_sync_op(rbd_dev, NULL,
1354 CEPH_NOSNAP,
1355 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1356 ops,
1357 rbd_dev->header_name,
1358 0, 0, NULL, NULL, NULL);
1359 if (ret < 0)
1360 goto fail_event;
1361
1362 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1363 dout("ceph_osdc_wait_event returned %d\n", ret);
1364 rbd_destroy_ops(ops);
1365 return 0;
1366
1367fail_event:
1368 ceph_osdc_cancel_event(event);
1369fail:
1370 rbd_destroy_ops(ops);
1371 return ret;
1372}
1373
1374/*
1375 * Request sync osd read
1376 */ 1391 */
1377static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1392static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1378 const char *object_name, 1393 const char *object_name,
1379 const char *class_name, 1394 const char *class_name,
1380 const char *method_name, 1395 const char *method_name,
1381 const char *data, 1396 const char *outbound,
1382 int len, 1397 size_t outbound_size,
1398 char *inbound,
1399 size_t inbound_size,
1400 int flags,
1383 u64 *ver) 1401 u64 *ver)
1384{ 1402{
1385 struct ceph_osd_req_op *ops; 1403 struct ceph_osd_req_op *ops;
1386 int class_name_len = strlen(class_name); 1404 int class_name_len = strlen(class_name);
1387 int method_name_len = strlen(method_name); 1405 int method_name_len = strlen(method_name);
1406 int payload_size;
1388 int ret; 1407 int ret;
1389 1408
1390 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, 1409 /*
1391 class_name_len + method_name_len + len); 1410 * Any input parameters required by the method we're calling
1411 * will be sent along with the class and method names as
1412 * part of the message payload. That data and its size are
1413 * supplied via the indata and indata_len fields (named from
1414 * the perspective of the server side) in the OSD request
1415 * operation.
1416 */
1417 payload_size = class_name_len + method_name_len + outbound_size;
1418 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
1392 if (!ops) 1419 if (!ops)
1393 return -ENOMEM; 1420 return -ENOMEM;
1394 1421
@@ -1397,14 +1424,14 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1397 ops[0].cls.method_name = method_name; 1424 ops[0].cls.method_name = method_name;
1398 ops[0].cls.method_len = (__u8) method_name_len; 1425 ops[0].cls.method_len = (__u8) method_name_len;
1399 ops[0].cls.argc = 0; 1426 ops[0].cls.argc = 0;
1400 ops[0].cls.indata = data; 1427 ops[0].cls.indata = outbound;
1401 ops[0].cls.indata_len = len; 1428 ops[0].cls.indata_len = outbound_size;
1402 1429
1403 ret = rbd_req_sync_op(rbd_dev, NULL, 1430 ret = rbd_req_sync_op(rbd_dev, NULL,
1404 CEPH_NOSNAP, 1431 CEPH_NOSNAP,
1405 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1432 flags, ops,
1406 ops, 1433 object_name, 0, inbound_size, inbound,
1407 object_name, 0, 0, NULL, NULL, ver); 1434 NULL, ver);
1408 1435
1409 rbd_destroy_ops(ops); 1436 rbd_destroy_ops(ops);
1410 1437
@@ -1446,10 +1473,6 @@ static void rbd_rq_fn(struct request_queue *q)
1446 struct rbd_req_coll *coll; 1473 struct rbd_req_coll *coll;
1447 struct ceph_snap_context *snapc; 1474 struct ceph_snap_context *snapc;
1448 1475
1449 /* peek at request from block layer */
1450 if (!rq)
1451 break;
1452
1453 dout("fetched request\n"); 1476 dout("fetched request\n");
1454 1477
1455 /* filter out block requests we don't understand */ 1478 /* filter out block requests we don't understand */
@@ -1464,7 +1487,7 @@ static void rbd_rq_fn(struct request_queue *q)
1464 size = blk_rq_bytes(rq); 1487 size = blk_rq_bytes(rq);
1465 ofs = blk_rq_pos(rq) * SECTOR_SIZE; 1488 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1466 rq_bio = rq->bio; 1489 rq_bio = rq->bio;
1467 if (do_write && rbd_dev->read_only) { 1490 if (do_write && rbd_dev->mapping.read_only) {
1468 __blk_end_request_all(rq, -EROFS); 1491 __blk_end_request_all(rq, -EROFS);
1469 continue; 1492 continue;
1470 } 1493 }
@@ -1473,7 +1496,8 @@ static void rbd_rq_fn(struct request_queue *q)
1473 1496
1474 down_read(&rbd_dev->header_rwsem); 1497 down_read(&rbd_dev->header_rwsem);
1475 1498
1476 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { 1499 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1500 !rbd_dev->mapping.snap_exists) {
1477 up_read(&rbd_dev->header_rwsem); 1501 up_read(&rbd_dev->header_rwsem);
1478 dout("request for non-existent snapshot"); 1502 dout("request for non-existent snapshot");
1479 spin_lock_irq(q->queue_lock); 1503 spin_lock_irq(q->queue_lock);
@@ -1490,6 +1514,12 @@ static void rbd_rq_fn(struct request_queue *q)
1490 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1514 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1491 1515
1492 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1516 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1517 if (num_segs <= 0) {
1518 spin_lock_irq(q->queue_lock);
1519 __blk_end_request_all(rq, num_segs);
1520 ceph_put_snap_context(snapc);
1521 continue;
1522 }
1493 coll = rbd_alloc_coll(num_segs); 1523 coll = rbd_alloc_coll(num_segs);
1494 if (!coll) { 1524 if (!coll) {
1495 spin_lock_irq(q->queue_lock); 1525 spin_lock_irq(q->queue_lock);
@@ -1501,10 +1531,7 @@ static void rbd_rq_fn(struct request_queue *q)
1501 do { 1531 do {
1502 /* a bio clone to be passed down to OSD req */ 1532 /* a bio clone to be passed down to OSD req */
1503 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 1533 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1504 op_size = rbd_get_segment(&rbd_dev->header, 1534 op_size = rbd_segment_length(rbd_dev, ofs, size);
1505 rbd_dev->header.object_prefix,
1506 ofs, size,
1507 NULL, NULL);
1508 kref_get(&coll->kref); 1535 kref_get(&coll->kref);
1509 bio = bio_chain_clone(&rq_bio, &next_bio, &bp, 1536 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1510 op_size, GFP_ATOMIC); 1537 op_size, GFP_ATOMIC);
@@ -1524,7 +1551,7 @@ static void rbd_rq_fn(struct request_queue *q)
1524 coll, cur_seg); 1551 coll, cur_seg);
1525 else 1552 else
1526 rbd_req_read(rq, rbd_dev, 1553 rbd_req_read(rq, rbd_dev,
1527 rbd_dev->snap_id, 1554 rbd_dev->mapping.snap_id,
1528 ofs, 1555 ofs,
1529 op_size, bio, 1556 op_size, bio,
1530 coll, cur_seg); 1557 coll, cur_seg);
@@ -1580,8 +1607,6 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
1580 if (!disk) 1607 if (!disk)
1581 return; 1608 return;
1582 1609
1583 rbd_header_free(&rbd_dev->header);
1584
1585 if (disk->flags & GENHD_FL_UP) 1610 if (disk->flags & GENHD_FL_UP)
1586 del_gendisk(disk); 1611 del_gendisk(disk);
1587 if (disk->queue) 1612 if (disk->queue)
@@ -1590,105 +1615,96 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
1590} 1615}
1591 1616
1592/* 1617/*
1593 * reload the ondisk the header 1618 * Read the complete header for the given rbd device.
1619 *
1620 * Returns a pointer to a dynamically-allocated buffer containing
1621 * the complete and validated header. Caller can pass the address
1622 * of a variable that will be filled in with the version of the
1623 * header object at the time it was read.
1624 *
1625 * Returns a pointer-coded errno if a failure occurs.
1594 */ 1626 */
1595static int rbd_read_header(struct rbd_device *rbd_dev, 1627static struct rbd_image_header_ondisk *
1596 struct rbd_image_header *header) 1628rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1597{ 1629{
1598 ssize_t rc; 1630 struct rbd_image_header_ondisk *ondisk = NULL;
1599 struct rbd_image_header_ondisk *dh;
1600 u32 snap_count = 0; 1631 u32 snap_count = 0;
1601 u64 ver; 1632 u64 names_size = 0;
1602 size_t len; 1633 u32 want_count;
1634 int ret;
1603 1635
1604 /* 1636 /*
1605 * First reads the fixed-size header to determine the number 1637 * The complete header will include an array of its 64-bit
1606 * of snapshots, then re-reads it, along with all snapshot 1638 * snapshot ids, followed by the names of those snapshots as
1607 * records as well as their stored names. 1639 * a contiguous block of NUL-terminated strings. Note that
1640 * the number of snapshots could change by the time we read
1641 * it in, in which case we re-read it.
1608 */ 1642 */
1609 len = sizeof (*dh); 1643 do {
1610 while (1) { 1644 size_t size;
1611 dh = kmalloc(len, GFP_KERNEL); 1645
1612 if (!dh) 1646 kfree(ondisk);
1613 return -ENOMEM; 1647
1614 1648 size = sizeof (*ondisk);
1615 rc = rbd_req_sync_read(rbd_dev, 1649 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1616 CEPH_NOSNAP, 1650 size += names_size;
1651 ondisk = kmalloc(size, GFP_KERNEL);
1652 if (!ondisk)
1653 return ERR_PTR(-ENOMEM);
1654
1655 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1617 rbd_dev->header_name, 1656 rbd_dev->header_name,
1618 0, len, 1657 0, size,
1619 (char *)dh, &ver); 1658 (char *) ondisk, version);
1620 if (rc < 0) 1659
1621 goto out_dh; 1660 if (ret < 0)
1622 1661 goto out_err;
1623 rc = rbd_header_from_disk(header, dh, snap_count); 1662 if (WARN_ON((size_t) ret < size)) {
1624 if (rc < 0) { 1663 ret = -ENXIO;
1625 if (rc == -ENXIO) 1664 pr_warning("short header read for image %s"
1626 pr_warning("unrecognized header format" 1665 " (want %zd got %d)\n",
1627 " for image %s\n", 1666 rbd_dev->image_name, size, ret);
1628 rbd_dev->image_name); 1667 goto out_err;
1629 goto out_dh; 1668 }
1669 if (!rbd_dev_ondisk_valid(ondisk)) {
1670 ret = -ENXIO;
1671 pr_warning("invalid header for image %s\n",
1672 rbd_dev->image_name);
1673 goto out_err;
1630 } 1674 }
1631 1675
1632 if (snap_count == header->total_snaps) 1676 names_size = le64_to_cpu(ondisk->snap_names_len);
1633 break; 1677 want_count = snap_count;
1678 snap_count = le32_to_cpu(ondisk->snap_count);
1679 } while (snap_count != want_count);
1634 1680
1635 snap_count = header->total_snaps; 1681 return ondisk;
1636 len = sizeof (*dh) +
1637 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1638 header->snap_names_len;
1639 1682
1640 rbd_header_free(header); 1683out_err:
1641 kfree(dh); 1684 kfree(ondisk);
1642 }
1643 header->obj_version = ver;
1644 1685
1645out_dh: 1686 return ERR_PTR(ret);
1646 kfree(dh);
1647 return rc;
1648} 1687}
1649 1688
1650/* 1689/*
1651 * create a snapshot 1690 * reload the ondisk the header
1652 */ 1691 */
1653static int rbd_header_add_snap(struct rbd_device *rbd_dev, 1692static int rbd_read_header(struct rbd_device *rbd_dev,
1654 const char *snap_name, 1693 struct rbd_image_header *header)
1655 gfp_t gfp_flags)
1656{ 1694{
1657 int name_len = strlen(snap_name); 1695 struct rbd_image_header_ondisk *ondisk;
1658 u64 new_snapid; 1696 u64 ver = 0;
1659 int ret; 1697 int ret;
1660 void *data, *p, *e;
1661 struct ceph_mon_client *monc;
1662
1663 /* we should create a snapshot only if we're pointing at the head */
1664 if (rbd_dev->snap_id != CEPH_NOSNAP)
1665 return -EINVAL;
1666 1698
1667 monc = &rbd_dev->rbd_client->client->monc; 1699 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1668 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid); 1700 if (IS_ERR(ondisk))
1669 dout("created snapid=%llu\n", (unsigned long long) new_snapid); 1701 return PTR_ERR(ondisk);
1670 if (ret < 0) 1702 ret = rbd_header_from_disk(header, ondisk);
1671 return ret; 1703 if (ret >= 0)
1672 1704 header->obj_version = ver;
1673 data = kmalloc(name_len + 16, gfp_flags); 1705 kfree(ondisk);
1674 if (!data)
1675 return -ENOMEM;
1676 1706
1677 p = data; 1707 return ret;
1678 e = data + name_len + 16;
1679
1680 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1681 ceph_encode_64_safe(&p, e, new_snapid, bad);
1682
1683 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
1684 "rbd", "snap_add",
1685 data, p - data, NULL);
1686
1687 kfree(data);
1688
1689 return ret < 0 ? ret : 0;
1690bad:
1691 return -ERANGE;
1692} 1708}
1693 1709
1694static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1710static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
@@ -1715,11 +1731,15 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1715 down_write(&rbd_dev->header_rwsem); 1731 down_write(&rbd_dev->header_rwsem);
1716 1732
1717 /* resized? */ 1733 /* resized? */
1718 if (rbd_dev->snap_id == CEPH_NOSNAP) { 1734 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
1719 sector_t size = (sector_t) h.image_size / SECTOR_SIZE; 1735 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1720 1736
1721 dout("setting size to %llu sectors", (unsigned long long) size); 1737 if (size != (sector_t) rbd_dev->mapping.size) {
1722 set_capacity(rbd_dev->disk, size); 1738 dout("setting size to %llu sectors",
1739 (unsigned long long) size);
1740 rbd_dev->mapping.size = (u64) size;
1741 set_capacity(rbd_dev->disk, size);
1742 }
1723 } 1743 }
1724 1744
1725 /* rbd_dev->header.object_prefix shouldn't change */ 1745 /* rbd_dev->header.object_prefix shouldn't change */
@@ -1732,16 +1752,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1732 *hver = h.obj_version; 1752 *hver = h.obj_version;
1733 rbd_dev->header.obj_version = h.obj_version; 1753 rbd_dev->header.obj_version = h.obj_version;
1734 rbd_dev->header.image_size = h.image_size; 1754 rbd_dev->header.image_size = h.image_size;
1735 rbd_dev->header.total_snaps = h.total_snaps;
1736 rbd_dev->header.snapc = h.snapc; 1755 rbd_dev->header.snapc = h.snapc;
1737 rbd_dev->header.snap_names = h.snap_names; 1756 rbd_dev->header.snap_names = h.snap_names;
1738 rbd_dev->header.snap_names_len = h.snap_names_len;
1739 rbd_dev->header.snap_sizes = h.snap_sizes; 1757 rbd_dev->header.snap_sizes = h.snap_sizes;
1740 /* Free the extra copy of the object prefix */ 1758 /* Free the extra copy of the object prefix */
1741 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1759 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1742 kfree(h.object_prefix); 1760 kfree(h.object_prefix);
1743 1761
1744 ret = __rbd_init_snaps_header(rbd_dev); 1762 ret = rbd_dev_snaps_update(rbd_dev);
1763 if (!ret)
1764 ret = rbd_dev_snaps_register(rbd_dev);
1745 1765
1746 up_write(&rbd_dev->header_rwsem); 1766 up_write(&rbd_dev->header_rwsem);
1747 1767
@@ -1763,29 +1783,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1763{ 1783{
1764 struct gendisk *disk; 1784 struct gendisk *disk;
1765 struct request_queue *q; 1785 struct request_queue *q;
1766 int rc;
1767 u64 segment_size; 1786 u64 segment_size;
1768 u64 total_size = 0;
1769
1770 /* contact OSD, request size info about the object being mapped */
1771 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1772 if (rc)
1773 return rc;
1774
1775 /* no need to lock here, as rbd_dev is not registered yet */
1776 rc = __rbd_init_snaps_header(rbd_dev);
1777 if (rc)
1778 return rc;
1779
1780 rc = rbd_header_set_snap(rbd_dev, &total_size);
1781 if (rc)
1782 return rc;
1783 1787
1784 /* create gendisk info */ 1788 /* create gendisk info */
1785 rc = -ENOMEM;
1786 disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1789 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1787 if (!disk) 1790 if (!disk)
1788 goto out; 1791 return -ENOMEM;
1789 1792
1790 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1793 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1791 rbd_dev->dev_id); 1794 rbd_dev->dev_id);
@@ -1795,7 +1798,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1795 disk->private_data = rbd_dev; 1798 disk->private_data = rbd_dev;
1796 1799
1797 /* init rq */ 1800 /* init rq */
1798 rc = -ENOMEM;
1799 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1801 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1800 if (!q) 1802 if (!q)
1801 goto out_disk; 1803 goto out_disk;
@@ -1816,20 +1818,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1816 q->queuedata = rbd_dev; 1818 q->queuedata = rbd_dev;
1817 1819
1818 rbd_dev->disk = disk; 1820 rbd_dev->disk = disk;
1819 rbd_dev->q = q;
1820 1821
1821 /* finally, announce the disk to the world */ 1822 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1822 set_capacity(disk, total_size / SECTOR_SIZE);
1823 add_disk(disk);
1824 1823
1825 pr_info("%s: added with size 0x%llx\n",
1826 disk->disk_name, (unsigned long long)total_size);
1827 return 0; 1824 return 0;
1828
1829out_disk: 1825out_disk:
1830 put_disk(disk); 1826 put_disk(disk);
1831out: 1827
1832 return rc; 1828 return -ENOMEM;
1833} 1829}
1834 1830
1835/* 1831/*
@@ -1854,6 +1850,19 @@ static ssize_t rbd_size_show(struct device *dev,
1854 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1850 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1855} 1851}
1856 1852
1853/*
1854 * Note this shows the features for whatever's mapped, which is not
1855 * necessarily the base image.
1856 */
1857static ssize_t rbd_features_show(struct device *dev,
1858 struct device_attribute *attr, char *buf)
1859{
1860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1861
1862 return sprintf(buf, "0x%016llx\n",
1863 (unsigned long long) rbd_dev->mapping.features);
1864}
1865
1857static ssize_t rbd_major_show(struct device *dev, 1866static ssize_t rbd_major_show(struct device *dev,
1858 struct device_attribute *attr, char *buf) 1867 struct device_attribute *attr, char *buf)
1859{ 1868{
@@ -1895,13 +1904,25 @@ static ssize_t rbd_name_show(struct device *dev,
1895 return sprintf(buf, "%s\n", rbd_dev->image_name); 1904 return sprintf(buf, "%s\n", rbd_dev->image_name);
1896} 1905}
1897 1906
1907static ssize_t rbd_image_id_show(struct device *dev,
1908 struct device_attribute *attr, char *buf)
1909{
1910 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1911
1912 return sprintf(buf, "%s\n", rbd_dev->image_id);
1913}
1914
1915/*
1916 * Shows the name of the currently-mapped snapshot (or
1917 * RBD_SNAP_HEAD_NAME for the base image).
1918 */
1898static ssize_t rbd_snap_show(struct device *dev, 1919static ssize_t rbd_snap_show(struct device *dev,
1899 struct device_attribute *attr, 1920 struct device_attribute *attr,
1900 char *buf) 1921 char *buf)
1901{ 1922{
1902 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1923 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1903 1924
1904 return sprintf(buf, "%s\n", rbd_dev->snap_name); 1925 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
1905} 1926}
1906 1927
1907static ssize_t rbd_image_refresh(struct device *dev, 1928static ssize_t rbd_image_refresh(struct device *dev,
@@ -1918,25 +1939,27 @@ static ssize_t rbd_image_refresh(struct device *dev,
1918} 1939}
1919 1940
1920static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 1941static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1942static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
1921static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 1943static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1922static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 1944static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1923static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 1945static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1924static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 1946static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
1925static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 1947static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1948static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
1926static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 1949static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1927static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 1950static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1928static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1929 1951
1930static struct attribute *rbd_attrs[] = { 1952static struct attribute *rbd_attrs[] = {
1931 &dev_attr_size.attr, 1953 &dev_attr_size.attr,
1954 &dev_attr_features.attr,
1932 &dev_attr_major.attr, 1955 &dev_attr_major.attr,
1933 &dev_attr_client_id.attr, 1956 &dev_attr_client_id.attr,
1934 &dev_attr_pool.attr, 1957 &dev_attr_pool.attr,
1935 &dev_attr_pool_id.attr, 1958 &dev_attr_pool_id.attr,
1936 &dev_attr_name.attr, 1959 &dev_attr_name.attr,
1960 &dev_attr_image_id.attr,
1937 &dev_attr_current_snap.attr, 1961 &dev_attr_current_snap.attr,
1938 &dev_attr_refresh.attr, 1962 &dev_attr_refresh.attr,
1939 &dev_attr_create_snap.attr,
1940 NULL 1963 NULL
1941}; 1964};
1942 1965
@@ -1982,12 +2005,24 @@ static ssize_t rbd_snap_id_show(struct device *dev,
1982 return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2005 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
1983} 2006}
1984 2007
2008static ssize_t rbd_snap_features_show(struct device *dev,
2009 struct device_attribute *attr,
2010 char *buf)
2011{
2012 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2013
2014 return sprintf(buf, "0x%016llx\n",
2015 (unsigned long long) snap->features);
2016}
2017
1985static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2018static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1986static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 2019static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2020static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
1987 2021
1988static struct attribute *rbd_snap_attrs[] = { 2022static struct attribute *rbd_snap_attrs[] = {
1989 &dev_attr_snap_size.attr, 2023 &dev_attr_snap_size.attr,
1990 &dev_attr_snap_id.attr, 2024 &dev_attr_snap_id.attr,
2025 &dev_attr_snap_features.attr,
1991 NULL, 2026 NULL,
1992}; 2027};
1993 2028
@@ -2012,10 +2047,21 @@ static struct device_type rbd_snap_device_type = {
2012 .release = rbd_snap_dev_release, 2047 .release = rbd_snap_dev_release,
2013}; 2048};
2014 2049
2050static bool rbd_snap_registered(struct rbd_snap *snap)
2051{
2052 bool ret = snap->dev.type == &rbd_snap_device_type;
2053 bool reg = device_is_registered(&snap->dev);
2054
2055 rbd_assert(!ret ^ reg);
2056
2057 return ret;
2058}
2059
2015static void __rbd_remove_snap_dev(struct rbd_snap *snap) 2060static void __rbd_remove_snap_dev(struct rbd_snap *snap)
2016{ 2061{
2017 list_del(&snap->node); 2062 list_del(&snap->node);
2018 device_unregister(&snap->dev); 2063 if (device_is_registered(&snap->dev))
2064 device_unregister(&snap->dev);
2019} 2065}
2020 2066
2021static int rbd_register_snap_dev(struct rbd_snap *snap, 2067static int rbd_register_snap_dev(struct rbd_snap *snap,
@@ -2028,13 +2074,17 @@ static int rbd_register_snap_dev(struct rbd_snap *snap,
2028 dev->parent = parent; 2074 dev->parent = parent;
2029 dev->release = rbd_snap_dev_release; 2075 dev->release = rbd_snap_dev_release;
2030 dev_set_name(dev, "snap_%s", snap->name); 2076 dev_set_name(dev, "snap_%s", snap->name);
2077 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2078
2031 ret = device_register(dev); 2079 ret = device_register(dev);
2032 2080
2033 return ret; 2081 return ret;
2034} 2082}
2035 2083
2036static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2084static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2037 int i, const char *name) 2085 const char *snap_name,
2086 u64 snap_id, u64 snap_size,
2087 u64 snap_features)
2038{ 2088{
2039 struct rbd_snap *snap; 2089 struct rbd_snap *snap;
2040 int ret; 2090 int ret;
@@ -2044,17 +2094,13 @@ static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2044 return ERR_PTR(-ENOMEM); 2094 return ERR_PTR(-ENOMEM);
2045 2095
2046 ret = -ENOMEM; 2096 ret = -ENOMEM;
2047 snap->name = kstrdup(name, GFP_KERNEL); 2097 snap->name = kstrdup(snap_name, GFP_KERNEL);
2048 if (!snap->name) 2098 if (!snap->name)
2049 goto err; 2099 goto err;
2050 2100
2051 snap->size = rbd_dev->header.snap_sizes[i]; 2101 snap->id = snap_id;
2052 snap->id = rbd_dev->header.snapc->snaps[i]; 2102 snap->size = snap_size;
2053 if (device_is_registered(&rbd_dev->dev)) { 2103 snap->features = snap_features;
2054 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2055 if (ret < 0)
2056 goto err;
2057 }
2058 2104
2059 return snap; 2105 return snap;
2060 2106
@@ -2065,128 +2111,439 @@ err:
2065 return ERR_PTR(ret); 2111 return ERR_PTR(ret);
2066} 2112}
2067 2113
2114static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2115 u64 *snap_size, u64 *snap_features)
2116{
2117 char *snap_name;
2118
2119 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2120
2121 *snap_size = rbd_dev->header.snap_sizes[which];
2122 *snap_features = 0; /* No features for v1 */
2123
2124 /* Skip over names until we find the one we are looking for */
2125
2126 snap_name = rbd_dev->header.snap_names;
2127 while (which--)
2128 snap_name += strlen(snap_name) + 1;
2129
2130 return snap_name;
2131}
2132
2068/* 2133/*
2069 * search for the previous snap in a null delimited string list 2134 * Get the size and object order for an image snapshot, or if
2135 * snap_id is CEPH_NOSNAP, gets this information for the base
2136 * image.
2070 */ 2137 */
2071const char *rbd_prev_snap_name(const char *name, const char *start) 2138static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2139 u8 *order, u64 *snap_size)
2072{ 2140{
2073 if (name < start + 2) 2141 __le64 snapid = cpu_to_le64(snap_id);
2074 return NULL; 2142 int ret;
2143 struct {
2144 u8 order;
2145 __le64 size;
2146 } __attribute__ ((packed)) size_buf = { 0 };
2147
2148 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2149 "rbd", "get_size",
2150 (char *) &snapid, sizeof (snapid),
2151 (char *) &size_buf, sizeof (size_buf),
2152 CEPH_OSD_FLAG_READ, NULL);
2153 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2154 if (ret < 0)
2155 return ret;
2156
2157 *order = size_buf.order;
2158 *snap_size = le64_to_cpu(size_buf.size);
2159
2160 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2161 (unsigned long long) snap_id, (unsigned int) *order,
2162 (unsigned long long) *snap_size);
2163
2164 return 0;
2165}
2166
2167static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2168{
2169 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2170 &rbd_dev->header.obj_order,
2171 &rbd_dev->header.image_size);
2172}
2173
2174static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2175{
2176 void *reply_buf;
2177 int ret;
2178 void *p;
2179
2180 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2181 if (!reply_buf)
2182 return -ENOMEM;
2183
2184 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2185 "rbd", "get_object_prefix",
2186 NULL, 0,
2187 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2188 CEPH_OSD_FLAG_READ, NULL);
2189 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2190 if (ret < 0)
2191 goto out;
2192
2193 p = reply_buf;
2194 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2195 p + RBD_OBJ_PREFIX_LEN_MAX,
2196 NULL, GFP_NOIO);
2197
2198 if (IS_ERR(rbd_dev->header.object_prefix)) {
2199 ret = PTR_ERR(rbd_dev->header.object_prefix);
2200 rbd_dev->header.object_prefix = NULL;
2201 } else {
2202 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2203 }
2075 2204
2076 name -= 2; 2205out:
2077 while (*name) { 2206 kfree(reply_buf);
2078 if (name == start) 2207
2079 return start; 2208 return ret;
2080 name--; 2209}
2210
2211static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2212 u64 *snap_features)
2213{
2214 __le64 snapid = cpu_to_le64(snap_id);
2215 struct {
2216 __le64 features;
2217 __le64 incompat;
2218 } features_buf = { 0 };
2219 int ret;
2220
2221 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2222 "rbd", "get_features",
2223 (char *) &snapid, sizeof (snapid),
2224 (char *) &features_buf, sizeof (features_buf),
2225 CEPH_OSD_FLAG_READ, NULL);
2226 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2227 if (ret < 0)
2228 return ret;
2229 *snap_features = le64_to_cpu(features_buf.features);
2230
2231 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2232 (unsigned long long) snap_id,
2233 (unsigned long long) *snap_features,
2234 (unsigned long long) le64_to_cpu(features_buf.incompat));
2235
2236 return 0;
2237}
2238
2239static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2240{
2241 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2242 &rbd_dev->header.features);
2243}
2244
2245static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
2246{
2247 size_t size;
2248 int ret;
2249 void *reply_buf;
2250 void *p;
2251 void *end;
2252 u64 seq;
2253 u32 snap_count;
2254 struct ceph_snap_context *snapc;
2255 u32 i;
2256
2257 /*
2258 * We'll need room for the seq value (maximum snapshot id),
2259 * snapshot count, and array of that many snapshot ids.
2260 * For now we have a fixed upper limit on the number we're
2261 * prepared to receive.
2262 */
2263 size = sizeof (__le64) + sizeof (__le32) +
2264 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2265 reply_buf = kzalloc(size, GFP_KERNEL);
2266 if (!reply_buf)
2267 return -ENOMEM;
2268
2269 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2270 "rbd", "get_snapcontext",
2271 NULL, 0,
2272 reply_buf, size,
2273 CEPH_OSD_FLAG_READ, ver);
2274 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2275 if (ret < 0)
2276 goto out;
2277
2278 ret = -ERANGE;
2279 p = reply_buf;
2280 end = (char *) reply_buf + size;
2281 ceph_decode_64_safe(&p, end, seq, out);
2282 ceph_decode_32_safe(&p, end, snap_count, out);
2283
2284 /*
2285 * Make sure the reported number of snapshot ids wouldn't go
2286 * beyond the end of our buffer. But before checking that,
2287 * make sure the computed size of the snapshot context we
2288 * allocate is representable in a size_t.
2289 */
2290 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2291 / sizeof (u64)) {
2292 ret = -EINVAL;
2293 goto out;
2081 } 2294 }
2082 return name + 1; 2295 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2296 goto out;
2297
2298 size = sizeof (struct ceph_snap_context) +
2299 snap_count * sizeof (snapc->snaps[0]);
2300 snapc = kmalloc(size, GFP_KERNEL);
2301 if (!snapc) {
2302 ret = -ENOMEM;
2303 goto out;
2304 }
2305
2306 atomic_set(&snapc->nref, 1);
2307 snapc->seq = seq;
2308 snapc->num_snaps = snap_count;
2309 for (i = 0; i < snap_count; i++)
2310 snapc->snaps[i] = ceph_decode_64(&p);
2311
2312 rbd_dev->header.snapc = snapc;
2313
2314 dout(" snap context seq = %llu, snap_count = %u\n",
2315 (unsigned long long) seq, (unsigned int) snap_count);
2316
2317out:
2318 kfree(reply_buf);
2319
2320 return 0;
2083} 2321}
2084 2322
2085/* 2323static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2086 * compare the old list of snapshots that we have to what's in the header
2087 * and update it accordingly. Note that the header holds the snapshots
2088 * in a reverse order (from newest to oldest) and we need to go from
2089 * older to new so that we don't get a duplicate snap name when
2090 * doing the process (e.g., removed snapshot and recreated a new
2091 * one with the same name.
2092 */
2093static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2094{ 2324{
2095 const char *name, *first_name; 2325 size_t size;
2096 int i = rbd_dev->header.total_snaps; 2326 void *reply_buf;
2097 struct rbd_snap *snap, *old_snap = NULL; 2327 __le64 snap_id;
2098 struct list_head *p, *n; 2328 int ret;
2329 void *p;
2330 void *end;
2331 size_t snap_name_len;
2332 char *snap_name;
2333
2334 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2335 reply_buf = kmalloc(size, GFP_KERNEL);
2336 if (!reply_buf)
2337 return ERR_PTR(-ENOMEM);
2099 2338
2100 first_name = rbd_dev->header.snap_names; 2339 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2101 name = first_name + rbd_dev->header.snap_names_len; 2340 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2341 "rbd", "get_snapshot_name",
2342 (char *) &snap_id, sizeof (snap_id),
2343 reply_buf, size,
2344 CEPH_OSD_FLAG_READ, NULL);
2345 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2346 if (ret < 0)
2347 goto out;
2102 2348
2103 list_for_each_prev_safe(p, n, &rbd_dev->snaps) { 2349 p = reply_buf;
2104 u64 cur_id; 2350 end = (char *) reply_buf + size;
2351 snap_name_len = 0;
2352 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2353 GFP_KERNEL);
2354 if (IS_ERR(snap_name)) {
2355 ret = PTR_ERR(snap_name);
2356 goto out;
2357 } else {
2358 dout(" snap_id 0x%016llx snap_name = %s\n",
2359 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2360 }
2361 kfree(reply_buf);
2105 2362
2106 old_snap = list_entry(p, struct rbd_snap, node); 2363 return snap_name;
2364out:
2365 kfree(reply_buf);
2107 2366
2108 if (i) 2367 return ERR_PTR(ret);
2109 cur_id = rbd_dev->header.snapc->snaps[i - 1]; 2368}
2110 2369
2111 if (!i || old_snap->id < cur_id) { 2370static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2112 /* 2371 u64 *snap_size, u64 *snap_features)
2113 * old_snap->id was skipped, thus was 2372{
2114 * removed. If this rbd_dev is mapped to 2373 __le64 snap_id;
2115 * the removed snapshot, record that it no 2374 u8 order;
2116 * longer exists, to prevent further I/O. 2375 int ret;
2117 */ 2376
2118 if (rbd_dev->snap_id == old_snap->id) 2377 snap_id = rbd_dev->header.snapc->snaps[which];
2119 rbd_dev->snap_exists = false; 2378 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2120 __rbd_remove_snap_dev(old_snap); 2379 if (ret)
2121 continue; 2380 return ERR_PTR(ret);
2122 } 2381 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2123 if (old_snap->id == cur_id) { 2382 if (ret)
2124 /* we have this snapshot already */ 2383 return ERR_PTR(ret);
2125 i--; 2384
2126 name = rbd_prev_snap_name(name, first_name); 2385 return rbd_dev_v2_snap_name(rbd_dev, which);
2386}
2387
2388static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2389 u64 *snap_size, u64 *snap_features)
2390{
2391 if (rbd_dev->image_format == 1)
2392 return rbd_dev_v1_snap_info(rbd_dev, which,
2393 snap_size, snap_features);
2394 if (rbd_dev->image_format == 2)
2395 return rbd_dev_v2_snap_info(rbd_dev, which,
2396 snap_size, snap_features);
2397 return ERR_PTR(-EINVAL);
2398}
2399
2400/*
2401 * Scan the rbd device's current snapshot list and compare it to the
2402 * newly-received snapshot context. Remove any existing snapshots
2403 * not present in the new snapshot context. Add a new snapshot for
2404 * any snaphots in the snapshot context not in the current list.
2405 * And verify there are no changes to snapshots we already know
2406 * about.
2407 *
2408 * Assumes the snapshots in the snapshot context are sorted by
2409 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2410 * are also maintained in that order.)
2411 */
2412static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2413{
2414 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2415 const u32 snap_count = snapc->num_snaps;
2416 struct list_head *head = &rbd_dev->snaps;
2417 struct list_head *links = head->next;
2418 u32 index = 0;
2419
2420 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
2421 while (index < snap_count || links != head) {
2422 u64 snap_id;
2423 struct rbd_snap *snap;
2424 char *snap_name;
2425 u64 snap_size = 0;
2426 u64 snap_features = 0;
2427
2428 snap_id = index < snap_count ? snapc->snaps[index]
2429 : CEPH_NOSNAP;
2430 snap = links != head ? list_entry(links, struct rbd_snap, node)
2431 : NULL;
2432 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2433
2434 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2435 struct list_head *next = links->next;
2436
2437 /* Existing snapshot not in the new snap context */
2438
2439 if (rbd_dev->mapping.snap_id == snap->id)
2440 rbd_dev->mapping.snap_exists = false;
2441 __rbd_remove_snap_dev(snap);
2442 dout("%ssnap id %llu has been removed\n",
2443 rbd_dev->mapping.snap_id == snap->id ?
2444 "mapped " : "",
2445 (unsigned long long) snap->id);
2446
2447 /* Done with this list entry; advance */
2448
2449 links = next;
2127 continue; 2450 continue;
2128 } 2451 }
2129 for (; i > 0; 2452
2130 i--, name = rbd_prev_snap_name(name, first_name)) { 2453 snap_name = rbd_dev_snap_info(rbd_dev, index,
2131 if (!name) { 2454 &snap_size, &snap_features);
2132 WARN_ON(1); 2455 if (IS_ERR(snap_name))
2133 return -EINVAL; 2456 return PTR_ERR(snap_name);
2457
2458 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2459 (unsigned long long) snap_id);
2460 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2461 struct rbd_snap *new_snap;
2462
2463 /* We haven't seen this snapshot before */
2464
2465 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2466 snap_id, snap_size, snap_features);
2467 if (IS_ERR(new_snap)) {
2468 int err = PTR_ERR(new_snap);
2469
2470 dout(" failed to add dev, error %d\n", err);
2471
2472 return err;
2134 } 2473 }
2135 cur_id = rbd_dev->header.snapc->snaps[i]; 2474
2136 /* snapshot removal? handle it above */ 2475 /* New goes before existing, or at end of list */
2137 if (cur_id >= old_snap->id) 2476
2138 break; 2477 dout(" added dev%s\n", snap ? "" : " at end\n");
2139 /* a new snapshot */ 2478 if (snap)
2140 snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); 2479 list_add_tail(&new_snap->node, &snap->node);
2141 if (IS_ERR(snap)) 2480 else
2142 return PTR_ERR(snap); 2481 list_add_tail(&new_snap->node, head);
2143 2482 } else {
2144 /* note that we add it backward so using n and not p */ 2483 /* Already have this one */
2145 list_add(&snap->node, n); 2484
2146 p = &snap->node; 2485 dout(" already present\n");
2486
2487 rbd_assert(snap->size == snap_size);
2488 rbd_assert(!strcmp(snap->name, snap_name));
2489 rbd_assert(snap->features == snap_features);
2490
2491 /* Done with this list entry; advance */
2492
2493 links = links->next;
2147 } 2494 }
2495
2496 /* Advance to the next entry in the snapshot context */
2497
2498 index++;
2148 } 2499 }
2149 /* we're done going over the old snap list, just add what's left */ 2500 dout("%s: done\n", __func__);
2150 for (; i > 0; i--) { 2501
2151 name = rbd_prev_snap_name(name, first_name); 2502 return 0;
2152 if (!name) { 2503}
2153 WARN_ON(1); 2504
2154 return -EINVAL; 2505/*
2506 * Scan the list of snapshots and register the devices for any that
2507 * have not already been registered.
2508 */
2509static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2510{
2511 struct rbd_snap *snap;
2512 int ret = 0;
2513
2514 dout("%s called\n", __func__);
2515 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2516 return -EIO;
2517
2518 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2519 if (!rbd_snap_registered(snap)) {
2520 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2521 if (ret < 0)
2522 break;
2155 } 2523 }
2156 snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
2157 if (IS_ERR(snap))
2158 return PTR_ERR(snap);
2159 list_add(&snap->node, &rbd_dev->snaps);
2160 } 2524 }
2525 dout("%s: returning %d\n", __func__, ret);
2161 2526
2162 return 0; 2527 return ret;
2163} 2528}
2164 2529
2165static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2530static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2166{ 2531{
2167 int ret;
2168 struct device *dev; 2532 struct device *dev;
2169 struct rbd_snap *snap; 2533 int ret;
2170 2534
2171 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2535 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2172 dev = &rbd_dev->dev;
2173 2536
2537 dev = &rbd_dev->dev;
2174 dev->bus = &rbd_bus_type; 2538 dev->bus = &rbd_bus_type;
2175 dev->type = &rbd_device_type; 2539 dev->type = &rbd_device_type;
2176 dev->parent = &rbd_root_dev; 2540 dev->parent = &rbd_root_dev;
2177 dev->release = rbd_dev_release; 2541 dev->release = rbd_dev_release;
2178 dev_set_name(dev, "%d", rbd_dev->dev_id); 2542 dev_set_name(dev, "%d", rbd_dev->dev_id);
2179 ret = device_register(dev); 2543 ret = device_register(dev);
2180 if (ret < 0)
2181 goto out;
2182 2544
2183 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2184 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2185 if (ret < 0)
2186 break;
2187 }
2188out:
2189 mutex_unlock(&ctl_mutex); 2545 mutex_unlock(&ctl_mutex);
2546
2190 return ret; 2547 return ret;
2191} 2548}
2192 2549
@@ -2211,33 +2568,37 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2211 return ret; 2568 return ret;
2212} 2569}
2213 2570
2214static atomic64_t rbd_id_max = ATOMIC64_INIT(0); 2571static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
2215 2572
2216/* 2573/*
2217 * Get a unique rbd identifier for the given new rbd_dev, and add 2574 * Get a unique rbd identifier for the given new rbd_dev, and add
2218 * the rbd_dev to the global list. The minimum rbd id is 1. 2575 * the rbd_dev to the global list. The minimum rbd id is 1.
2219 */ 2576 */
2220static void rbd_id_get(struct rbd_device *rbd_dev) 2577static void rbd_dev_id_get(struct rbd_device *rbd_dev)
2221{ 2578{
2222 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); 2579 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
2223 2580
2224 spin_lock(&rbd_dev_list_lock); 2581 spin_lock(&rbd_dev_list_lock);
2225 list_add_tail(&rbd_dev->node, &rbd_dev_list); 2582 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2226 spin_unlock(&rbd_dev_list_lock); 2583 spin_unlock(&rbd_dev_list_lock);
2584 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2585 (unsigned long long) rbd_dev->dev_id);
2227} 2586}
2228 2587
2229/* 2588/*
2230 * Remove an rbd_dev from the global list, and record that its 2589 * Remove an rbd_dev from the global list, and record that its
2231 * identifier is no longer in use. 2590 * identifier is no longer in use.
2232 */ 2591 */
2233static void rbd_id_put(struct rbd_device *rbd_dev) 2592static void rbd_dev_id_put(struct rbd_device *rbd_dev)
2234{ 2593{
2235 struct list_head *tmp; 2594 struct list_head *tmp;
2236 int rbd_id = rbd_dev->dev_id; 2595 int rbd_id = rbd_dev->dev_id;
2237 int max_id; 2596 int max_id;
2238 2597
2239 BUG_ON(rbd_id < 1); 2598 rbd_assert(rbd_id > 0);
2240 2599
2600 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2601 (unsigned long long) rbd_dev->dev_id);
2241 spin_lock(&rbd_dev_list_lock); 2602 spin_lock(&rbd_dev_list_lock);
2242 list_del_init(&rbd_dev->node); 2603 list_del_init(&rbd_dev->node);
2243 2604
@@ -2245,7 +2606,7 @@ static void rbd_id_put(struct rbd_device *rbd_dev)
2245 * If the id being "put" is not the current maximum, there 2606 * If the id being "put" is not the current maximum, there
2246 * is nothing special we need to do. 2607 * is nothing special we need to do.
2247 */ 2608 */
2248 if (rbd_id != atomic64_read(&rbd_id_max)) { 2609 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
2249 spin_unlock(&rbd_dev_list_lock); 2610 spin_unlock(&rbd_dev_list_lock);
2250 return; 2611 return;
2251 } 2612 }
@@ -2266,12 +2627,13 @@ static void rbd_id_put(struct rbd_device *rbd_dev)
2266 spin_unlock(&rbd_dev_list_lock); 2627 spin_unlock(&rbd_dev_list_lock);
2267 2628
2268 /* 2629 /*
2269 * The max id could have been updated by rbd_id_get(), in 2630 * The max id could have been updated by rbd_dev_id_get(), in
2270 * which case it now accurately reflects the new maximum. 2631 * which case it now accurately reflects the new maximum.
2271 * Be careful not to overwrite the maximum value in that 2632 * Be careful not to overwrite the maximum value in that
2272 * case. 2633 * case.
2273 */ 2634 */
2274 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); 2635 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2636 dout(" max dev id has been reset\n");
2275} 2637}
2276 2638
2277/* 2639/*
@@ -2360,28 +2722,31 @@ static inline char *dup_token(const char **buf, size_t *lenp)
2360} 2722}
2361 2723
2362/* 2724/*
2363 * This fills in the pool_name, image_name, image_name_len, snap_name, 2725 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2364 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based 2726 * rbd_md_name, and name fields of the given rbd_dev, based on the
2365 * on the list of monitor addresses and other options provided via 2727 * list of monitor addresses and other options provided via
2366 * /sys/bus/rbd/add. 2728 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2729 * copy of the snapshot name to map if successful, or a
2730 * pointer-coded error otherwise.
2367 * 2731 *
2368 * Note: rbd_dev is assumed to have been initially zero-filled. 2732 * Note: rbd_dev is assumed to have been initially zero-filled.
2369 */ 2733 */
2370static int rbd_add_parse_args(struct rbd_device *rbd_dev, 2734static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2371 const char *buf, 2735 const char *buf,
2372 const char **mon_addrs, 2736 const char **mon_addrs,
2373 size_t *mon_addrs_size, 2737 size_t *mon_addrs_size,
2374 char *options, 2738 char *options,
2375 size_t options_size) 2739 size_t options_size)
2376{ 2740{
2377 size_t len; 2741 size_t len;
2378 int ret; 2742 char *err_ptr = ERR_PTR(-EINVAL);
2743 char *snap_name;
2379 2744
2380 /* The first four tokens are required */ 2745 /* The first four tokens are required */
2381 2746
2382 len = next_token(&buf); 2747 len = next_token(&buf);
2383 if (!len) 2748 if (!len)
2384 return -EINVAL; 2749 return err_ptr;
2385 *mon_addrs_size = len + 1; 2750 *mon_addrs_size = len + 1;
2386 *mon_addrs = buf; 2751 *mon_addrs = buf;
2387 2752
@@ -2389,9 +2754,9 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2389 2754
2390 len = copy_token(&buf, options, options_size); 2755 len = copy_token(&buf, options, options_size);
2391 if (!len || len >= options_size) 2756 if (!len || len >= options_size)
2392 return -EINVAL; 2757 return err_ptr;
2393 2758
2394 ret = -ENOMEM; 2759 err_ptr = ERR_PTR(-ENOMEM);
2395 rbd_dev->pool_name = dup_token(&buf, NULL); 2760 rbd_dev->pool_name = dup_token(&buf, NULL);
2396 if (!rbd_dev->pool_name) 2761 if (!rbd_dev->pool_name)
2397 goto out_err; 2762 goto out_err;
@@ -2400,41 +2765,227 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2400 if (!rbd_dev->image_name) 2765 if (!rbd_dev->image_name)
2401 goto out_err; 2766 goto out_err;
2402 2767
2403 /* Create the name of the header object */ 2768 /* Snapshot name is optional */
2769 len = next_token(&buf);
2770 if (!len) {
2771 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2772 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
2773 }
2774 snap_name = kmalloc(len + 1, GFP_KERNEL);
2775 if (!snap_name)
2776 goto out_err;
2777 memcpy(snap_name, buf, len);
2778 *(snap_name + len) = '\0';
2404 2779
2405 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len 2780dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2406 + sizeof (RBD_SUFFIX), 2781
2407 GFP_KERNEL); 2782 return snap_name;
2408 if (!rbd_dev->header_name) 2783
2784out_err:
2785 kfree(rbd_dev->image_name);
2786 rbd_dev->image_name = NULL;
2787 rbd_dev->image_name_len = 0;
2788 kfree(rbd_dev->pool_name);
2789 rbd_dev->pool_name = NULL;
2790
2791 return err_ptr;
2792}
2793
2794/*
2795 * An rbd format 2 image has a unique identifier, distinct from the
2796 * name given to it by the user. Internally, that identifier is
2797 * what's used to specify the names of objects related to the image.
2798 *
2799 * A special "rbd id" object is used to map an rbd image name to its
2800 * id. If that object doesn't exist, then there is no v2 rbd image
2801 * with the supplied name.
2802 *
2803 * This function will record the given rbd_dev's image_id field if
2804 * it can be determined, and in that case will return 0. If any
2805 * errors occur a negative errno will be returned and the rbd_dev's
2806 * image_id field will be unchanged (and should be NULL).
2807 */
2808static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2809{
2810 int ret;
2811 size_t size;
2812 char *object_name;
2813 void *response;
2814 void *p;
2815
2816 /*
2817 * First, see if the format 2 image id file exists, and if
2818 * so, get the image's persistent id from it.
2819 */
2820 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2821 object_name = kmalloc(size, GFP_NOIO);
2822 if (!object_name)
2823 return -ENOMEM;
2824 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2825 dout("rbd id object name is %s\n", object_name);
2826
2827 /* Response will be an encoded string, which includes a length */
2828
2829 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2830 response = kzalloc(size, GFP_NOIO);
2831 if (!response) {
2832 ret = -ENOMEM;
2833 goto out;
2834 }
2835
2836 ret = rbd_req_sync_exec(rbd_dev, object_name,
2837 "rbd", "get_id",
2838 NULL, 0,
2839 response, RBD_IMAGE_ID_LEN_MAX,
2840 CEPH_OSD_FLAG_READ, NULL);
2841 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2842 if (ret < 0)
2843 goto out;
2844
2845 p = response;
2846 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2847 p + RBD_IMAGE_ID_LEN_MAX,
2848 &rbd_dev->image_id_len,
2849 GFP_NOIO);
2850 if (IS_ERR(rbd_dev->image_id)) {
2851 ret = PTR_ERR(rbd_dev->image_id);
2852 rbd_dev->image_id = NULL;
2853 } else {
2854 dout("image_id is %s\n", rbd_dev->image_id);
2855 }
2856out:
2857 kfree(response);
2858 kfree(object_name);
2859
2860 return ret;
2861}
2862
2863static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2864{
2865 int ret;
2866 size_t size;
2867
2868 /* Version 1 images have no id; empty string is used */
2869
2870 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2871 if (!rbd_dev->image_id)
2872 return -ENOMEM;
2873 rbd_dev->image_id_len = 0;
2874
2875 /* Record the header object name for this rbd image. */
2876
2877 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2878 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2879 if (!rbd_dev->header_name) {
2880 ret = -ENOMEM;
2409 goto out_err; 2881 goto out_err;
2882 }
2410 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 2883 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2411 2884
2885 /* Populate rbd image metadata */
2886
2887 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2888 if (ret < 0)
2889 goto out_err;
2890 rbd_dev->image_format = 1;
2891
2892 dout("discovered version 1 image, header name is %s\n",
2893 rbd_dev->header_name);
2894
2895 return 0;
2896
2897out_err:
2898 kfree(rbd_dev->header_name);
2899 rbd_dev->header_name = NULL;
2900 kfree(rbd_dev->image_id);
2901 rbd_dev->image_id = NULL;
2902
2903 return ret;
2904}
2905
2906static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2907{
2908 size_t size;
2909 int ret;
2910 u64 ver = 0;
2911
2412 /* 2912 /*
2413 * The snapshot name is optional. If none is is supplied, 2913 * Image id was filled in by the caller. Record the header
2414 * we use the default value. 2914 * object name for this rbd image.
2415 */ 2915 */
2416 rbd_dev->snap_name = dup_token(&buf, &len); 2916 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2417 if (!rbd_dev->snap_name) 2917 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2918 if (!rbd_dev->header_name)
2919 return -ENOMEM;
2920 sprintf(rbd_dev->header_name, "%s%s",
2921 RBD_HEADER_PREFIX, rbd_dev->image_id);
2922
2923 /* Get the size and object order for the image */
2924
2925 ret = rbd_dev_v2_image_size(rbd_dev);
2926 if (ret < 0)
2418 goto out_err; 2927 goto out_err;
2419 if (!len) {
2420 /* Replace the empty name with the default */
2421 kfree(rbd_dev->snap_name);
2422 rbd_dev->snap_name
2423 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2424 if (!rbd_dev->snap_name)
2425 goto out_err;
2426 2928
2427 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, 2929 /* Get the object prefix (a.k.a. block_name) for the image */
2428 sizeof (RBD_SNAP_HEAD_NAME));
2429 }
2430 2930
2431 return 0; 2931 ret = rbd_dev_v2_object_prefix(rbd_dev);
2932 if (ret < 0)
2933 goto out_err;
2934
2935 /* Get the features for the image */
2432 2936
2937 ret = rbd_dev_v2_features(rbd_dev);
2938 if (ret < 0)
2939 goto out_err;
2940
2941 /* crypto and compression type aren't (yet) supported for v2 images */
2942
2943 rbd_dev->header.crypt_type = 0;
2944 rbd_dev->header.comp_type = 0;
2945
2946 /* Get the snapshot context, plus the header version */
2947
2948 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
2949 if (ret)
2950 goto out_err;
2951 rbd_dev->header.obj_version = ver;
2952
2953 rbd_dev->image_format = 2;
2954
2955 dout("discovered version 2 image, header name is %s\n",
2956 rbd_dev->header_name);
2957
2958 return -ENOTSUPP;
2433out_err: 2959out_err:
2434 kfree(rbd_dev->header_name); 2960 kfree(rbd_dev->header_name);
2435 kfree(rbd_dev->image_name); 2961 rbd_dev->header_name = NULL;
2436 kfree(rbd_dev->pool_name); 2962 kfree(rbd_dev->header.object_prefix);
2437 rbd_dev->pool_name = NULL; 2963 rbd_dev->header.object_prefix = NULL;
2964
2965 return ret;
2966}
2967
2968/*
2969 * Probe for the existence of the header object for the given rbd
2970 * device. For format 2 images this includes determining the image
2971 * id.
2972 */
2973static int rbd_dev_probe(struct rbd_device *rbd_dev)
2974{
2975 int ret;
2976
2977 /*
2978 * Get the id from the image id object. If it's not a
2979 * format 2 image, we'll get ENOENT back, and we'll assume
2980 * it's a format 1 image.
2981 */
2982 ret = rbd_dev_image_id(rbd_dev);
2983 if (ret)
2984 ret = rbd_dev_v1_probe(rbd_dev);
2985 else
2986 ret = rbd_dev_v2_probe(rbd_dev);
2987 if (ret)
2988 dout("probe failed, returning %d\n", ret);
2438 2989
2439 return ret; 2990 return ret;
2440} 2991}
@@ -2449,16 +3000,17 @@ static ssize_t rbd_add(struct bus_type *bus,
2449 size_t mon_addrs_size = 0; 3000 size_t mon_addrs_size = 0;
2450 struct ceph_osd_client *osdc; 3001 struct ceph_osd_client *osdc;
2451 int rc = -ENOMEM; 3002 int rc = -ENOMEM;
3003 char *snap_name;
2452 3004
2453 if (!try_module_get(THIS_MODULE)) 3005 if (!try_module_get(THIS_MODULE))
2454 return -ENODEV; 3006 return -ENODEV;
2455 3007
2456 options = kmalloc(count, GFP_KERNEL); 3008 options = kmalloc(count, GFP_KERNEL);
2457 if (!options) 3009 if (!options)
2458 goto err_nomem; 3010 goto err_out_mem;
2459 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 3011 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2460 if (!rbd_dev) 3012 if (!rbd_dev)
2461 goto err_nomem; 3013 goto err_out_mem;
2462 3014
2463 /* static rbd_device initialization */ 3015 /* static rbd_device initialization */
2464 spin_lock_init(&rbd_dev->lock); 3016 spin_lock_init(&rbd_dev->lock);
@@ -2466,27 +3018,18 @@ static ssize_t rbd_add(struct bus_type *bus,
2466 INIT_LIST_HEAD(&rbd_dev->snaps); 3018 INIT_LIST_HEAD(&rbd_dev->snaps);
2467 init_rwsem(&rbd_dev->header_rwsem); 3019 init_rwsem(&rbd_dev->header_rwsem);
2468 3020
2469 /* generate unique id: find highest unique id, add one */
2470 rbd_id_get(rbd_dev);
2471
2472 /* Fill in the device name, now that we have its id. */
2473 BUILD_BUG_ON(DEV_NAME_LEN
2474 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2475 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2476
2477 /* parse add command */ 3021 /* parse add command */
2478 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, 3022 snap_name = rbd_add_parse_args(rbd_dev, buf,
2479 options, count); 3023 &mon_addrs, &mon_addrs_size, options, count);
2480 if (rc) 3024 if (IS_ERR(snap_name)) {
2481 goto err_put_id; 3025 rc = PTR_ERR(snap_name);
2482 3026 goto err_out_mem;
2483 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2484 options);
2485 if (IS_ERR(rbd_dev->rbd_client)) {
2486 rc = PTR_ERR(rbd_dev->rbd_client);
2487 goto err_put_id;
2488 } 3027 }
2489 3028
3029 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3030 if (rc < 0)
3031 goto err_out_args;
3032
2490 /* pick the pool */ 3033 /* pick the pool */
2491 osdc = &rbd_dev->rbd_client->client->osdc; 3034 osdc = &rbd_dev->rbd_client->client->osdc;
2492 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); 3035 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
@@ -2494,23 +3037,53 @@ static ssize_t rbd_add(struct bus_type *bus,
2494 goto err_out_client; 3037 goto err_out_client;
2495 rbd_dev->pool_id = rc; 3038 rbd_dev->pool_id = rc;
2496 3039
2497 /* register our block device */ 3040 rc = rbd_dev_probe(rbd_dev);
2498 rc = register_blkdev(0, rbd_dev->name);
2499 if (rc < 0) 3041 if (rc < 0)
2500 goto err_out_client; 3042 goto err_out_client;
3043 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3044
3045 /* no need to lock here, as rbd_dev is not registered yet */
3046 rc = rbd_dev_snaps_update(rbd_dev);
3047 if (rc)
3048 goto err_out_header;
3049
3050 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3051 if (rc)
3052 goto err_out_header;
3053
3054 /* generate unique id: find highest unique id, add one */
3055 rbd_dev_id_get(rbd_dev);
3056
3057 /* Fill in the device name, now that we have its id. */
3058 BUILD_BUG_ON(DEV_NAME_LEN
3059 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3060 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3061
3062 /* Get our block major device number. */
3063
3064 rc = register_blkdev(0, rbd_dev->name);
3065 if (rc < 0)
3066 goto err_out_id;
2501 rbd_dev->major = rc; 3067 rbd_dev->major = rc;
2502 3068
2503 rc = rbd_bus_add_dev(rbd_dev); 3069 /* Set up the blkdev mapping. */
3070
3071 rc = rbd_init_disk(rbd_dev);
2504 if (rc) 3072 if (rc)
2505 goto err_out_blkdev; 3073 goto err_out_blkdev;
2506 3074
3075 rc = rbd_bus_add_dev(rbd_dev);
3076 if (rc)
3077 goto err_out_disk;
3078
2507 /* 3079 /*
2508 * At this point cleanup in the event of an error is the job 3080 * At this point cleanup in the event of an error is the job
2509 * of the sysfs code (initiated by rbd_bus_del_dev()). 3081 * of the sysfs code (initiated by rbd_bus_del_dev()).
2510 *
2511 * Set up and announce blkdev mapping.
2512 */ 3082 */
2513 rc = rbd_init_disk(rbd_dev); 3083
3084 down_write(&rbd_dev->header_rwsem);
3085 rc = rbd_dev_snaps_register(rbd_dev);
3086 up_write(&rbd_dev->header_rwsem);
2514 if (rc) 3087 if (rc)
2515 goto err_out_bus; 3088 goto err_out_bus;
2516 3089
@@ -2518,6 +3091,13 @@ static ssize_t rbd_add(struct bus_type *bus,
2518 if (rc) 3091 if (rc)
2519 goto err_out_bus; 3092 goto err_out_bus;
2520 3093
3094 /* Everything's ready. Announce the disk to the world. */
3095
3096 add_disk(rbd_dev->disk);
3097
3098 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3099 (unsigned long long) rbd_dev->mapping.size);
3100
2521 return count; 3101 return count;
2522 3102
2523err_out_bus: 3103err_out_bus:
@@ -2527,19 +3107,23 @@ err_out_bus:
2527 kfree(options); 3107 kfree(options);
2528 return rc; 3108 return rc;
2529 3109
3110err_out_disk:
3111 rbd_free_disk(rbd_dev);
2530err_out_blkdev: 3112err_out_blkdev:
2531 unregister_blkdev(rbd_dev->major, rbd_dev->name); 3113 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3114err_out_id:
3115 rbd_dev_id_put(rbd_dev);
3116err_out_header:
3117 rbd_header_free(&rbd_dev->header);
2532err_out_client: 3118err_out_client:
3119 kfree(rbd_dev->header_name);
2533 rbd_put_client(rbd_dev); 3120 rbd_put_client(rbd_dev);
2534err_put_id: 3121 kfree(rbd_dev->image_id);
2535 if (rbd_dev->pool_name) { 3122err_out_args:
2536 kfree(rbd_dev->snap_name); 3123 kfree(rbd_dev->mapping.snap_name);
2537 kfree(rbd_dev->header_name); 3124 kfree(rbd_dev->image_name);
2538 kfree(rbd_dev->image_name); 3125 kfree(rbd_dev->pool_name);
2539 kfree(rbd_dev->pool_name); 3126err_out_mem:
2540 }
2541 rbd_id_put(rbd_dev);
2542err_nomem:
2543 kfree(rbd_dev); 3127 kfree(rbd_dev);
2544 kfree(options); 3128 kfree(options);
2545 3129
@@ -2585,12 +3169,16 @@ static void rbd_dev_release(struct device *dev)
2585 rbd_free_disk(rbd_dev); 3169 rbd_free_disk(rbd_dev);
2586 unregister_blkdev(rbd_dev->major, rbd_dev->name); 3170 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2587 3171
3172 /* release allocated disk header fields */
3173 rbd_header_free(&rbd_dev->header);
3174
2588 /* done with the id, and with the rbd_dev */ 3175 /* done with the id, and with the rbd_dev */
2589 kfree(rbd_dev->snap_name); 3176 kfree(rbd_dev->mapping.snap_name);
3177 kfree(rbd_dev->image_id);
2590 kfree(rbd_dev->header_name); 3178 kfree(rbd_dev->header_name);
2591 kfree(rbd_dev->pool_name); 3179 kfree(rbd_dev->pool_name);
2592 kfree(rbd_dev->image_name); 3180 kfree(rbd_dev->image_name);
2593 rbd_id_put(rbd_dev); 3181 rbd_dev_id_put(rbd_dev);
2594 kfree(rbd_dev); 3182 kfree(rbd_dev);
2595 3183
2596 /* release module ref */ 3184 /* release module ref */
@@ -2628,47 +3216,7 @@ static ssize_t rbd_remove(struct bus_type *bus,
2628 3216
2629done: 3217done:
2630 mutex_unlock(&ctl_mutex); 3218 mutex_unlock(&ctl_mutex);
2631 return ret;
2632}
2633 3219
2634static ssize_t rbd_snap_add(struct device *dev,
2635 struct device_attribute *attr,
2636 const char *buf,
2637 size_t count)
2638{
2639 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2640 int ret;
2641 char *name = kmalloc(count + 1, GFP_KERNEL);
2642 if (!name)
2643 return -ENOMEM;
2644
2645 snprintf(name, count, "%s", buf);
2646
2647 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2648
2649 ret = rbd_header_add_snap(rbd_dev,
2650 name, GFP_KERNEL);
2651 if (ret < 0)
2652 goto err_unlock;
2653
2654 ret = __rbd_refresh_header(rbd_dev, NULL);
2655 if (ret < 0)
2656 goto err_unlock;
2657
2658 /* shouldn't hold ctl_mutex when notifying.. notify might
2659 trigger a watch callback that would need to get that mutex */
2660 mutex_unlock(&ctl_mutex);
2661
2662 /* make a best effort, don't error if failed */
2663 rbd_req_sync_notify(rbd_dev);
2664
2665 ret = count;
2666 kfree(name);
2667 return ret;
2668
2669err_unlock:
2670 mutex_unlock(&ctl_mutex);
2671 kfree(name);
2672 return ret; 3220 return ret;
2673} 3221}
2674 3222
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 0924e9e41a60..cbe77fa105ba 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -15,15 +15,30 @@
15 15
16#include <linux/types.h> 16#include <linux/types.h>
17 17
18/* For format version 2, rbd image 'foo' consists of objects
19 * rbd_id.foo - id of image
20 * rbd_header.<id> - image metadata
21 * rbd_data.<id>.0000000000000000
22 * rbd_data.<id>.0000000000000001
23 * ... - data
24 * Clients do not access header data directly in rbd format 2.
25 */
26
27#define RBD_HEADER_PREFIX "rbd_header."
28#define RBD_DATA_PREFIX "rbd_data."
29#define RBD_ID_PREFIX "rbd_id."
30
18/* 31/*
19 * rbd image 'foo' consists of objects 32 * For format version 1, rbd image 'foo' consists of objects
20 * foo.rbd - image metadata 33 * foo.rbd - image metadata
21 * foo.00000000 34 * rb.<idhi>.<idlo>.00000000
22 * foo.00000001 35 * rb.<idhi>.<idlo>.00000001
23 * ... - data 36 * ... - data
37 * There is no notion of a persistent image id in rbd format 1.
24 */ 38 */
25 39
26#define RBD_SUFFIX ".rbd" 40#define RBD_SUFFIX ".rbd"
41
27#define RBD_DIRECTORY "rbd_directory" 42#define RBD_DIRECTORY "rbd_directory"
28#define RBD_INFO "rbd_info" 43#define RBD_INFO "rbd_info"
29 44
@@ -47,7 +62,7 @@ struct rbd_image_snap_ondisk {
47 62
48struct rbd_image_header_ondisk { 63struct rbd_image_header_ondisk {
49 char text[40]; 64 char text[40];
50 char block_name[24]; 65 char object_prefix[24];
51 char signature[4]; 66 char signature[4];
52 char version[8]; 67 char version[8];
53 struct { 68 struct {