diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/rbd.c | 1389 | ||||
-rw-r--r-- | drivers/block/rbd_types.h | 2 |
2 files changed, 963 insertions, 428 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bb3d9be3b1b4..89576a0b3f2e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -61,15 +61,29 @@ | |||
61 | 61 | ||
62 | #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ | 62 | #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ |
63 | 63 | ||
64 | #define RBD_MAX_SNAP_NAME_LEN 32 | 64 | #define RBD_SNAP_DEV_NAME_PREFIX "snap_" |
65 | #define RBD_MAX_SNAP_NAME_LEN \ | ||
66 | (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) | ||
67 | |||
65 | #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ | 68 | #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ |
66 | #define RBD_MAX_OPT_LEN 1024 | 69 | #define RBD_MAX_OPT_LEN 1024 |
67 | 70 | ||
68 | #define RBD_SNAP_HEAD_NAME "-" | 71 | #define RBD_SNAP_HEAD_NAME "-" |
69 | 72 | ||
73 | /* This allows a single page to hold an image name sent by OSD */ | ||
74 | #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) | ||
70 | #define RBD_IMAGE_ID_LEN_MAX 64 | 75 | #define RBD_IMAGE_ID_LEN_MAX 64 |
76 | |||
71 | #define RBD_OBJ_PREFIX_LEN_MAX 64 | 77 | #define RBD_OBJ_PREFIX_LEN_MAX 64 |
72 | 78 | ||
79 | /* Feature bits */ | ||
80 | |||
81 | #define RBD_FEATURE_LAYERING 1 | ||
82 | |||
83 | /* Features supported by this (client software) implementation. */ | ||
84 | |||
85 | #define RBD_FEATURES_ALL (0) | ||
86 | |||
73 | /* | 87 | /* |
74 | * An RBD device name will be "rbd#", where the "rbd" comes from | 88 | * An RBD device name will be "rbd#", where the "rbd" comes from |
75 | * RBD_DRV_NAME above, and # is a unique integer identifier. | 89 | * RBD_DRV_NAME above, and # is a unique integer identifier. |
@@ -101,6 +115,27 @@ struct rbd_image_header { | |||
101 | u64 obj_version; | 115 | u64 obj_version; |
102 | }; | 116 | }; |
103 | 117 | ||
118 | /* | ||
119 | * An rbd image specification. | ||
120 | * | ||
121 | * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely | ||
122 | * identify an image. | ||
123 | */ | ||
124 | struct rbd_spec { | ||
125 | u64 pool_id; | ||
126 | char *pool_name; | ||
127 | |||
128 | char *image_id; | ||
129 | size_t image_id_len; | ||
130 | char *image_name; | ||
131 | size_t image_name_len; | ||
132 | |||
133 | u64 snap_id; | ||
134 | char *snap_name; | ||
135 | |||
136 | struct kref kref; | ||
137 | }; | ||
138 | |||
104 | struct rbd_options { | 139 | struct rbd_options { |
105 | bool read_only; | 140 | bool read_only; |
106 | }; | 141 | }; |
@@ -155,11 +190,8 @@ struct rbd_snap { | |||
155 | }; | 190 | }; |
156 | 191 | ||
157 | struct rbd_mapping { | 192 | struct rbd_mapping { |
158 | char *snap_name; | ||
159 | u64 snap_id; | ||
160 | u64 size; | 193 | u64 size; |
161 | u64 features; | 194 | u64 features; |
162 | bool snap_exists; | ||
163 | bool read_only; | 195 | bool read_only; |
164 | }; | 196 | }; |
165 | 197 | ||
@@ -173,7 +205,6 @@ struct rbd_device { | |||
173 | struct gendisk *disk; /* blkdev's gendisk and rq */ | 205 | struct gendisk *disk; /* blkdev's gendisk and rq */ |
174 | 206 | ||
175 | u32 image_format; /* Either 1 or 2 */ | 207 | u32 image_format; /* Either 1 or 2 */ |
176 | struct rbd_options rbd_opts; | ||
177 | struct rbd_client *rbd_client; | 208 | struct rbd_client *rbd_client; |
178 | 209 | ||
179 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ | 210 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ |
@@ -181,17 +212,17 @@ struct rbd_device { | |||
181 | spinlock_t lock; /* queue lock */ | 212 | spinlock_t lock; /* queue lock */ |
182 | 213 | ||
183 | struct rbd_image_header header; | 214 | struct rbd_image_header header; |
184 | char *image_id; | 215 | bool exists; |
185 | size_t image_id_len; | 216 | struct rbd_spec *spec; |
186 | char *image_name; | 217 | |
187 | size_t image_name_len; | ||
188 | char *header_name; | 218 | char *header_name; |
189 | char *pool_name; | ||
190 | int pool_id; | ||
191 | 219 | ||
192 | struct ceph_osd_event *watch_event; | 220 | struct ceph_osd_event *watch_event; |
193 | struct ceph_osd_request *watch_request; | 221 | struct ceph_osd_request *watch_request; |
194 | 222 | ||
223 | struct rbd_spec *parent_spec; | ||
224 | u64 parent_overlap; | ||
225 | |||
195 | /* protects updating the header */ | 226 | /* protects updating the header */ |
196 | struct rw_semaphore header_rwsem; | 227 | struct rw_semaphore header_rwsem; |
197 | 228 | ||
@@ -204,6 +235,7 @@ struct rbd_device { | |||
204 | 235 | ||
205 | /* sysfs related */ | 236 | /* sysfs related */ |
206 | struct device dev; | 237 | struct device dev; |
238 | unsigned long open_count; | ||
207 | }; | 239 | }; |
208 | 240 | ||
209 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ | 241 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ |
@@ -218,7 +250,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); | |||
218 | static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); | 250 | static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); |
219 | 251 | ||
220 | static void rbd_dev_release(struct device *dev); | 252 | static void rbd_dev_release(struct device *dev); |
221 | static void __rbd_remove_snap_dev(struct rbd_snap *snap); | 253 | static void rbd_remove_snap_dev(struct rbd_snap *snap); |
222 | 254 | ||
223 | static ssize_t rbd_add(struct bus_type *bus, const char *buf, | 255 | static ssize_t rbd_add(struct bus_type *bus, const char *buf, |
224 | size_t count); | 256 | size_t count); |
@@ -258,17 +290,8 @@ static struct device rbd_root_dev = { | |||
258 | # define rbd_assert(expr) ((void) 0) | 290 | # define rbd_assert(expr) ((void) 0) |
259 | #endif /* !RBD_DEBUG */ | 291 | #endif /* !RBD_DEBUG */ |
260 | 292 | ||
261 | static struct device *rbd_get_dev(struct rbd_device *rbd_dev) | 293 | static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); |
262 | { | 294 | static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); |
263 | return get_device(&rbd_dev->dev); | ||
264 | } | ||
265 | |||
266 | static void rbd_put_dev(struct rbd_device *rbd_dev) | ||
267 | { | ||
268 | put_device(&rbd_dev->dev); | ||
269 | } | ||
270 | |||
271 | static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver); | ||
272 | 295 | ||
273 | static int rbd_open(struct block_device *bdev, fmode_t mode) | 296 | static int rbd_open(struct block_device *bdev, fmode_t mode) |
274 | { | 297 | { |
@@ -277,8 +300,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) | |||
277 | if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) | 300 | if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) |
278 | return -EROFS; | 301 | return -EROFS; |
279 | 302 | ||
280 | rbd_get_dev(rbd_dev); | 303 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
304 | (void) get_device(&rbd_dev->dev); | ||
281 | set_device_ro(bdev, rbd_dev->mapping.read_only); | 305 | set_device_ro(bdev, rbd_dev->mapping.read_only); |
306 | rbd_dev->open_count++; | ||
307 | mutex_unlock(&ctl_mutex); | ||
282 | 308 | ||
283 | return 0; | 309 | return 0; |
284 | } | 310 | } |
@@ -287,7 +313,11 @@ static int rbd_release(struct gendisk *disk, fmode_t mode) | |||
287 | { | 313 | { |
288 | struct rbd_device *rbd_dev = disk->private_data; | 314 | struct rbd_device *rbd_dev = disk->private_data; |
289 | 315 | ||
290 | rbd_put_dev(rbd_dev); | 316 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
317 | rbd_assert(rbd_dev->open_count > 0); | ||
318 | rbd_dev->open_count--; | ||
319 | put_device(&rbd_dev->dev); | ||
320 | mutex_unlock(&ctl_mutex); | ||
291 | 321 | ||
292 | return 0; | 322 | return 0; |
293 | } | 323 | } |
@@ -388,7 +418,7 @@ enum { | |||
388 | static match_table_t rbd_opts_tokens = { | 418 | static match_table_t rbd_opts_tokens = { |
389 | /* int args above */ | 419 | /* int args above */ |
390 | /* string args above */ | 420 | /* string args above */ |
391 | {Opt_read_only, "mapping.read_only"}, | 421 | {Opt_read_only, "read_only"}, |
392 | {Opt_read_only, "ro"}, /* Alternate spelling */ | 422 | {Opt_read_only, "ro"}, /* Alternate spelling */ |
393 | {Opt_read_write, "read_write"}, | 423 | {Opt_read_write, "read_write"}, |
394 | {Opt_read_write, "rw"}, /* Alternate spelling */ | 424 | {Opt_read_write, "rw"}, /* Alternate spelling */ |
@@ -441,33 +471,17 @@ static int parse_rbd_opts_token(char *c, void *private) | |||
441 | * Get a ceph client with specific addr and configuration, if one does | 471 | * Get a ceph client with specific addr and configuration, if one does |
442 | * not exist create it. | 472 | * not exist create it. |
443 | */ | 473 | */ |
444 | static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, | 474 | static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) |
445 | size_t mon_addr_len, char *options) | ||
446 | { | 475 | { |
447 | struct rbd_options *rbd_opts = &rbd_dev->rbd_opts; | ||
448 | struct ceph_options *ceph_opts; | ||
449 | struct rbd_client *rbdc; | 476 | struct rbd_client *rbdc; |
450 | 477 | ||
451 | rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; | ||
452 | |||
453 | ceph_opts = ceph_parse_options(options, mon_addr, | ||
454 | mon_addr + mon_addr_len, | ||
455 | parse_rbd_opts_token, rbd_opts); | ||
456 | if (IS_ERR(ceph_opts)) | ||
457 | return PTR_ERR(ceph_opts); | ||
458 | |||
459 | rbdc = rbd_client_find(ceph_opts); | 478 | rbdc = rbd_client_find(ceph_opts); |
460 | if (rbdc) { | 479 | if (rbdc) /* using an existing client */ |
461 | /* using an existing client */ | ||
462 | ceph_destroy_options(ceph_opts); | 480 | ceph_destroy_options(ceph_opts); |
463 | } else { | 481 | else |
464 | rbdc = rbd_client_create(ceph_opts); | 482 | rbdc = rbd_client_create(ceph_opts); |
465 | if (IS_ERR(rbdc)) | ||
466 | return PTR_ERR(rbdc); | ||
467 | } | ||
468 | rbd_dev->rbd_client = rbdc; | ||
469 | 483 | ||
470 | return 0; | 484 | return rbdc; |
471 | } | 485 | } |
472 | 486 | ||
473 | /* | 487 | /* |
@@ -492,10 +506,10 @@ static void rbd_client_release(struct kref *kref) | |||
492 | * Drop reference to ceph client node. If it's not referenced anymore, release | 506 | * Drop reference to ceph client node. If it's not referenced anymore, release |
493 | * it. | 507 | * it. |
494 | */ | 508 | */ |
495 | static void rbd_put_client(struct rbd_device *rbd_dev) | 509 | static void rbd_put_client(struct rbd_client *rbdc) |
496 | { | 510 | { |
497 | kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); | 511 | if (rbdc) |
498 | rbd_dev->rbd_client = NULL; | 512 | kref_put(&rbdc->kref, rbd_client_release); |
499 | } | 513 | } |
500 | 514 | ||
501 | /* | 515 | /* |
@@ -524,6 +538,16 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) | |||
524 | if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) | 538 | if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) |
525 | return false; | 539 | return false; |
526 | 540 | ||
541 | /* The bio layer requires at least sector-sized I/O */ | ||
542 | |||
543 | if (ondisk->options.order < SECTOR_SHIFT) | ||
544 | return false; | ||
545 | |||
546 | /* If we use u64 in a few spots we may be able to loosen this */ | ||
547 | |||
548 | if (ondisk->options.order > 8 * sizeof (int) - 1) | ||
549 | return false; | ||
550 | |||
527 | /* | 551 | /* |
528 | * The size of a snapshot header has to fit in a size_t, and | 552 | * The size of a snapshot header has to fit in a size_t, and |
529 | * that limits the number of snapshots. | 553 | * that limits the number of snapshots. |
@@ -635,6 +659,20 @@ out_err: | |||
635 | return -ENOMEM; | 659 | return -ENOMEM; |
636 | } | 660 | } |
637 | 661 | ||
662 | static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) | ||
663 | { | ||
664 | struct rbd_snap *snap; | ||
665 | |||
666 | if (snap_id == CEPH_NOSNAP) | ||
667 | return RBD_SNAP_HEAD_NAME; | ||
668 | |||
669 | list_for_each_entry(snap, &rbd_dev->snaps, node) | ||
670 | if (snap_id == snap->id) | ||
671 | return snap->name; | ||
672 | |||
673 | return NULL; | ||
674 | } | ||
675 | |||
638 | static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) | 676 | static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) |
639 | { | 677 | { |
640 | 678 | ||
@@ -642,7 +680,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) | |||
642 | 680 | ||
643 | list_for_each_entry(snap, &rbd_dev->snaps, node) { | 681 | list_for_each_entry(snap, &rbd_dev->snaps, node) { |
644 | if (!strcmp(snap_name, snap->name)) { | 682 | if (!strcmp(snap_name, snap->name)) { |
645 | rbd_dev->mapping.snap_id = snap->id; | 683 | rbd_dev->spec->snap_id = snap->id; |
646 | rbd_dev->mapping.size = snap->size; | 684 | rbd_dev->mapping.size = snap->size; |
647 | rbd_dev->mapping.features = snap->features; | 685 | rbd_dev->mapping.features = snap->features; |
648 | 686 | ||
@@ -653,26 +691,23 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) | |||
653 | return -ENOENT; | 691 | return -ENOENT; |
654 | } | 692 | } |
655 | 693 | ||
656 | static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) | 694 | static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) |
657 | { | 695 | { |
658 | int ret; | 696 | int ret; |
659 | 697 | ||
660 | if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, | 698 | if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, |
661 | sizeof (RBD_SNAP_HEAD_NAME))) { | 699 | sizeof (RBD_SNAP_HEAD_NAME))) { |
662 | rbd_dev->mapping.snap_id = CEPH_NOSNAP; | 700 | rbd_dev->spec->snap_id = CEPH_NOSNAP; |
663 | rbd_dev->mapping.size = rbd_dev->header.image_size; | 701 | rbd_dev->mapping.size = rbd_dev->header.image_size; |
664 | rbd_dev->mapping.features = rbd_dev->header.features; | 702 | rbd_dev->mapping.features = rbd_dev->header.features; |
665 | rbd_dev->mapping.snap_exists = false; | ||
666 | rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; | ||
667 | ret = 0; | 703 | ret = 0; |
668 | } else { | 704 | } else { |
669 | ret = snap_by_name(rbd_dev, snap_name); | 705 | ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); |
670 | if (ret < 0) | 706 | if (ret < 0) |
671 | goto done; | 707 | goto done; |
672 | rbd_dev->mapping.snap_exists = true; | ||
673 | rbd_dev->mapping.read_only = true; | 708 | rbd_dev->mapping.read_only = true; |
674 | } | 709 | } |
675 | rbd_dev->mapping.snap_name = snap_name; | 710 | rbd_dev->exists = true; |
676 | done: | 711 | done: |
677 | return ret; | 712 | return ret; |
678 | } | 713 | } |
@@ -695,13 +730,13 @@ static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) | |||
695 | u64 segment; | 730 | u64 segment; |
696 | int ret; | 731 | int ret; |
697 | 732 | ||
698 | name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); | 733 | name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); |
699 | if (!name) | 734 | if (!name) |
700 | return NULL; | 735 | return NULL; |
701 | segment = offset >> rbd_dev->header.obj_order; | 736 | segment = offset >> rbd_dev->header.obj_order; |
702 | ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", | 737 | ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", |
703 | rbd_dev->header.object_prefix, segment); | 738 | rbd_dev->header.object_prefix, segment); |
704 | if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { | 739 | if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { |
705 | pr_err("error formatting segment name for #%llu (%d)\n", | 740 | pr_err("error formatting segment name for #%llu (%d)\n", |
706 | segment, ret); | 741 | segment, ret); |
707 | kfree(name); | 742 | kfree(name); |
@@ -800,77 +835,144 @@ static void zero_bio_chain(struct bio *chain, int start_ofs) | |||
800 | } | 835 | } |
801 | 836 | ||
802 | /* | 837 | /* |
803 | * bio_chain_clone - clone a chain of bios up to a certain length. | 838 | * Clone a portion of a bio, starting at the given byte offset |
804 | * might return a bio_pair that will need to be released. | 839 | * and continuing for the number of bytes indicated. |
805 | */ | 840 | */ |
806 | static struct bio *bio_chain_clone(struct bio **old, struct bio **next, | 841 | static struct bio *bio_clone_range(struct bio *bio_src, |
807 | struct bio_pair **bp, | 842 | unsigned int offset, |
808 | int len, gfp_t gfpmask) | 843 | unsigned int len, |
809 | { | 844 | gfp_t gfpmask) |
810 | struct bio *old_chain = *old; | 845 | { |
811 | struct bio *new_chain = NULL; | 846 | struct bio_vec *bv; |
812 | struct bio *tail; | 847 | unsigned int resid; |
813 | int total = 0; | 848 | unsigned short idx; |
814 | 849 | unsigned int voff; | |
815 | if (*bp) { | 850 | unsigned short end_idx; |
816 | bio_pair_release(*bp); | 851 | unsigned short vcnt; |
817 | *bp = NULL; | 852 | struct bio *bio; |
818 | } | ||
819 | 853 | ||
820 | while (old_chain && (total < len)) { | 854 | /* Handle the easy case for the caller */ |
821 | struct bio *tmp; | ||
822 | 855 | ||
823 | tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); | 856 | if (!offset && len == bio_src->bi_size) |
824 | if (!tmp) | 857 | return bio_clone(bio_src, gfpmask); |
825 | goto err_out; | ||
826 | gfpmask &= ~__GFP_WAIT; /* can't wait after the first */ | ||
827 | 858 | ||
828 | if (total + old_chain->bi_size > len) { | 859 | if (WARN_ON_ONCE(!len)) |
829 | struct bio_pair *bp; | 860 | return NULL; |
861 | if (WARN_ON_ONCE(len > bio_src->bi_size)) | ||
862 | return NULL; | ||
863 | if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) | ||
864 | return NULL; | ||
830 | 865 | ||
831 | /* | 866 | /* Find first affected segment... */ |
832 | * this split can only happen with a single paged bio, | ||
833 | * split_bio will BUG_ON if this is not the case | ||
834 | */ | ||
835 | dout("bio_chain_clone split! total=%d remaining=%d" | ||
836 | "bi_size=%u\n", | ||
837 | total, len - total, old_chain->bi_size); | ||
838 | 867 | ||
839 | /* split the bio. We'll release it either in the next | 868 | resid = offset; |
840 | call, or it will have to be released outside */ | 869 | __bio_for_each_segment(bv, bio_src, idx, 0) { |
841 | bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); | 870 | if (resid < bv->bv_len) |
842 | if (!bp) | 871 | break; |
843 | goto err_out; | 872 | resid -= bv->bv_len; |
873 | } | ||
874 | voff = resid; | ||
844 | 875 | ||
845 | __bio_clone(tmp, &bp->bio1); | 876 | /* ...and the last affected segment */ |
846 | 877 | ||
847 | *next = &bp->bio2; | 878 | resid += len; |
848 | } else { | 879 | __bio_for_each_segment(bv, bio_src, end_idx, idx) { |
849 | __bio_clone(tmp, old_chain); | 880 | if (resid <= bv->bv_len) |
850 | *next = old_chain->bi_next; | 881 | break; |
851 | } | 882 | resid -= bv->bv_len; |
883 | } | ||
884 | vcnt = end_idx - idx + 1; | ||
885 | |||
886 | /* Build the clone */ | ||
852 | 887 | ||
853 | tmp->bi_bdev = NULL; | 888 | bio = bio_alloc(gfpmask, (unsigned int) vcnt); |
854 | tmp->bi_next = NULL; | 889 | if (!bio) |
855 | if (new_chain) | 890 | return NULL; /* ENOMEM */ |
856 | tail->bi_next = tmp; | ||
857 | else | ||
858 | new_chain = tmp; | ||
859 | tail = tmp; | ||
860 | old_chain = old_chain->bi_next; | ||
861 | 891 | ||
862 | total += tmp->bi_size; | 892 | bio->bi_bdev = bio_src->bi_bdev; |
893 | bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); | ||
894 | bio->bi_rw = bio_src->bi_rw; | ||
895 | bio->bi_flags |= 1 << BIO_CLONED; | ||
896 | |||
897 | /* | ||
898 | * Copy over our part of the bio_vec, then update the first | ||
899 | * and last (or only) entries. | ||
900 | */ | ||
901 | memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], | ||
902 | vcnt * sizeof (struct bio_vec)); | ||
903 | bio->bi_io_vec[0].bv_offset += voff; | ||
904 | if (vcnt > 1) { | ||
905 | bio->bi_io_vec[0].bv_len -= voff; | ||
906 | bio->bi_io_vec[vcnt - 1].bv_len = resid; | ||
907 | } else { | ||
908 | bio->bi_io_vec[0].bv_len = len; | ||
863 | } | 909 | } |
864 | 910 | ||
865 | rbd_assert(total == len); | 911 | bio->bi_vcnt = vcnt; |
912 | bio->bi_size = len; | ||
913 | bio->bi_idx = 0; | ||
914 | |||
915 | return bio; | ||
916 | } | ||
917 | |||
918 | /* | ||
919 | * Clone a portion of a bio chain, starting at the given byte offset | ||
920 | * into the first bio in the source chain and continuing for the | ||
921 | * number of bytes indicated. The result is another bio chain of | ||
922 | * exactly the given length, or a null pointer on error. | ||
923 | * | ||
924 | * The bio_src and offset parameters are both in-out. On entry they | ||
925 | * refer to the first source bio and the offset into that bio where | ||
926 | * the start of data to be cloned is located. | ||
927 | * | ||
928 | * On return, bio_src is updated to refer to the bio in the source | ||
929 | * chain that contains first un-cloned byte, and *offset will | ||
930 | * contain the offset of that byte within that bio. | ||
931 | */ | ||
932 | static struct bio *bio_chain_clone_range(struct bio **bio_src, | ||
933 | unsigned int *offset, | ||
934 | unsigned int len, | ||
935 | gfp_t gfpmask) | ||
936 | { | ||
937 | struct bio *bi = *bio_src; | ||
938 | unsigned int off = *offset; | ||
939 | struct bio *chain = NULL; | ||
940 | struct bio **end; | ||
941 | |||
942 | /* Build up a chain of clone bios up to the limit */ | ||
943 | |||
944 | if (!bi || off >= bi->bi_size || !len) | ||
945 | return NULL; /* Nothing to clone */ | ||
866 | 946 | ||
867 | *old = old_chain; | 947 | end = &chain; |
948 | while (len) { | ||
949 | unsigned int bi_size; | ||
950 | struct bio *bio; | ||
951 | |||
952 | if (!bi) | ||
953 | goto out_err; /* EINVAL; ran out of bio's */ | ||
954 | bi_size = min_t(unsigned int, bi->bi_size - off, len); | ||
955 | bio = bio_clone_range(bi, off, bi_size, gfpmask); | ||
956 | if (!bio) | ||
957 | goto out_err; /* ENOMEM */ | ||
958 | |||
959 | *end = bio; | ||
960 | end = &bio->bi_next; | ||
961 | |||
962 | off += bi_size; | ||
963 | if (off == bi->bi_size) { | ||
964 | bi = bi->bi_next; | ||
965 | off = 0; | ||
966 | } | ||
967 | len -= bi_size; | ||
968 | } | ||
969 | *bio_src = bi; | ||
970 | *offset = off; | ||
868 | 971 | ||
869 | return new_chain; | 972 | return chain; |
973 | out_err: | ||
974 | bio_chain_put(chain); | ||
870 | 975 | ||
871 | err_out: | ||
872 | dout("bio_chain_clone with err\n"); | ||
873 | bio_chain_put(new_chain); | ||
874 | return NULL; | 976 | return NULL; |
875 | } | 977 | } |
876 | 978 | ||
@@ -988,8 +1090,9 @@ static int rbd_do_request(struct request *rq, | |||
988 | req_data->coll_index = coll_index; | 1090 | req_data->coll_index = coll_index; |
989 | } | 1091 | } |
990 | 1092 | ||
991 | dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, | 1093 | dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", |
992 | (unsigned long long) ofs, (unsigned long long) len); | 1094 | object_name, (unsigned long long) ofs, |
1095 | (unsigned long long) len, coll, coll_index); | ||
993 | 1096 | ||
994 | osdc = &rbd_dev->rbd_client->client->osdc; | 1097 | osdc = &rbd_dev->rbd_client->client->osdc; |
995 | req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, | 1098 | req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, |
@@ -1019,7 +1122,7 @@ static int rbd_do_request(struct request *rq, | |||
1019 | layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | 1122 | layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); |
1020 | layout->fl_stripe_count = cpu_to_le32(1); | 1123 | layout->fl_stripe_count = cpu_to_le32(1); |
1021 | layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | 1124 | layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); |
1022 | layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); | 1125 | layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); |
1023 | ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, | 1126 | ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, |
1024 | req, ops); | 1127 | req, ops); |
1025 | rbd_assert(ret == 0); | 1128 | rbd_assert(ret == 0); |
@@ -1154,8 +1257,6 @@ done: | |||
1154 | static int rbd_do_op(struct request *rq, | 1257 | static int rbd_do_op(struct request *rq, |
1155 | struct rbd_device *rbd_dev, | 1258 | struct rbd_device *rbd_dev, |
1156 | struct ceph_snap_context *snapc, | 1259 | struct ceph_snap_context *snapc, |
1157 | u64 snapid, | ||
1158 | int opcode, int flags, | ||
1159 | u64 ofs, u64 len, | 1260 | u64 ofs, u64 len, |
1160 | struct bio *bio, | 1261 | struct bio *bio, |
1161 | struct rbd_req_coll *coll, | 1262 | struct rbd_req_coll *coll, |
@@ -1167,6 +1268,9 @@ static int rbd_do_op(struct request *rq, | |||
1167 | int ret; | 1268 | int ret; |
1168 | struct ceph_osd_req_op *ops; | 1269 | struct ceph_osd_req_op *ops; |
1169 | u32 payload_len; | 1270 | u32 payload_len; |
1271 | int opcode; | ||
1272 | int flags; | ||
1273 | u64 snapid; | ||
1170 | 1274 | ||
1171 | seg_name = rbd_segment_name(rbd_dev, ofs); | 1275 | seg_name = rbd_segment_name(rbd_dev, ofs); |
1172 | if (!seg_name) | 1276 | if (!seg_name) |
@@ -1174,7 +1278,18 @@ static int rbd_do_op(struct request *rq, | |||
1174 | seg_len = rbd_segment_length(rbd_dev, ofs, len); | 1278 | seg_len = rbd_segment_length(rbd_dev, ofs, len); |
1175 | seg_ofs = rbd_segment_offset(rbd_dev, ofs); | 1279 | seg_ofs = rbd_segment_offset(rbd_dev, ofs); |
1176 | 1280 | ||
1177 | payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); | 1281 | if (rq_data_dir(rq) == WRITE) { |
1282 | opcode = CEPH_OSD_OP_WRITE; | ||
1283 | flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; | ||
1284 | snapid = CEPH_NOSNAP; | ||
1285 | payload_len = seg_len; | ||
1286 | } else { | ||
1287 | opcode = CEPH_OSD_OP_READ; | ||
1288 | flags = CEPH_OSD_FLAG_READ; | ||
1289 | snapc = NULL; | ||
1290 | snapid = rbd_dev->spec->snap_id; | ||
1291 | payload_len = 0; | ||
1292 | } | ||
1178 | 1293 | ||
1179 | ret = -ENOMEM; | 1294 | ret = -ENOMEM; |
1180 | ops = rbd_create_rw_ops(1, opcode, payload_len); | 1295 | ops = rbd_create_rw_ops(1, opcode, payload_len); |
@@ -1202,41 +1317,6 @@ done: | |||
1202 | } | 1317 | } |
1203 | 1318 | ||
1204 | /* | 1319 | /* |
1205 | * Request async osd write | ||
1206 | */ | ||
1207 | static int rbd_req_write(struct request *rq, | ||
1208 | struct rbd_device *rbd_dev, | ||
1209 | struct ceph_snap_context *snapc, | ||
1210 | u64 ofs, u64 len, | ||
1211 | struct bio *bio, | ||
1212 | struct rbd_req_coll *coll, | ||
1213 | int coll_index) | ||
1214 | { | ||
1215 | return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, | ||
1216 | CEPH_OSD_OP_WRITE, | ||
1217 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | ||
1218 | ofs, len, bio, coll, coll_index); | ||
1219 | } | ||
1220 | |||
1221 | /* | ||
1222 | * Request async osd read | ||
1223 | */ | ||
1224 | static int rbd_req_read(struct request *rq, | ||
1225 | struct rbd_device *rbd_dev, | ||
1226 | u64 snapid, | ||
1227 | u64 ofs, u64 len, | ||
1228 | struct bio *bio, | ||
1229 | struct rbd_req_coll *coll, | ||
1230 | int coll_index) | ||
1231 | { | ||
1232 | return rbd_do_op(rq, rbd_dev, NULL, | ||
1233 | snapid, | ||
1234 | CEPH_OSD_OP_READ, | ||
1235 | CEPH_OSD_FLAG_READ, | ||
1236 | ofs, len, bio, coll, coll_index); | ||
1237 | } | ||
1238 | |||
1239 | /* | ||
1240 | * Request sync osd read | 1320 | * Request sync osd read |
1241 | */ | 1321 | */ |
1242 | static int rbd_req_sync_read(struct rbd_device *rbd_dev, | 1322 | static int rbd_req_sync_read(struct rbd_device *rbd_dev, |
@@ -1304,7 +1384,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | |||
1304 | dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", | 1384 | dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", |
1305 | rbd_dev->header_name, (unsigned long long) notify_id, | 1385 | rbd_dev->header_name, (unsigned long long) notify_id, |
1306 | (unsigned int) opcode); | 1386 | (unsigned int) opcode); |
1307 | rc = rbd_refresh_header(rbd_dev, &hver); | 1387 | rc = rbd_dev_refresh(rbd_dev, &hver); |
1308 | if (rc) | 1388 | if (rc) |
1309 | pr_warning(RBD_DRV_NAME "%d got notification but failed to " | 1389 | pr_warning(RBD_DRV_NAME "%d got notification but failed to " |
1310 | " update snaps: %d\n", rbd_dev->major, rc); | 1390 | " update snaps: %d\n", rbd_dev->major, rc); |
@@ -1460,18 +1540,16 @@ static void rbd_rq_fn(struct request_queue *q) | |||
1460 | { | 1540 | { |
1461 | struct rbd_device *rbd_dev = q->queuedata; | 1541 | struct rbd_device *rbd_dev = q->queuedata; |
1462 | struct request *rq; | 1542 | struct request *rq; |
1463 | struct bio_pair *bp = NULL; | ||
1464 | 1543 | ||
1465 | while ((rq = blk_fetch_request(q))) { | 1544 | while ((rq = blk_fetch_request(q))) { |
1466 | struct bio *bio; | 1545 | struct bio *bio; |
1467 | struct bio *rq_bio, *next_bio = NULL; | ||
1468 | bool do_write; | 1546 | bool do_write; |
1469 | unsigned int size; | 1547 | unsigned int size; |
1470 | u64 op_size = 0; | ||
1471 | u64 ofs; | 1548 | u64 ofs; |
1472 | int num_segs, cur_seg = 0; | 1549 | int num_segs, cur_seg = 0; |
1473 | struct rbd_req_coll *coll; | 1550 | struct rbd_req_coll *coll; |
1474 | struct ceph_snap_context *snapc; | 1551 | struct ceph_snap_context *snapc; |
1552 | unsigned int bio_offset; | ||
1475 | 1553 | ||
1476 | dout("fetched request\n"); | 1554 | dout("fetched request\n"); |
1477 | 1555 | ||
@@ -1483,10 +1561,6 @@ static void rbd_rq_fn(struct request_queue *q) | |||
1483 | 1561 | ||
1484 | /* deduce our operation (read, write) */ | 1562 | /* deduce our operation (read, write) */ |
1485 | do_write = (rq_data_dir(rq) == WRITE); | 1563 | do_write = (rq_data_dir(rq) == WRITE); |
1486 | |||
1487 | size = blk_rq_bytes(rq); | ||
1488 | ofs = blk_rq_pos(rq) * SECTOR_SIZE; | ||
1489 | rq_bio = rq->bio; | ||
1490 | if (do_write && rbd_dev->mapping.read_only) { | 1564 | if (do_write && rbd_dev->mapping.read_only) { |
1491 | __blk_end_request_all(rq, -EROFS); | 1565 | __blk_end_request_all(rq, -EROFS); |
1492 | continue; | 1566 | continue; |
@@ -1496,8 +1570,8 @@ static void rbd_rq_fn(struct request_queue *q) | |||
1496 | 1570 | ||
1497 | down_read(&rbd_dev->header_rwsem); | 1571 | down_read(&rbd_dev->header_rwsem); |
1498 | 1572 | ||
1499 | if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && | 1573 | if (!rbd_dev->exists) { |
1500 | !rbd_dev->mapping.snap_exists) { | 1574 | rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); |
1501 | up_read(&rbd_dev->header_rwsem); | 1575 | up_read(&rbd_dev->header_rwsem); |
1502 | dout("request for non-existent snapshot"); | 1576 | dout("request for non-existent snapshot"); |
1503 | spin_lock_irq(q->queue_lock); | 1577 | spin_lock_irq(q->queue_lock); |
@@ -1509,6 +1583,10 @@ static void rbd_rq_fn(struct request_queue *q) | |||
1509 | 1583 | ||
1510 | up_read(&rbd_dev->header_rwsem); | 1584 | up_read(&rbd_dev->header_rwsem); |
1511 | 1585 | ||
1586 | size = blk_rq_bytes(rq); | ||
1587 | ofs = blk_rq_pos(rq) * SECTOR_SIZE; | ||
1588 | bio = rq->bio; | ||
1589 | |||
1512 | dout("%s 0x%x bytes at 0x%llx\n", | 1590 | dout("%s 0x%x bytes at 0x%llx\n", |
1513 | do_write ? "write" : "read", | 1591 | do_write ? "write" : "read", |
1514 | size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); | 1592 | size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); |
@@ -1528,45 +1606,37 @@ static void rbd_rq_fn(struct request_queue *q) | |||
1528 | continue; | 1606 | continue; |
1529 | } | 1607 | } |
1530 | 1608 | ||
1609 | bio_offset = 0; | ||
1531 | do { | 1610 | do { |
1532 | /* a bio clone to be passed down to OSD req */ | 1611 | u64 limit = rbd_segment_length(rbd_dev, ofs, size); |
1612 | unsigned int chain_size; | ||
1613 | struct bio *bio_chain; | ||
1614 | |||
1615 | BUG_ON(limit > (u64) UINT_MAX); | ||
1616 | chain_size = (unsigned int) limit; | ||
1533 | dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); | 1617 | dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); |
1534 | op_size = rbd_segment_length(rbd_dev, ofs, size); | 1618 | |
1535 | kref_get(&coll->kref); | 1619 | kref_get(&coll->kref); |
1536 | bio = bio_chain_clone(&rq_bio, &next_bio, &bp, | ||
1537 | op_size, GFP_ATOMIC); | ||
1538 | if (!bio) { | ||
1539 | rbd_coll_end_req_index(rq, coll, cur_seg, | ||
1540 | -ENOMEM, op_size); | ||
1541 | goto next_seg; | ||
1542 | } | ||
1543 | 1620 | ||
1621 | /* Pass a cloned bio chain via an osd request */ | ||
1544 | 1622 | ||
1545 | /* init OSD command: write or read */ | 1623 | bio_chain = bio_chain_clone_range(&bio, |
1546 | if (do_write) | 1624 | &bio_offset, chain_size, |
1547 | rbd_req_write(rq, rbd_dev, | 1625 | GFP_ATOMIC); |
1548 | snapc, | 1626 | if (bio_chain) |
1549 | ofs, | 1627 | (void) rbd_do_op(rq, rbd_dev, snapc, |
1550 | op_size, bio, | 1628 | ofs, chain_size, |
1551 | coll, cur_seg); | 1629 | bio_chain, coll, cur_seg); |
1552 | else | 1630 | else |
1553 | rbd_req_read(rq, rbd_dev, | 1631 | rbd_coll_end_req_index(rq, coll, cur_seg, |
1554 | rbd_dev->mapping.snap_id, | 1632 | -ENOMEM, chain_size); |
1555 | ofs, | 1633 | size -= chain_size; |
1556 | op_size, bio, | 1634 | ofs += chain_size; |
1557 | coll, cur_seg); | ||
1558 | |||
1559 | next_seg: | ||
1560 | size -= op_size; | ||
1561 | ofs += op_size; | ||
1562 | 1635 | ||
1563 | cur_seg++; | 1636 | cur_seg++; |
1564 | rq_bio = next_bio; | ||
1565 | } while (size > 0); | 1637 | } while (size > 0); |
1566 | kref_put(&coll->kref, rbd_coll_release); | 1638 | kref_put(&coll->kref, rbd_coll_release); |
1567 | 1639 | ||
1568 | if (bp) | ||
1569 | bio_pair_release(bp); | ||
1570 | spin_lock_irq(q->queue_lock); | 1640 | spin_lock_irq(q->queue_lock); |
1571 | 1641 | ||
1572 | ceph_put_snap_context(snapc); | 1642 | ceph_put_snap_context(snapc); |
@@ -1576,28 +1646,47 @@ next_seg: | |||
1576 | /* | 1646 | /* |
1577 | * a queue callback. Makes sure that we don't create a bio that spans across | 1647 | * a queue callback. Makes sure that we don't create a bio that spans across |
1578 | * multiple osd objects. One exception would be with a single page bios, | 1648 | * multiple osd objects. One exception would be with a single page bios, |
1579 | * which we handle later at bio_chain_clone | 1649 | * which we handle later at bio_chain_clone_range() |
1580 | */ | 1650 | */ |
1581 | static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, | 1651 | static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, |
1582 | struct bio_vec *bvec) | 1652 | struct bio_vec *bvec) |
1583 | { | 1653 | { |
1584 | struct rbd_device *rbd_dev = q->queuedata; | 1654 | struct rbd_device *rbd_dev = q->queuedata; |
1585 | unsigned int chunk_sectors; | 1655 | sector_t sector_offset; |
1586 | sector_t sector; | 1656 | sector_t sectors_per_obj; |
1587 | unsigned int bio_sectors; | 1657 | sector_t obj_sector_offset; |
1588 | int max; | 1658 | int ret; |
1589 | 1659 | ||
1590 | chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); | 1660 | /* |
1591 | sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); | 1661 | * Find how far into its rbd object the partition-relative |
1592 | bio_sectors = bmd->bi_size >> SECTOR_SHIFT; | 1662 | * bio start sector is to offset relative to the enclosing |
1663 | * device. | ||
1664 | */ | ||
1665 | sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; | ||
1666 | sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); | ||
1667 | obj_sector_offset = sector_offset & (sectors_per_obj - 1); | ||
1668 | |||
1669 | /* | ||
1670 | * Compute the number of bytes from that offset to the end | ||
1671 | * of the object. Account for what's already used by the bio. | ||
1672 | */ | ||
1673 | ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; | ||
1674 | if (ret > bmd->bi_size) | ||
1675 | ret -= bmd->bi_size; | ||
1676 | else | ||
1677 | ret = 0; | ||
1593 | 1678 | ||
1594 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) | 1679 | /* |
1595 | + bio_sectors)) << SECTOR_SHIFT; | 1680 | * Don't send back more than was asked for. And if the bio |
1596 | if (max < 0) | 1681 | * was empty, let the whole thing through because: "Note |
1597 | max = 0; /* bio_add cannot handle a negative return */ | 1682 | * that a block device *must* allow a single page to be |
1598 | if (max <= bvec->bv_len && bio_sectors == 0) | 1683 | * added to an empty bio." |
1599 | return bvec->bv_len; | 1684 | */ |
1600 | return max; | 1685 | rbd_assert(bvec->bv_len <= PAGE_SIZE); |
1686 | if (ret > (int) bvec->bv_len || !bmd->bi_size) | ||
1687 | ret = (int) bvec->bv_len; | ||
1688 | |||
1689 | return ret; | ||
1601 | } | 1690 | } |
1602 | 1691 | ||
1603 | static void rbd_free_disk(struct rbd_device *rbd_dev) | 1692 | static void rbd_free_disk(struct rbd_device *rbd_dev) |
@@ -1663,13 +1752,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) | |||
1663 | ret = -ENXIO; | 1752 | ret = -ENXIO; |
1664 | pr_warning("short header read for image %s" | 1753 | pr_warning("short header read for image %s" |
1665 | " (want %zd got %d)\n", | 1754 | " (want %zd got %d)\n", |
1666 | rbd_dev->image_name, size, ret); | 1755 | rbd_dev->spec->image_name, size, ret); |
1667 | goto out_err; | 1756 | goto out_err; |
1668 | } | 1757 | } |
1669 | if (!rbd_dev_ondisk_valid(ondisk)) { | 1758 | if (!rbd_dev_ondisk_valid(ondisk)) { |
1670 | ret = -ENXIO; | 1759 | ret = -ENXIO; |
1671 | pr_warning("invalid header for image %s\n", | 1760 | pr_warning("invalid header for image %s\n", |
1672 | rbd_dev->image_name); | 1761 | rbd_dev->spec->image_name); |
1673 | goto out_err; | 1762 | goto out_err; |
1674 | } | 1763 | } |
1675 | 1764 | ||
@@ -1707,19 +1796,32 @@ static int rbd_read_header(struct rbd_device *rbd_dev, | |||
1707 | return ret; | 1796 | return ret; |
1708 | } | 1797 | } |
1709 | 1798 | ||
1710 | static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) | 1799 | static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) |
1711 | { | 1800 | { |
1712 | struct rbd_snap *snap; | 1801 | struct rbd_snap *snap; |
1713 | struct rbd_snap *next; | 1802 | struct rbd_snap *next; |
1714 | 1803 | ||
1715 | list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) | 1804 | list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) |
1716 | __rbd_remove_snap_dev(snap); | 1805 | rbd_remove_snap_dev(snap); |
1806 | } | ||
1807 | |||
1808 | static void rbd_update_mapping_size(struct rbd_device *rbd_dev) | ||
1809 | { | ||
1810 | sector_t size; | ||
1811 | |||
1812 | if (rbd_dev->spec->snap_id != CEPH_NOSNAP) | ||
1813 | return; | ||
1814 | |||
1815 | size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; | ||
1816 | dout("setting size to %llu sectors", (unsigned long long) size); | ||
1817 | rbd_dev->mapping.size = (u64) size; | ||
1818 | set_capacity(rbd_dev->disk, size); | ||
1717 | } | 1819 | } |
1718 | 1820 | ||
1719 | /* | 1821 | /* |
1720 | * only read the first part of the ondisk header, without the snaps info | 1822 | * only read the first part of the ondisk header, without the snaps info |
1721 | */ | 1823 | */ |
1722 | static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) | 1824 | static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) |
1723 | { | 1825 | { |
1724 | int ret; | 1826 | int ret; |
1725 | struct rbd_image_header h; | 1827 | struct rbd_image_header h; |
@@ -1730,17 +1832,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) | |||
1730 | 1832 | ||
1731 | down_write(&rbd_dev->header_rwsem); | 1833 | down_write(&rbd_dev->header_rwsem); |
1732 | 1834 | ||
1733 | /* resized? */ | 1835 | /* Update image size, and check for resize of mapped image */ |
1734 | if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { | 1836 | rbd_dev->header.image_size = h.image_size; |
1735 | sector_t size = (sector_t) h.image_size / SECTOR_SIZE; | 1837 | rbd_update_mapping_size(rbd_dev); |
1736 | |||
1737 | if (size != (sector_t) rbd_dev->mapping.size) { | ||
1738 | dout("setting size to %llu sectors", | ||
1739 | (unsigned long long) size); | ||
1740 | rbd_dev->mapping.size = (u64) size; | ||
1741 | set_capacity(rbd_dev->disk, size); | ||
1742 | } | ||
1743 | } | ||
1744 | 1838 | ||
1745 | /* rbd_dev->header.object_prefix shouldn't change */ | 1839 | /* rbd_dev->header.object_prefix shouldn't change */ |
1746 | kfree(rbd_dev->header.snap_sizes); | 1840 | kfree(rbd_dev->header.snap_sizes); |
@@ -1768,12 +1862,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) | |||
1768 | return ret; | 1862 | return ret; |
1769 | } | 1863 | } |
1770 | 1864 | ||
1771 | static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) | 1865 | static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) |
1772 | { | 1866 | { |
1773 | int ret; | 1867 | int ret; |
1774 | 1868 | ||
1869 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); | ||
1775 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 1870 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
1776 | ret = __rbd_refresh_header(rbd_dev, hver); | 1871 | if (rbd_dev->image_format == 1) |
1872 | ret = rbd_dev_v1_refresh(rbd_dev, hver); | ||
1873 | else | ||
1874 | ret = rbd_dev_v2_refresh(rbd_dev, hver); | ||
1777 | mutex_unlock(&ctl_mutex); | 1875 | mutex_unlock(&ctl_mutex); |
1778 | 1876 | ||
1779 | return ret; | 1877 | return ret; |
@@ -1885,7 +1983,7 @@ static ssize_t rbd_pool_show(struct device *dev, | |||
1885 | { | 1983 | { |
1886 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 1984 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
1887 | 1985 | ||
1888 | return sprintf(buf, "%s\n", rbd_dev->pool_name); | 1986 | return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); |
1889 | } | 1987 | } |
1890 | 1988 | ||
1891 | static ssize_t rbd_pool_id_show(struct device *dev, | 1989 | static ssize_t rbd_pool_id_show(struct device *dev, |
@@ -1893,7 +1991,8 @@ static ssize_t rbd_pool_id_show(struct device *dev, | |||
1893 | { | 1991 | { |
1894 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 1992 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
1895 | 1993 | ||
1896 | return sprintf(buf, "%d\n", rbd_dev->pool_id); | 1994 | return sprintf(buf, "%llu\n", |
1995 | (unsigned long long) rbd_dev->spec->pool_id); | ||
1897 | } | 1996 | } |
1898 | 1997 | ||
1899 | static ssize_t rbd_name_show(struct device *dev, | 1998 | static ssize_t rbd_name_show(struct device *dev, |
@@ -1901,7 +2000,10 @@ static ssize_t rbd_name_show(struct device *dev, | |||
1901 | { | 2000 | { |
1902 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 2001 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
1903 | 2002 | ||
1904 | return sprintf(buf, "%s\n", rbd_dev->image_name); | 2003 | if (rbd_dev->spec->image_name) |
2004 | return sprintf(buf, "%s\n", rbd_dev->spec->image_name); | ||
2005 | |||
2006 | return sprintf(buf, "(unknown)\n"); | ||
1905 | } | 2007 | } |
1906 | 2008 | ||
1907 | static ssize_t rbd_image_id_show(struct device *dev, | 2009 | static ssize_t rbd_image_id_show(struct device *dev, |
@@ -1909,7 +2011,7 @@ static ssize_t rbd_image_id_show(struct device *dev, | |||
1909 | { | 2011 | { |
1910 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 2012 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
1911 | 2013 | ||
1912 | return sprintf(buf, "%s\n", rbd_dev->image_id); | 2014 | return sprintf(buf, "%s\n", rbd_dev->spec->image_id); |
1913 | } | 2015 | } |
1914 | 2016 | ||
1915 | /* | 2017 | /* |
@@ -1922,7 +2024,50 @@ static ssize_t rbd_snap_show(struct device *dev, | |||
1922 | { | 2024 | { |
1923 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 2025 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
1924 | 2026 | ||
1925 | return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); | 2027 | return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); |
2028 | } | ||
2029 | |||
2030 | /* | ||
2031 | * For an rbd v2 image, shows the pool id, image id, and snapshot id | ||
2032 | * for the parent image. If there is no parent, simply shows | ||
2033 | * "(no parent image)". | ||
2034 | */ | ||
2035 | static ssize_t rbd_parent_show(struct device *dev, | ||
2036 | struct device_attribute *attr, | ||
2037 | char *buf) | ||
2038 | { | ||
2039 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | ||
2040 | struct rbd_spec *spec = rbd_dev->parent_spec; | ||
2041 | int count; | ||
2042 | char *bufp = buf; | ||
2043 | |||
2044 | if (!spec) | ||
2045 | return sprintf(buf, "(no parent image)\n"); | ||
2046 | |||
2047 | count = sprintf(bufp, "pool_id %llu\npool_name %s\n", | ||
2048 | (unsigned long long) spec->pool_id, spec->pool_name); | ||
2049 | if (count < 0) | ||
2050 | return count; | ||
2051 | bufp += count; | ||
2052 | |||
2053 | count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, | ||
2054 | spec->image_name ? spec->image_name : "(unknown)"); | ||
2055 | if (count < 0) | ||
2056 | return count; | ||
2057 | bufp += count; | ||
2058 | |||
2059 | count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", | ||
2060 | (unsigned long long) spec->snap_id, spec->snap_name); | ||
2061 | if (count < 0) | ||
2062 | return count; | ||
2063 | bufp += count; | ||
2064 | |||
2065 | count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); | ||
2066 | if (count < 0) | ||
2067 | return count; | ||
2068 | bufp += count; | ||
2069 | |||
2070 | return (ssize_t) (bufp - buf); | ||
1926 | } | 2071 | } |
1927 | 2072 | ||
1928 | static ssize_t rbd_image_refresh(struct device *dev, | 2073 | static ssize_t rbd_image_refresh(struct device *dev, |
@@ -1933,7 +2078,7 @@ static ssize_t rbd_image_refresh(struct device *dev, | |||
1933 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 2078 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
1934 | int ret; | 2079 | int ret; |
1935 | 2080 | ||
1936 | ret = rbd_refresh_header(rbd_dev, NULL); | 2081 | ret = rbd_dev_refresh(rbd_dev, NULL); |
1937 | 2082 | ||
1938 | return ret < 0 ? ret : size; | 2083 | return ret < 0 ? ret : size; |
1939 | } | 2084 | } |
@@ -1948,6 +2093,7 @@ static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); | |||
1948 | static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); | 2093 | static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); |
1949 | static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); | 2094 | static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); |
1950 | static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); | 2095 | static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); |
2096 | static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); | ||
1951 | 2097 | ||
1952 | static struct attribute *rbd_attrs[] = { | 2098 | static struct attribute *rbd_attrs[] = { |
1953 | &dev_attr_size.attr, | 2099 | &dev_attr_size.attr, |
@@ -1959,6 +2105,7 @@ static struct attribute *rbd_attrs[] = { | |||
1959 | &dev_attr_name.attr, | 2105 | &dev_attr_name.attr, |
1960 | &dev_attr_image_id.attr, | 2106 | &dev_attr_image_id.attr, |
1961 | &dev_attr_current_snap.attr, | 2107 | &dev_attr_current_snap.attr, |
2108 | &dev_attr_parent.attr, | ||
1962 | &dev_attr_refresh.attr, | 2109 | &dev_attr_refresh.attr, |
1963 | NULL | 2110 | NULL |
1964 | }; | 2111 | }; |
@@ -2047,6 +2194,74 @@ static struct device_type rbd_snap_device_type = { | |||
2047 | .release = rbd_snap_dev_release, | 2194 | .release = rbd_snap_dev_release, |
2048 | }; | 2195 | }; |
2049 | 2196 | ||
2197 | static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) | ||
2198 | { | ||
2199 | kref_get(&spec->kref); | ||
2200 | |||
2201 | return spec; | ||
2202 | } | ||
2203 | |||
2204 | static void rbd_spec_free(struct kref *kref); | ||
2205 | static void rbd_spec_put(struct rbd_spec *spec) | ||
2206 | { | ||
2207 | if (spec) | ||
2208 | kref_put(&spec->kref, rbd_spec_free); | ||
2209 | } | ||
2210 | |||
2211 | static struct rbd_spec *rbd_spec_alloc(void) | ||
2212 | { | ||
2213 | struct rbd_spec *spec; | ||
2214 | |||
2215 | spec = kzalloc(sizeof (*spec), GFP_KERNEL); | ||
2216 | if (!spec) | ||
2217 | return NULL; | ||
2218 | kref_init(&spec->kref); | ||
2219 | |||
2220 | rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ | ||
2221 | |||
2222 | return spec; | ||
2223 | } | ||
2224 | |||
2225 | static void rbd_spec_free(struct kref *kref) | ||
2226 | { | ||
2227 | struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); | ||
2228 | |||
2229 | kfree(spec->pool_name); | ||
2230 | kfree(spec->image_id); | ||
2231 | kfree(spec->image_name); | ||
2232 | kfree(spec->snap_name); | ||
2233 | kfree(spec); | ||
2234 | } | ||
2235 | |||
2236 | struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, | ||
2237 | struct rbd_spec *spec) | ||
2238 | { | ||
2239 | struct rbd_device *rbd_dev; | ||
2240 | |||
2241 | rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); | ||
2242 | if (!rbd_dev) | ||
2243 | return NULL; | ||
2244 | |||
2245 | spin_lock_init(&rbd_dev->lock); | ||
2246 | INIT_LIST_HEAD(&rbd_dev->node); | ||
2247 | INIT_LIST_HEAD(&rbd_dev->snaps); | ||
2248 | init_rwsem(&rbd_dev->header_rwsem); | ||
2249 | |||
2250 | rbd_dev->spec = spec; | ||
2251 | rbd_dev->rbd_client = rbdc; | ||
2252 | |||
2253 | return rbd_dev; | ||
2254 | } | ||
2255 | |||
2256 | static void rbd_dev_destroy(struct rbd_device *rbd_dev) | ||
2257 | { | ||
2258 | rbd_spec_put(rbd_dev->parent_spec); | ||
2259 | kfree(rbd_dev->header_name); | ||
2260 | rbd_put_client(rbd_dev->rbd_client); | ||
2261 | rbd_spec_put(rbd_dev->spec); | ||
2262 | kfree(rbd_dev); | ||
2263 | } | ||
2264 | |||
2050 | static bool rbd_snap_registered(struct rbd_snap *snap) | 2265 | static bool rbd_snap_registered(struct rbd_snap *snap) |
2051 | { | 2266 | { |
2052 | bool ret = snap->dev.type == &rbd_snap_device_type; | 2267 | bool ret = snap->dev.type == &rbd_snap_device_type; |
@@ -2057,7 +2272,7 @@ static bool rbd_snap_registered(struct rbd_snap *snap) | |||
2057 | return ret; | 2272 | return ret; |
2058 | } | 2273 | } |
2059 | 2274 | ||
2060 | static void __rbd_remove_snap_dev(struct rbd_snap *snap) | 2275 | static void rbd_remove_snap_dev(struct rbd_snap *snap) |
2061 | { | 2276 | { |
2062 | list_del(&snap->node); | 2277 | list_del(&snap->node); |
2063 | if (device_is_registered(&snap->dev)) | 2278 | if (device_is_registered(&snap->dev)) |
@@ -2073,7 +2288,7 @@ static int rbd_register_snap_dev(struct rbd_snap *snap, | |||
2073 | dev->type = &rbd_snap_device_type; | 2288 | dev->type = &rbd_snap_device_type; |
2074 | dev->parent = parent; | 2289 | dev->parent = parent; |
2075 | dev->release = rbd_snap_dev_release; | 2290 | dev->release = rbd_snap_dev_release; |
2076 | dev_set_name(dev, "snap_%s", snap->name); | 2291 | dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); |
2077 | dout("%s: registering device for snapshot %s\n", __func__, snap->name); | 2292 | dout("%s: registering device for snapshot %s\n", __func__, snap->name); |
2078 | 2293 | ||
2079 | ret = device_register(dev); | 2294 | ret = device_register(dev); |
@@ -2189,6 +2404,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) | |||
2189 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | 2404 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); |
2190 | if (ret < 0) | 2405 | if (ret < 0) |
2191 | goto out; | 2406 | goto out; |
2407 | ret = 0; /* rbd_req_sync_exec() can return positive */ | ||
2192 | 2408 | ||
2193 | p = reply_buf; | 2409 | p = reply_buf; |
2194 | rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, | 2410 | rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, |
@@ -2216,6 +2432,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, | |||
2216 | __le64 features; | 2432 | __le64 features; |
2217 | __le64 incompat; | 2433 | __le64 incompat; |
2218 | } features_buf = { 0 }; | 2434 | } features_buf = { 0 }; |
2435 | u64 incompat; | ||
2219 | int ret; | 2436 | int ret; |
2220 | 2437 | ||
2221 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | 2438 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, |
@@ -2226,6 +2443,11 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, | |||
2226 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | 2443 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); |
2227 | if (ret < 0) | 2444 | if (ret < 0) |
2228 | return ret; | 2445 | return ret; |
2446 | |||
2447 | incompat = le64_to_cpu(features_buf.incompat); | ||
2448 | if (incompat & ~RBD_FEATURES_ALL) | ||
2449 | return -ENXIO; | ||
2450 | |||
2229 | *snap_features = le64_to_cpu(features_buf.features); | 2451 | *snap_features = le64_to_cpu(features_buf.features); |
2230 | 2452 | ||
2231 | dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", | 2453 | dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", |
@@ -2242,6 +2464,183 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev) | |||
2242 | &rbd_dev->header.features); | 2464 | &rbd_dev->header.features); |
2243 | } | 2465 | } |
2244 | 2466 | ||
2467 | static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) | ||
2468 | { | ||
2469 | struct rbd_spec *parent_spec; | ||
2470 | size_t size; | ||
2471 | void *reply_buf = NULL; | ||
2472 | __le64 snapid; | ||
2473 | void *p; | ||
2474 | void *end; | ||
2475 | char *image_id; | ||
2476 | u64 overlap; | ||
2477 | size_t len = 0; | ||
2478 | int ret; | ||
2479 | |||
2480 | parent_spec = rbd_spec_alloc(); | ||
2481 | if (!parent_spec) | ||
2482 | return -ENOMEM; | ||
2483 | |||
2484 | size = sizeof (__le64) + /* pool_id */ | ||
2485 | sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ | ||
2486 | sizeof (__le64) + /* snap_id */ | ||
2487 | sizeof (__le64); /* overlap */ | ||
2488 | reply_buf = kmalloc(size, GFP_KERNEL); | ||
2489 | if (!reply_buf) { | ||
2490 | ret = -ENOMEM; | ||
2491 | goto out_err; | ||
2492 | } | ||
2493 | |||
2494 | snapid = cpu_to_le64(CEPH_NOSNAP); | ||
2495 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | ||
2496 | "rbd", "get_parent", | ||
2497 | (char *) &snapid, sizeof (snapid), | ||
2498 | (char *) reply_buf, size, | ||
2499 | CEPH_OSD_FLAG_READ, NULL); | ||
2500 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | ||
2501 | if (ret < 0) | ||
2502 | goto out_err; | ||
2503 | |||
2504 | ret = -ERANGE; | ||
2505 | p = reply_buf; | ||
2506 | end = (char *) reply_buf + size; | ||
2507 | ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); | ||
2508 | if (parent_spec->pool_id == CEPH_NOPOOL) | ||
2509 | goto out; /* No parent? No problem. */ | ||
2510 | |||
2511 | image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); | ||
2512 | if (IS_ERR(image_id)) { | ||
2513 | ret = PTR_ERR(image_id); | ||
2514 | goto out_err; | ||
2515 | } | ||
2516 | parent_spec->image_id = image_id; | ||
2517 | parent_spec->image_id_len = len; | ||
2518 | ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); | ||
2519 | ceph_decode_64_safe(&p, end, overlap, out_err); | ||
2520 | |||
2521 | rbd_dev->parent_overlap = overlap; | ||
2522 | rbd_dev->parent_spec = parent_spec; | ||
2523 | parent_spec = NULL; /* rbd_dev now owns this */ | ||
2524 | out: | ||
2525 | ret = 0; | ||
2526 | out_err: | ||
2527 | kfree(reply_buf); | ||
2528 | rbd_spec_put(parent_spec); | ||
2529 | |||
2530 | return ret; | ||
2531 | } | ||
2532 | |||
2533 | static char *rbd_dev_image_name(struct rbd_device *rbd_dev) | ||
2534 | { | ||
2535 | size_t image_id_size; | ||
2536 | char *image_id; | ||
2537 | void *p; | ||
2538 | void *end; | ||
2539 | size_t size; | ||
2540 | void *reply_buf = NULL; | ||
2541 | size_t len = 0; | ||
2542 | char *image_name = NULL; | ||
2543 | int ret; | ||
2544 | |||
2545 | rbd_assert(!rbd_dev->spec->image_name); | ||
2546 | |||
2547 | image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; | ||
2548 | image_id = kmalloc(image_id_size, GFP_KERNEL); | ||
2549 | if (!image_id) | ||
2550 | return NULL; | ||
2551 | |||
2552 | p = image_id; | ||
2553 | end = (char *) image_id + image_id_size; | ||
2554 | ceph_encode_string(&p, end, rbd_dev->spec->image_id, | ||
2555 | (u32) rbd_dev->spec->image_id_len); | ||
2556 | |||
2557 | size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; | ||
2558 | reply_buf = kmalloc(size, GFP_KERNEL); | ||
2559 | if (!reply_buf) | ||
2560 | goto out; | ||
2561 | |||
2562 | ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, | ||
2563 | "rbd", "dir_get_name", | ||
2564 | image_id, image_id_size, | ||
2565 | (char *) reply_buf, size, | ||
2566 | CEPH_OSD_FLAG_READ, NULL); | ||
2567 | if (ret < 0) | ||
2568 | goto out; | ||
2569 | p = reply_buf; | ||
2570 | end = (char *) reply_buf + size; | ||
2571 | image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); | ||
2572 | if (IS_ERR(image_name)) | ||
2573 | image_name = NULL; | ||
2574 | else | ||
2575 | dout("%s: name is %s len is %zd\n", __func__, image_name, len); | ||
2576 | out: | ||
2577 | kfree(reply_buf); | ||
2578 | kfree(image_id); | ||
2579 | |||
2580 | return image_name; | ||
2581 | } | ||
2582 | |||
2583 | /* | ||
2584 | * When a parent image gets probed, we only have the pool, image, | ||
2585 | * and snapshot ids but not the names of any of them. This call | ||
2586 | * is made later to fill in those names. It has to be done after | ||
2587 | * rbd_dev_snaps_update() has completed because some of the | ||
2588 | * information (in particular, snapshot name) is not available | ||
2589 | * until then. | ||
2590 | */ | ||
2591 | static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) | ||
2592 | { | ||
2593 | struct ceph_osd_client *osdc; | ||
2594 | const char *name; | ||
2595 | void *reply_buf = NULL; | ||
2596 | int ret; | ||
2597 | |||
2598 | if (rbd_dev->spec->pool_name) | ||
2599 | return 0; /* Already have the names */ | ||
2600 | |||
2601 | /* Look up the pool name */ | ||
2602 | |||
2603 | osdc = &rbd_dev->rbd_client->client->osdc; | ||
2604 | name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); | ||
2605 | if (!name) | ||
2606 | return -EIO; /* pool id too large (>= 2^31) */ | ||
2607 | |||
2608 | rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); | ||
2609 | if (!rbd_dev->spec->pool_name) | ||
2610 | return -ENOMEM; | ||
2611 | |||
2612 | /* Fetch the image name; tolerate failure here */ | ||
2613 | |||
2614 | name = rbd_dev_image_name(rbd_dev); | ||
2615 | if (name) { | ||
2616 | rbd_dev->spec->image_name_len = strlen(name); | ||
2617 | rbd_dev->spec->image_name = (char *) name; | ||
2618 | } else { | ||
2619 | pr_warning(RBD_DRV_NAME "%d " | ||
2620 | "unable to get image name for image id %s\n", | ||
2621 | rbd_dev->major, rbd_dev->spec->image_id); | ||
2622 | } | ||
2623 | |||
2624 | /* Look up the snapshot name. */ | ||
2625 | |||
2626 | name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); | ||
2627 | if (!name) { | ||
2628 | ret = -EIO; | ||
2629 | goto out_err; | ||
2630 | } | ||
2631 | rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); | ||
2632 | if(!rbd_dev->spec->snap_name) | ||
2633 | goto out_err; | ||
2634 | |||
2635 | return 0; | ||
2636 | out_err: | ||
2637 | kfree(reply_buf); | ||
2638 | kfree(rbd_dev->spec->pool_name); | ||
2639 | rbd_dev->spec->pool_name = NULL; | ||
2640 | |||
2641 | return ret; | ||
2642 | } | ||
2643 | |||
2245 | static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) | 2644 | static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) |
2246 | { | 2645 | { |
2247 | size_t size; | 2646 | size_t size; |
@@ -2328,7 +2727,6 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) | |||
2328 | int ret; | 2727 | int ret; |
2329 | void *p; | 2728 | void *p; |
2330 | void *end; | 2729 | void *end; |
2331 | size_t snap_name_len; | ||
2332 | char *snap_name; | 2730 | char *snap_name; |
2333 | 2731 | ||
2334 | size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; | 2732 | size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; |
@@ -2348,9 +2746,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) | |||
2348 | 2746 | ||
2349 | p = reply_buf; | 2747 | p = reply_buf; |
2350 | end = (char *) reply_buf + size; | 2748 | end = (char *) reply_buf + size; |
2351 | snap_name_len = 0; | 2749 | snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); |
2352 | snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len, | ||
2353 | GFP_KERNEL); | ||
2354 | if (IS_ERR(snap_name)) { | 2750 | if (IS_ERR(snap_name)) { |
2355 | ret = PTR_ERR(snap_name); | 2751 | ret = PTR_ERR(snap_name); |
2356 | goto out; | 2752 | goto out; |
@@ -2397,6 +2793,41 @@ static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, | |||
2397 | return ERR_PTR(-EINVAL); | 2793 | return ERR_PTR(-EINVAL); |
2398 | } | 2794 | } |
2399 | 2795 | ||
2796 | static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) | ||
2797 | { | ||
2798 | int ret; | ||
2799 | __u8 obj_order; | ||
2800 | |||
2801 | down_write(&rbd_dev->header_rwsem); | ||
2802 | |||
2803 | /* Grab old order first, to see if it changes */ | ||
2804 | |||
2805 | obj_order = rbd_dev->header.obj_order, | ||
2806 | ret = rbd_dev_v2_image_size(rbd_dev); | ||
2807 | if (ret) | ||
2808 | goto out; | ||
2809 | if (rbd_dev->header.obj_order != obj_order) { | ||
2810 | ret = -EIO; | ||
2811 | goto out; | ||
2812 | } | ||
2813 | rbd_update_mapping_size(rbd_dev); | ||
2814 | |||
2815 | ret = rbd_dev_v2_snap_context(rbd_dev, hver); | ||
2816 | dout("rbd_dev_v2_snap_context returned %d\n", ret); | ||
2817 | if (ret) | ||
2818 | goto out; | ||
2819 | ret = rbd_dev_snaps_update(rbd_dev); | ||
2820 | dout("rbd_dev_snaps_update returned %d\n", ret); | ||
2821 | if (ret) | ||
2822 | goto out; | ||
2823 | ret = rbd_dev_snaps_register(rbd_dev); | ||
2824 | dout("rbd_dev_snaps_register returned %d\n", ret); | ||
2825 | out: | ||
2826 | up_write(&rbd_dev->header_rwsem); | ||
2827 | |||
2828 | return ret; | ||
2829 | } | ||
2830 | |||
2400 | /* | 2831 | /* |
2401 | * Scan the rbd device's current snapshot list and compare it to the | 2832 | * Scan the rbd device's current snapshot list and compare it to the |
2402 | * newly-received snapshot context. Remove any existing snapshots | 2833 | * newly-received snapshot context. Remove any existing snapshots |
@@ -2436,12 +2867,12 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) | |||
2436 | 2867 | ||
2437 | /* Existing snapshot not in the new snap context */ | 2868 | /* Existing snapshot not in the new snap context */ |
2438 | 2869 | ||
2439 | if (rbd_dev->mapping.snap_id == snap->id) | 2870 | if (rbd_dev->spec->snap_id == snap->id) |
2440 | rbd_dev->mapping.snap_exists = false; | 2871 | rbd_dev->exists = false; |
2441 | __rbd_remove_snap_dev(snap); | 2872 | rbd_remove_snap_dev(snap); |
2442 | dout("%ssnap id %llu has been removed\n", | 2873 | dout("%ssnap id %llu has been removed\n", |
2443 | rbd_dev->mapping.snap_id == snap->id ? | 2874 | rbd_dev->spec->snap_id == snap->id ? |
2444 | "mapped " : "", | 2875 | "mapped " : "", |
2445 | (unsigned long long) snap->id); | 2876 | (unsigned long long) snap->id); |
2446 | 2877 | ||
2447 | /* Done with this list entry; advance */ | 2878 | /* Done with this list entry; advance */ |
@@ -2559,7 +2990,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) | |||
2559 | do { | 2990 | do { |
2560 | ret = rbd_req_sync_watch(rbd_dev); | 2991 | ret = rbd_req_sync_watch(rbd_dev); |
2561 | if (ret == -ERANGE) { | 2992 | if (ret == -ERANGE) { |
2562 | rc = rbd_refresh_header(rbd_dev, NULL); | 2993 | rc = rbd_dev_refresh(rbd_dev, NULL); |
2563 | if (rc < 0) | 2994 | if (rc < 0) |
2564 | return rc; | 2995 | return rc; |
2565 | } | 2996 | } |
@@ -2621,8 +3052,8 @@ static void rbd_dev_id_put(struct rbd_device *rbd_dev) | |||
2621 | struct rbd_device *rbd_dev; | 3052 | struct rbd_device *rbd_dev; |
2622 | 3053 | ||
2623 | rbd_dev = list_entry(tmp, struct rbd_device, node); | 3054 | rbd_dev = list_entry(tmp, struct rbd_device, node); |
2624 | if (rbd_id > max_id) | 3055 | if (rbd_dev->dev_id > max_id) |
2625 | max_id = rbd_id; | 3056 | max_id = rbd_dev->dev_id; |
2626 | } | 3057 | } |
2627 | spin_unlock(&rbd_dev_list_lock); | 3058 | spin_unlock(&rbd_dev_list_lock); |
2628 | 3059 | ||
@@ -2722,73 +3153,140 @@ static inline char *dup_token(const char **buf, size_t *lenp) | |||
2722 | } | 3153 | } |
2723 | 3154 | ||
2724 | /* | 3155 | /* |
2725 | * This fills in the pool_name, image_name, image_name_len, rbd_dev, | 3156 | * Parse the options provided for an "rbd add" (i.e., rbd image |
2726 | * rbd_md_name, and name fields of the given rbd_dev, based on the | 3157 | * mapping) request. These arrive via a write to /sys/bus/rbd/add, |
2727 | * list of monitor addresses and other options provided via | 3158 | * and the data written is passed here via a NUL-terminated buffer. |
2728 | * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated | 3159 | * Returns 0 if successful or an error code otherwise. |
2729 | * copy of the snapshot name to map if successful, or a | 3160 | * |
2730 | * pointer-coded error otherwise. | 3161 | * The information extracted from these options is recorded in |
3162 | * the other parameters which return dynamically-allocated | ||
3163 | * structures: | ||
3164 | * ceph_opts | ||
3165 | * The address of a pointer that will refer to a ceph options | ||
3166 | * structure. Caller must release the returned pointer using | ||
3167 | * ceph_destroy_options() when it is no longer needed. | ||
3168 | * rbd_opts | ||
3169 | * Address of an rbd options pointer. Fully initialized by | ||
3170 | * this function; caller must release with kfree(). | ||
3171 | * spec | ||
3172 | * Address of an rbd image specification pointer. Fully | ||
3173 | * initialized by this function based on parsed options. | ||
3174 | * Caller must release with rbd_spec_put(). | ||
2731 | * | 3175 | * |
2732 | * Note: rbd_dev is assumed to have been initially zero-filled. | 3176 | * The options passed take this form: |
3177 | * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] | ||
3178 | * where: | ||
3179 | * <mon_addrs> | ||
3180 | * A comma-separated list of one or more monitor addresses. | ||
3181 | * A monitor address is an ip address, optionally followed | ||
3182 | * by a port number (separated by a colon). | ||
3183 | * I.e.: ip1[:port1][,ip2[:port2]...] | ||
3184 | * <options> | ||
3185 | * A comma-separated list of ceph and/or rbd options. | ||
3186 | * <pool_name> | ||
3187 | * The name of the rados pool containing the rbd image. | ||
3188 | * <image_name> | ||
3189 | * The name of the image in that pool to map. | ||
3190 | * <snap_id> | ||
3191 | * An optional snapshot id. If provided, the mapping will | ||
3192 | * present data from the image at the time that snapshot was | ||
3193 | * created. The image head is used if no snapshot id is | ||
3194 | * provided. Snapshot mappings are always read-only. | ||
2733 | */ | 3195 | */ |
2734 | static char *rbd_add_parse_args(struct rbd_device *rbd_dev, | 3196 | static int rbd_add_parse_args(const char *buf, |
2735 | const char *buf, | 3197 | struct ceph_options **ceph_opts, |
2736 | const char **mon_addrs, | 3198 | struct rbd_options **opts, |
2737 | size_t *mon_addrs_size, | 3199 | struct rbd_spec **rbd_spec) |
2738 | char *options, | ||
2739 | size_t options_size) | ||
2740 | { | 3200 | { |
2741 | size_t len; | 3201 | size_t len; |
2742 | char *err_ptr = ERR_PTR(-EINVAL); | 3202 | char *options; |
2743 | char *snap_name; | 3203 | const char *mon_addrs; |
3204 | size_t mon_addrs_size; | ||
3205 | struct rbd_spec *spec = NULL; | ||
3206 | struct rbd_options *rbd_opts = NULL; | ||
3207 | struct ceph_options *copts; | ||
3208 | int ret; | ||
2744 | 3209 | ||
2745 | /* The first four tokens are required */ | 3210 | /* The first four tokens are required */ |
2746 | 3211 | ||
2747 | len = next_token(&buf); | 3212 | len = next_token(&buf); |
2748 | if (!len) | 3213 | if (!len) |
2749 | return err_ptr; | 3214 | return -EINVAL; /* Missing monitor address(es) */ |
2750 | *mon_addrs_size = len + 1; | 3215 | mon_addrs = buf; |
2751 | *mon_addrs = buf; | 3216 | mon_addrs_size = len + 1; |
2752 | |||
2753 | buf += len; | 3217 | buf += len; |
2754 | 3218 | ||
2755 | len = copy_token(&buf, options, options_size); | 3219 | ret = -EINVAL; |
2756 | if (!len || len >= options_size) | 3220 | options = dup_token(&buf, NULL); |
2757 | return err_ptr; | 3221 | if (!options) |
3222 | return -ENOMEM; | ||
3223 | if (!*options) | ||
3224 | goto out_err; /* Missing options */ | ||
2758 | 3225 | ||
2759 | err_ptr = ERR_PTR(-ENOMEM); | 3226 | spec = rbd_spec_alloc(); |
2760 | rbd_dev->pool_name = dup_token(&buf, NULL); | 3227 | if (!spec) |
2761 | if (!rbd_dev->pool_name) | 3228 | goto out_mem; |
2762 | goto out_err; | ||
2763 | 3229 | ||
2764 | rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); | 3230 | spec->pool_name = dup_token(&buf, NULL); |
2765 | if (!rbd_dev->image_name) | 3231 | if (!spec->pool_name) |
2766 | goto out_err; | 3232 | goto out_mem; |
3233 | if (!*spec->pool_name) | ||
3234 | goto out_err; /* Missing pool name */ | ||
2767 | 3235 | ||
2768 | /* Snapshot name is optional */ | 3236 | spec->image_name = dup_token(&buf, &spec->image_name_len); |
3237 | if (!spec->image_name) | ||
3238 | goto out_mem; | ||
3239 | if (!*spec->image_name) | ||
3240 | goto out_err; /* Missing image name */ | ||
3241 | |||
3242 | /* | ||
3243 | * Snapshot name is optional; default is to use "-" | ||
3244 | * (indicating the head/no snapshot). | ||
3245 | */ | ||
2769 | len = next_token(&buf); | 3246 | len = next_token(&buf); |
2770 | if (!len) { | 3247 | if (!len) { |
2771 | buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ | 3248 | buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ |
2772 | len = sizeof (RBD_SNAP_HEAD_NAME) - 1; | 3249 | len = sizeof (RBD_SNAP_HEAD_NAME) - 1; |
2773 | } | 3250 | } else if (len > RBD_MAX_SNAP_NAME_LEN) { |
2774 | snap_name = kmalloc(len + 1, GFP_KERNEL); | 3251 | ret = -ENAMETOOLONG; |
2775 | if (!snap_name) | ||
2776 | goto out_err; | 3252 | goto out_err; |
2777 | memcpy(snap_name, buf, len); | 3253 | } |
2778 | *(snap_name + len) = '\0'; | 3254 | spec->snap_name = kmalloc(len + 1, GFP_KERNEL); |
3255 | if (!spec->snap_name) | ||
3256 | goto out_mem; | ||
3257 | memcpy(spec->snap_name, buf, len); | ||
3258 | *(spec->snap_name + len) = '\0'; | ||
2779 | 3259 | ||
2780 | dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); | 3260 | /* Initialize all rbd options to the defaults */ |
2781 | 3261 | ||
2782 | return snap_name; | 3262 | rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); |
3263 | if (!rbd_opts) | ||
3264 | goto out_mem; | ||
3265 | |||
3266 | rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; | ||
3267 | |||
3268 | copts = ceph_parse_options(options, mon_addrs, | ||
3269 | mon_addrs + mon_addrs_size - 1, | ||
3270 | parse_rbd_opts_token, rbd_opts); | ||
3271 | if (IS_ERR(copts)) { | ||
3272 | ret = PTR_ERR(copts); | ||
3273 | goto out_err; | ||
3274 | } | ||
3275 | kfree(options); | ||
2783 | 3276 | ||
3277 | *ceph_opts = copts; | ||
3278 | *opts = rbd_opts; | ||
3279 | *rbd_spec = spec; | ||
3280 | |||
3281 | return 0; | ||
3282 | out_mem: | ||
3283 | ret = -ENOMEM; | ||
2784 | out_err: | 3284 | out_err: |
2785 | kfree(rbd_dev->image_name); | 3285 | kfree(rbd_opts); |
2786 | rbd_dev->image_name = NULL; | 3286 | rbd_spec_put(spec); |
2787 | rbd_dev->image_name_len = 0; | 3287 | kfree(options); |
2788 | kfree(rbd_dev->pool_name); | ||
2789 | rbd_dev->pool_name = NULL; | ||
2790 | 3288 | ||
2791 | return err_ptr; | 3289 | return ret; |
2792 | } | 3290 | } |
2793 | 3291 | ||
2794 | /* | 3292 | /* |
@@ -2814,14 +3312,22 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
2814 | void *p; | 3312 | void *p; |
2815 | 3313 | ||
2816 | /* | 3314 | /* |
3315 | * When probing a parent image, the image id is already | ||
3316 | * known (and the image name likely is not). There's no | ||
3317 | * need to fetch the image id again in this case. | ||
3318 | */ | ||
3319 | if (rbd_dev->spec->image_id) | ||
3320 | return 0; | ||
3321 | |||
3322 | /* | ||
2817 | * First, see if the format 2 image id file exists, and if | 3323 | * First, see if the format 2 image id file exists, and if |
2818 | * so, get the image's persistent id from it. | 3324 | * so, get the image's persistent id from it. |
2819 | */ | 3325 | */ |
2820 | size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; | 3326 | size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; |
2821 | object_name = kmalloc(size, GFP_NOIO); | 3327 | object_name = kmalloc(size, GFP_NOIO); |
2822 | if (!object_name) | 3328 | if (!object_name) |
2823 | return -ENOMEM; | 3329 | return -ENOMEM; |
2824 | sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); | 3330 | sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); |
2825 | dout("rbd id object name is %s\n", object_name); | 3331 | dout("rbd id object name is %s\n", object_name); |
2826 | 3332 | ||
2827 | /* Response will be an encoded string, which includes a length */ | 3333 | /* Response will be an encoded string, which includes a length */ |
@@ -2841,17 +3347,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
2841 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | 3347 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); |
2842 | if (ret < 0) | 3348 | if (ret < 0) |
2843 | goto out; | 3349 | goto out; |
3350 | ret = 0; /* rbd_req_sync_exec() can return positive */ | ||
2844 | 3351 | ||
2845 | p = response; | 3352 | p = response; |
2846 | rbd_dev->image_id = ceph_extract_encoded_string(&p, | 3353 | rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, |
2847 | p + RBD_IMAGE_ID_LEN_MAX, | 3354 | p + RBD_IMAGE_ID_LEN_MAX, |
2848 | &rbd_dev->image_id_len, | 3355 | &rbd_dev->spec->image_id_len, |
2849 | GFP_NOIO); | 3356 | GFP_NOIO); |
2850 | if (IS_ERR(rbd_dev->image_id)) { | 3357 | if (IS_ERR(rbd_dev->spec->image_id)) { |
2851 | ret = PTR_ERR(rbd_dev->image_id); | 3358 | ret = PTR_ERR(rbd_dev->spec->image_id); |
2852 | rbd_dev->image_id = NULL; | 3359 | rbd_dev->spec->image_id = NULL; |
2853 | } else { | 3360 | } else { |
2854 | dout("image_id is %s\n", rbd_dev->image_id); | 3361 | dout("image_id is %s\n", rbd_dev->spec->image_id); |
2855 | } | 3362 | } |
2856 | out: | 3363 | out: |
2857 | kfree(response); | 3364 | kfree(response); |
@@ -2867,26 +3374,33 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) | |||
2867 | 3374 | ||
2868 | /* Version 1 images have no id; empty string is used */ | 3375 | /* Version 1 images have no id; empty string is used */ |
2869 | 3376 | ||
2870 | rbd_dev->image_id = kstrdup("", GFP_KERNEL); | 3377 | rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); |
2871 | if (!rbd_dev->image_id) | 3378 | if (!rbd_dev->spec->image_id) |
2872 | return -ENOMEM; | 3379 | return -ENOMEM; |
2873 | rbd_dev->image_id_len = 0; | 3380 | rbd_dev->spec->image_id_len = 0; |
2874 | 3381 | ||
2875 | /* Record the header object name for this rbd image. */ | 3382 | /* Record the header object name for this rbd image. */ |
2876 | 3383 | ||
2877 | size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); | 3384 | size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); |
2878 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); | 3385 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); |
2879 | if (!rbd_dev->header_name) { | 3386 | if (!rbd_dev->header_name) { |
2880 | ret = -ENOMEM; | 3387 | ret = -ENOMEM; |
2881 | goto out_err; | 3388 | goto out_err; |
2882 | } | 3389 | } |
2883 | sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); | 3390 | sprintf(rbd_dev->header_name, "%s%s", |
3391 | rbd_dev->spec->image_name, RBD_SUFFIX); | ||
2884 | 3392 | ||
2885 | /* Populate rbd image metadata */ | 3393 | /* Populate rbd image metadata */ |
2886 | 3394 | ||
2887 | ret = rbd_read_header(rbd_dev, &rbd_dev->header); | 3395 | ret = rbd_read_header(rbd_dev, &rbd_dev->header); |
2888 | if (ret < 0) | 3396 | if (ret < 0) |
2889 | goto out_err; | 3397 | goto out_err; |
3398 | |||
3399 | /* Version 1 images have no parent (no layering) */ | ||
3400 | |||
3401 | rbd_dev->parent_spec = NULL; | ||
3402 | rbd_dev->parent_overlap = 0; | ||
3403 | |||
2890 | rbd_dev->image_format = 1; | 3404 | rbd_dev->image_format = 1; |
2891 | 3405 | ||
2892 | dout("discovered version 1 image, header name is %s\n", | 3406 | dout("discovered version 1 image, header name is %s\n", |
@@ -2897,8 +3411,8 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) | |||
2897 | out_err: | 3411 | out_err: |
2898 | kfree(rbd_dev->header_name); | 3412 | kfree(rbd_dev->header_name); |
2899 | rbd_dev->header_name = NULL; | 3413 | rbd_dev->header_name = NULL; |
2900 | kfree(rbd_dev->image_id); | 3414 | kfree(rbd_dev->spec->image_id); |
2901 | rbd_dev->image_id = NULL; | 3415 | rbd_dev->spec->image_id = NULL; |
2902 | 3416 | ||
2903 | return ret; | 3417 | return ret; |
2904 | } | 3418 | } |
@@ -2913,12 +3427,12 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) | |||
2913 | * Image id was filled in by the caller. Record the header | 3427 | * Image id was filled in by the caller. Record the header |
2914 | * object name for this rbd image. | 3428 | * object name for this rbd image. |
2915 | */ | 3429 | */ |
2916 | size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; | 3430 | size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; |
2917 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); | 3431 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); |
2918 | if (!rbd_dev->header_name) | 3432 | if (!rbd_dev->header_name) |
2919 | return -ENOMEM; | 3433 | return -ENOMEM; |
2920 | sprintf(rbd_dev->header_name, "%s%s", | 3434 | sprintf(rbd_dev->header_name, "%s%s", |
2921 | RBD_HEADER_PREFIX, rbd_dev->image_id); | 3435 | RBD_HEADER_PREFIX, rbd_dev->spec->image_id); |
2922 | 3436 | ||
2923 | /* Get the size and object order for the image */ | 3437 | /* Get the size and object order for the image */ |
2924 | 3438 | ||
@@ -2932,12 +3446,20 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) | |||
2932 | if (ret < 0) | 3446 | if (ret < 0) |
2933 | goto out_err; | 3447 | goto out_err; |
2934 | 3448 | ||
2935 | /* Get the features for the image */ | 3449 | /* Get the and check features for the image */ |
2936 | 3450 | ||
2937 | ret = rbd_dev_v2_features(rbd_dev); | 3451 | ret = rbd_dev_v2_features(rbd_dev); |
2938 | if (ret < 0) | 3452 | if (ret < 0) |
2939 | goto out_err; | 3453 | goto out_err; |
2940 | 3454 | ||
3455 | /* If the image supports layering, get the parent info */ | ||
3456 | |||
3457 | if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { | ||
3458 | ret = rbd_dev_v2_parent_info(rbd_dev); | ||
3459 | if (ret < 0) | ||
3460 | goto out_err; | ||
3461 | } | ||
3462 | |||
2941 | /* crypto and compression type aren't (yet) supported for v2 images */ | 3463 | /* crypto and compression type aren't (yet) supported for v2 images */ |
2942 | 3464 | ||
2943 | rbd_dev->header.crypt_type = 0; | 3465 | rbd_dev->header.crypt_type = 0; |
@@ -2955,8 +3477,11 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) | |||
2955 | dout("discovered version 2 image, header name is %s\n", | 3477 | dout("discovered version 2 image, header name is %s\n", |
2956 | rbd_dev->header_name); | 3478 | rbd_dev->header_name); |
2957 | 3479 | ||
2958 | return -ENOTSUPP; | 3480 | return 0; |
2959 | out_err: | 3481 | out_err: |
3482 | rbd_dev->parent_overlap = 0; | ||
3483 | rbd_spec_put(rbd_dev->parent_spec); | ||
3484 | rbd_dev->parent_spec = NULL; | ||
2960 | kfree(rbd_dev->header_name); | 3485 | kfree(rbd_dev->header_name); |
2961 | rbd_dev->header_name = NULL; | 3486 | rbd_dev->header_name = NULL; |
2962 | kfree(rbd_dev->header.object_prefix); | 3487 | kfree(rbd_dev->header.object_prefix); |
@@ -2965,91 +3490,22 @@ out_err: | |||
2965 | return ret; | 3490 | return ret; |
2966 | } | 3491 | } |
2967 | 3492 | ||
2968 | /* | 3493 | static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) |
2969 | * Probe for the existence of the header object for the given rbd | ||
2970 | * device. For format 2 images this includes determining the image | ||
2971 | * id. | ||
2972 | */ | ||
2973 | static int rbd_dev_probe(struct rbd_device *rbd_dev) | ||
2974 | { | 3494 | { |
2975 | int ret; | 3495 | int ret; |
2976 | 3496 | ||
2977 | /* | 3497 | /* no need to lock here, as rbd_dev is not registered yet */ |
2978 | * Get the id from the image id object. If it's not a | 3498 | ret = rbd_dev_snaps_update(rbd_dev); |
2979 | * format 2 image, we'll get ENOENT back, and we'll assume | ||
2980 | * it's a format 1 image. | ||
2981 | */ | ||
2982 | ret = rbd_dev_image_id(rbd_dev); | ||
2983 | if (ret) | ||
2984 | ret = rbd_dev_v1_probe(rbd_dev); | ||
2985 | else | ||
2986 | ret = rbd_dev_v2_probe(rbd_dev); | ||
2987 | if (ret) | 3499 | if (ret) |
2988 | dout("probe failed, returning %d\n", ret); | 3500 | return ret; |
2989 | |||
2990 | return ret; | ||
2991 | } | ||
2992 | |||
2993 | static ssize_t rbd_add(struct bus_type *bus, | ||
2994 | const char *buf, | ||
2995 | size_t count) | ||
2996 | { | ||
2997 | char *options; | ||
2998 | struct rbd_device *rbd_dev = NULL; | ||
2999 | const char *mon_addrs = NULL; | ||
3000 | size_t mon_addrs_size = 0; | ||
3001 | struct ceph_osd_client *osdc; | ||
3002 | int rc = -ENOMEM; | ||
3003 | char *snap_name; | ||
3004 | |||
3005 | if (!try_module_get(THIS_MODULE)) | ||
3006 | return -ENODEV; | ||
3007 | |||
3008 | options = kmalloc(count, GFP_KERNEL); | ||
3009 | if (!options) | ||
3010 | goto err_out_mem; | ||
3011 | rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); | ||
3012 | if (!rbd_dev) | ||
3013 | goto err_out_mem; | ||
3014 | |||
3015 | /* static rbd_device initialization */ | ||
3016 | spin_lock_init(&rbd_dev->lock); | ||
3017 | INIT_LIST_HEAD(&rbd_dev->node); | ||
3018 | INIT_LIST_HEAD(&rbd_dev->snaps); | ||
3019 | init_rwsem(&rbd_dev->header_rwsem); | ||
3020 | |||
3021 | /* parse add command */ | ||
3022 | snap_name = rbd_add_parse_args(rbd_dev, buf, | ||
3023 | &mon_addrs, &mon_addrs_size, options, count); | ||
3024 | if (IS_ERR(snap_name)) { | ||
3025 | rc = PTR_ERR(snap_name); | ||
3026 | goto err_out_mem; | ||
3027 | } | ||
3028 | |||
3029 | rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); | ||
3030 | if (rc < 0) | ||
3031 | goto err_out_args; | ||
3032 | |||
3033 | /* pick the pool */ | ||
3034 | osdc = &rbd_dev->rbd_client->client->osdc; | ||
3035 | rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); | ||
3036 | if (rc < 0) | ||
3037 | goto err_out_client; | ||
3038 | rbd_dev->pool_id = rc; | ||
3039 | |||
3040 | rc = rbd_dev_probe(rbd_dev); | ||
3041 | if (rc < 0) | ||
3042 | goto err_out_client; | ||
3043 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); | ||
3044 | 3501 | ||
3045 | /* no need to lock here, as rbd_dev is not registered yet */ | 3502 | ret = rbd_dev_probe_update_spec(rbd_dev); |
3046 | rc = rbd_dev_snaps_update(rbd_dev); | 3503 | if (ret) |
3047 | if (rc) | 3504 | goto err_out_snaps; |
3048 | goto err_out_header; | ||
3049 | 3505 | ||
3050 | rc = rbd_dev_set_mapping(rbd_dev, snap_name); | 3506 | ret = rbd_dev_set_mapping(rbd_dev); |
3051 | if (rc) | 3507 | if (ret) |
3052 | goto err_out_header; | 3508 | goto err_out_snaps; |
3053 | 3509 | ||
3054 | /* generate unique id: find highest unique id, add one */ | 3510 | /* generate unique id: find highest unique id, add one */ |
3055 | rbd_dev_id_get(rbd_dev); | 3511 | rbd_dev_id_get(rbd_dev); |
@@ -3061,34 +3517,33 @@ static ssize_t rbd_add(struct bus_type *bus, | |||
3061 | 3517 | ||
3062 | /* Get our block major device number. */ | 3518 | /* Get our block major device number. */ |
3063 | 3519 | ||
3064 | rc = register_blkdev(0, rbd_dev->name); | 3520 | ret = register_blkdev(0, rbd_dev->name); |
3065 | if (rc < 0) | 3521 | if (ret < 0) |
3066 | goto err_out_id; | 3522 | goto err_out_id; |
3067 | rbd_dev->major = rc; | 3523 | rbd_dev->major = ret; |
3068 | 3524 | ||
3069 | /* Set up the blkdev mapping. */ | 3525 | /* Set up the blkdev mapping. */ |
3070 | 3526 | ||
3071 | rc = rbd_init_disk(rbd_dev); | 3527 | ret = rbd_init_disk(rbd_dev); |
3072 | if (rc) | 3528 | if (ret) |
3073 | goto err_out_blkdev; | 3529 | goto err_out_blkdev; |
3074 | 3530 | ||
3075 | rc = rbd_bus_add_dev(rbd_dev); | 3531 | ret = rbd_bus_add_dev(rbd_dev); |
3076 | if (rc) | 3532 | if (ret) |
3077 | goto err_out_disk; | 3533 | goto err_out_disk; |
3078 | 3534 | ||
3079 | /* | 3535 | /* |
3080 | * At this point cleanup in the event of an error is the job | 3536 | * At this point cleanup in the event of an error is the job |
3081 | * of the sysfs code (initiated by rbd_bus_del_dev()). | 3537 | * of the sysfs code (initiated by rbd_bus_del_dev()). |
3082 | */ | 3538 | */ |
3083 | |||
3084 | down_write(&rbd_dev->header_rwsem); | 3539 | down_write(&rbd_dev->header_rwsem); |
3085 | rc = rbd_dev_snaps_register(rbd_dev); | 3540 | ret = rbd_dev_snaps_register(rbd_dev); |
3086 | up_write(&rbd_dev->header_rwsem); | 3541 | up_write(&rbd_dev->header_rwsem); |
3087 | if (rc) | 3542 | if (ret) |
3088 | goto err_out_bus; | 3543 | goto err_out_bus; |
3089 | 3544 | ||
3090 | rc = rbd_init_watch_dev(rbd_dev); | 3545 | ret = rbd_init_watch_dev(rbd_dev); |
3091 | if (rc) | 3546 | if (ret) |
3092 | goto err_out_bus; | 3547 | goto err_out_bus; |
3093 | 3548 | ||
3094 | /* Everything's ready. Announce the disk to the world. */ | 3549 | /* Everything's ready. Announce the disk to the world. */ |
@@ -3098,37 +3553,119 @@ static ssize_t rbd_add(struct bus_type *bus, | |||
3098 | pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, | 3553 | pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, |
3099 | (unsigned long long) rbd_dev->mapping.size); | 3554 | (unsigned long long) rbd_dev->mapping.size); |
3100 | 3555 | ||
3101 | return count; | 3556 | return ret; |
3102 | |||
3103 | err_out_bus: | 3557 | err_out_bus: |
3104 | /* this will also clean up rest of rbd_dev stuff */ | 3558 | /* this will also clean up rest of rbd_dev stuff */ |
3105 | 3559 | ||
3106 | rbd_bus_del_dev(rbd_dev); | 3560 | rbd_bus_del_dev(rbd_dev); |
3107 | kfree(options); | ||
3108 | return rc; | ||
3109 | 3561 | ||
3562 | return ret; | ||
3110 | err_out_disk: | 3563 | err_out_disk: |
3111 | rbd_free_disk(rbd_dev); | 3564 | rbd_free_disk(rbd_dev); |
3112 | err_out_blkdev: | 3565 | err_out_blkdev: |
3113 | unregister_blkdev(rbd_dev->major, rbd_dev->name); | 3566 | unregister_blkdev(rbd_dev->major, rbd_dev->name); |
3114 | err_out_id: | 3567 | err_out_id: |
3115 | rbd_dev_id_put(rbd_dev); | 3568 | rbd_dev_id_put(rbd_dev); |
3116 | err_out_header: | 3569 | err_out_snaps: |
3117 | rbd_header_free(&rbd_dev->header); | 3570 | rbd_remove_all_snaps(rbd_dev); |
3571 | |||
3572 | return ret; | ||
3573 | } | ||
3574 | |||
3575 | /* | ||
3576 | * Probe for the existence of the header object for the given rbd | ||
3577 | * device. For format 2 images this includes determining the image | ||
3578 | * id. | ||
3579 | */ | ||
3580 | static int rbd_dev_probe(struct rbd_device *rbd_dev) | ||
3581 | { | ||
3582 | int ret; | ||
3583 | |||
3584 | /* | ||
3585 | * Get the id from the image id object. If it's not a | ||
3586 | * format 2 image, we'll get ENOENT back, and we'll assume | ||
3587 | * it's a format 1 image. | ||
3588 | */ | ||
3589 | ret = rbd_dev_image_id(rbd_dev); | ||
3590 | if (ret) | ||
3591 | ret = rbd_dev_v1_probe(rbd_dev); | ||
3592 | else | ||
3593 | ret = rbd_dev_v2_probe(rbd_dev); | ||
3594 | if (ret) { | ||
3595 | dout("probe failed, returning %d\n", ret); | ||
3596 | |||
3597 | return ret; | ||
3598 | } | ||
3599 | |||
3600 | ret = rbd_dev_probe_finish(rbd_dev); | ||
3601 | if (ret) | ||
3602 | rbd_header_free(&rbd_dev->header); | ||
3603 | |||
3604 | return ret; | ||
3605 | } | ||
3606 | |||
3607 | static ssize_t rbd_add(struct bus_type *bus, | ||
3608 | const char *buf, | ||
3609 | size_t count) | ||
3610 | { | ||
3611 | struct rbd_device *rbd_dev = NULL; | ||
3612 | struct ceph_options *ceph_opts = NULL; | ||
3613 | struct rbd_options *rbd_opts = NULL; | ||
3614 | struct rbd_spec *spec = NULL; | ||
3615 | struct rbd_client *rbdc; | ||
3616 | struct ceph_osd_client *osdc; | ||
3617 | int rc = -ENOMEM; | ||
3618 | |||
3619 | if (!try_module_get(THIS_MODULE)) | ||
3620 | return -ENODEV; | ||
3621 | |||
3622 | /* parse add command */ | ||
3623 | rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); | ||
3624 | if (rc < 0) | ||
3625 | goto err_out_module; | ||
3626 | |||
3627 | rbdc = rbd_get_client(ceph_opts); | ||
3628 | if (IS_ERR(rbdc)) { | ||
3629 | rc = PTR_ERR(rbdc); | ||
3630 | goto err_out_args; | ||
3631 | } | ||
3632 | ceph_opts = NULL; /* rbd_dev client now owns this */ | ||
3633 | |||
3634 | /* pick the pool */ | ||
3635 | osdc = &rbdc->client->osdc; | ||
3636 | rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); | ||
3637 | if (rc < 0) | ||
3638 | goto err_out_client; | ||
3639 | spec->pool_id = (u64) rc; | ||
3640 | |||
3641 | rbd_dev = rbd_dev_create(rbdc, spec); | ||
3642 | if (!rbd_dev) | ||
3643 | goto err_out_client; | ||
3644 | rbdc = NULL; /* rbd_dev now owns this */ | ||
3645 | spec = NULL; /* rbd_dev now owns this */ | ||
3646 | |||
3647 | rbd_dev->mapping.read_only = rbd_opts->read_only; | ||
3648 | kfree(rbd_opts); | ||
3649 | rbd_opts = NULL; /* done with this */ | ||
3650 | |||
3651 | rc = rbd_dev_probe(rbd_dev); | ||
3652 | if (rc < 0) | ||
3653 | goto err_out_rbd_dev; | ||
3654 | |||
3655 | return count; | ||
3656 | err_out_rbd_dev: | ||
3657 | rbd_dev_destroy(rbd_dev); | ||
3118 | err_out_client: | 3658 | err_out_client: |
3119 | kfree(rbd_dev->header_name); | 3659 | rbd_put_client(rbdc); |
3120 | rbd_put_client(rbd_dev); | ||
3121 | kfree(rbd_dev->image_id); | ||
3122 | err_out_args: | 3660 | err_out_args: |
3123 | kfree(rbd_dev->mapping.snap_name); | 3661 | if (ceph_opts) |
3124 | kfree(rbd_dev->image_name); | 3662 | ceph_destroy_options(ceph_opts); |
3125 | kfree(rbd_dev->pool_name); | 3663 | kfree(rbd_opts); |
3126 | err_out_mem: | 3664 | rbd_spec_put(spec); |
3127 | kfree(rbd_dev); | 3665 | err_out_module: |
3128 | kfree(options); | 3666 | module_put(THIS_MODULE); |
3129 | 3667 | ||
3130 | dout("Error adding device %s\n", buf); | 3668 | dout("Error adding device %s\n", buf); |
3131 | module_put(THIS_MODULE); | ||
3132 | 3669 | ||
3133 | return (ssize_t) rc; | 3670 | return (ssize_t) rc; |
3134 | } | 3671 | } |
@@ -3163,7 +3700,6 @@ static void rbd_dev_release(struct device *dev) | |||
3163 | if (rbd_dev->watch_event) | 3700 | if (rbd_dev->watch_event) |
3164 | rbd_req_sync_unwatch(rbd_dev); | 3701 | rbd_req_sync_unwatch(rbd_dev); |
3165 | 3702 | ||
3166 | rbd_put_client(rbd_dev); | ||
3167 | 3703 | ||
3168 | /* clean up and free blkdev */ | 3704 | /* clean up and free blkdev */ |
3169 | rbd_free_disk(rbd_dev); | 3705 | rbd_free_disk(rbd_dev); |
@@ -3173,13 +3709,9 @@ static void rbd_dev_release(struct device *dev) | |||
3173 | rbd_header_free(&rbd_dev->header); | 3709 | rbd_header_free(&rbd_dev->header); |
3174 | 3710 | ||
3175 | /* done with the id, and with the rbd_dev */ | 3711 | /* done with the id, and with the rbd_dev */ |
3176 | kfree(rbd_dev->mapping.snap_name); | ||
3177 | kfree(rbd_dev->image_id); | ||
3178 | kfree(rbd_dev->header_name); | ||
3179 | kfree(rbd_dev->pool_name); | ||
3180 | kfree(rbd_dev->image_name); | ||
3181 | rbd_dev_id_put(rbd_dev); | 3712 | rbd_dev_id_put(rbd_dev); |
3182 | kfree(rbd_dev); | 3713 | rbd_assert(rbd_dev->rbd_client != NULL); |
3714 | rbd_dev_destroy(rbd_dev); | ||
3183 | 3715 | ||
3184 | /* release module ref */ | 3716 | /* release module ref */ |
3185 | module_put(THIS_MODULE); | 3717 | module_put(THIS_MODULE); |
@@ -3211,7 +3743,12 @@ static ssize_t rbd_remove(struct bus_type *bus, | |||
3211 | goto done; | 3743 | goto done; |
3212 | } | 3744 | } |
3213 | 3745 | ||
3214 | __rbd_remove_all_snaps(rbd_dev); | 3746 | if (rbd_dev->open_count) { |
3747 | ret = -EBUSY; | ||
3748 | goto done; | ||
3749 | } | ||
3750 | |||
3751 | rbd_remove_all_snaps(rbd_dev); | ||
3215 | rbd_bus_del_dev(rbd_dev); | 3752 | rbd_bus_del_dev(rbd_dev); |
3216 | 3753 | ||
3217 | done: | 3754 | done: |
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index cbe77fa105ba..49d77cbcf8bd 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h | |||
@@ -46,8 +46,6 @@ | |||
46 | #define RBD_MIN_OBJ_ORDER 16 | 46 | #define RBD_MIN_OBJ_ORDER 16 |
47 | #define RBD_MAX_OBJ_ORDER 30 | 47 | #define RBD_MAX_OBJ_ORDER 30 |
48 | 48 | ||
49 | #define RBD_MAX_SEG_NAME_LEN 128 | ||
50 | |||
51 | #define RBD_COMP_NONE 0 | 49 | #define RBD_COMP_NONE 0 |
52 | #define RBD_CRYPT_NONE 0 | 50 | #define RBD_CRYPT_NONE 0 |
53 | 51 | ||