aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/rbd.c1389
-rw-r--r--drivers/block/rbd_types.h2
-rw-r--r--drivers/iommu/amd_iommu.c196
-rw-r--r--drivers/iommu/amd_iommu_types.h1
-rw-r--r--drivers/iommu/intel-iommu.c42
-rw-r--r--drivers/iommu/omap-iommu.c68
-rw-r--r--drivers/iommu/omap-iommu.h3
-rw-r--r--drivers/iommu/omap-iommu2.c36
-rw-r--r--drivers/iommu/tegra-gart.c2
-rw-r--r--drivers/iommu/tegra-smmu.c6
10 files changed, 1187 insertions, 558 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bb3d9be3b1b4..89576a0b3f2e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -61,15 +61,29 @@
61 61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63 63
64#define RBD_MAX_SNAP_NAME_LEN 32 64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
65#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
66#define RBD_MAX_OPT_LEN 1024 69#define RBD_MAX_OPT_LEN 1024
67 70
68#define RBD_SNAP_HEAD_NAME "-" 71#define RBD_SNAP_HEAD_NAME "-"
69 72
73/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
70#define RBD_IMAGE_ID_LEN_MAX 64 75#define RBD_IMAGE_ID_LEN_MAX 64
76
71#define RBD_OBJ_PREFIX_LEN_MAX 64 77#define RBD_OBJ_PREFIX_LEN_MAX 64
72 78
79/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
73/* 87/*
74 * An RBD device name will be "rbd#", where the "rbd" comes from 88 * An RBD device name will be "rbd#", where the "rbd" comes from
75 * RBD_DRV_NAME above, and # is a unique integer identifier. 89 * RBD_DRV_NAME above, and # is a unique integer identifier.
@@ -101,6 +115,27 @@ struct rbd_image_header {
101 u64 obj_version; 115 u64 obj_version;
102}; 116};
103 117
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image.
123 */
124struct rbd_spec {
125 u64 pool_id;
126 char *pool_name;
127
128 char *image_id;
129 size_t image_id_len;
130 char *image_name;
131 size_t image_name_len;
132
133 u64 snap_id;
134 char *snap_name;
135
136 struct kref kref;
137};
138
104struct rbd_options { 139struct rbd_options {
105 bool read_only; 140 bool read_only;
106}; 141};
@@ -155,11 +190,8 @@ struct rbd_snap {
155}; 190};
156 191
157struct rbd_mapping { 192struct rbd_mapping {
158 char *snap_name;
159 u64 snap_id;
160 u64 size; 193 u64 size;
161 u64 features; 194 u64 features;
162 bool snap_exists;
163 bool read_only; 195 bool read_only;
164}; 196};
165 197
@@ -173,7 +205,6 @@ struct rbd_device {
173 struct gendisk *disk; /* blkdev's gendisk and rq */ 205 struct gendisk *disk; /* blkdev's gendisk and rq */
174 206
175 u32 image_format; /* Either 1 or 2 */ 207 u32 image_format; /* Either 1 or 2 */
176 struct rbd_options rbd_opts;
177 struct rbd_client *rbd_client; 208 struct rbd_client *rbd_client;
178 209
179 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -181,17 +212,17 @@ struct rbd_device {
181 spinlock_t lock; /* queue lock */ 212 spinlock_t lock; /* queue lock */
182 213
183 struct rbd_image_header header; 214 struct rbd_image_header header;
184 char *image_id; 215 bool exists;
185 size_t image_id_len; 216 struct rbd_spec *spec;
186 char *image_name; 217
187 size_t image_name_len;
188 char *header_name; 218 char *header_name;
189 char *pool_name;
190 int pool_id;
191 219
192 struct ceph_osd_event *watch_event; 220 struct ceph_osd_event *watch_event;
193 struct ceph_osd_request *watch_request; 221 struct ceph_osd_request *watch_request;
194 222
223 struct rbd_spec *parent_spec;
224 u64 parent_overlap;
225
195 /* protects updating the header */ 226 /* protects updating the header */
196 struct rw_semaphore header_rwsem; 227 struct rw_semaphore header_rwsem;
197 228
@@ -204,6 +235,7 @@ struct rbd_device {
204 235
205 /* sysfs related */ 236 /* sysfs related */
206 struct device dev; 237 struct device dev;
238 unsigned long open_count;
207}; 239};
208 240
209static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 241static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
@@ -218,7 +250,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
218static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 250static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
219 251
220static void rbd_dev_release(struct device *dev); 252static void rbd_dev_release(struct device *dev);
221static void __rbd_remove_snap_dev(struct rbd_snap *snap); 253static void rbd_remove_snap_dev(struct rbd_snap *snap);
222 254
223static ssize_t rbd_add(struct bus_type *bus, const char *buf, 255static ssize_t rbd_add(struct bus_type *bus, const char *buf,
224 size_t count); 256 size_t count);
@@ -258,17 +290,8 @@ static struct device rbd_root_dev = {
258# define rbd_assert(expr) ((void) 0) 290# define rbd_assert(expr) ((void) 0)
259#endif /* !RBD_DEBUG */ 291#endif /* !RBD_DEBUG */
260 292
261static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 293static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
262{ 294static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
263 return get_device(&rbd_dev->dev);
264}
265
266static void rbd_put_dev(struct rbd_device *rbd_dev)
267{
268 put_device(&rbd_dev->dev);
269}
270
271static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
272 295
273static int rbd_open(struct block_device *bdev, fmode_t mode) 296static int rbd_open(struct block_device *bdev, fmode_t mode)
274{ 297{
@@ -277,8 +300,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
277 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 300 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
278 return -EROFS; 301 return -EROFS;
279 302
280 rbd_get_dev(rbd_dev); 303 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
304 (void) get_device(&rbd_dev->dev);
281 set_device_ro(bdev, rbd_dev->mapping.read_only); 305 set_device_ro(bdev, rbd_dev->mapping.read_only);
306 rbd_dev->open_count++;
307 mutex_unlock(&ctl_mutex);
282 308
283 return 0; 309 return 0;
284} 310}
@@ -287,7 +313,11 @@ static int rbd_release(struct gendisk *disk, fmode_t mode)
287{ 313{
288 struct rbd_device *rbd_dev = disk->private_data; 314 struct rbd_device *rbd_dev = disk->private_data;
289 315
290 rbd_put_dev(rbd_dev); 316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317 rbd_assert(rbd_dev->open_count > 0);
318 rbd_dev->open_count--;
319 put_device(&rbd_dev->dev);
320 mutex_unlock(&ctl_mutex);
291 321
292 return 0; 322 return 0;
293} 323}
@@ -388,7 +418,7 @@ enum {
388static match_table_t rbd_opts_tokens = { 418static match_table_t rbd_opts_tokens = {
389 /* int args above */ 419 /* int args above */
390 /* string args above */ 420 /* string args above */
391 {Opt_read_only, "mapping.read_only"}, 421 {Opt_read_only, "read_only"},
392 {Opt_read_only, "ro"}, /* Alternate spelling */ 422 {Opt_read_only, "ro"}, /* Alternate spelling */
393 {Opt_read_write, "read_write"}, 423 {Opt_read_write, "read_write"},
394 {Opt_read_write, "rw"}, /* Alternate spelling */ 424 {Opt_read_write, "rw"}, /* Alternate spelling */
@@ -441,33 +471,17 @@ static int parse_rbd_opts_token(char *c, void *private)
441 * Get a ceph client with specific addr and configuration, if one does 471 * Get a ceph client with specific addr and configuration, if one does
442 * not exist create it. 472 * not exist create it.
443 */ 473 */
444static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, 474static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
445 size_t mon_addr_len, char *options)
446{ 475{
447 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
448 struct ceph_options *ceph_opts;
449 struct rbd_client *rbdc; 476 struct rbd_client *rbdc;
450 477
451 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
452
453 ceph_opts = ceph_parse_options(options, mon_addr,
454 mon_addr + mon_addr_len,
455 parse_rbd_opts_token, rbd_opts);
456 if (IS_ERR(ceph_opts))
457 return PTR_ERR(ceph_opts);
458
459 rbdc = rbd_client_find(ceph_opts); 478 rbdc = rbd_client_find(ceph_opts);
460 if (rbdc) { 479 if (rbdc) /* using an existing client */
461 /* using an existing client */
462 ceph_destroy_options(ceph_opts); 480 ceph_destroy_options(ceph_opts);
463 } else { 481 else
464 rbdc = rbd_client_create(ceph_opts); 482 rbdc = rbd_client_create(ceph_opts);
465 if (IS_ERR(rbdc))
466 return PTR_ERR(rbdc);
467 }
468 rbd_dev->rbd_client = rbdc;
469 483
470 return 0; 484 return rbdc;
471} 485}
472 486
473/* 487/*
@@ -492,10 +506,10 @@ static void rbd_client_release(struct kref *kref)
492 * Drop reference to ceph client node. If it's not referenced anymore, release 506 * Drop reference to ceph client node. If it's not referenced anymore, release
493 * it. 507 * it.
494 */ 508 */
495static void rbd_put_client(struct rbd_device *rbd_dev) 509static void rbd_put_client(struct rbd_client *rbdc)
496{ 510{
497 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); 511 if (rbdc)
498 rbd_dev->rbd_client = NULL; 512 kref_put(&rbdc->kref, rbd_client_release);
499} 513}
500 514
501/* 515/*
@@ -524,6 +538,16 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
524 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 538 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
525 return false; 539 return false;
526 540
541 /* The bio layer requires at least sector-sized I/O */
542
543 if (ondisk->options.order < SECTOR_SHIFT)
544 return false;
545
546 /* If we use u64 in a few spots we may be able to loosen this */
547
548 if (ondisk->options.order > 8 * sizeof (int) - 1)
549 return false;
550
527 /* 551 /*
528 * The size of a snapshot header has to fit in a size_t, and 552 * The size of a snapshot header has to fit in a size_t, and
529 * that limits the number of snapshots. 553 * that limits the number of snapshots.
@@ -635,6 +659,20 @@ out_err:
635 return -ENOMEM; 659 return -ENOMEM;
636} 660}
637 661
662static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
663{
664 struct rbd_snap *snap;
665
666 if (snap_id == CEPH_NOSNAP)
667 return RBD_SNAP_HEAD_NAME;
668
669 list_for_each_entry(snap, &rbd_dev->snaps, node)
670 if (snap_id == snap->id)
671 return snap->name;
672
673 return NULL;
674}
675
638static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 676static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
639{ 677{
640 678
@@ -642,7 +680,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
642 680
643 list_for_each_entry(snap, &rbd_dev->snaps, node) { 681 list_for_each_entry(snap, &rbd_dev->snaps, node) {
644 if (!strcmp(snap_name, snap->name)) { 682 if (!strcmp(snap_name, snap->name)) {
645 rbd_dev->mapping.snap_id = snap->id; 683 rbd_dev->spec->snap_id = snap->id;
646 rbd_dev->mapping.size = snap->size; 684 rbd_dev->mapping.size = snap->size;
647 rbd_dev->mapping.features = snap->features; 685 rbd_dev->mapping.features = snap->features;
648 686
@@ -653,26 +691,23 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
653 return -ENOENT; 691 return -ENOENT;
654} 692}
655 693
656static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) 694static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
657{ 695{
658 int ret; 696 int ret;
659 697
660 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, 698 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
661 sizeof (RBD_SNAP_HEAD_NAME))) { 699 sizeof (RBD_SNAP_HEAD_NAME))) {
662 rbd_dev->mapping.snap_id = CEPH_NOSNAP; 700 rbd_dev->spec->snap_id = CEPH_NOSNAP;
663 rbd_dev->mapping.size = rbd_dev->header.image_size; 701 rbd_dev->mapping.size = rbd_dev->header.image_size;
664 rbd_dev->mapping.features = rbd_dev->header.features; 702 rbd_dev->mapping.features = rbd_dev->header.features;
665 rbd_dev->mapping.snap_exists = false;
666 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
667 ret = 0; 703 ret = 0;
668 } else { 704 } else {
669 ret = snap_by_name(rbd_dev, snap_name); 705 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
670 if (ret < 0) 706 if (ret < 0)
671 goto done; 707 goto done;
672 rbd_dev->mapping.snap_exists = true;
673 rbd_dev->mapping.read_only = true; 708 rbd_dev->mapping.read_only = true;
674 } 709 }
675 rbd_dev->mapping.snap_name = snap_name; 710 rbd_dev->exists = true;
676done: 711done:
677 return ret; 712 return ret;
678} 713}
@@ -695,13 +730,13 @@ static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
695 u64 segment; 730 u64 segment;
696 int ret; 731 int ret;
697 732
698 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 733 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
699 if (!name) 734 if (!name)
700 return NULL; 735 return NULL;
701 segment = offset >> rbd_dev->header.obj_order; 736 segment = offset >> rbd_dev->header.obj_order;
702 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", 737 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
703 rbd_dev->header.object_prefix, segment); 738 rbd_dev->header.object_prefix, segment);
704 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { 739 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
705 pr_err("error formatting segment name for #%llu (%d)\n", 740 pr_err("error formatting segment name for #%llu (%d)\n",
706 segment, ret); 741 segment, ret);
707 kfree(name); 742 kfree(name);
@@ -800,77 +835,144 @@ static void zero_bio_chain(struct bio *chain, int start_ofs)
800} 835}
801 836
802/* 837/*
803 * bio_chain_clone - clone a chain of bios up to a certain length. 838 * Clone a portion of a bio, starting at the given byte offset
804 * might return a bio_pair that will need to be released. 839 * and continuing for the number of bytes indicated.
805 */ 840 */
806static struct bio *bio_chain_clone(struct bio **old, struct bio **next, 841static struct bio *bio_clone_range(struct bio *bio_src,
807 struct bio_pair **bp, 842 unsigned int offset,
808 int len, gfp_t gfpmask) 843 unsigned int len,
809{ 844 gfp_t gfpmask)
810 struct bio *old_chain = *old; 845{
811 struct bio *new_chain = NULL; 846 struct bio_vec *bv;
812 struct bio *tail; 847 unsigned int resid;
813 int total = 0; 848 unsigned short idx;
814 849 unsigned int voff;
815 if (*bp) { 850 unsigned short end_idx;
816 bio_pair_release(*bp); 851 unsigned short vcnt;
817 *bp = NULL; 852 struct bio *bio;
818 }
819 853
820 while (old_chain && (total < len)) { 854 /* Handle the easy case for the caller */
821 struct bio *tmp;
822 855
823 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); 856 if (!offset && len == bio_src->bi_size)
824 if (!tmp) 857 return bio_clone(bio_src, gfpmask);
825 goto err_out;
826 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
827 858
828 if (total + old_chain->bi_size > len) { 859 if (WARN_ON_ONCE(!len))
829 struct bio_pair *bp; 860 return NULL;
861 if (WARN_ON_ONCE(len > bio_src->bi_size))
862 return NULL;
863 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
864 return NULL;
830 865
831 /* 866 /* Find first affected segment... */
832 * this split can only happen with a single paged bio,
833 * split_bio will BUG_ON if this is not the case
834 */
835 dout("bio_chain_clone split! total=%d remaining=%d"
836 "bi_size=%u\n",
837 total, len - total, old_chain->bi_size);
838 867
839 /* split the bio. We'll release it either in the next 868 resid = offset;
840 call, or it will have to be released outside */ 869 __bio_for_each_segment(bv, bio_src, idx, 0) {
841 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); 870 if (resid < bv->bv_len)
842 if (!bp) 871 break;
843 goto err_out; 872 resid -= bv->bv_len;
873 }
874 voff = resid;
844 875
845 __bio_clone(tmp, &bp->bio1); 876 /* ...and the last affected segment */
846 877
847 *next = &bp->bio2; 878 resid += len;
848 } else { 879 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
849 __bio_clone(tmp, old_chain); 880 if (resid <= bv->bv_len)
850 *next = old_chain->bi_next; 881 break;
851 } 882 resid -= bv->bv_len;
883 }
884 vcnt = end_idx - idx + 1;
885
886 /* Build the clone */
852 887
853 tmp->bi_bdev = NULL; 888 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
854 tmp->bi_next = NULL; 889 if (!bio)
855 if (new_chain) 890 return NULL; /* ENOMEM */
856 tail->bi_next = tmp;
857 else
858 new_chain = tmp;
859 tail = tmp;
860 old_chain = old_chain->bi_next;
861 891
862 total += tmp->bi_size; 892 bio->bi_bdev = bio_src->bi_bdev;
893 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
894 bio->bi_rw = bio_src->bi_rw;
895 bio->bi_flags |= 1 << BIO_CLONED;
896
897 /*
898 * Copy over our part of the bio_vec, then update the first
899 * and last (or only) entries.
900 */
901 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
902 vcnt * sizeof (struct bio_vec));
903 bio->bi_io_vec[0].bv_offset += voff;
904 if (vcnt > 1) {
905 bio->bi_io_vec[0].bv_len -= voff;
906 bio->bi_io_vec[vcnt - 1].bv_len = resid;
907 } else {
908 bio->bi_io_vec[0].bv_len = len;
863 } 909 }
864 910
865 rbd_assert(total == len); 911 bio->bi_vcnt = vcnt;
912 bio->bi_size = len;
913 bio->bi_idx = 0;
914
915 return bio;
916}
917
918/*
919 * Clone a portion of a bio chain, starting at the given byte offset
920 * into the first bio in the source chain and continuing for the
921 * number of bytes indicated. The result is another bio chain of
922 * exactly the given length, or a null pointer on error.
923 *
924 * The bio_src and offset parameters are both in-out. On entry they
925 * refer to the first source bio and the offset into that bio where
926 * the start of data to be cloned is located.
927 *
928 * On return, bio_src is updated to refer to the bio in the source
929 * chain that contains first un-cloned byte, and *offset will
930 * contain the offset of that byte within that bio.
931 */
932static struct bio *bio_chain_clone_range(struct bio **bio_src,
933 unsigned int *offset,
934 unsigned int len,
935 gfp_t gfpmask)
936{
937 struct bio *bi = *bio_src;
938 unsigned int off = *offset;
939 struct bio *chain = NULL;
940 struct bio **end;
941
942 /* Build up a chain of clone bios up to the limit */
943
944 if (!bi || off >= bi->bi_size || !len)
945 return NULL; /* Nothing to clone */
866 946
867 *old = old_chain; 947 end = &chain;
948 while (len) {
949 unsigned int bi_size;
950 struct bio *bio;
951
952 if (!bi)
953 goto out_err; /* EINVAL; ran out of bio's */
954 bi_size = min_t(unsigned int, bi->bi_size - off, len);
955 bio = bio_clone_range(bi, off, bi_size, gfpmask);
956 if (!bio)
957 goto out_err; /* ENOMEM */
958
959 *end = bio;
960 end = &bio->bi_next;
961
962 off += bi_size;
963 if (off == bi->bi_size) {
964 bi = bi->bi_next;
965 off = 0;
966 }
967 len -= bi_size;
968 }
969 *bio_src = bi;
970 *offset = off;
868 971
869 return new_chain; 972 return chain;
973out_err:
974 bio_chain_put(chain);
870 975
871err_out:
872 dout("bio_chain_clone with err\n");
873 bio_chain_put(new_chain);
874 return NULL; 976 return NULL;
875} 977}
876 978
@@ -988,8 +1090,9 @@ static int rbd_do_request(struct request *rq,
988 req_data->coll_index = coll_index; 1090 req_data->coll_index = coll_index;
989 } 1091 }
990 1092
991 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, 1093 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
992 (unsigned long long) ofs, (unsigned long long) len); 1094 object_name, (unsigned long long) ofs,
1095 (unsigned long long) len, coll, coll_index);
993 1096
994 osdc = &rbd_dev->rbd_client->client->osdc; 1097 osdc = &rbd_dev->rbd_client->client->osdc;
995 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, 1098 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
@@ -1019,7 +1122,7 @@ static int rbd_do_request(struct request *rq,
1019 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1122 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1020 layout->fl_stripe_count = cpu_to_le32(1); 1123 layout->fl_stripe_count = cpu_to_le32(1);
1021 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1124 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1022 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); 1125 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
1023 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 1126 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1024 req, ops); 1127 req, ops);
1025 rbd_assert(ret == 0); 1128 rbd_assert(ret == 0);
@@ -1154,8 +1257,6 @@ done:
1154static int rbd_do_op(struct request *rq, 1257static int rbd_do_op(struct request *rq,
1155 struct rbd_device *rbd_dev, 1258 struct rbd_device *rbd_dev,
1156 struct ceph_snap_context *snapc, 1259 struct ceph_snap_context *snapc,
1157 u64 snapid,
1158 int opcode, int flags,
1159 u64 ofs, u64 len, 1260 u64 ofs, u64 len,
1160 struct bio *bio, 1261 struct bio *bio,
1161 struct rbd_req_coll *coll, 1262 struct rbd_req_coll *coll,
@@ -1167,6 +1268,9 @@ static int rbd_do_op(struct request *rq,
1167 int ret; 1268 int ret;
1168 struct ceph_osd_req_op *ops; 1269 struct ceph_osd_req_op *ops;
1169 u32 payload_len; 1270 u32 payload_len;
1271 int opcode;
1272 int flags;
1273 u64 snapid;
1170 1274
1171 seg_name = rbd_segment_name(rbd_dev, ofs); 1275 seg_name = rbd_segment_name(rbd_dev, ofs);
1172 if (!seg_name) 1276 if (!seg_name)
@@ -1174,7 +1278,18 @@ static int rbd_do_op(struct request *rq,
1174 seg_len = rbd_segment_length(rbd_dev, ofs, len); 1278 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1175 seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1279 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1176 1280
1177 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); 1281 if (rq_data_dir(rq) == WRITE) {
1282 opcode = CEPH_OSD_OP_WRITE;
1283 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
1284 snapid = CEPH_NOSNAP;
1285 payload_len = seg_len;
1286 } else {
1287 opcode = CEPH_OSD_OP_READ;
1288 flags = CEPH_OSD_FLAG_READ;
1289 snapc = NULL;
1290 snapid = rbd_dev->spec->snap_id;
1291 payload_len = 0;
1292 }
1178 1293
1179 ret = -ENOMEM; 1294 ret = -ENOMEM;
1180 ops = rbd_create_rw_ops(1, opcode, payload_len); 1295 ops = rbd_create_rw_ops(1, opcode, payload_len);
@@ -1202,41 +1317,6 @@ done:
1202} 1317}
1203 1318
1204/* 1319/*
1205 * Request async osd write
1206 */
1207static int rbd_req_write(struct request *rq,
1208 struct rbd_device *rbd_dev,
1209 struct ceph_snap_context *snapc,
1210 u64 ofs, u64 len,
1211 struct bio *bio,
1212 struct rbd_req_coll *coll,
1213 int coll_index)
1214{
1215 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1216 CEPH_OSD_OP_WRITE,
1217 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1218 ofs, len, bio, coll, coll_index);
1219}
1220
1221/*
1222 * Request async osd read
1223 */
1224static int rbd_req_read(struct request *rq,
1225 struct rbd_device *rbd_dev,
1226 u64 snapid,
1227 u64 ofs, u64 len,
1228 struct bio *bio,
1229 struct rbd_req_coll *coll,
1230 int coll_index)
1231{
1232 return rbd_do_op(rq, rbd_dev, NULL,
1233 snapid,
1234 CEPH_OSD_OP_READ,
1235 CEPH_OSD_FLAG_READ,
1236 ofs, len, bio, coll, coll_index);
1237}
1238
1239/*
1240 * Request sync osd read 1320 * Request sync osd read
1241 */ 1321 */
1242static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1322static int rbd_req_sync_read(struct rbd_device *rbd_dev,
@@ -1304,7 +1384,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1304 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1384 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1305 rbd_dev->header_name, (unsigned long long) notify_id, 1385 rbd_dev->header_name, (unsigned long long) notify_id,
1306 (unsigned int) opcode); 1386 (unsigned int) opcode);
1307 rc = rbd_refresh_header(rbd_dev, &hver); 1387 rc = rbd_dev_refresh(rbd_dev, &hver);
1308 if (rc) 1388 if (rc)
1309 pr_warning(RBD_DRV_NAME "%d got notification but failed to " 1389 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1310 " update snaps: %d\n", rbd_dev->major, rc); 1390 " update snaps: %d\n", rbd_dev->major, rc);
@@ -1460,18 +1540,16 @@ static void rbd_rq_fn(struct request_queue *q)
1460{ 1540{
1461 struct rbd_device *rbd_dev = q->queuedata; 1541 struct rbd_device *rbd_dev = q->queuedata;
1462 struct request *rq; 1542 struct request *rq;
1463 struct bio_pair *bp = NULL;
1464 1543
1465 while ((rq = blk_fetch_request(q))) { 1544 while ((rq = blk_fetch_request(q))) {
1466 struct bio *bio; 1545 struct bio *bio;
1467 struct bio *rq_bio, *next_bio = NULL;
1468 bool do_write; 1546 bool do_write;
1469 unsigned int size; 1547 unsigned int size;
1470 u64 op_size = 0;
1471 u64 ofs; 1548 u64 ofs;
1472 int num_segs, cur_seg = 0; 1549 int num_segs, cur_seg = 0;
1473 struct rbd_req_coll *coll; 1550 struct rbd_req_coll *coll;
1474 struct ceph_snap_context *snapc; 1551 struct ceph_snap_context *snapc;
1552 unsigned int bio_offset;
1475 1553
1476 dout("fetched request\n"); 1554 dout("fetched request\n");
1477 1555
@@ -1483,10 +1561,6 @@ static void rbd_rq_fn(struct request_queue *q)
1483 1561
1484 /* deduce our operation (read, write) */ 1562 /* deduce our operation (read, write) */
1485 do_write = (rq_data_dir(rq) == WRITE); 1563 do_write = (rq_data_dir(rq) == WRITE);
1486
1487 size = blk_rq_bytes(rq);
1488 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1489 rq_bio = rq->bio;
1490 if (do_write && rbd_dev->mapping.read_only) { 1564 if (do_write && rbd_dev->mapping.read_only) {
1491 __blk_end_request_all(rq, -EROFS); 1565 __blk_end_request_all(rq, -EROFS);
1492 continue; 1566 continue;
@@ -1496,8 +1570,8 @@ static void rbd_rq_fn(struct request_queue *q)
1496 1570
1497 down_read(&rbd_dev->header_rwsem); 1571 down_read(&rbd_dev->header_rwsem);
1498 1572
1499 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && 1573 if (!rbd_dev->exists) {
1500 !rbd_dev->mapping.snap_exists) { 1574 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1501 up_read(&rbd_dev->header_rwsem); 1575 up_read(&rbd_dev->header_rwsem);
1502 dout("request for non-existent snapshot"); 1576 dout("request for non-existent snapshot");
1503 spin_lock_irq(q->queue_lock); 1577 spin_lock_irq(q->queue_lock);
@@ -1509,6 +1583,10 @@ static void rbd_rq_fn(struct request_queue *q)
1509 1583
1510 up_read(&rbd_dev->header_rwsem); 1584 up_read(&rbd_dev->header_rwsem);
1511 1585
1586 size = blk_rq_bytes(rq);
1587 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1588 bio = rq->bio;
1589
1512 dout("%s 0x%x bytes at 0x%llx\n", 1590 dout("%s 0x%x bytes at 0x%llx\n",
1513 do_write ? "write" : "read", 1591 do_write ? "write" : "read",
1514 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1592 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
@@ -1528,45 +1606,37 @@ static void rbd_rq_fn(struct request_queue *q)
1528 continue; 1606 continue;
1529 } 1607 }
1530 1608
1609 bio_offset = 0;
1531 do { 1610 do {
1532 /* a bio clone to be passed down to OSD req */ 1611 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1612 unsigned int chain_size;
1613 struct bio *bio_chain;
1614
1615 BUG_ON(limit > (u64) UINT_MAX);
1616 chain_size = (unsigned int) limit;
1533 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 1617 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1534 op_size = rbd_segment_length(rbd_dev, ofs, size); 1618
1535 kref_get(&coll->kref); 1619 kref_get(&coll->kref);
1536 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1537 op_size, GFP_ATOMIC);
1538 if (!bio) {
1539 rbd_coll_end_req_index(rq, coll, cur_seg,
1540 -ENOMEM, op_size);
1541 goto next_seg;
1542 }
1543 1620
1621 /* Pass a cloned bio chain via an osd request */
1544 1622
1545 /* init OSD command: write or read */ 1623 bio_chain = bio_chain_clone_range(&bio,
1546 if (do_write) 1624 &bio_offset, chain_size,
1547 rbd_req_write(rq, rbd_dev, 1625 GFP_ATOMIC);
1548 snapc, 1626 if (bio_chain)
1549 ofs, 1627 (void) rbd_do_op(rq, rbd_dev, snapc,
1550 op_size, bio, 1628 ofs, chain_size,
1551 coll, cur_seg); 1629 bio_chain, coll, cur_seg);
1552 else 1630 else
1553 rbd_req_read(rq, rbd_dev, 1631 rbd_coll_end_req_index(rq, coll, cur_seg,
1554 rbd_dev->mapping.snap_id, 1632 -ENOMEM, chain_size);
1555 ofs, 1633 size -= chain_size;
1556 op_size, bio, 1634 ofs += chain_size;
1557 coll, cur_seg);
1558
1559next_seg:
1560 size -= op_size;
1561 ofs += op_size;
1562 1635
1563 cur_seg++; 1636 cur_seg++;
1564 rq_bio = next_bio;
1565 } while (size > 0); 1637 } while (size > 0);
1566 kref_put(&coll->kref, rbd_coll_release); 1638 kref_put(&coll->kref, rbd_coll_release);
1567 1639
1568 if (bp)
1569 bio_pair_release(bp);
1570 spin_lock_irq(q->queue_lock); 1640 spin_lock_irq(q->queue_lock);
1571 1641
1572 ceph_put_snap_context(snapc); 1642 ceph_put_snap_context(snapc);
@@ -1576,28 +1646,47 @@ next_seg:
1576/* 1646/*
1577 * a queue callback. Makes sure that we don't create a bio that spans across 1647 * a queue callback. Makes sure that we don't create a bio that spans across
1578 * multiple osd objects. One exception would be with a single page bios, 1648 * multiple osd objects. One exception would be with a single page bios,
1579 * which we handle later at bio_chain_clone 1649 * which we handle later at bio_chain_clone_range()
1580 */ 1650 */
1581static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1651static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1582 struct bio_vec *bvec) 1652 struct bio_vec *bvec)
1583{ 1653{
1584 struct rbd_device *rbd_dev = q->queuedata; 1654 struct rbd_device *rbd_dev = q->queuedata;
1585 unsigned int chunk_sectors; 1655 sector_t sector_offset;
1586 sector_t sector; 1656 sector_t sectors_per_obj;
1587 unsigned int bio_sectors; 1657 sector_t obj_sector_offset;
1588 int max; 1658 int ret;
1589 1659
1590 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 1660 /*
1591 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); 1661 * Find how far into its rbd object the partition-relative
1592 bio_sectors = bmd->bi_size >> SECTOR_SHIFT; 1662 * bio start sector is to offset relative to the enclosing
1663 * device.
1664 */
1665 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1666 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1667 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1668
1669 /*
1670 * Compute the number of bytes from that offset to the end
1671 * of the object. Account for what's already used by the bio.
1672 */
1673 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1674 if (ret > bmd->bi_size)
1675 ret -= bmd->bi_size;
1676 else
1677 ret = 0;
1593 1678
1594 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) 1679 /*
1595 + bio_sectors)) << SECTOR_SHIFT; 1680 * Don't send back more than was asked for. And if the bio
1596 if (max < 0) 1681 * was empty, let the whole thing through because: "Note
1597 max = 0; /* bio_add cannot handle a negative return */ 1682 * that a block device *must* allow a single page to be
1598 if (max <= bvec->bv_len && bio_sectors == 0) 1683 * added to an empty bio."
1599 return bvec->bv_len; 1684 */
1600 return max; 1685 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1686 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1687 ret = (int) bvec->bv_len;
1688
1689 return ret;
1601} 1690}
1602 1691
1603static void rbd_free_disk(struct rbd_device *rbd_dev) 1692static void rbd_free_disk(struct rbd_device *rbd_dev)
@@ -1663,13 +1752,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1663 ret = -ENXIO; 1752 ret = -ENXIO;
1664 pr_warning("short header read for image %s" 1753 pr_warning("short header read for image %s"
1665 " (want %zd got %d)\n", 1754 " (want %zd got %d)\n",
1666 rbd_dev->image_name, size, ret); 1755 rbd_dev->spec->image_name, size, ret);
1667 goto out_err; 1756 goto out_err;
1668 } 1757 }
1669 if (!rbd_dev_ondisk_valid(ondisk)) { 1758 if (!rbd_dev_ondisk_valid(ondisk)) {
1670 ret = -ENXIO; 1759 ret = -ENXIO;
1671 pr_warning("invalid header for image %s\n", 1760 pr_warning("invalid header for image %s\n",
1672 rbd_dev->image_name); 1761 rbd_dev->spec->image_name);
1673 goto out_err; 1762 goto out_err;
1674 } 1763 }
1675 1764
@@ -1707,19 +1796,32 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
1707 return ret; 1796 return ret;
1708} 1797}
1709 1798
1710static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1799static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1711{ 1800{
1712 struct rbd_snap *snap; 1801 struct rbd_snap *snap;
1713 struct rbd_snap *next; 1802 struct rbd_snap *next;
1714 1803
1715 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 1804 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
1716 __rbd_remove_snap_dev(snap); 1805 rbd_remove_snap_dev(snap);
1806}
1807
1808static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1809{
1810 sector_t size;
1811
1812 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
1813 return;
1814
1815 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1816 dout("setting size to %llu sectors", (unsigned long long) size);
1817 rbd_dev->mapping.size = (u64) size;
1818 set_capacity(rbd_dev->disk, size);
1717} 1819}
1718 1820
1719/* 1821/*
1720 * only read the first part of the ondisk header, without the snaps info 1822 * only read the first part of the ondisk header, without the snaps info
1721 */ 1823 */
1722static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) 1824static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1723{ 1825{
1724 int ret; 1826 int ret;
1725 struct rbd_image_header h; 1827 struct rbd_image_header h;
@@ -1730,17 +1832,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1730 1832
1731 down_write(&rbd_dev->header_rwsem); 1833 down_write(&rbd_dev->header_rwsem);
1732 1834
1733 /* resized? */ 1835 /* Update image size, and check for resize of mapped image */
1734 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { 1836 rbd_dev->header.image_size = h.image_size;
1735 sector_t size = (sector_t) h.image_size / SECTOR_SIZE; 1837 rbd_update_mapping_size(rbd_dev);
1736
1737 if (size != (sector_t) rbd_dev->mapping.size) {
1738 dout("setting size to %llu sectors",
1739 (unsigned long long) size);
1740 rbd_dev->mapping.size = (u64) size;
1741 set_capacity(rbd_dev->disk, size);
1742 }
1743 }
1744 1838
1745 /* rbd_dev->header.object_prefix shouldn't change */ 1839 /* rbd_dev->header.object_prefix shouldn't change */
1746 kfree(rbd_dev->header.snap_sizes); 1840 kfree(rbd_dev->header.snap_sizes);
@@ -1768,12 +1862,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1768 return ret; 1862 return ret;
1769} 1863}
1770 1864
1771static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) 1865static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1772{ 1866{
1773 int ret; 1867 int ret;
1774 1868
1869 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1775 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1870 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1776 ret = __rbd_refresh_header(rbd_dev, hver); 1871 if (rbd_dev->image_format == 1)
1872 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1873 else
1874 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1777 mutex_unlock(&ctl_mutex); 1875 mutex_unlock(&ctl_mutex);
1778 1876
1779 return ret; 1877 return ret;
@@ -1885,7 +1983,7 @@ static ssize_t rbd_pool_show(struct device *dev,
1885{ 1983{
1886 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1887 1985
1888 return sprintf(buf, "%s\n", rbd_dev->pool_name); 1986 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
1889} 1987}
1890 1988
1891static ssize_t rbd_pool_id_show(struct device *dev, 1989static ssize_t rbd_pool_id_show(struct device *dev,
@@ -1893,7 +1991,8 @@ static ssize_t rbd_pool_id_show(struct device *dev,
1893{ 1991{
1894 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1895 1993
1896 return sprintf(buf, "%d\n", rbd_dev->pool_id); 1994 return sprintf(buf, "%llu\n",
1995 (unsigned long long) rbd_dev->spec->pool_id);
1897} 1996}
1898 1997
1899static ssize_t rbd_name_show(struct device *dev, 1998static ssize_t rbd_name_show(struct device *dev,
@@ -1901,7 +2000,10 @@ static ssize_t rbd_name_show(struct device *dev,
1901{ 2000{
1902 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2001 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1903 2002
1904 return sprintf(buf, "%s\n", rbd_dev->image_name); 2003 if (rbd_dev->spec->image_name)
2004 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2005
2006 return sprintf(buf, "(unknown)\n");
1905} 2007}
1906 2008
1907static ssize_t rbd_image_id_show(struct device *dev, 2009static ssize_t rbd_image_id_show(struct device *dev,
@@ -1909,7 +2011,7 @@ static ssize_t rbd_image_id_show(struct device *dev,
1909{ 2011{
1910 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1911 2013
1912 return sprintf(buf, "%s\n", rbd_dev->image_id); 2014 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
1913} 2015}
1914 2016
1915/* 2017/*
@@ -1922,7 +2024,50 @@ static ssize_t rbd_snap_show(struct device *dev,
1922{ 2024{
1923 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2025 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1924 2026
1925 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); 2027 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2028}
2029
2030/*
2031 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2032 * for the parent image. If there is no parent, simply shows
2033 * "(no parent image)".
2034 */
2035static ssize_t rbd_parent_show(struct device *dev,
2036 struct device_attribute *attr,
2037 char *buf)
2038{
2039 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2040 struct rbd_spec *spec = rbd_dev->parent_spec;
2041 int count;
2042 char *bufp = buf;
2043
2044 if (!spec)
2045 return sprintf(buf, "(no parent image)\n");
2046
2047 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2048 (unsigned long long) spec->pool_id, spec->pool_name);
2049 if (count < 0)
2050 return count;
2051 bufp += count;
2052
2053 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2054 spec->image_name ? spec->image_name : "(unknown)");
2055 if (count < 0)
2056 return count;
2057 bufp += count;
2058
2059 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2060 (unsigned long long) spec->snap_id, spec->snap_name);
2061 if (count < 0)
2062 return count;
2063 bufp += count;
2064
2065 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2066 if (count < 0)
2067 return count;
2068 bufp += count;
2069
2070 return (ssize_t) (bufp - buf);
1926} 2071}
1927 2072
1928static ssize_t rbd_image_refresh(struct device *dev, 2073static ssize_t rbd_image_refresh(struct device *dev,
@@ -1933,7 +2078,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
1933 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2078 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1934 int ret; 2079 int ret;
1935 2080
1936 ret = rbd_refresh_header(rbd_dev, NULL); 2081 ret = rbd_dev_refresh(rbd_dev, NULL);
1937 2082
1938 return ret < 0 ? ret : size; 2083 return ret < 0 ? ret : size;
1939} 2084}
@@ -1948,6 +2093,7 @@ static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1948static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2093static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
1949static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2094static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1950static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 2095static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2096static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
1951 2097
1952static struct attribute *rbd_attrs[] = { 2098static struct attribute *rbd_attrs[] = {
1953 &dev_attr_size.attr, 2099 &dev_attr_size.attr,
@@ -1959,6 +2105,7 @@ static struct attribute *rbd_attrs[] = {
1959 &dev_attr_name.attr, 2105 &dev_attr_name.attr,
1960 &dev_attr_image_id.attr, 2106 &dev_attr_image_id.attr,
1961 &dev_attr_current_snap.attr, 2107 &dev_attr_current_snap.attr,
2108 &dev_attr_parent.attr,
1962 &dev_attr_refresh.attr, 2109 &dev_attr_refresh.attr,
1963 NULL 2110 NULL
1964}; 2111};
@@ -2047,6 +2194,74 @@ static struct device_type rbd_snap_device_type = {
2047 .release = rbd_snap_dev_release, 2194 .release = rbd_snap_dev_release,
2048}; 2195};
2049 2196
2197static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2198{
2199 kref_get(&spec->kref);
2200
2201 return spec;
2202}
2203
2204static void rbd_spec_free(struct kref *kref);
2205static void rbd_spec_put(struct rbd_spec *spec)
2206{
2207 if (spec)
2208 kref_put(&spec->kref, rbd_spec_free);
2209}
2210
2211static struct rbd_spec *rbd_spec_alloc(void)
2212{
2213 struct rbd_spec *spec;
2214
2215 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2216 if (!spec)
2217 return NULL;
2218 kref_init(&spec->kref);
2219
2220 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2221
2222 return spec;
2223}
2224
2225static void rbd_spec_free(struct kref *kref)
2226{
2227 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2228
2229 kfree(spec->pool_name);
2230 kfree(spec->image_id);
2231 kfree(spec->image_name);
2232 kfree(spec->snap_name);
2233 kfree(spec);
2234}
2235
2236struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2237 struct rbd_spec *spec)
2238{
2239 struct rbd_device *rbd_dev;
2240
2241 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2242 if (!rbd_dev)
2243 return NULL;
2244
2245 spin_lock_init(&rbd_dev->lock);
2246 INIT_LIST_HEAD(&rbd_dev->node);
2247 INIT_LIST_HEAD(&rbd_dev->snaps);
2248 init_rwsem(&rbd_dev->header_rwsem);
2249
2250 rbd_dev->spec = spec;
2251 rbd_dev->rbd_client = rbdc;
2252
2253 return rbd_dev;
2254}
2255
2256static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2257{
2258 rbd_spec_put(rbd_dev->parent_spec);
2259 kfree(rbd_dev->header_name);
2260 rbd_put_client(rbd_dev->rbd_client);
2261 rbd_spec_put(rbd_dev->spec);
2262 kfree(rbd_dev);
2263}
2264
2050static bool rbd_snap_registered(struct rbd_snap *snap) 2265static bool rbd_snap_registered(struct rbd_snap *snap)
2051{ 2266{
2052 bool ret = snap->dev.type == &rbd_snap_device_type; 2267 bool ret = snap->dev.type == &rbd_snap_device_type;
@@ -2057,7 +2272,7 @@ static bool rbd_snap_registered(struct rbd_snap *snap)
2057 return ret; 2272 return ret;
2058} 2273}
2059 2274
2060static void __rbd_remove_snap_dev(struct rbd_snap *snap) 2275static void rbd_remove_snap_dev(struct rbd_snap *snap)
2061{ 2276{
2062 list_del(&snap->node); 2277 list_del(&snap->node);
2063 if (device_is_registered(&snap->dev)) 2278 if (device_is_registered(&snap->dev))
@@ -2073,7 +2288,7 @@ static int rbd_register_snap_dev(struct rbd_snap *snap,
2073 dev->type = &rbd_snap_device_type; 2288 dev->type = &rbd_snap_device_type;
2074 dev->parent = parent; 2289 dev->parent = parent;
2075 dev->release = rbd_snap_dev_release; 2290 dev->release = rbd_snap_dev_release;
2076 dev_set_name(dev, "snap_%s", snap->name); 2291 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2077 dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2292 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2078 2293
2079 ret = device_register(dev); 2294 ret = device_register(dev);
@@ -2189,6 +2404,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2189 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2404 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2190 if (ret < 0) 2405 if (ret < 0)
2191 goto out; 2406 goto out;
2407 ret = 0; /* rbd_req_sync_exec() can return positive */
2192 2408
2193 p = reply_buf; 2409 p = reply_buf;
2194 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 2410 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
@@ -2216,6 +2432,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2216 __le64 features; 2432 __le64 features;
2217 __le64 incompat; 2433 __le64 incompat;
2218 } features_buf = { 0 }; 2434 } features_buf = { 0 };
2435 u64 incompat;
2219 int ret; 2436 int ret;
2220 2437
2221 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2438 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
@@ -2226,6 +2443,11 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2226 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2443 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2227 if (ret < 0) 2444 if (ret < 0)
2228 return ret; 2445 return ret;
2446
2447 incompat = le64_to_cpu(features_buf.incompat);
2448 if (incompat & ~RBD_FEATURES_ALL)
2449 return -ENXIO;
2450
2229 *snap_features = le64_to_cpu(features_buf.features); 2451 *snap_features = le64_to_cpu(features_buf.features);
2230 2452
2231 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2453 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
@@ -2242,6 +2464,183 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2242 &rbd_dev->header.features); 2464 &rbd_dev->header.features);
2243} 2465}
2244 2466
2467static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2468{
2469 struct rbd_spec *parent_spec;
2470 size_t size;
2471 void *reply_buf = NULL;
2472 __le64 snapid;
2473 void *p;
2474 void *end;
2475 char *image_id;
2476 u64 overlap;
2477 size_t len = 0;
2478 int ret;
2479
2480 parent_spec = rbd_spec_alloc();
2481 if (!parent_spec)
2482 return -ENOMEM;
2483
2484 size = sizeof (__le64) + /* pool_id */
2485 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2486 sizeof (__le64) + /* snap_id */
2487 sizeof (__le64); /* overlap */
2488 reply_buf = kmalloc(size, GFP_KERNEL);
2489 if (!reply_buf) {
2490 ret = -ENOMEM;
2491 goto out_err;
2492 }
2493
2494 snapid = cpu_to_le64(CEPH_NOSNAP);
2495 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2496 "rbd", "get_parent",
2497 (char *) &snapid, sizeof (snapid),
2498 (char *) reply_buf, size,
2499 CEPH_OSD_FLAG_READ, NULL);
2500 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2501 if (ret < 0)
2502 goto out_err;
2503
2504 ret = -ERANGE;
2505 p = reply_buf;
2506 end = (char *) reply_buf + size;
2507 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2508 if (parent_spec->pool_id == CEPH_NOPOOL)
2509 goto out; /* No parent? No problem. */
2510
2511 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2512 if (IS_ERR(image_id)) {
2513 ret = PTR_ERR(image_id);
2514 goto out_err;
2515 }
2516 parent_spec->image_id = image_id;
2517 parent_spec->image_id_len = len;
2518 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2519 ceph_decode_64_safe(&p, end, overlap, out_err);
2520
2521 rbd_dev->parent_overlap = overlap;
2522 rbd_dev->parent_spec = parent_spec;
2523 parent_spec = NULL; /* rbd_dev now owns this */
2524out:
2525 ret = 0;
2526out_err:
2527 kfree(reply_buf);
2528 rbd_spec_put(parent_spec);
2529
2530 return ret;
2531}
2532
2533static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2534{
2535 size_t image_id_size;
2536 char *image_id;
2537 void *p;
2538 void *end;
2539 size_t size;
2540 void *reply_buf = NULL;
2541 size_t len = 0;
2542 char *image_name = NULL;
2543 int ret;
2544
2545 rbd_assert(!rbd_dev->spec->image_name);
2546
2547 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2548 image_id = kmalloc(image_id_size, GFP_KERNEL);
2549 if (!image_id)
2550 return NULL;
2551
2552 p = image_id;
2553 end = (char *) image_id + image_id_size;
2554 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2555 (u32) rbd_dev->spec->image_id_len);
2556
2557 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2558 reply_buf = kmalloc(size, GFP_KERNEL);
2559 if (!reply_buf)
2560 goto out;
2561
2562 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2563 "rbd", "dir_get_name",
2564 image_id, image_id_size,
2565 (char *) reply_buf, size,
2566 CEPH_OSD_FLAG_READ, NULL);
2567 if (ret < 0)
2568 goto out;
2569 p = reply_buf;
2570 end = (char *) reply_buf + size;
2571 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2572 if (IS_ERR(image_name))
2573 image_name = NULL;
2574 else
2575 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2576out:
2577 kfree(reply_buf);
2578 kfree(image_id);
2579
2580 return image_name;
2581}
2582
2583/*
2584 * When a parent image gets probed, we only have the pool, image,
2585 * and snapshot ids but not the names of any of them. This call
2586 * is made later to fill in those names. It has to be done after
2587 * rbd_dev_snaps_update() has completed because some of the
2588 * information (in particular, snapshot name) is not available
2589 * until then.
2590 */
2591static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2592{
2593 struct ceph_osd_client *osdc;
2594 const char *name;
2595 void *reply_buf = NULL;
2596 int ret;
2597
2598 if (rbd_dev->spec->pool_name)
2599 return 0; /* Already have the names */
2600
2601 /* Look up the pool name */
2602
2603 osdc = &rbd_dev->rbd_client->client->osdc;
2604 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2605 if (!name)
2606 return -EIO; /* pool id too large (>= 2^31) */
2607
2608 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2609 if (!rbd_dev->spec->pool_name)
2610 return -ENOMEM;
2611
2612 /* Fetch the image name; tolerate failure here */
2613
2614 name = rbd_dev_image_name(rbd_dev);
2615 if (name) {
2616 rbd_dev->spec->image_name_len = strlen(name);
2617 rbd_dev->spec->image_name = (char *) name;
2618 } else {
2619 pr_warning(RBD_DRV_NAME "%d "
2620 "unable to get image name for image id %s\n",
2621 rbd_dev->major, rbd_dev->spec->image_id);
2622 }
2623
2624 /* Look up the snapshot name. */
2625
2626 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2627 if (!name) {
2628 ret = -EIO;
2629 goto out_err;
2630 }
2631 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2632 if(!rbd_dev->spec->snap_name)
2633 goto out_err;
2634
2635 return 0;
2636out_err:
2637 kfree(reply_buf);
2638 kfree(rbd_dev->spec->pool_name);
2639 rbd_dev->spec->pool_name = NULL;
2640
2641 return ret;
2642}
2643
2245static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 2644static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
2246{ 2645{
2247 size_t size; 2646 size_t size;
@@ -2328,7 +2727,6 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2328 int ret; 2727 int ret;
2329 void *p; 2728 void *p;
2330 void *end; 2729 void *end;
2331 size_t snap_name_len;
2332 char *snap_name; 2730 char *snap_name;
2333 2731
2334 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2732 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
@@ -2348,9 +2746,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2348 2746
2349 p = reply_buf; 2747 p = reply_buf;
2350 end = (char *) reply_buf + size; 2748 end = (char *) reply_buf + size;
2351 snap_name_len = 0; 2749 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2352 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2353 GFP_KERNEL);
2354 if (IS_ERR(snap_name)) { 2750 if (IS_ERR(snap_name)) {
2355 ret = PTR_ERR(snap_name); 2751 ret = PTR_ERR(snap_name);
2356 goto out; 2752 goto out;
@@ -2397,6 +2793,41 @@ static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2397 return ERR_PTR(-EINVAL); 2793 return ERR_PTR(-EINVAL);
2398} 2794}
2399 2795
2796static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2797{
2798 int ret;
2799 __u8 obj_order;
2800
2801 down_write(&rbd_dev->header_rwsem);
2802
2803 /* Grab old order first, to see if it changes */
2804
2805 obj_order = rbd_dev->header.obj_order,
2806 ret = rbd_dev_v2_image_size(rbd_dev);
2807 if (ret)
2808 goto out;
2809 if (rbd_dev->header.obj_order != obj_order) {
2810 ret = -EIO;
2811 goto out;
2812 }
2813 rbd_update_mapping_size(rbd_dev);
2814
2815 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2816 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2817 if (ret)
2818 goto out;
2819 ret = rbd_dev_snaps_update(rbd_dev);
2820 dout("rbd_dev_snaps_update returned %d\n", ret);
2821 if (ret)
2822 goto out;
2823 ret = rbd_dev_snaps_register(rbd_dev);
2824 dout("rbd_dev_snaps_register returned %d\n", ret);
2825out:
2826 up_write(&rbd_dev->header_rwsem);
2827
2828 return ret;
2829}
2830
2400/* 2831/*
2401 * Scan the rbd device's current snapshot list and compare it to the 2832 * Scan the rbd device's current snapshot list and compare it to the
2402 * newly-received snapshot context. Remove any existing snapshots 2833 * newly-received snapshot context. Remove any existing snapshots
@@ -2436,12 +2867,12 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2436 2867
2437 /* Existing snapshot not in the new snap context */ 2868 /* Existing snapshot not in the new snap context */
2438 2869
2439 if (rbd_dev->mapping.snap_id == snap->id) 2870 if (rbd_dev->spec->snap_id == snap->id)
2440 rbd_dev->mapping.snap_exists = false; 2871 rbd_dev->exists = false;
2441 __rbd_remove_snap_dev(snap); 2872 rbd_remove_snap_dev(snap);
2442 dout("%ssnap id %llu has been removed\n", 2873 dout("%ssnap id %llu has been removed\n",
2443 rbd_dev->mapping.snap_id == snap->id ? 2874 rbd_dev->spec->snap_id == snap->id ?
2444 "mapped " : "", 2875 "mapped " : "",
2445 (unsigned long long) snap->id); 2876 (unsigned long long) snap->id);
2446 2877
2447 /* Done with this list entry; advance */ 2878 /* Done with this list entry; advance */
@@ -2559,7 +2990,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2559 do { 2990 do {
2560 ret = rbd_req_sync_watch(rbd_dev); 2991 ret = rbd_req_sync_watch(rbd_dev);
2561 if (ret == -ERANGE) { 2992 if (ret == -ERANGE) {
2562 rc = rbd_refresh_header(rbd_dev, NULL); 2993 rc = rbd_dev_refresh(rbd_dev, NULL);
2563 if (rc < 0) 2994 if (rc < 0)
2564 return rc; 2995 return rc;
2565 } 2996 }
@@ -2621,8 +3052,8 @@ static void rbd_dev_id_put(struct rbd_device *rbd_dev)
2621 struct rbd_device *rbd_dev; 3052 struct rbd_device *rbd_dev;
2622 3053
2623 rbd_dev = list_entry(tmp, struct rbd_device, node); 3054 rbd_dev = list_entry(tmp, struct rbd_device, node);
2624 if (rbd_id > max_id) 3055 if (rbd_dev->dev_id > max_id)
2625 max_id = rbd_id; 3056 max_id = rbd_dev->dev_id;
2626 } 3057 }
2627 spin_unlock(&rbd_dev_list_lock); 3058 spin_unlock(&rbd_dev_list_lock);
2628 3059
@@ -2722,73 +3153,140 @@ static inline char *dup_token(const char **buf, size_t *lenp)
2722} 3153}
2723 3154
2724/* 3155/*
2725 * This fills in the pool_name, image_name, image_name_len, rbd_dev, 3156 * Parse the options provided for an "rbd add" (i.e., rbd image
2726 * rbd_md_name, and name fields of the given rbd_dev, based on the 3157 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
2727 * list of monitor addresses and other options provided via 3158 * and the data written is passed here via a NUL-terminated buffer.
2728 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated 3159 * Returns 0 if successful or an error code otherwise.
2729 * copy of the snapshot name to map if successful, or a 3160 *
2730 * pointer-coded error otherwise. 3161 * The information extracted from these options is recorded in
3162 * the other parameters which return dynamically-allocated
3163 * structures:
3164 * ceph_opts
3165 * The address of a pointer that will refer to a ceph options
3166 * structure. Caller must release the returned pointer using
3167 * ceph_destroy_options() when it is no longer needed.
3168 * rbd_opts
3169 * Address of an rbd options pointer. Fully initialized by
3170 * this function; caller must release with kfree().
3171 * spec
3172 * Address of an rbd image specification pointer. Fully
3173 * initialized by this function based on parsed options.
3174 * Caller must release with rbd_spec_put().
2731 * 3175 *
2732 * Note: rbd_dev is assumed to have been initially zero-filled. 3176 * The options passed take this form:
3177 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3178 * where:
3179 * <mon_addrs>
3180 * A comma-separated list of one or more monitor addresses.
3181 * A monitor address is an ip address, optionally followed
3182 * by a port number (separated by a colon).
3183 * I.e.: ip1[:port1][,ip2[:port2]...]
3184 * <options>
3185 * A comma-separated list of ceph and/or rbd options.
3186 * <pool_name>
3187 * The name of the rados pool containing the rbd image.
3188 * <image_name>
3189 * The name of the image in that pool to map.
3190 * <snap_id>
3191 * An optional snapshot id. If provided, the mapping will
3192 * present data from the image at the time that snapshot was
3193 * created. The image head is used if no snapshot id is
3194 * provided. Snapshot mappings are always read-only.
2733 */ 3195 */
2734static char *rbd_add_parse_args(struct rbd_device *rbd_dev, 3196static int rbd_add_parse_args(const char *buf,
2735 const char *buf, 3197 struct ceph_options **ceph_opts,
2736 const char **mon_addrs, 3198 struct rbd_options **opts,
2737 size_t *mon_addrs_size, 3199 struct rbd_spec **rbd_spec)
2738 char *options,
2739 size_t options_size)
2740{ 3200{
2741 size_t len; 3201 size_t len;
2742 char *err_ptr = ERR_PTR(-EINVAL); 3202 char *options;
2743 char *snap_name; 3203 const char *mon_addrs;
3204 size_t mon_addrs_size;
3205 struct rbd_spec *spec = NULL;
3206 struct rbd_options *rbd_opts = NULL;
3207 struct ceph_options *copts;
3208 int ret;
2744 3209
2745 /* The first four tokens are required */ 3210 /* The first four tokens are required */
2746 3211
2747 len = next_token(&buf); 3212 len = next_token(&buf);
2748 if (!len) 3213 if (!len)
2749 return err_ptr; 3214 return -EINVAL; /* Missing monitor address(es) */
2750 *mon_addrs_size = len + 1; 3215 mon_addrs = buf;
2751 *mon_addrs = buf; 3216 mon_addrs_size = len + 1;
2752
2753 buf += len; 3217 buf += len;
2754 3218
2755 len = copy_token(&buf, options, options_size); 3219 ret = -EINVAL;
2756 if (!len || len >= options_size) 3220 options = dup_token(&buf, NULL);
2757 return err_ptr; 3221 if (!options)
3222 return -ENOMEM;
3223 if (!*options)
3224 goto out_err; /* Missing options */
2758 3225
2759 err_ptr = ERR_PTR(-ENOMEM); 3226 spec = rbd_spec_alloc();
2760 rbd_dev->pool_name = dup_token(&buf, NULL); 3227 if (!spec)
2761 if (!rbd_dev->pool_name) 3228 goto out_mem;
2762 goto out_err;
2763 3229
2764 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); 3230 spec->pool_name = dup_token(&buf, NULL);
2765 if (!rbd_dev->image_name) 3231 if (!spec->pool_name)
2766 goto out_err; 3232 goto out_mem;
3233 if (!*spec->pool_name)
3234 goto out_err; /* Missing pool name */
2767 3235
2768 /* Snapshot name is optional */ 3236 spec->image_name = dup_token(&buf, &spec->image_name_len);
3237 if (!spec->image_name)
3238 goto out_mem;
3239 if (!*spec->image_name)
3240 goto out_err; /* Missing image name */
3241
3242 /*
3243 * Snapshot name is optional; default is to use "-"
3244 * (indicating the head/no snapshot).
3245 */
2769 len = next_token(&buf); 3246 len = next_token(&buf);
2770 if (!len) { 3247 if (!len) {
2771 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 3248 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2772 len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 3249 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
2773 } 3250 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
2774 snap_name = kmalloc(len + 1, GFP_KERNEL); 3251 ret = -ENAMETOOLONG;
2775 if (!snap_name)
2776 goto out_err; 3252 goto out_err;
2777 memcpy(snap_name, buf, len); 3253 }
2778 *(snap_name + len) = '\0'; 3254 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3255 if (!spec->snap_name)
3256 goto out_mem;
3257 memcpy(spec->snap_name, buf, len);
3258 *(spec->snap_name + len) = '\0';
2779 3259
2780dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); 3260 /* Initialize all rbd options to the defaults */
2781 3261
2782 return snap_name; 3262 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3263 if (!rbd_opts)
3264 goto out_mem;
3265
3266 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3267
3268 copts = ceph_parse_options(options, mon_addrs,
3269 mon_addrs + mon_addrs_size - 1,
3270 parse_rbd_opts_token, rbd_opts);
3271 if (IS_ERR(copts)) {
3272 ret = PTR_ERR(copts);
3273 goto out_err;
3274 }
3275 kfree(options);
2783 3276
3277 *ceph_opts = copts;
3278 *opts = rbd_opts;
3279 *rbd_spec = spec;
3280
3281 return 0;
3282out_mem:
3283 ret = -ENOMEM;
2784out_err: 3284out_err:
2785 kfree(rbd_dev->image_name); 3285 kfree(rbd_opts);
2786 rbd_dev->image_name = NULL; 3286 rbd_spec_put(spec);
2787 rbd_dev->image_name_len = 0; 3287 kfree(options);
2788 kfree(rbd_dev->pool_name);
2789 rbd_dev->pool_name = NULL;
2790 3288
2791 return err_ptr; 3289 return ret;
2792} 3290}
2793 3291
2794/* 3292/*
@@ -2814,14 +3312,22 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2814 void *p; 3312 void *p;
2815 3313
2816 /* 3314 /*
3315 * When probing a parent image, the image id is already
3316 * known (and the image name likely is not). There's no
3317 * need to fetch the image id again in this case.
3318 */
3319 if (rbd_dev->spec->image_id)
3320 return 0;
3321
3322 /*
2817 * First, see if the format 2 image id file exists, and if 3323 * First, see if the format 2 image id file exists, and if
2818 * so, get the image's persistent id from it. 3324 * so, get the image's persistent id from it.
2819 */ 3325 */
2820 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; 3326 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
2821 object_name = kmalloc(size, GFP_NOIO); 3327 object_name = kmalloc(size, GFP_NOIO);
2822 if (!object_name) 3328 if (!object_name)
2823 return -ENOMEM; 3329 return -ENOMEM;
2824 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); 3330 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
2825 dout("rbd id object name is %s\n", object_name); 3331 dout("rbd id object name is %s\n", object_name);
2826 3332
2827 /* Response will be an encoded string, which includes a length */ 3333 /* Response will be an encoded string, which includes a length */
@@ -2841,17 +3347,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2841 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 3347 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2842 if (ret < 0) 3348 if (ret < 0)
2843 goto out; 3349 goto out;
3350 ret = 0; /* rbd_req_sync_exec() can return positive */
2844 3351
2845 p = response; 3352 p = response;
2846 rbd_dev->image_id = ceph_extract_encoded_string(&p, 3353 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
2847 p + RBD_IMAGE_ID_LEN_MAX, 3354 p + RBD_IMAGE_ID_LEN_MAX,
2848 &rbd_dev->image_id_len, 3355 &rbd_dev->spec->image_id_len,
2849 GFP_NOIO); 3356 GFP_NOIO);
2850 if (IS_ERR(rbd_dev->image_id)) { 3357 if (IS_ERR(rbd_dev->spec->image_id)) {
2851 ret = PTR_ERR(rbd_dev->image_id); 3358 ret = PTR_ERR(rbd_dev->spec->image_id);
2852 rbd_dev->image_id = NULL; 3359 rbd_dev->spec->image_id = NULL;
2853 } else { 3360 } else {
2854 dout("image_id is %s\n", rbd_dev->image_id); 3361 dout("image_id is %s\n", rbd_dev->spec->image_id);
2855 } 3362 }
2856out: 3363out:
2857 kfree(response); 3364 kfree(response);
@@ -2867,26 +3374,33 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2867 3374
2868 /* Version 1 images have no id; empty string is used */ 3375 /* Version 1 images have no id; empty string is used */
2869 3376
2870 rbd_dev->image_id = kstrdup("", GFP_KERNEL); 3377 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
2871 if (!rbd_dev->image_id) 3378 if (!rbd_dev->spec->image_id)
2872 return -ENOMEM; 3379 return -ENOMEM;
2873 rbd_dev->image_id_len = 0; 3380 rbd_dev->spec->image_id_len = 0;
2874 3381
2875 /* Record the header object name for this rbd image. */ 3382 /* Record the header object name for this rbd image. */
2876 3383
2877 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); 3384 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
2878 rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3385 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2879 if (!rbd_dev->header_name) { 3386 if (!rbd_dev->header_name) {
2880 ret = -ENOMEM; 3387 ret = -ENOMEM;
2881 goto out_err; 3388 goto out_err;
2882 } 3389 }
2883 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 3390 sprintf(rbd_dev->header_name, "%s%s",
3391 rbd_dev->spec->image_name, RBD_SUFFIX);
2884 3392
2885 /* Populate rbd image metadata */ 3393 /* Populate rbd image metadata */
2886 3394
2887 ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3395 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2888 if (ret < 0) 3396 if (ret < 0)
2889 goto out_err; 3397 goto out_err;
3398
3399 /* Version 1 images have no parent (no layering) */
3400
3401 rbd_dev->parent_spec = NULL;
3402 rbd_dev->parent_overlap = 0;
3403
2890 rbd_dev->image_format = 1; 3404 rbd_dev->image_format = 1;
2891 3405
2892 dout("discovered version 1 image, header name is %s\n", 3406 dout("discovered version 1 image, header name is %s\n",
@@ -2897,8 +3411,8 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2897out_err: 3411out_err:
2898 kfree(rbd_dev->header_name); 3412 kfree(rbd_dev->header_name);
2899 rbd_dev->header_name = NULL; 3413 rbd_dev->header_name = NULL;
2900 kfree(rbd_dev->image_id); 3414 kfree(rbd_dev->spec->image_id);
2901 rbd_dev->image_id = NULL; 3415 rbd_dev->spec->image_id = NULL;
2902 3416
2903 return ret; 3417 return ret;
2904} 3418}
@@ -2913,12 +3427,12 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2913 * Image id was filled in by the caller. Record the header 3427 * Image id was filled in by the caller. Record the header
2914 * object name for this rbd image. 3428 * object name for this rbd image.
2915 */ 3429 */
2916 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; 3430 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
2917 rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3431 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2918 if (!rbd_dev->header_name) 3432 if (!rbd_dev->header_name)
2919 return -ENOMEM; 3433 return -ENOMEM;
2920 sprintf(rbd_dev->header_name, "%s%s", 3434 sprintf(rbd_dev->header_name, "%s%s",
2921 RBD_HEADER_PREFIX, rbd_dev->image_id); 3435 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
2922 3436
2923 /* Get the size and object order for the image */ 3437 /* Get the size and object order for the image */
2924 3438
@@ -2932,12 +3446,20 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2932 if (ret < 0) 3446 if (ret < 0)
2933 goto out_err; 3447 goto out_err;
2934 3448
2935 /* Get the features for the image */ 3449 /* Get the and check features for the image */
2936 3450
2937 ret = rbd_dev_v2_features(rbd_dev); 3451 ret = rbd_dev_v2_features(rbd_dev);
2938 if (ret < 0) 3452 if (ret < 0)
2939 goto out_err; 3453 goto out_err;
2940 3454
3455 /* If the image supports layering, get the parent info */
3456
3457 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3458 ret = rbd_dev_v2_parent_info(rbd_dev);
3459 if (ret < 0)
3460 goto out_err;
3461 }
3462
2941 /* crypto and compression type aren't (yet) supported for v2 images */ 3463 /* crypto and compression type aren't (yet) supported for v2 images */
2942 3464
2943 rbd_dev->header.crypt_type = 0; 3465 rbd_dev->header.crypt_type = 0;
@@ -2955,8 +3477,11 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2955 dout("discovered version 2 image, header name is %s\n", 3477 dout("discovered version 2 image, header name is %s\n",
2956 rbd_dev->header_name); 3478 rbd_dev->header_name);
2957 3479
2958 return -ENOTSUPP; 3480 return 0;
2959out_err: 3481out_err:
3482 rbd_dev->parent_overlap = 0;
3483 rbd_spec_put(rbd_dev->parent_spec);
3484 rbd_dev->parent_spec = NULL;
2960 kfree(rbd_dev->header_name); 3485 kfree(rbd_dev->header_name);
2961 rbd_dev->header_name = NULL; 3486 rbd_dev->header_name = NULL;
2962 kfree(rbd_dev->header.object_prefix); 3487 kfree(rbd_dev->header.object_prefix);
@@ -2965,91 +3490,22 @@ out_err:
2965 return ret; 3490 return ret;
2966} 3491}
2967 3492
2968/* 3493static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
2969 * Probe for the existence of the header object for the given rbd
2970 * device. For format 2 images this includes determining the image
2971 * id.
2972 */
2973static int rbd_dev_probe(struct rbd_device *rbd_dev)
2974{ 3494{
2975 int ret; 3495 int ret;
2976 3496
2977 /* 3497 /* no need to lock here, as rbd_dev is not registered yet */
2978 * Get the id from the image id object. If it's not a 3498 ret = rbd_dev_snaps_update(rbd_dev);
2979 * format 2 image, we'll get ENOENT back, and we'll assume
2980 * it's a format 1 image.
2981 */
2982 ret = rbd_dev_image_id(rbd_dev);
2983 if (ret)
2984 ret = rbd_dev_v1_probe(rbd_dev);
2985 else
2986 ret = rbd_dev_v2_probe(rbd_dev);
2987 if (ret) 3499 if (ret)
2988 dout("probe failed, returning %d\n", ret); 3500 return ret;
2989
2990 return ret;
2991}
2992
2993static ssize_t rbd_add(struct bus_type *bus,
2994 const char *buf,
2995 size_t count)
2996{
2997 char *options;
2998 struct rbd_device *rbd_dev = NULL;
2999 const char *mon_addrs = NULL;
3000 size_t mon_addrs_size = 0;
3001 struct ceph_osd_client *osdc;
3002 int rc = -ENOMEM;
3003 char *snap_name;
3004
3005 if (!try_module_get(THIS_MODULE))
3006 return -ENODEV;
3007
3008 options = kmalloc(count, GFP_KERNEL);
3009 if (!options)
3010 goto err_out_mem;
3011 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3012 if (!rbd_dev)
3013 goto err_out_mem;
3014
3015 /* static rbd_device initialization */
3016 spin_lock_init(&rbd_dev->lock);
3017 INIT_LIST_HEAD(&rbd_dev->node);
3018 INIT_LIST_HEAD(&rbd_dev->snaps);
3019 init_rwsem(&rbd_dev->header_rwsem);
3020
3021 /* parse add command */
3022 snap_name = rbd_add_parse_args(rbd_dev, buf,
3023 &mon_addrs, &mon_addrs_size, options, count);
3024 if (IS_ERR(snap_name)) {
3025 rc = PTR_ERR(snap_name);
3026 goto err_out_mem;
3027 }
3028
3029 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3030 if (rc < 0)
3031 goto err_out_args;
3032
3033 /* pick the pool */
3034 osdc = &rbd_dev->rbd_client->client->osdc;
3035 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3036 if (rc < 0)
3037 goto err_out_client;
3038 rbd_dev->pool_id = rc;
3039
3040 rc = rbd_dev_probe(rbd_dev);
3041 if (rc < 0)
3042 goto err_out_client;
3043 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3044 3501
3045 /* no need to lock here, as rbd_dev is not registered yet */ 3502 ret = rbd_dev_probe_update_spec(rbd_dev);
3046 rc = rbd_dev_snaps_update(rbd_dev); 3503 if (ret)
3047 if (rc) 3504 goto err_out_snaps;
3048 goto err_out_header;
3049 3505
3050 rc = rbd_dev_set_mapping(rbd_dev, snap_name); 3506 ret = rbd_dev_set_mapping(rbd_dev);
3051 if (rc) 3507 if (ret)
3052 goto err_out_header; 3508 goto err_out_snaps;
3053 3509
3054 /* generate unique id: find highest unique id, add one */ 3510 /* generate unique id: find highest unique id, add one */
3055 rbd_dev_id_get(rbd_dev); 3511 rbd_dev_id_get(rbd_dev);
@@ -3061,34 +3517,33 @@ static ssize_t rbd_add(struct bus_type *bus,
3061 3517
3062 /* Get our block major device number. */ 3518 /* Get our block major device number. */
3063 3519
3064 rc = register_blkdev(0, rbd_dev->name); 3520 ret = register_blkdev(0, rbd_dev->name);
3065 if (rc < 0) 3521 if (ret < 0)
3066 goto err_out_id; 3522 goto err_out_id;
3067 rbd_dev->major = rc; 3523 rbd_dev->major = ret;
3068 3524
3069 /* Set up the blkdev mapping. */ 3525 /* Set up the blkdev mapping. */
3070 3526
3071 rc = rbd_init_disk(rbd_dev); 3527 ret = rbd_init_disk(rbd_dev);
3072 if (rc) 3528 if (ret)
3073 goto err_out_blkdev; 3529 goto err_out_blkdev;
3074 3530
3075 rc = rbd_bus_add_dev(rbd_dev); 3531 ret = rbd_bus_add_dev(rbd_dev);
3076 if (rc) 3532 if (ret)
3077 goto err_out_disk; 3533 goto err_out_disk;
3078 3534
3079 /* 3535 /*
3080 * At this point cleanup in the event of an error is the job 3536 * At this point cleanup in the event of an error is the job
3081 * of the sysfs code (initiated by rbd_bus_del_dev()). 3537 * of the sysfs code (initiated by rbd_bus_del_dev()).
3082 */ 3538 */
3083
3084 down_write(&rbd_dev->header_rwsem); 3539 down_write(&rbd_dev->header_rwsem);
3085 rc = rbd_dev_snaps_register(rbd_dev); 3540 ret = rbd_dev_snaps_register(rbd_dev);
3086 up_write(&rbd_dev->header_rwsem); 3541 up_write(&rbd_dev->header_rwsem);
3087 if (rc) 3542 if (ret)
3088 goto err_out_bus; 3543 goto err_out_bus;
3089 3544
3090 rc = rbd_init_watch_dev(rbd_dev); 3545 ret = rbd_init_watch_dev(rbd_dev);
3091 if (rc) 3546 if (ret)
3092 goto err_out_bus; 3547 goto err_out_bus;
3093 3548
3094 /* Everything's ready. Announce the disk to the world. */ 3549 /* Everything's ready. Announce the disk to the world. */
@@ -3098,37 +3553,119 @@ static ssize_t rbd_add(struct bus_type *bus,
3098 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 3553 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3099 (unsigned long long) rbd_dev->mapping.size); 3554 (unsigned long long) rbd_dev->mapping.size);
3100 3555
3101 return count; 3556 return ret;
3102
3103err_out_bus: 3557err_out_bus:
3104 /* this will also clean up rest of rbd_dev stuff */ 3558 /* this will also clean up rest of rbd_dev stuff */
3105 3559
3106 rbd_bus_del_dev(rbd_dev); 3560 rbd_bus_del_dev(rbd_dev);
3107 kfree(options);
3108 return rc;
3109 3561
3562 return ret;
3110err_out_disk: 3563err_out_disk:
3111 rbd_free_disk(rbd_dev); 3564 rbd_free_disk(rbd_dev);
3112err_out_blkdev: 3565err_out_blkdev:
3113 unregister_blkdev(rbd_dev->major, rbd_dev->name); 3566 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3114err_out_id: 3567err_out_id:
3115 rbd_dev_id_put(rbd_dev); 3568 rbd_dev_id_put(rbd_dev);
3116err_out_header: 3569err_out_snaps:
3117 rbd_header_free(&rbd_dev->header); 3570 rbd_remove_all_snaps(rbd_dev);
3571
3572 return ret;
3573}
3574
3575/*
3576 * Probe for the existence of the header object for the given rbd
3577 * device. For format 2 images this includes determining the image
3578 * id.
3579 */
3580static int rbd_dev_probe(struct rbd_device *rbd_dev)
3581{
3582 int ret;
3583
3584 /*
3585 * Get the id from the image id object. If it's not a
3586 * format 2 image, we'll get ENOENT back, and we'll assume
3587 * it's a format 1 image.
3588 */
3589 ret = rbd_dev_image_id(rbd_dev);
3590 if (ret)
3591 ret = rbd_dev_v1_probe(rbd_dev);
3592 else
3593 ret = rbd_dev_v2_probe(rbd_dev);
3594 if (ret) {
3595 dout("probe failed, returning %d\n", ret);
3596
3597 return ret;
3598 }
3599
3600 ret = rbd_dev_probe_finish(rbd_dev);
3601 if (ret)
3602 rbd_header_free(&rbd_dev->header);
3603
3604 return ret;
3605}
3606
3607static ssize_t rbd_add(struct bus_type *bus,
3608 const char *buf,
3609 size_t count)
3610{
3611 struct rbd_device *rbd_dev = NULL;
3612 struct ceph_options *ceph_opts = NULL;
3613 struct rbd_options *rbd_opts = NULL;
3614 struct rbd_spec *spec = NULL;
3615 struct rbd_client *rbdc;
3616 struct ceph_osd_client *osdc;
3617 int rc = -ENOMEM;
3618
3619 if (!try_module_get(THIS_MODULE))
3620 return -ENODEV;
3621
3622 /* parse add command */
3623 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3624 if (rc < 0)
3625 goto err_out_module;
3626
3627 rbdc = rbd_get_client(ceph_opts);
3628 if (IS_ERR(rbdc)) {
3629 rc = PTR_ERR(rbdc);
3630 goto err_out_args;
3631 }
3632 ceph_opts = NULL; /* rbd_dev client now owns this */
3633
3634 /* pick the pool */
3635 osdc = &rbdc->client->osdc;
3636 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3637 if (rc < 0)
3638 goto err_out_client;
3639 spec->pool_id = (u64) rc;
3640
3641 rbd_dev = rbd_dev_create(rbdc, spec);
3642 if (!rbd_dev)
3643 goto err_out_client;
3644 rbdc = NULL; /* rbd_dev now owns this */
3645 spec = NULL; /* rbd_dev now owns this */
3646
3647 rbd_dev->mapping.read_only = rbd_opts->read_only;
3648 kfree(rbd_opts);
3649 rbd_opts = NULL; /* done with this */
3650
3651 rc = rbd_dev_probe(rbd_dev);
3652 if (rc < 0)
3653 goto err_out_rbd_dev;
3654
3655 return count;
3656err_out_rbd_dev:
3657 rbd_dev_destroy(rbd_dev);
3118err_out_client: 3658err_out_client:
3119 kfree(rbd_dev->header_name); 3659 rbd_put_client(rbdc);
3120 rbd_put_client(rbd_dev);
3121 kfree(rbd_dev->image_id);
3122err_out_args: 3660err_out_args:
3123 kfree(rbd_dev->mapping.snap_name); 3661 if (ceph_opts)
3124 kfree(rbd_dev->image_name); 3662 ceph_destroy_options(ceph_opts);
3125 kfree(rbd_dev->pool_name); 3663 kfree(rbd_opts);
3126err_out_mem: 3664 rbd_spec_put(spec);
3127 kfree(rbd_dev); 3665err_out_module:
3128 kfree(options); 3666 module_put(THIS_MODULE);
3129 3667
3130 dout("Error adding device %s\n", buf); 3668 dout("Error adding device %s\n", buf);
3131 module_put(THIS_MODULE);
3132 3669
3133 return (ssize_t) rc; 3670 return (ssize_t) rc;
3134} 3671}
@@ -3163,7 +3700,6 @@ static void rbd_dev_release(struct device *dev)
3163 if (rbd_dev->watch_event) 3700 if (rbd_dev->watch_event)
3164 rbd_req_sync_unwatch(rbd_dev); 3701 rbd_req_sync_unwatch(rbd_dev);
3165 3702
3166 rbd_put_client(rbd_dev);
3167 3703
3168 /* clean up and free blkdev */ 3704 /* clean up and free blkdev */
3169 rbd_free_disk(rbd_dev); 3705 rbd_free_disk(rbd_dev);
@@ -3173,13 +3709,9 @@ static void rbd_dev_release(struct device *dev)
3173 rbd_header_free(&rbd_dev->header); 3709 rbd_header_free(&rbd_dev->header);
3174 3710
3175 /* done with the id, and with the rbd_dev */ 3711 /* done with the id, and with the rbd_dev */
3176 kfree(rbd_dev->mapping.snap_name);
3177 kfree(rbd_dev->image_id);
3178 kfree(rbd_dev->header_name);
3179 kfree(rbd_dev->pool_name);
3180 kfree(rbd_dev->image_name);
3181 rbd_dev_id_put(rbd_dev); 3712 rbd_dev_id_put(rbd_dev);
3182 kfree(rbd_dev); 3713 rbd_assert(rbd_dev->rbd_client != NULL);
3714 rbd_dev_destroy(rbd_dev);
3183 3715
3184 /* release module ref */ 3716 /* release module ref */
3185 module_put(THIS_MODULE); 3717 module_put(THIS_MODULE);
@@ -3211,7 +3743,12 @@ static ssize_t rbd_remove(struct bus_type *bus,
3211 goto done; 3743 goto done;
3212 } 3744 }
3213 3745
3214 __rbd_remove_all_snaps(rbd_dev); 3746 if (rbd_dev->open_count) {
3747 ret = -EBUSY;
3748 goto done;
3749 }
3750
3751 rbd_remove_all_snaps(rbd_dev);
3215 rbd_bus_del_dev(rbd_dev); 3752 rbd_bus_del_dev(rbd_dev);
3216 3753
3217done: 3754done:
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index cbe77fa105ba..49d77cbcf8bd 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -46,8 +46,6 @@
46#define RBD_MIN_OBJ_ORDER 16 46#define RBD_MIN_OBJ_ORDER 16
47#define RBD_MAX_OBJ_ORDER 30 47#define RBD_MAX_OBJ_ORDER 30
48 48
49#define RBD_MAX_SEG_NAME_LEN 128
50
51#define RBD_COMP_NONE 0 49#define RBD_COMP_NONE 0
52#define RBD_CRYPT_NONE 0 50#define RBD_CRYPT_NONE 0
53 51
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 55074cba20eb..c1c74e030a58 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -57,17 +57,9 @@
57 * physically contiguous memory regions it is mapping into page sizes 57 * physically contiguous memory regions it is mapping into page sizes
58 * that we support. 58 * that we support.
59 * 59 *
60 * Traditionally the IOMMU core just handed us the mappings directly, 60 * 512GB Pages are not supported due to a hardware bug
61 * after making sure the size is an order of a 4KiB page and that the
62 * mapping has natural alignment.
63 *
64 * To retain this behavior, we currently advertise that we support
65 * all page sizes that are an order of 4KiB.
66 *
67 * If at some point we'd like to utilize the IOMMU core's new behavior,
68 * we could change this to advertise the real page sizes we support.
69 */ 61 */
70#define AMD_IOMMU_PGSIZES (~0xFFFUL) 62#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
71 63
72static DEFINE_RWLOCK(amd_iommu_devtable_lock); 64static DEFINE_RWLOCK(amd_iommu_devtable_lock);
73 65
@@ -140,6 +132,9 @@ static void free_dev_data(struct iommu_dev_data *dev_data)
140 list_del(&dev_data->dev_data_list); 132 list_del(&dev_data->dev_data_list);
141 spin_unlock_irqrestore(&dev_data_list_lock, flags); 133 spin_unlock_irqrestore(&dev_data_list_lock, flags);
142 134
135 if (dev_data->group)
136 iommu_group_put(dev_data->group);
137
143 kfree(dev_data); 138 kfree(dev_data);
144} 139}
145 140
@@ -274,41 +269,23 @@ static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
274 *from = to; 269 *from = to;
275} 270}
276 271
277#define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF) 272static struct pci_bus *find_hosted_bus(struct pci_bus *bus)
278
279static int iommu_init_device(struct device *dev)
280{ 273{
281 struct pci_dev *dma_pdev = NULL, *pdev = to_pci_dev(dev); 274 while (!bus->self) {
282 struct iommu_dev_data *dev_data; 275 if (!pci_is_root_bus(bus))
283 struct iommu_group *group; 276 bus = bus->parent;
284 u16 alias; 277 else
285 int ret; 278 return ERR_PTR(-ENODEV);
286 279 }
287 if (dev->archdata.iommu)
288 return 0;
289
290 dev_data = find_dev_data(get_device_id(dev));
291 if (!dev_data)
292 return -ENOMEM;
293
294 alias = amd_iommu_alias_table[dev_data->devid];
295 if (alias != dev_data->devid) {
296 struct iommu_dev_data *alias_data;
297 280
298 alias_data = find_dev_data(alias); 281 return bus;
299 if (alias_data == NULL) { 282}
300 pr_err("AMD-Vi: Warning: Unhandled device %s\n",
301 dev_name(dev));
302 free_dev_data(dev_data);
303 return -ENOTSUPP;
304 }
305 dev_data->alias_data = alias_data;
306 283
307 dma_pdev = pci_get_bus_and_slot(alias >> 8, alias & 0xff); 284#define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
308 }
309 285
310 if (dma_pdev == NULL) 286static struct pci_dev *get_isolation_root(struct pci_dev *pdev)
311 dma_pdev = pci_dev_get(pdev); 287{
288 struct pci_dev *dma_pdev = pdev;
312 289
313 /* Account for quirked devices */ 290 /* Account for quirked devices */
314 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev)); 291 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
@@ -330,14 +307,9 @@ static int iommu_init_device(struct device *dev)
330 * Finding the next device may require skipping virtual buses. 307 * Finding the next device may require skipping virtual buses.
331 */ 308 */
332 while (!pci_is_root_bus(dma_pdev->bus)) { 309 while (!pci_is_root_bus(dma_pdev->bus)) {
333 struct pci_bus *bus = dma_pdev->bus; 310 struct pci_bus *bus = find_hosted_bus(dma_pdev->bus);
334 311 if (IS_ERR(bus))
335 while (!bus->self) { 312 break;
336 if (!pci_is_root_bus(bus))
337 bus = bus->parent;
338 else
339 goto root_bus;
340 }
341 313
342 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS)) 314 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
343 break; 315 break;
@@ -345,19 +317,137 @@ static int iommu_init_device(struct device *dev)
345 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self)); 317 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
346 } 318 }
347 319
348root_bus: 320 return dma_pdev;
349 group = iommu_group_get(&dma_pdev->dev); 321}
350 pci_dev_put(dma_pdev); 322
323static int use_pdev_iommu_group(struct pci_dev *pdev, struct device *dev)
324{
325 struct iommu_group *group = iommu_group_get(&pdev->dev);
326 int ret;
327
351 if (!group) { 328 if (!group) {
352 group = iommu_group_alloc(); 329 group = iommu_group_alloc();
353 if (IS_ERR(group)) 330 if (IS_ERR(group))
354 return PTR_ERR(group); 331 return PTR_ERR(group);
332
333 WARN_ON(&pdev->dev != dev);
355 } 334 }
356 335
357 ret = iommu_group_add_device(group, dev); 336 ret = iommu_group_add_device(group, dev);
358
359 iommu_group_put(group); 337 iommu_group_put(group);
338 return ret;
339}
340
341static int use_dev_data_iommu_group(struct iommu_dev_data *dev_data,
342 struct device *dev)
343{
344 if (!dev_data->group) {
345 struct iommu_group *group = iommu_group_alloc();
346 if (IS_ERR(group))
347 return PTR_ERR(group);
348
349 dev_data->group = group;
350 }
351
352 return iommu_group_add_device(dev_data->group, dev);
353}
354
355static int init_iommu_group(struct device *dev)
356{
357 struct iommu_dev_data *dev_data;
358 struct iommu_group *group;
359 struct pci_dev *dma_pdev;
360 int ret;
361
362 group = iommu_group_get(dev);
363 if (group) {
364 iommu_group_put(group);
365 return 0;
366 }
367
368 dev_data = find_dev_data(get_device_id(dev));
369 if (!dev_data)
370 return -ENOMEM;
371
372 if (dev_data->alias_data) {
373 u16 alias;
374 struct pci_bus *bus;
375
376 if (dev_data->alias_data->group)
377 goto use_group;
378
379 /*
380 * If the alias device exists, it's effectively just a first
381 * level quirk for finding the DMA source.
382 */
383 alias = amd_iommu_alias_table[dev_data->devid];
384 dma_pdev = pci_get_bus_and_slot(alias >> 8, alias & 0xff);
385 if (dma_pdev) {
386 dma_pdev = get_isolation_root(dma_pdev);
387 goto use_pdev;
388 }
389
390 /*
391 * If the alias is virtual, try to find a parent device
392 * and test whether the IOMMU group is actualy rooted above
393 * the alias. Be careful to also test the parent device if
394 * we think the alias is the root of the group.
395 */
396 bus = pci_find_bus(0, alias >> 8);
397 if (!bus)
398 goto use_group;
399
400 bus = find_hosted_bus(bus);
401 if (IS_ERR(bus) || !bus->self)
402 goto use_group;
403
404 dma_pdev = get_isolation_root(pci_dev_get(bus->self));
405 if (dma_pdev != bus->self || (dma_pdev->multifunction &&
406 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)))
407 goto use_pdev;
408
409 pci_dev_put(dma_pdev);
410 goto use_group;
411 }
412
413 dma_pdev = get_isolation_root(pci_dev_get(to_pci_dev(dev)));
414use_pdev:
415 ret = use_pdev_iommu_group(dma_pdev, dev);
416 pci_dev_put(dma_pdev);
417 return ret;
418use_group:
419 return use_dev_data_iommu_group(dev_data->alias_data, dev);
420}
421
422static int iommu_init_device(struct device *dev)
423{
424 struct pci_dev *pdev = to_pci_dev(dev);
425 struct iommu_dev_data *dev_data;
426 u16 alias;
427 int ret;
428
429 if (dev->archdata.iommu)
430 return 0;
431
432 dev_data = find_dev_data(get_device_id(dev));
433 if (!dev_data)
434 return -ENOMEM;
435
436 alias = amd_iommu_alias_table[dev_data->devid];
437 if (alias != dev_data->devid) {
438 struct iommu_dev_data *alias_data;
439
440 alias_data = find_dev_data(alias);
441 if (alias_data == NULL) {
442 pr_err("AMD-Vi: Warning: Unhandled device %s\n",
443 dev_name(dev));
444 free_dev_data(dev_data);
445 return -ENOTSUPP;
446 }
447 dev_data->alias_data = alias_data;
448 }
360 449
450 ret = init_iommu_group(dev);
361 if (ret) 451 if (ret)
362 return ret; 452 return ret;
363 453
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index c9aa3d079ff0..e38ab438bb34 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -426,6 +426,7 @@ struct iommu_dev_data {
426 struct iommu_dev_data *alias_data;/* The alias dev_data */ 426 struct iommu_dev_data *alias_data;/* The alias dev_data */
427 struct protection_domain *domain; /* Domain the device is bound to */ 427 struct protection_domain *domain; /* Domain the device is bound to */
428 atomic_t bind; /* Domain attach reference count */ 428 atomic_t bind; /* Domain attach reference count */
429 struct iommu_group *group; /* IOMMU group for virtual aliases */
429 u16 devid; /* PCI Device ID */ 430 u16 devid; /* PCI Device ID */
430 bool iommu_v2; /* Device can make use of IOMMUv2 */ 431 bool iommu_v2; /* Device can make use of IOMMUv2 */
431 bool passthrough; /* Default for device is pt_domain */ 432 bool passthrough; /* Default for device is pt_domain */
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 0badfa48b32b..c2c07a4a7f21 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1827,10 +1827,17 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1827 if (!pte) 1827 if (!pte)
1828 return -ENOMEM; 1828 return -ENOMEM;
1829 /* It is large page*/ 1829 /* It is large page*/
1830 if (largepage_lvl > 1) 1830 if (largepage_lvl > 1) {
1831 pteval |= DMA_PTE_LARGE_PAGE; 1831 pteval |= DMA_PTE_LARGE_PAGE;
1832 else 1832 /* Ensure that old small page tables are removed to make room
1833 for superpage, if they exist. */
1834 dma_pte_clear_range(domain, iov_pfn,
1835 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1836 dma_pte_free_pagetable(domain, iov_pfn,
1837 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838 } else {
1833 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 1839 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1840 }
1834 1841
1835 } 1842 }
1836 /* We don't need lock here, nobody else 1843 /* We don't need lock here, nobody else
@@ -2320,8 +2327,39 @@ static int domain_add_dev_info(struct dmar_domain *domain,
2320 return 0; 2327 return 0;
2321} 2328}
2322 2329
2330static bool device_has_rmrr(struct pci_dev *dev)
2331{
2332 struct dmar_rmrr_unit *rmrr;
2333 int i;
2334
2335 for_each_rmrr_units(rmrr) {
2336 for (i = 0; i < rmrr->devices_cnt; i++) {
2337 /*
2338 * Return TRUE if this RMRR contains the device that
2339 * is passed in.
2340 */
2341 if (rmrr->devices[i] == dev)
2342 return true;
2343 }
2344 }
2345 return false;
2346}
2347
2323static int iommu_should_identity_map(struct pci_dev *pdev, int startup) 2348static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2324{ 2349{
2350
2351 /*
2352 * We want to prevent any device associated with an RMRR from
2353 * getting placed into the SI Domain. This is done because
2354 * problems exist when devices are moved in and out of domains
2355 * and their respective RMRR info is lost. We exempt USB devices
2356 * from this process due to their usage of RMRRs that are known
2357 * to not be needed after BIOS hand-off to OS.
2358 */
2359 if (device_has_rmrr(pdev) &&
2360 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2361 return 0;
2362
2325 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) 2363 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2326 return 1; 2364 return 1;
2327 2365
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index badc17c2bcb4..18108c1405e2 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -16,13 +16,13 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <linux/ioport.h> 18#include <linux/ioport.h>
19#include <linux/clk.h>
20#include <linux/platform_device.h> 19#include <linux/platform_device.h>
21#include <linux/iommu.h> 20#include <linux/iommu.h>
22#include <linux/omap-iommu.h> 21#include <linux/omap-iommu.h>
23#include <linux/mutex.h> 22#include <linux/mutex.h>
24#include <linux/spinlock.h> 23#include <linux/spinlock.h>
25#include <linux/io.h> 24#include <linux/io.h>
25#include <linux/pm_runtime.h>
26 26
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28 28
@@ -143,31 +143,44 @@ EXPORT_SYMBOL_GPL(omap_iommu_arch_version);
143static int iommu_enable(struct omap_iommu *obj) 143static int iommu_enable(struct omap_iommu *obj)
144{ 144{
145 int err; 145 int err;
146 struct platform_device *pdev = to_platform_device(obj->dev);
147 struct iommu_platform_data *pdata = pdev->dev.platform_data;
146 148
147 if (!obj) 149 if (!obj || !pdata)
148 return -EINVAL; 150 return -EINVAL;
149 151
150 if (!arch_iommu) 152 if (!arch_iommu)
151 return -ENODEV; 153 return -ENODEV;
152 154
153 clk_enable(obj->clk); 155 if (pdata->deassert_reset) {
156 err = pdata->deassert_reset(pdev, pdata->reset_name);
157 if (err) {
158 dev_err(obj->dev, "deassert_reset failed: %d\n", err);
159 return err;
160 }
161 }
162
163 pm_runtime_get_sync(obj->dev);
154 164
155 err = arch_iommu->enable(obj); 165 err = arch_iommu->enable(obj);
156 166
157 clk_disable(obj->clk);
158 return err; 167 return err;
159} 168}
160 169
161static void iommu_disable(struct omap_iommu *obj) 170static void iommu_disable(struct omap_iommu *obj)
162{ 171{
163 if (!obj) 172 struct platform_device *pdev = to_platform_device(obj->dev);
164 return; 173 struct iommu_platform_data *pdata = pdev->dev.platform_data;
165 174
166 clk_enable(obj->clk); 175 if (!obj || !pdata)
176 return;
167 177
168 arch_iommu->disable(obj); 178 arch_iommu->disable(obj);
169 179
170 clk_disable(obj->clk); 180 pm_runtime_put_sync(obj->dev);
181
182 if (pdata->assert_reset)
183 pdata->assert_reset(pdev, pdata->reset_name);
171} 184}
172 185
173/* 186/*
@@ -290,7 +303,7 @@ static int load_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
290 if (!obj || !obj->nr_tlb_entries || !e) 303 if (!obj || !obj->nr_tlb_entries || !e)
291 return -EINVAL; 304 return -EINVAL;
292 305
293 clk_enable(obj->clk); 306 pm_runtime_get_sync(obj->dev);
294 307
295 iotlb_lock_get(obj, &l); 308 iotlb_lock_get(obj, &l);
296 if (l.base == obj->nr_tlb_entries) { 309 if (l.base == obj->nr_tlb_entries) {
@@ -320,7 +333,7 @@ static int load_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
320 333
321 cr = iotlb_alloc_cr(obj, e); 334 cr = iotlb_alloc_cr(obj, e);
322 if (IS_ERR(cr)) { 335 if (IS_ERR(cr)) {
323 clk_disable(obj->clk); 336 pm_runtime_put_sync(obj->dev);
324 return PTR_ERR(cr); 337 return PTR_ERR(cr);
325 } 338 }
326 339
@@ -334,7 +347,7 @@ static int load_iotlb_entry(struct omap_iommu *obj, struct iotlb_entry *e)
334 l.vict = l.base; 347 l.vict = l.base;
335 iotlb_lock_set(obj, &l); 348 iotlb_lock_set(obj, &l);
336out: 349out:
337 clk_disable(obj->clk); 350 pm_runtime_put_sync(obj->dev);
338 return err; 351 return err;
339} 352}
340 353
@@ -364,7 +377,7 @@ static void flush_iotlb_page(struct omap_iommu *obj, u32 da)
364 int i; 377 int i;
365 struct cr_regs cr; 378 struct cr_regs cr;
366 379
367 clk_enable(obj->clk); 380 pm_runtime_get_sync(obj->dev);
368 381
369 for_each_iotlb_cr(obj, obj->nr_tlb_entries, i, cr) { 382 for_each_iotlb_cr(obj, obj->nr_tlb_entries, i, cr) {
370 u32 start; 383 u32 start;
@@ -383,7 +396,7 @@ static void flush_iotlb_page(struct omap_iommu *obj, u32 da)
383 iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY); 396 iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
384 } 397 }
385 } 398 }
386 clk_disable(obj->clk); 399 pm_runtime_put_sync(obj->dev);
387 400
388 if (i == obj->nr_tlb_entries) 401 if (i == obj->nr_tlb_entries)
389 dev_dbg(obj->dev, "%s: no page for %08x\n", __func__, da); 402 dev_dbg(obj->dev, "%s: no page for %08x\n", __func__, da);
@@ -397,7 +410,7 @@ static void flush_iotlb_all(struct omap_iommu *obj)
397{ 410{
398 struct iotlb_lock l; 411 struct iotlb_lock l;
399 412
400 clk_enable(obj->clk); 413 pm_runtime_get_sync(obj->dev);
401 414
402 l.base = 0; 415 l.base = 0;
403 l.vict = 0; 416 l.vict = 0;
@@ -405,7 +418,7 @@ static void flush_iotlb_all(struct omap_iommu *obj)
405 418
406 iommu_write_reg(obj, 1, MMU_GFLUSH); 419 iommu_write_reg(obj, 1, MMU_GFLUSH);
407 420
408 clk_disable(obj->clk); 421 pm_runtime_put_sync(obj->dev);
409} 422}
410 423
411#if defined(CONFIG_OMAP_IOMMU_DEBUG) || defined(CONFIG_OMAP_IOMMU_DEBUG_MODULE) 424#if defined(CONFIG_OMAP_IOMMU_DEBUG) || defined(CONFIG_OMAP_IOMMU_DEBUG_MODULE)
@@ -415,11 +428,11 @@ ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t bytes)
415 if (!obj || !buf) 428 if (!obj || !buf)
416 return -EINVAL; 429 return -EINVAL;
417 430
418 clk_enable(obj->clk); 431 pm_runtime_get_sync(obj->dev);
419 432
420 bytes = arch_iommu->dump_ctx(obj, buf, bytes); 433 bytes = arch_iommu->dump_ctx(obj, buf, bytes);
421 434
422 clk_disable(obj->clk); 435 pm_runtime_put_sync(obj->dev);
423 436
424 return bytes; 437 return bytes;
425} 438}
@@ -433,7 +446,7 @@ __dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
433 struct cr_regs tmp; 446 struct cr_regs tmp;
434 struct cr_regs *p = crs; 447 struct cr_regs *p = crs;
435 448
436 clk_enable(obj->clk); 449 pm_runtime_get_sync(obj->dev);
437 iotlb_lock_get(obj, &saved); 450 iotlb_lock_get(obj, &saved);
438 451
439 for_each_iotlb_cr(obj, num, i, tmp) { 452 for_each_iotlb_cr(obj, num, i, tmp) {
@@ -443,7 +456,7 @@ __dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
443 } 456 }
444 457
445 iotlb_lock_set(obj, &saved); 458 iotlb_lock_set(obj, &saved);
446 clk_disable(obj->clk); 459 pm_runtime_put_sync(obj->dev);
447 460
448 return p - crs; 461 return p - crs;
449} 462}
@@ -807,9 +820,7 @@ static irqreturn_t iommu_fault_handler(int irq, void *data)
807 if (!obj->refcount) 820 if (!obj->refcount)
808 return IRQ_NONE; 821 return IRQ_NONE;
809 822
810 clk_enable(obj->clk);
811 errs = iommu_report_fault(obj, &da); 823 errs = iommu_report_fault(obj, &da);
812 clk_disable(obj->clk);
813 if (errs == 0) 824 if (errs == 0)
814 return IRQ_HANDLED; 825 return IRQ_HANDLED;
815 826
@@ -931,17 +942,10 @@ static int __devinit omap_iommu_probe(struct platform_device *pdev)
931 struct resource *res; 942 struct resource *res;
932 struct iommu_platform_data *pdata = pdev->dev.platform_data; 943 struct iommu_platform_data *pdata = pdev->dev.platform_data;
933 944
934 if (pdev->num_resources != 2)
935 return -EINVAL;
936
937 obj = kzalloc(sizeof(*obj) + MMU_REG_SIZE, GFP_KERNEL); 945 obj = kzalloc(sizeof(*obj) + MMU_REG_SIZE, GFP_KERNEL);
938 if (!obj) 946 if (!obj)
939 return -ENOMEM; 947 return -ENOMEM;
940 948
941 obj->clk = clk_get(&pdev->dev, pdata->clk_name);
942 if (IS_ERR(obj->clk))
943 goto err_clk;
944
945 obj->nr_tlb_entries = pdata->nr_tlb_entries; 949 obj->nr_tlb_entries = pdata->nr_tlb_entries;
946 obj->name = pdata->name; 950 obj->name = pdata->name;
947 obj->dev = &pdev->dev; 951 obj->dev = &pdev->dev;
@@ -984,6 +988,9 @@ static int __devinit omap_iommu_probe(struct platform_device *pdev)
984 goto err_irq; 988 goto err_irq;
985 platform_set_drvdata(pdev, obj); 989 platform_set_drvdata(pdev, obj);
986 990
991 pm_runtime_irq_safe(obj->dev);
992 pm_runtime_enable(obj->dev);
993
987 dev_info(&pdev->dev, "%s registered\n", obj->name); 994 dev_info(&pdev->dev, "%s registered\n", obj->name);
988 return 0; 995 return 0;
989 996
@@ -992,8 +999,6 @@ err_irq:
992err_ioremap: 999err_ioremap:
993 release_mem_region(res->start, resource_size(res)); 1000 release_mem_region(res->start, resource_size(res));
994err_mem: 1001err_mem:
995 clk_put(obj->clk);
996err_clk:
997 kfree(obj); 1002 kfree(obj);
998 return err; 1003 return err;
999} 1004}
@@ -1014,7 +1019,8 @@ static int __devexit omap_iommu_remove(struct platform_device *pdev)
1014 release_mem_region(res->start, resource_size(res)); 1019 release_mem_region(res->start, resource_size(res));
1015 iounmap(obj->regbase); 1020 iounmap(obj->regbase);
1016 1021
1017 clk_put(obj->clk); 1022 pm_runtime_disable(obj->dev);
1023
1018 dev_info(&pdev->dev, "%s removed\n", obj->name); 1024 dev_info(&pdev->dev, "%s removed\n", obj->name);
1019 kfree(obj); 1025 kfree(obj);
1020 return 0; 1026 return 0;
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index 2b5f3c04d167..120084206602 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -29,7 +29,6 @@ struct iotlb_entry {
29struct omap_iommu { 29struct omap_iommu {
30 const char *name; 30 const char *name;
31 struct module *owner; 31 struct module *owner;
32 struct clk *clk;
33 void __iomem *regbase; 32 void __iomem *regbase;
34 struct device *dev; 33 struct device *dev;
35 void *isr_priv; 34 void *isr_priv;
@@ -116,8 +115,6 @@ static inline struct omap_iommu *dev_to_omap_iommu(struct device *dev)
116 * MMU Register offsets 115 * MMU Register offsets
117 */ 116 */
118#define MMU_REVISION 0x00 117#define MMU_REVISION 0x00
119#define MMU_SYSCONFIG 0x10
120#define MMU_SYSSTATUS 0x14
121#define MMU_IRQSTATUS 0x18 118#define MMU_IRQSTATUS 0x18
122#define MMU_IRQENABLE 0x1c 119#define MMU_IRQENABLE 0x1c
123#define MMU_WALKING_ST 0x40 120#define MMU_WALKING_ST 0x40
diff --git a/drivers/iommu/omap-iommu2.c b/drivers/iommu/omap-iommu2.c
index c02020292377..d745094a69dd 100644
--- a/drivers/iommu/omap-iommu2.c
+++ b/drivers/iommu/omap-iommu2.c
@@ -28,19 +28,6 @@
28 */ 28 */
29#define IOMMU_ARCH_VERSION 0x00000011 29#define IOMMU_ARCH_VERSION 0x00000011
30 30
31/* SYSCONF */
32#define MMU_SYS_IDLE_SHIFT 3
33#define MMU_SYS_IDLE_FORCE (0 << MMU_SYS_IDLE_SHIFT)
34#define MMU_SYS_IDLE_NONE (1 << MMU_SYS_IDLE_SHIFT)
35#define MMU_SYS_IDLE_SMART (2 << MMU_SYS_IDLE_SHIFT)
36#define MMU_SYS_IDLE_MASK (3 << MMU_SYS_IDLE_SHIFT)
37
38#define MMU_SYS_SOFTRESET (1 << 1)
39#define MMU_SYS_AUTOIDLE 1
40
41/* SYSSTATUS */
42#define MMU_SYS_RESETDONE 1
43
44/* IRQSTATUS & IRQENABLE */ 31/* IRQSTATUS & IRQENABLE */
45#define MMU_IRQ_MULTIHITFAULT (1 << 4) 32#define MMU_IRQ_MULTIHITFAULT (1 << 4)
46#define MMU_IRQ_TABLEWALKFAULT (1 << 3) 33#define MMU_IRQ_TABLEWALKFAULT (1 << 3)
@@ -97,7 +84,6 @@ static void __iommu_set_twl(struct omap_iommu *obj, bool on)
97static int omap2_iommu_enable(struct omap_iommu *obj) 84static int omap2_iommu_enable(struct omap_iommu *obj)
98{ 85{
99 u32 l, pa; 86 u32 l, pa;
100 unsigned long timeout;
101 87
102 if (!obj->iopgd || !IS_ALIGNED((u32)obj->iopgd, SZ_16K)) 88 if (!obj->iopgd || !IS_ALIGNED((u32)obj->iopgd, SZ_16K))
103 return -EINVAL; 89 return -EINVAL;
@@ -106,29 +92,10 @@ static int omap2_iommu_enable(struct omap_iommu *obj)
106 if (!IS_ALIGNED(pa, SZ_16K)) 92 if (!IS_ALIGNED(pa, SZ_16K))
107 return -EINVAL; 93 return -EINVAL;
108 94
109 iommu_write_reg(obj, MMU_SYS_SOFTRESET, MMU_SYSCONFIG);
110
111 timeout = jiffies + msecs_to_jiffies(20);
112 do {
113 l = iommu_read_reg(obj, MMU_SYSSTATUS);
114 if (l & MMU_SYS_RESETDONE)
115 break;
116 } while (!time_after(jiffies, timeout));
117
118 if (!(l & MMU_SYS_RESETDONE)) {
119 dev_err(obj->dev, "can't take mmu out of reset\n");
120 return -ENODEV;
121 }
122
123 l = iommu_read_reg(obj, MMU_REVISION); 95 l = iommu_read_reg(obj, MMU_REVISION);
124 dev_info(obj->dev, "%s: version %d.%d\n", obj->name, 96 dev_info(obj->dev, "%s: version %d.%d\n", obj->name,
125 (l >> 4) & 0xf, l & 0xf); 97 (l >> 4) & 0xf, l & 0xf);
126 98
127 l = iommu_read_reg(obj, MMU_SYSCONFIG);
128 l &= ~MMU_SYS_IDLE_MASK;
129 l |= (MMU_SYS_IDLE_SMART | MMU_SYS_AUTOIDLE);
130 iommu_write_reg(obj, l, MMU_SYSCONFIG);
131
132 iommu_write_reg(obj, pa, MMU_TTB); 99 iommu_write_reg(obj, pa, MMU_TTB);
133 100
134 __iommu_set_twl(obj, true); 101 __iommu_set_twl(obj, true);
@@ -142,7 +109,6 @@ static void omap2_iommu_disable(struct omap_iommu *obj)
142 109
143 l &= ~MMU_CNTL_MASK; 110 l &= ~MMU_CNTL_MASK;
144 iommu_write_reg(obj, l, MMU_CNTL); 111 iommu_write_reg(obj, l, MMU_CNTL);
145 iommu_write_reg(obj, MMU_SYS_IDLE_FORCE, MMU_SYSCONFIG);
146 112
147 dev_dbg(obj->dev, "%s is shutting down\n", obj->name); 113 dev_dbg(obj->dev, "%s is shutting down\n", obj->name);
148} 114}
@@ -271,8 +237,6 @@ omap2_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len)
271 char *p = buf; 237 char *p = buf;
272 238
273 pr_reg(REVISION); 239 pr_reg(REVISION);
274 pr_reg(SYSCONFIG);
275 pr_reg(SYSSTATUS);
276 pr_reg(IRQSTATUS); 240 pr_reg(IRQSTATUS);
277 pr_reg(IRQENABLE); 241 pr_reg(IRQENABLE);
278 pr_reg(WALKING_ST); 242 pr_reg(WALKING_ST);
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index c16e8fc8a4bd..4c9db62814ff 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -398,6 +398,7 @@ static int tegra_gart_probe(struct platform_device *pdev)
398 do_gart_setup(gart, NULL); 398 do_gart_setup(gart, NULL);
399 399
400 gart_handle = gart; 400 gart_handle = gart;
401 bus_set_iommu(&platform_bus_type, &gart_iommu_ops);
401 return 0; 402 return 0;
402 403
403fail: 404fail:
@@ -450,7 +451,6 @@ static struct platform_driver tegra_gart_driver = {
450 451
451static int __devinit tegra_gart_init(void) 452static int __devinit tegra_gart_init(void)
452{ 453{
453 bus_set_iommu(&platform_bus_type, &gart_iommu_ops);
454 return platform_driver_register(&tegra_gart_driver); 454 return platform_driver_register(&tegra_gart_driver);
455} 455}
456 456
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 4252d743963d..25c1210c0832 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -694,10 +694,8 @@ static void __smmu_iommu_unmap(struct smmu_as *as, dma_addr_t iova)
694 *pte = _PTE_VACANT(iova); 694 *pte = _PTE_VACANT(iova);
695 FLUSH_CPU_DCACHE(pte, page, sizeof(*pte)); 695 FLUSH_CPU_DCACHE(pte, page, sizeof(*pte));
696 flush_ptc_and_tlb(as->smmu, as, iova, pte, page, 0); 696 flush_ptc_and_tlb(as->smmu, as, iova, pte, page, 0);
697 if (!--(*count)) { 697 if (!--(*count))
698 free_ptbl(as, iova); 698 free_ptbl(as, iova);
699 smmu_flush_regs(as->smmu, 0);
700 }
701} 699}
702 700
703static void __smmu_iommu_map_pfn(struct smmu_as *as, dma_addr_t iova, 701static void __smmu_iommu_map_pfn(struct smmu_as *as, dma_addr_t iova,
@@ -1232,6 +1230,7 @@ static int tegra_smmu_probe(struct platform_device *pdev)
1232 1230
1233 smmu_debugfs_create(smmu); 1231 smmu_debugfs_create(smmu);
1234 smmu_handle = smmu; 1232 smmu_handle = smmu;
1233 bus_set_iommu(&platform_bus_type, &smmu_iommu_ops);
1235 return 0; 1234 return 0;
1236} 1235}
1237 1236
@@ -1276,7 +1275,6 @@ static struct platform_driver tegra_smmu_driver = {
1276 1275
1277static int __devinit tegra_smmu_init(void) 1276static int __devinit tegra_smmu_init(void)
1278{ 1277{
1279 bus_set_iommu(&platform_bus_type, &smmu_iommu_ops);
1280 return platform_driver_register(&tegra_smmu_driver); 1278 return platform_driver_register(&tegra_smmu_driver);
1281} 1279}
1282 1280