aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/block/rbd.c1852
-rw-r--r--fs/ceph/addr.c38
-rw-r--r--fs/ceph/caps.c32
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/ioctl.c6
-rw-r--r--fs/ceph/mds_client.c33
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/strings.c4
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h10
-rw-r--r--fs/ceph/xattr.c214
-rw-r--r--include/linux/ceph/ceph_features.h38
-rw-r--r--include/linux/ceph/ceph_fs.h32
-rw-r--r--include/linux/ceph/decode.h29
-rw-r--r--include/linux/ceph/libceph.h16
-rw-r--r--include/linux/ceph/mdsmap.h4
-rw-r--r--include/linux/ceph/messenger.h2
-rw-r--r--include/linux/ceph/osd_client.h74
-rw-r--r--include/linux/ceph/osdmap.h30
-rw-r--r--include/linux/ceph/rados.h158
-rw-r--r--include/linux/crush/crush.h2
-rw-r--r--net/ceph/ceph_common.c22
-rw-r--r--net/ceph/ceph_strings.c39
-rw-r--r--net/ceph/crush/mapper.c15
-rw-r--r--net/ceph/crypto.c7
-rw-r--r--net/ceph/debugfs.c29
-rw-r--r--net/ceph/messenger.c260
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/ceph/osd_client.c635
-rw-r--r--net/ceph/osdmap.c290
-rw-r--r--net/ceph/pagevec.c24
32 files changed, 2402 insertions, 1528 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 89576a0b3f2e..6c81a4c040b9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,9 +52,12 @@
52#define SECTOR_SHIFT 9 52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54 54
55/* It might be useful to have this defined elsewhere too */ 55/* It might be useful to have these defined elsewhere */
56 56
57#define U64_MAX ((u64) (~0ULL)) 57#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
58 61
59#define RBD_DRV_NAME "rbd" 62#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)" 63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
@@ -66,7 +69,6 @@
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67 70
68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 71#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
69#define RBD_MAX_OPT_LEN 1024
70 72
71#define RBD_SNAP_HEAD_NAME "-" 73#define RBD_SNAP_HEAD_NAME "-"
72 74
@@ -93,8 +95,6 @@
93#define DEV_NAME_LEN 32 95#define DEV_NAME_LEN 32
94#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 96#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
95 97
96#define RBD_READ_ONLY_DEFAULT false
97
98/* 98/*
99 * block device image metadata (in-memory version) 99 * block device image metadata (in-memory version)
100 */ 100 */
@@ -119,16 +119,33 @@ struct rbd_image_header {
119 * An rbd image specification. 119 * An rbd image specification.
120 * 120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image. 122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
123 */ 142 */
124struct rbd_spec { 143struct rbd_spec {
125 u64 pool_id; 144 u64 pool_id;
126 char *pool_name; 145 char *pool_name;
127 146
128 char *image_id; 147 char *image_id;
129 size_t image_id_len;
130 char *image_name; 148 char *image_name;
131 size_t image_name_len;
132 149
133 u64 snap_id; 150 u64 snap_id;
134 char *snap_name; 151 char *snap_name;
@@ -136,10 +153,6 @@ struct rbd_spec {
136 struct kref kref; 153 struct kref kref;
137}; 154};
138 155
139struct rbd_options {
140 bool read_only;
141};
142
143/* 156/*
144 * an instance of the client. multiple devices may share an rbd client. 157 * an instance of the client. multiple devices may share an rbd client.
145 */ 158 */
@@ -149,37 +162,76 @@ struct rbd_client {
149 struct list_head node; 162 struct list_head node;
150}; 163};
151 164
152/* 165struct rbd_img_request;
153 * a request completion status 166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
154 */ 167
155struct rbd_req_status { 168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
156 int done; 169
157 int rc; 170struct rbd_obj_request;
158 u64 bytes; 171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
159}; 175};
160 176
161/* 177struct rbd_obj_request {
162 * a collection of requests 178 const char *object_name;
163 */ 179 u64 offset; /* object start byte */
164struct rbd_req_coll { 180 u64 length; /* bytes from offset */
165 int total; 181
166 int num_done; 182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 int result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
203 struct completion completion;
204
167 struct kref kref; 205 struct kref kref;
168 struct rbd_req_status status[0];
169}; 206};
170 207
171/* 208struct rbd_img_request {
172 * a single io request 209 struct request *rq;
173 */ 210 struct rbd_device *rbd_dev;
174struct rbd_request { 211 u64 offset; /* starting image byte offset */
175 struct request *rq; /* blk layer request */ 212 u64 length; /* byte count from offset */
176 struct bio *bio; /* cloned bio */ 213 bool write_request; /* false for read */
177 struct page **pages; /* list of used pages */ 214 union {
178 u64 len; 215 struct ceph_snap_context *snapc; /* for writes */
179 int coll_index; 216 u64 snap_id; /* for reads */
180 struct rbd_req_coll *coll; 217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
181}; 226};
182 227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
234
183struct rbd_snap { 235struct rbd_snap {
184 struct device dev; 236 struct device dev;
185 const char *name; 237 const char *name;
@@ -209,16 +261,18 @@ struct rbd_device {
209 261
210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211 263
212 spinlock_t lock; /* queue lock */ 264 spinlock_t lock; /* queue, flags, open_count */
213 265
214 struct rbd_image_header header; 266 struct rbd_image_header header;
215 bool exists; 267 unsigned long flags; /* possibly lock protected */
216 struct rbd_spec *spec; 268 struct rbd_spec *spec;
217 269
218 char *header_name; 270 char *header_name;
219 271
272 struct ceph_file_layout layout;
273
220 struct ceph_osd_event *watch_event; 274 struct ceph_osd_event *watch_event;
221 struct ceph_osd_request *watch_request; 275 struct rbd_obj_request *watch_request;
222 276
223 struct rbd_spec *parent_spec; 277 struct rbd_spec *parent_spec;
224 u64 parent_overlap; 278 u64 parent_overlap;
@@ -235,7 +289,19 @@ struct rbd_device {
235 289
236 /* sysfs related */ 290 /* sysfs related */
237 struct device dev; 291 struct device dev;
238 unsigned long open_count; 292 unsigned long open_count; /* protected by lock */
293};
294
295/*
296 * Flag bits for rbd_dev->flags. If atomicity is required,
297 * rbd_dev->lock is used to protect access.
298 *
299 * Currently, only the "removing" flag (which is coupled with the
300 * "open_count" field) requires atomic access.
301 */
302enum rbd_dev_flags {
303 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
304 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
239}; 305};
240 306
241static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 307static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
@@ -277,6 +343,33 @@ static struct device rbd_root_dev = {
277 .release = rbd_root_dev_release, 343 .release = rbd_root_dev_release,
278}; 344};
279 345
346static __printf(2, 3)
347void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
348{
349 struct va_format vaf;
350 va_list args;
351
352 va_start(args, fmt);
353 vaf.fmt = fmt;
354 vaf.va = &args;
355
356 if (!rbd_dev)
357 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
358 else if (rbd_dev->disk)
359 printk(KERN_WARNING "%s: %s: %pV\n",
360 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
361 else if (rbd_dev->spec && rbd_dev->spec->image_name)
362 printk(KERN_WARNING "%s: image %s: %pV\n",
363 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
364 else if (rbd_dev->spec && rbd_dev->spec->image_id)
365 printk(KERN_WARNING "%s: id %s: %pV\n",
366 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
367 else /* punt */
368 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
369 RBD_DRV_NAME, rbd_dev, &vaf);
370 va_end(args);
371}
372
280#ifdef RBD_DEBUG 373#ifdef RBD_DEBUG
281#define rbd_assert(expr) \ 374#define rbd_assert(expr) \
282 if (unlikely(!(expr))) { \ 375 if (unlikely(!(expr))) { \
@@ -296,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
296static int rbd_open(struct block_device *bdev, fmode_t mode) 389static int rbd_open(struct block_device *bdev, fmode_t mode)
297{ 390{
298 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 391 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
392 bool removing = false;
299 393
300 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 394 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
301 return -EROFS; 395 return -EROFS;
302 396
397 spin_lock_irq(&rbd_dev->lock);
398 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
399 removing = true;
400 else
401 rbd_dev->open_count++;
402 spin_unlock_irq(&rbd_dev->lock);
403 if (removing)
404 return -ENOENT;
405
303 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 406 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
304 (void) get_device(&rbd_dev->dev); 407 (void) get_device(&rbd_dev->dev);
305 set_device_ro(bdev, rbd_dev->mapping.read_only); 408 set_device_ro(bdev, rbd_dev->mapping.read_only);
306 rbd_dev->open_count++;
307 mutex_unlock(&ctl_mutex); 409 mutex_unlock(&ctl_mutex);
308 410
309 return 0; 411 return 0;
@@ -312,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
312static int rbd_release(struct gendisk *disk, fmode_t mode) 414static int rbd_release(struct gendisk *disk, fmode_t mode)
313{ 415{
314 struct rbd_device *rbd_dev = disk->private_data; 416 struct rbd_device *rbd_dev = disk->private_data;
417 unsigned long open_count_before;
418
419 spin_lock_irq(&rbd_dev->lock);
420 open_count_before = rbd_dev->open_count--;
421 spin_unlock_irq(&rbd_dev->lock);
422 rbd_assert(open_count_before > 0);
315 423
316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317 rbd_assert(rbd_dev->open_count > 0);
318 rbd_dev->open_count--;
319 put_device(&rbd_dev->dev); 425 put_device(&rbd_dev->dev);
320 mutex_unlock(&ctl_mutex); 426 mutex_unlock(&ctl_mutex);
321 427
@@ -337,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
337 struct rbd_client *rbdc; 443 struct rbd_client *rbdc;
338 int ret = -ENOMEM; 444 int ret = -ENOMEM;
339 445
340 dout("rbd_client_create\n"); 446 dout("%s:\n", __func__);
341 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 447 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
342 if (!rbdc) 448 if (!rbdc)
343 goto out_opt; 449 goto out_opt;
@@ -361,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
361 spin_unlock(&rbd_client_list_lock); 467 spin_unlock(&rbd_client_list_lock);
362 468
363 mutex_unlock(&ctl_mutex); 469 mutex_unlock(&ctl_mutex);
470 dout("%s: rbdc %p\n", __func__, rbdc);
364 471
365 dout("rbd_client_create created %p\n", rbdc);
366 return rbdc; 472 return rbdc;
367 473
368out_err: 474out_err:
@@ -373,6 +479,8 @@ out_mutex:
373out_opt: 479out_opt:
374 if (ceph_opts) 480 if (ceph_opts)
375 ceph_destroy_options(ceph_opts); 481 ceph_destroy_options(ceph_opts);
482 dout("%s: error %d\n", __func__, ret);
483
376 return ERR_PTR(ret); 484 return ERR_PTR(ret);
377} 485}
378 486
@@ -426,6 +534,12 @@ static match_table_t rbd_opts_tokens = {
426 {-1, NULL} 534 {-1, NULL}
427}; 535};
428 536
537struct rbd_options {
538 bool read_only;
539};
540
541#define RBD_READ_ONLY_DEFAULT false
542
429static int parse_rbd_opts_token(char *c, void *private) 543static int parse_rbd_opts_token(char *c, void *private)
430{ 544{
431 struct rbd_options *rbd_opts = private; 545 struct rbd_options *rbd_opts = private;
@@ -493,7 +607,7 @@ static void rbd_client_release(struct kref *kref)
493{ 607{
494 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 608 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
495 609
496 dout("rbd_release_client %p\n", rbdc); 610 dout("%s: rbdc %p\n", __func__, rbdc);
497 spin_lock(&rbd_client_list_lock); 611 spin_lock(&rbd_client_list_lock);
498 list_del(&rbdc->node); 612 list_del(&rbdc->node);
499 spin_unlock(&rbd_client_list_lock); 613 spin_unlock(&rbd_client_list_lock);
@@ -512,18 +626,6 @@ static void rbd_put_client(struct rbd_client *rbdc)
512 kref_put(&rbdc->kref, rbd_client_release); 626 kref_put(&rbdc->kref, rbd_client_release);
513} 627}
514 628
515/*
516 * Destroy requests collection
517 */
518static void rbd_coll_release(struct kref *kref)
519{
520 struct rbd_req_coll *coll =
521 container_of(kref, struct rbd_req_coll, kref);
522
523 dout("rbd_coll_release %p\n", coll);
524 kfree(coll);
525}
526
527static bool rbd_image_format_valid(u32 image_format) 629static bool rbd_image_format_valid(u32 image_format)
528{ 630{
529 return image_format == 1 || image_format == 2; 631 return image_format == 1 || image_format == 2;
@@ -707,7 +809,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
707 goto done; 809 goto done;
708 rbd_dev->mapping.read_only = true; 810 rbd_dev->mapping.read_only = true;
709 } 811 }
710 rbd_dev->exists = true; 812 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
813
711done: 814done:
712 return ret; 815 return ret;
713} 816}
@@ -724,7 +827,7 @@ static void rbd_header_free(struct rbd_image_header *header)
724 header->snapc = NULL; 827 header->snapc = NULL;
725} 828}
726 829
727static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 830static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
728{ 831{
729 char *name; 832 char *name;
730 u64 segment; 833 u64 segment;
@@ -767,23 +870,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
767 return length; 870 return length;
768} 871}
769 872
770static int rbd_get_num_segments(struct rbd_image_header *header,
771 u64 ofs, u64 len)
772{
773 u64 start_seg;
774 u64 end_seg;
775
776 if (!len)
777 return 0;
778 if (len - 1 > U64_MAX - ofs)
779 return -ERANGE;
780
781 start_seg = ofs >> header->obj_order;
782 end_seg = (ofs + len - 1) >> header->obj_order;
783
784 return end_seg - start_seg + 1;
785}
786
787/* 873/*
788 * returns the size of an object in the image 874 * returns the size of an object in the image
789 */ 875 */
@@ -949,8 +1035,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
949 unsigned int bi_size; 1035 unsigned int bi_size;
950 struct bio *bio; 1036 struct bio *bio;
951 1037
952 if (!bi) 1038 if (!bi) {
1039 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
953 goto out_err; /* EINVAL; ran out of bio's */ 1040 goto out_err; /* EINVAL; ran out of bio's */
1041 }
954 bi_size = min_t(unsigned int, bi->bi_size - off, len); 1042 bi_size = min_t(unsigned int, bi->bi_size - off, len);
955 bio = bio_clone_range(bi, off, bi_size, gfpmask); 1043 bio = bio_clone_range(bi, off, bi_size, gfpmask);
956 if (!bio) 1044 if (!bio)
@@ -976,399 +1064,721 @@ out_err:
976 return NULL; 1064 return NULL;
977} 1065}
978 1066
979/* 1067static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
980 * helpers for osd request op vectors.
981 */
982static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
983 int opcode, u32 payload_len)
984{ 1068{
985 struct ceph_osd_req_op *ops; 1069 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1070 atomic_read(&obj_request->kref.refcount));
1071 kref_get(&obj_request->kref);
1072}
1073
1074static void rbd_obj_request_destroy(struct kref *kref);
1075static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076{
1077 rbd_assert(obj_request != NULL);
1078 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1079 atomic_read(&obj_request->kref.refcount));
1080 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081}
1082
1083static void rbd_img_request_get(struct rbd_img_request *img_request)
1084{
1085 dout("%s: img %p (was %d)\n", __func__, img_request,
1086 atomic_read(&img_request->kref.refcount));
1087 kref_get(&img_request->kref);
1088}
1089
1090static void rbd_img_request_destroy(struct kref *kref);
1091static void rbd_img_request_put(struct rbd_img_request *img_request)
1092{
1093 rbd_assert(img_request != NULL);
1094 dout("%s: img %p (was %d)\n", __func__, img_request,
1095 atomic_read(&img_request->kref.refcount));
1096 kref_put(&img_request->kref, rbd_img_request_destroy);
1097}
1098
1099static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100 struct rbd_obj_request *obj_request)
1101{
1102 rbd_assert(obj_request->img_request == NULL);
1103
1104 rbd_obj_request_get(obj_request);
1105 obj_request->img_request = img_request;
1106 obj_request->which = img_request->obj_request_count;
1107 rbd_assert(obj_request->which != BAD_WHICH);
1108 img_request->obj_request_count++;
1109 list_add_tail(&obj_request->links, &img_request->obj_requests);
1110 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1111 obj_request->which);
1112}
986 1113
987 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); 1114static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
988 if (!ops) 1115 struct rbd_obj_request *obj_request)
1116{
1117 rbd_assert(obj_request->which != BAD_WHICH);
1118
1119 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1120 obj_request->which);
1121 list_del(&obj_request->links);
1122 rbd_assert(img_request->obj_request_count > 0);
1123 img_request->obj_request_count--;
1124 rbd_assert(obj_request->which == img_request->obj_request_count);
1125 obj_request->which = BAD_WHICH;
1126 rbd_assert(obj_request->img_request == img_request);
1127 obj_request->img_request = NULL;
1128 obj_request->callback = NULL;
1129 rbd_obj_request_put(obj_request);
1130}
1131
1132static bool obj_request_type_valid(enum obj_request_type type)
1133{
1134 switch (type) {
1135 case OBJ_REQUEST_NODATA:
1136 case OBJ_REQUEST_BIO:
1137 case OBJ_REQUEST_PAGES:
1138 return true;
1139 default:
1140 return false;
1141 }
1142}
1143
1144static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1145{
1146 struct ceph_osd_req_op *op;
1147 va_list args;
1148 size_t size;
1149
1150 op = kzalloc(sizeof (*op), GFP_NOIO);
1151 if (!op)
989 return NULL; 1152 return NULL;
1153 op->op = opcode;
1154 va_start(args, opcode);
1155 switch (opcode) {
1156 case CEPH_OSD_OP_READ:
1157 case CEPH_OSD_OP_WRITE:
1158 /* rbd_osd_req_op_create(READ, offset, length) */
1159 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160 op->extent.offset = va_arg(args, u64);
1161 op->extent.length = va_arg(args, u64);
1162 if (opcode == CEPH_OSD_OP_WRITE)
1163 op->payload_len = op->extent.length;
1164 break;
1165 case CEPH_OSD_OP_STAT:
1166 break;
1167 case CEPH_OSD_OP_CALL:
1168 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169 op->cls.class_name = va_arg(args, char *);
1170 size = strlen(op->cls.class_name);
1171 rbd_assert(size <= (size_t) U8_MAX);
1172 op->cls.class_len = size;
1173 op->payload_len = size;
1174
1175 op->cls.method_name = va_arg(args, char *);
1176 size = strlen(op->cls.method_name);
1177 rbd_assert(size <= (size_t) U8_MAX);
1178 op->cls.method_len = size;
1179 op->payload_len += size;
1180
1181 op->cls.argc = 0;
1182 op->cls.indata = va_arg(args, void *);
1183 size = va_arg(args, size_t);
1184 rbd_assert(size <= (size_t) U32_MAX);
1185 op->cls.indata_len = (u32) size;
1186 op->payload_len += size;
1187 break;
1188 case CEPH_OSD_OP_NOTIFY_ACK:
1189 case CEPH_OSD_OP_WATCH:
1190 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192 op->watch.cookie = va_arg(args, u64);
1193 op->watch.ver = va_arg(args, u64);
1194 op->watch.ver = cpu_to_le64(op->watch.ver);
1195 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196 op->watch.flag = (u8) 1;
1197 break;
1198 default:
1199 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200 kfree(op);
1201 op = NULL;
1202 break;
1203 }
1204 va_end(args);
990 1205
991 ops[0].op = opcode; 1206 return op;
1207}
992 1208
993 /* 1209static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
994 * op extent offset and length will be set later on 1210{
995 * in calc_raw_layout() 1211 kfree(op);
996 */ 1212}
997 ops[0].payload_len = payload_len; 1213
1214static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215 struct rbd_obj_request *obj_request)
1216{
1217 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
998 1218
999 return ops; 1219 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1000} 1220}
1001 1221
1002static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 1222static void rbd_img_request_complete(struct rbd_img_request *img_request)
1003{ 1223{
1004 kfree(ops); 1224 dout("%s: img %p\n", __func__, img_request);
1225 if (img_request->callback)
1226 img_request->callback(img_request);
1227 else
1228 rbd_img_request_put(img_request);
1005} 1229}
1006 1230
1007static void rbd_coll_end_req_index(struct request *rq, 1231/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1008 struct rbd_req_coll *coll, 1232
1009 int index, 1233static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1010 int ret, u64 len)
1011{ 1234{
1012 struct request_queue *q; 1235 dout("%s: obj %p\n", __func__, obj_request);
1013 int min, max, i;
1014 1236
1015 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 1237 return wait_for_completion_interruptible(&obj_request->completion);
1016 coll, index, ret, (unsigned long long) len); 1238}
1017 1239
1018 if (!rq) 1240static void obj_request_done_init(struct rbd_obj_request *obj_request)
1019 return; 1241{
1242 atomic_set(&obj_request->done, 0);
1243 smp_wmb();
1244}
1020 1245
1021 if (!coll) { 1246static void obj_request_done_set(struct rbd_obj_request *obj_request)
1022 blk_end_request(rq, ret, len); 1247{
1023 return; 1248 int done;
1249
1250 done = atomic_inc_return(&obj_request->done);
1251 if (done > 1) {
1252 struct rbd_img_request *img_request = obj_request->img_request;
1253 struct rbd_device *rbd_dev;
1254
1255 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257 obj_request);
1024 } 1258 }
1259}
1025 1260
1026 q = rq->q; 1261static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1027 1262{
1028 spin_lock_irq(q->queue_lock); 1263 smp_mb();
1029 coll->status[index].done = 1; 1264 return atomic_read(&obj_request->done) != 0;
1030 coll->status[index].rc = ret; 1265}
1031 coll->status[index].bytes = len; 1266
1032 max = min = coll->num_done; 1267static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1033 while (max < coll->total && coll->status[max].done) 1268{
1034 max++; 1269 dout("%s: obj %p cb %p\n", __func__, obj_request,
1035 1270 obj_request->callback);
1036 for (i = min; i<max; i++) { 1271 if (obj_request->callback)
1037 __blk_end_request(rq, coll->status[i].rc, 1272 obj_request->callback(obj_request);
1038 coll->status[i].bytes); 1273 else
1039 coll->num_done++; 1274 complete_all(&obj_request->completion);
1040 kref_put(&coll->kref, rbd_coll_release); 1275}
1276
1277static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1278{
1279 dout("%s: obj %p\n", __func__, obj_request);
1280 obj_request_done_set(obj_request);
1281}
1282
1283static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1284{
1285 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1286 obj_request->result, obj_request->xferred, obj_request->length);
1287 /*
1288 * ENOENT means a hole in the object. We zero-fill the
1289 * entire length of the request. A short read also implies
1290 * zero-fill to the end of the request. Either way we
1291 * update the xferred count to indicate the whole request
1292 * was satisfied.
1293 */
1294 if (obj_request->result == -ENOENT) {
1295 zero_bio_chain(obj_request->bio_list, 0);
1296 obj_request->result = 0;
1297 obj_request->xferred = obj_request->length;
1298 } else if (obj_request->xferred < obj_request->length &&
1299 !obj_request->result) {
1300 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1301 obj_request->xferred = obj_request->length;
1041 } 1302 }
1042 spin_unlock_irq(q->queue_lock); 1303 obj_request_done_set(obj_request);
1043} 1304}
1044 1305
1045static void rbd_coll_end_req(struct rbd_request *req, 1306static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1046 int ret, u64 len)
1047{ 1307{
1048 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); 1308 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1309 obj_request->result, obj_request->length);
1310 /*
1311 * There is no such thing as a successful short write.
1312 * Our xferred value is the number of bytes transferred
1313 * back. Set it to our originally-requested length.
1314 */
1315 obj_request->xferred = obj_request->length;
1316 obj_request_done_set(obj_request);
1049} 1317}
1050 1318
1051/* 1319/*
1052 * Send ceph osd request 1320 * For a simple stat call there's nothing to do. We'll do more if
1321 * this is part of a write sequence for a layered image.
1053 */ 1322 */
1054static int rbd_do_request(struct request *rq, 1323static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1055 struct rbd_device *rbd_dev, 1324{
1056 struct ceph_snap_context *snapc, 1325 dout("%s: obj %p\n", __func__, obj_request);
1057 u64 snapid, 1326 obj_request_done_set(obj_request);
1058 const char *object_name, u64 ofs, u64 len, 1327}
1059 struct bio *bio,
1060 struct page **pages,
1061 int num_pages,
1062 int flags,
1063 struct ceph_osd_req_op *ops,
1064 struct rbd_req_coll *coll,
1065 int coll_index,
1066 void (*rbd_cb)(struct ceph_osd_request *req,
1067 struct ceph_msg *msg),
1068 struct ceph_osd_request **linger_req,
1069 u64 *ver)
1070{
1071 struct ceph_osd_request *req;
1072 struct ceph_file_layout *layout;
1073 int ret;
1074 u64 bno;
1075 struct timespec mtime = CURRENT_TIME;
1076 struct rbd_request *req_data;
1077 struct ceph_osd_request_head *reqhead;
1078 struct ceph_osd_client *osdc;
1079 1328
1080 req_data = kzalloc(sizeof(*req_data), GFP_NOIO); 1329static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1081 if (!req_data) { 1330 struct ceph_msg *msg)
1082 if (coll) 1331{
1083 rbd_coll_end_req_index(rq, coll, coll_index, 1332 struct rbd_obj_request *obj_request = osd_req->r_priv;
1084 -ENOMEM, len); 1333 u16 opcode;
1085 return -ENOMEM; 1334
1335 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1336 rbd_assert(osd_req == obj_request->osd_req);
1337 rbd_assert(!!obj_request->img_request ^
1338 (obj_request->which == BAD_WHICH));
1339
1340 if (osd_req->r_result < 0)
1341 obj_request->result = osd_req->r_result;
1342 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1343
1344 WARN_ON(osd_req->r_num_ops != 1); /* For now */
1345
1346 /*
1347 * We support a 64-bit length, but ultimately it has to be
1348 * passed to blk_end_request(), which takes an unsigned int.
1349 */
1350 obj_request->xferred = osd_req->r_reply_op_len[0];
1351 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
1352 opcode = osd_req->r_request_ops[0].op;
1353 switch (opcode) {
1354 case CEPH_OSD_OP_READ:
1355 rbd_osd_read_callback(obj_request);
1356 break;
1357 case CEPH_OSD_OP_WRITE:
1358 rbd_osd_write_callback(obj_request);
1359 break;
1360 case CEPH_OSD_OP_STAT:
1361 rbd_osd_stat_callback(obj_request);
1362 break;
1363 case CEPH_OSD_OP_CALL:
1364 case CEPH_OSD_OP_NOTIFY_ACK:
1365 case CEPH_OSD_OP_WATCH:
1366 rbd_osd_trivial_callback(obj_request);
1367 break;
1368 default:
1369 rbd_warn(NULL, "%s: unsupported op %hu\n",
1370 obj_request->object_name, (unsigned short) opcode);
1371 break;
1086 } 1372 }
1087 1373
1088 if (coll) { 1374 if (obj_request_done_test(obj_request))
1089 req_data->coll = coll; 1375 rbd_obj_request_complete(obj_request);
1090 req_data->coll_index = coll_index; 1376}
1377
1378static struct ceph_osd_request *rbd_osd_req_create(
1379 struct rbd_device *rbd_dev,
1380 bool write_request,
1381 struct rbd_obj_request *obj_request,
1382 struct ceph_osd_req_op *op)
1383{
1384 struct rbd_img_request *img_request = obj_request->img_request;
1385 struct ceph_snap_context *snapc = NULL;
1386 struct ceph_osd_client *osdc;
1387 struct ceph_osd_request *osd_req;
1388 struct timespec now;
1389 struct timespec *mtime;
1390 u64 snap_id = CEPH_NOSNAP;
1391 u64 offset = obj_request->offset;
1392 u64 length = obj_request->length;
1393
1394 if (img_request) {
1395 rbd_assert(img_request->write_request == write_request);
1396 if (img_request->write_request)
1397 snapc = img_request->snapc;
1398 else
1399 snap_id = img_request->snap_id;
1091 } 1400 }
1092 1401
1093 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", 1402 /* Allocate and initialize the request, for the single op */
1094 object_name, (unsigned long long) ofs,
1095 (unsigned long long) len, coll, coll_index);
1096 1403
1097 osdc = &rbd_dev->rbd_client->client->osdc; 1404 osdc = &rbd_dev->rbd_client->client->osdc;
1098 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, 1405 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1099 false, GFP_NOIO, pages, bio); 1406 if (!osd_req)
1100 if (!req) { 1407 return NULL; /* ENOMEM */
1101 ret = -ENOMEM; 1408
1102 goto done_pages; 1409 rbd_assert(obj_request_type_valid(obj_request->type));
1410 switch (obj_request->type) {
1411 case OBJ_REQUEST_NODATA:
1412 break; /* Nothing to do */
1413 case OBJ_REQUEST_BIO:
1414 rbd_assert(obj_request->bio_list != NULL);
1415 osd_req->r_bio = obj_request->bio_list;
1416 break;
1417 case OBJ_REQUEST_PAGES:
1418 osd_req->r_pages = obj_request->pages;
1419 osd_req->r_num_pages = obj_request->page_count;
1420 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1421 break;
1103 } 1422 }
1104 1423
1105 req->r_callback = rbd_cb; 1424 if (write_request) {
1425 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1426 now = CURRENT_TIME;
1427 mtime = &now;
1428 } else {
1429 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1430 mtime = NULL; /* not needed for reads */
1431 offset = 0; /* These are not used... */
1432 length = 0; /* ...for osd read requests */
1433 }
1106 1434
1107 req_data->rq = rq; 1435 osd_req->r_callback = rbd_osd_req_callback;
1108 req_data->bio = bio; 1436 osd_req->r_priv = obj_request;
1109 req_data->pages = pages;
1110 req_data->len = len;
1111 1437
1112 req->r_priv = req_data; 1438 osd_req->r_oid_len = strlen(obj_request->object_name);
1439 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1440 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1113 1441
1114 reqhead = req->r_request->front.iov_base; 1442 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1115 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1116 1443
1117 strncpy(req->r_oid, object_name, sizeof(req->r_oid)); 1444 /* osd_req will get its own reference to snapc (if non-null) */
1118 req->r_oid_len = strlen(req->r_oid);
1119 1445
1120 layout = &req->r_file_layout; 1446 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1121 memset(layout, 0, sizeof(*layout)); 1447 snapc, snap_id, mtime);
1122 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1123 layout->fl_stripe_count = cpu_to_le32(1);
1124 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1125 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
1126 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1127 req, ops);
1128 rbd_assert(ret == 0);
1129 1448
1130 ceph_osdc_build_request(req, ofs, &len, 1449 return osd_req;
1131 ops, 1450}
1132 snapc,
1133 &mtime,
1134 req->r_oid, req->r_oid_len);
1135 1451
1136 if (linger_req) { 1452static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1137 ceph_osdc_set_request_linger(osdc, req); 1453{
1138 *linger_req = req; 1454 ceph_osdc_put_request(osd_req);
1139 } 1455}
1140 1456
1141 ret = ceph_osdc_start_request(osdc, req, false); 1457/* object_name is assumed to be a non-null pointer and NUL-terminated */
1142 if (ret < 0) 1458
1143 goto done_err; 1459static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1144 1460 u64 offset, u64 length,
1145 if (!rbd_cb) { 1461 enum obj_request_type type)
1146 ret = ceph_osdc_wait_request(osdc, req); 1462{
1147 if (ver) 1463 struct rbd_obj_request *obj_request;
1148 *ver = le64_to_cpu(req->r_reassert_version.version); 1464 size_t size;
1149 dout("reassert_ver=%llu\n", 1465 char *name;
1150 (unsigned long long) 1466
1151 le64_to_cpu(req->r_reassert_version.version)); 1467 rbd_assert(obj_request_type_valid(type));
1152 ceph_osdc_put_request(req); 1468
1469 size = strlen(object_name) + 1;
1470 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1471 if (!obj_request)
1472 return NULL;
1473
1474 name = (char *)(obj_request + 1);
1475 obj_request->object_name = memcpy(name, object_name, size);
1476 obj_request->offset = offset;
1477 obj_request->length = length;
1478 obj_request->which = BAD_WHICH;
1479 obj_request->type = type;
1480 INIT_LIST_HEAD(&obj_request->links);
1481 obj_request_done_init(obj_request);
1482 init_completion(&obj_request->completion);
1483 kref_init(&obj_request->kref);
1484
1485 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1486 offset, length, (int)type, obj_request);
1487
1488 return obj_request;
1489}
1490
1491static void rbd_obj_request_destroy(struct kref *kref)
1492{
1493 struct rbd_obj_request *obj_request;
1494
1495 obj_request = container_of(kref, struct rbd_obj_request, kref);
1496
1497 dout("%s: obj %p\n", __func__, obj_request);
1498
1499 rbd_assert(obj_request->img_request == NULL);
1500 rbd_assert(obj_request->which == BAD_WHICH);
1501
1502 if (obj_request->osd_req)
1503 rbd_osd_req_destroy(obj_request->osd_req);
1504
1505 rbd_assert(obj_request_type_valid(obj_request->type));
1506 switch (obj_request->type) {
1507 case OBJ_REQUEST_NODATA:
1508 break; /* Nothing to do */
1509 case OBJ_REQUEST_BIO:
1510 if (obj_request->bio_list)
1511 bio_chain_put(obj_request->bio_list);
1512 break;
1513 case OBJ_REQUEST_PAGES:
1514 if (obj_request->pages)
1515 ceph_release_page_vector(obj_request->pages,
1516 obj_request->page_count);
1517 break;
1153 } 1518 }
1154 return ret;
1155 1519
1156done_err: 1520 kfree(obj_request);
1157 bio_chain_put(req_data->bio);
1158 ceph_osdc_put_request(req);
1159done_pages:
1160 rbd_coll_end_req(req_data, ret, len);
1161 kfree(req_data);
1162 return ret;
1163} 1521}
1164 1522
1165/* 1523/*
1166 * Ceph osd op callback 1524 * Caller is responsible for filling in the list of object requests
1525 * that comprises the image request, and the Linux request pointer
1526 * (if there is one).
1167 */ 1527 */
1168static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 1528static struct rbd_img_request *rbd_img_request_create(
1169{ 1529 struct rbd_device *rbd_dev,
1170 struct rbd_request *req_data = req->r_priv; 1530 u64 offset, u64 length,
1171 struct ceph_osd_reply_head *replyhead; 1531 bool write_request)
1172 struct ceph_osd_op *op; 1532{
1173 __s32 rc; 1533 struct rbd_img_request *img_request;
1174 u64 bytes; 1534 struct ceph_snap_context *snapc = NULL;
1175 int read_op;
1176
1177 /* parse reply */
1178 replyhead = msg->front.iov_base;
1179 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1180 op = (void *)(replyhead + 1);
1181 rc = le32_to_cpu(replyhead->result);
1182 bytes = le64_to_cpu(op->extent.length);
1183 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1184
1185 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1186 (unsigned long long) bytes, read_op, (int) rc);
1187
1188 if (rc == -ENOENT && read_op) {
1189 zero_bio_chain(req_data->bio, 0);
1190 rc = 0;
1191 } else if (rc == 0 && read_op && bytes < req_data->len) {
1192 zero_bio_chain(req_data->bio, bytes);
1193 bytes = req_data->len;
1194 }
1195 1535
1196 rbd_coll_end_req(req_data, rc, bytes); 1536 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1537 if (!img_request)
1538 return NULL;
1197 1539
1198 if (req_data->bio) 1540 if (write_request) {
1199 bio_chain_put(req_data->bio); 1541 down_read(&rbd_dev->header_rwsem);
1542 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1543 up_read(&rbd_dev->header_rwsem);
1544 if (WARN_ON(!snapc)) {
1545 kfree(img_request);
1546 return NULL; /* Shouldn't happen */
1547 }
1548 }
1200 1549
1201 ceph_osdc_put_request(req); 1550 img_request->rq = NULL;
1202 kfree(req_data); 1551 img_request->rbd_dev = rbd_dev;
1552 img_request->offset = offset;
1553 img_request->length = length;
1554 img_request->write_request = write_request;
1555 if (write_request)
1556 img_request->snapc = snapc;
1557 else
1558 img_request->snap_id = rbd_dev->spec->snap_id;
1559 spin_lock_init(&img_request->completion_lock);
1560 img_request->next_completion = 0;
1561 img_request->callback = NULL;
1562 img_request->obj_request_count = 0;
1563 INIT_LIST_HEAD(&img_request->obj_requests);
1564 kref_init(&img_request->kref);
1565
1566 rbd_img_request_get(img_request); /* Avoid a warning */
1567 rbd_img_request_put(img_request); /* TEMPORARY */
1568
1569 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1570 write_request ? "write" : "read", offset, length,
1571 img_request);
1572
1573 return img_request;
1203} 1574}
1204 1575
1205static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 1576static void rbd_img_request_destroy(struct kref *kref)
1206{ 1577{
1207 ceph_osdc_put_request(req); 1578 struct rbd_img_request *img_request;
1579 struct rbd_obj_request *obj_request;
1580 struct rbd_obj_request *next_obj_request;
1581
1582 img_request = container_of(kref, struct rbd_img_request, kref);
1583
1584 dout("%s: img %p\n", __func__, img_request);
1585
1586 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1587 rbd_img_obj_request_del(img_request, obj_request);
1588 rbd_assert(img_request->obj_request_count == 0);
1589
1590 if (img_request->write_request)
1591 ceph_put_snap_context(img_request->snapc);
1592
1593 kfree(img_request);
1208} 1594}
1209 1595
1210/* 1596static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1211 * Do a synchronous ceph osd operation 1597 struct bio *bio_list)
1212 */
1213static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1214 struct ceph_snap_context *snapc,
1215 u64 snapid,
1216 int flags,
1217 struct ceph_osd_req_op *ops,
1218 const char *object_name,
1219 u64 ofs, u64 inbound_size,
1220 char *inbound,
1221 struct ceph_osd_request **linger_req,
1222 u64 *ver)
1223{ 1598{
1224 int ret; 1599 struct rbd_device *rbd_dev = img_request->rbd_dev;
1225 struct page **pages; 1600 struct rbd_obj_request *obj_request = NULL;
1226 int num_pages; 1601 struct rbd_obj_request *next_obj_request;
1227 1602 unsigned int bio_offset;
1228 rbd_assert(ops != NULL); 1603 u64 image_offset;
1604 u64 resid;
1605 u16 opcode;
1606
1607 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1608
1609 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1610 : CEPH_OSD_OP_READ;
1611 bio_offset = 0;
1612 image_offset = img_request->offset;
1613 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1614 resid = img_request->length;
1615 rbd_assert(resid > 0);
1616 while (resid) {
1617 const char *object_name;
1618 unsigned int clone_size;
1619 struct ceph_osd_req_op *op;
1620 u64 offset;
1621 u64 length;
1622
1623 object_name = rbd_segment_name(rbd_dev, image_offset);
1624 if (!object_name)
1625 goto out_unwind;
1626 offset = rbd_segment_offset(rbd_dev, image_offset);
1627 length = rbd_segment_length(rbd_dev, image_offset, resid);
1628 obj_request = rbd_obj_request_create(object_name,
1629 offset, length,
1630 OBJ_REQUEST_BIO);
1631 kfree(object_name); /* object request has its own copy */
1632 if (!obj_request)
1633 goto out_unwind;
1634
1635 rbd_assert(length <= (u64) UINT_MAX);
1636 clone_size = (unsigned int) length;
1637 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1638 &bio_offset, clone_size,
1639 GFP_ATOMIC);
1640 if (!obj_request->bio_list)
1641 goto out_partial;
1229 1642
1230 num_pages = calc_pages_for(ofs, inbound_size); 1643 /*
1231 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1644 * Build up the op to use in building the osd
1232 if (IS_ERR(pages)) 1645 * request. Note that the contents of the op are
1233 return PTR_ERR(pages); 1646 * copied by rbd_osd_req_create().
1647 */
1648 op = rbd_osd_req_op_create(opcode, offset, length);
1649 if (!op)
1650 goto out_partial;
1651 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1652 img_request->write_request,
1653 obj_request, op);
1654 rbd_osd_req_op_destroy(op);
1655 if (!obj_request->osd_req)
1656 goto out_partial;
1657 /* status and version are initially zero-filled */
1658
1659 rbd_img_obj_request_add(img_request, obj_request);
1660
1661 image_offset += length;
1662 resid -= length;
1663 }
1234 1664
1235 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1665 return 0;
1236 object_name, ofs, inbound_size, NULL,
1237 pages, num_pages,
1238 flags,
1239 ops,
1240 NULL, 0,
1241 NULL,
1242 linger_req, ver);
1243 if (ret < 0)
1244 goto done;
1245 1666
1246 if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1667out_partial:
1247 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1668 rbd_obj_request_put(obj_request);
1669out_unwind:
1670 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1671 rbd_obj_request_put(obj_request);
1248 1672
1249done: 1673 return -ENOMEM;
1250 ceph_release_page_vector(pages, num_pages);
1251 return ret;
1252} 1674}
1253 1675
1254/* 1676static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1255 * Do an asynchronous ceph osd operation 1677{
1256 */ 1678 struct rbd_img_request *img_request;
1257static int rbd_do_op(struct request *rq, 1679 u32 which = obj_request->which;
1258 struct rbd_device *rbd_dev, 1680 bool more = true;
1259 struct ceph_snap_context *snapc, 1681
1260 u64 ofs, u64 len, 1682 img_request = obj_request->img_request;
1261 struct bio *bio, 1683
1262 struct rbd_req_coll *coll, 1684 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1263 int coll_index) 1685 rbd_assert(img_request != NULL);
1264{ 1686 rbd_assert(img_request->rq != NULL);
1265 char *seg_name; 1687 rbd_assert(img_request->obj_request_count > 0);
1266 u64 seg_ofs; 1688 rbd_assert(which != BAD_WHICH);
1267 u64 seg_len; 1689 rbd_assert(which < img_request->obj_request_count);
1268 int ret; 1690 rbd_assert(which >= img_request->next_completion);
1269 struct ceph_osd_req_op *ops; 1691
1270 u32 payload_len; 1692 spin_lock_irq(&img_request->completion_lock);
1271 int opcode; 1693 if (which != img_request->next_completion)
1272 int flags; 1694 goto out;
1273 u64 snapid; 1695
1274 1696 for_each_obj_request_from(img_request, obj_request) {
1275 seg_name = rbd_segment_name(rbd_dev, ofs); 1697 unsigned int xferred;
1276 if (!seg_name) 1698 int result;
1277 return -ENOMEM; 1699
1278 seg_len = rbd_segment_length(rbd_dev, ofs, len); 1700 rbd_assert(more);
1279 seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1701 rbd_assert(which < img_request->obj_request_count);
1280 1702
1281 if (rq_data_dir(rq) == WRITE) { 1703 if (!obj_request_done_test(obj_request))
1282 opcode = CEPH_OSD_OP_WRITE; 1704 break;
1283 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; 1705
1284 snapid = CEPH_NOSNAP; 1706 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1285 payload_len = seg_len; 1707 xferred = (unsigned int) obj_request->xferred;
1286 } else { 1708 result = (int) obj_request->result;
1287 opcode = CEPH_OSD_OP_READ; 1709 if (result)
1288 flags = CEPH_OSD_FLAG_READ; 1710 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1289 snapc = NULL; 1711 img_request->write_request ? "write" : "read",
1290 snapid = rbd_dev->spec->snap_id; 1712 result, xferred);
1291 payload_len = 0; 1713
1714 more = blk_end_request(img_request->rq, result, xferred);
1715 which++;
1292 } 1716 }
1293 1717
1294 ret = -ENOMEM; 1718 rbd_assert(more ^ (which == img_request->obj_request_count));
1295 ops = rbd_create_rw_ops(1, opcode, payload_len); 1719 img_request->next_completion = which;
1296 if (!ops) 1720out:
1297 goto done; 1721 spin_unlock_irq(&img_request->completion_lock);
1298 1722
1299 /* we've taken care of segment sizes earlier when we 1723 if (!more)
1300 cloned the bios. We should never have a segment 1724 rbd_img_request_complete(img_request);
1301 truncated at this point */
1302 rbd_assert(seg_len == len);
1303
1304 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1305 seg_name, seg_ofs, seg_len,
1306 bio,
1307 NULL, 0,
1308 flags,
1309 ops,
1310 coll, coll_index,
1311 rbd_req_cb, 0, NULL);
1312
1313 rbd_destroy_ops(ops);
1314done:
1315 kfree(seg_name);
1316 return ret;
1317} 1725}
1318 1726
1319/* 1727static int rbd_img_request_submit(struct rbd_img_request *img_request)
1320 * Request sync osd read 1728{
1321 */ 1729 struct rbd_device *rbd_dev = img_request->rbd_dev;
1322static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1730 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1323 u64 snapid, 1731 struct rbd_obj_request *obj_request;
1324 const char *object_name,
1325 u64 ofs, u64 len,
1326 char *buf,
1327 u64 *ver)
1328{
1329 struct ceph_osd_req_op *ops;
1330 int ret;
1331 1732
1332 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); 1733 dout("%s: img %p\n", __func__, img_request);
1333 if (!ops) 1734 for_each_obj_request(img_request, obj_request) {
1334 return -ENOMEM; 1735 int ret;
1335 1736
1336 ret = rbd_req_sync_op(rbd_dev, NULL, 1737 obj_request->callback = rbd_img_obj_callback;
1337 snapid, 1738 ret = rbd_obj_request_submit(osdc, obj_request);
1338 CEPH_OSD_FLAG_READ, 1739 if (ret)
1339 ops, object_name, ofs, len, buf, NULL, ver); 1740 return ret;
1340 rbd_destroy_ops(ops); 1741 /*
1742 * The image request has its own reference to each
1743 * of its object requests, so we can safely drop the
1744 * initial one here.
1745 */
1746 rbd_obj_request_put(obj_request);
1747 }
1341 1748
1342 return ret; 1749 return 0;
1343} 1750}
1344 1751
1345/* 1752static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1346 * Request sync osd watch 1753 u64 ver, u64 notify_id)
1347 */
1348static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
1349 u64 ver,
1350 u64 notify_id)
1351{ 1754{
1352 struct ceph_osd_req_op *ops; 1755 struct rbd_obj_request *obj_request;
1756 struct ceph_osd_req_op *op;
1757 struct ceph_osd_client *osdc;
1353 int ret; 1758 int ret;
1354 1759
1355 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); 1760 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1356 if (!ops) 1761 OBJ_REQUEST_NODATA);
1762 if (!obj_request)
1357 return -ENOMEM; 1763 return -ENOMEM;
1358 1764
1359 ops[0].watch.ver = cpu_to_le64(ver); 1765 ret = -ENOMEM;
1360 ops[0].watch.cookie = notify_id; 1766 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1361 ops[0].watch.flag = 0; 1767 if (!op)
1768 goto out;
1769 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1770 obj_request, op);
1771 rbd_osd_req_op_destroy(op);
1772 if (!obj_request->osd_req)
1773 goto out;
1362 1774
1363 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 1775 osdc = &rbd_dev->rbd_client->client->osdc;
1364 rbd_dev->header_name, 0, 0, NULL, 1776 obj_request->callback = rbd_obj_request_put;
1365 NULL, 0, 1777 ret = rbd_obj_request_submit(osdc, obj_request);
1366 CEPH_OSD_FLAG_READ, 1778out:
1367 ops, 1779 if (ret)
1368 NULL, 0, 1780 rbd_obj_request_put(obj_request);
1369 rbd_simple_req_cb, 0, NULL);
1370 1781
1371 rbd_destroy_ops(ops);
1372 return ret; 1782 return ret;
1373} 1783}
1374 1784
@@ -1381,95 +1791,103 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1381 if (!rbd_dev) 1791 if (!rbd_dev)
1382 return; 1792 return;
1383 1793
1384 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1794 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1385 rbd_dev->header_name, (unsigned long long) notify_id, 1795 rbd_dev->header_name, (unsigned long long) notify_id,
1386 (unsigned int) opcode); 1796 (unsigned int) opcode);
1387 rc = rbd_dev_refresh(rbd_dev, &hver); 1797 rc = rbd_dev_refresh(rbd_dev, &hver);
1388 if (rc) 1798 if (rc)
1389 pr_warning(RBD_DRV_NAME "%d got notification but failed to " 1799 rbd_warn(rbd_dev, "got notification but failed to "
1390 " update snaps: %d\n", rbd_dev->major, rc); 1800 " update snaps: %d\n", rc);
1391 1801
1392 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 1802 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1393} 1803}
1394 1804
1395/* 1805/*
1396 * Request sync osd watch 1806 * Request sync osd watch/unwatch. The value of "start" determines
1807 * whether a watch request is being initiated or torn down.
1397 */ 1808 */
1398static int rbd_req_sync_watch(struct rbd_device *rbd_dev) 1809static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1399{ 1810{
1400 struct ceph_osd_req_op *ops;
1401 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1811 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1812 struct rbd_obj_request *obj_request;
1813 struct ceph_osd_req_op *op;
1402 int ret; 1814 int ret;
1403 1815
1404 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 1816 rbd_assert(start ^ !!rbd_dev->watch_event);
1405 if (!ops) 1817 rbd_assert(start ^ !!rbd_dev->watch_request);
1406 return -ENOMEM;
1407 1818
1408 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 1819 if (start) {
1409 (void *)rbd_dev, &rbd_dev->watch_event); 1820 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
1410 if (ret < 0) 1821 &rbd_dev->watch_event);
1411 goto fail; 1822 if (ret < 0)
1823 return ret;
1824 rbd_assert(rbd_dev->watch_event != NULL);
1825 }
1412 1826
1413 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); 1827 ret = -ENOMEM;
1414 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 1828 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1415 ops[0].watch.flag = 1; 1829 OBJ_REQUEST_NODATA);
1830 if (!obj_request)
1831 goto out_cancel;
1832
1833 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1834 rbd_dev->watch_event->cookie,
1835 rbd_dev->header.obj_version, start);
1836 if (!op)
1837 goto out_cancel;
1838 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1839 obj_request, op);
1840 rbd_osd_req_op_destroy(op);
1841 if (!obj_request->osd_req)
1842 goto out_cancel;
1843
1844 if (start)
1845 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1846 else
1847 ceph_osdc_unregister_linger_request(osdc,
1848 rbd_dev->watch_request->osd_req);
1849 ret = rbd_obj_request_submit(osdc, obj_request);
1850 if (ret)
1851 goto out_cancel;
1852 ret = rbd_obj_request_wait(obj_request);
1853 if (ret)
1854 goto out_cancel;
1855 ret = obj_request->result;
1856 if (ret)
1857 goto out_cancel;
1416 1858
1417 ret = rbd_req_sync_op(rbd_dev, NULL, 1859 /*
1418 CEPH_NOSNAP, 1860 * A watch request is set to linger, so the underlying osd
1419 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1861 * request won't go away until we unregister it. We retain
1420 ops, 1862 * a pointer to the object request during that time (in
1421 rbd_dev->header_name, 1863 * rbd_dev->watch_request), so we'll keep a reference to
1422 0, 0, NULL, 1864 * it. We'll drop that reference (below) after we've
1423 &rbd_dev->watch_request, NULL); 1865 * unregistered it.
1866 */
1867 if (start) {
1868 rbd_dev->watch_request = obj_request;
1424 1869
1425 if (ret < 0) 1870 return 0;
1426 goto fail_event; 1871 }
1427 1872
1428 rbd_destroy_ops(ops); 1873 /* We have successfully torn down the watch request */
1429 return 0;
1430 1874
1431fail_event: 1875 rbd_obj_request_put(rbd_dev->watch_request);
1876 rbd_dev->watch_request = NULL;
1877out_cancel:
1878 /* Cancel the event if we're tearing down, or on error */
1432 ceph_osdc_cancel_event(rbd_dev->watch_event); 1879 ceph_osdc_cancel_event(rbd_dev->watch_event);
1433 rbd_dev->watch_event = NULL; 1880 rbd_dev->watch_event = NULL;
1434fail: 1881 if (obj_request)
1435 rbd_destroy_ops(ops); 1882 rbd_obj_request_put(obj_request);
1436 return ret;
1437}
1438 1883
1439/*
1440 * Request sync osd unwatch
1441 */
1442static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
1443{
1444 struct ceph_osd_req_op *ops;
1445 int ret;
1446
1447 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1448 if (!ops)
1449 return -ENOMEM;
1450
1451 ops[0].watch.ver = 0;
1452 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
1453 ops[0].watch.flag = 0;
1454
1455 ret = rbd_req_sync_op(rbd_dev, NULL,
1456 CEPH_NOSNAP,
1457 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1458 ops,
1459 rbd_dev->header_name,
1460 0, 0, NULL, NULL, NULL);
1461
1462
1463 rbd_destroy_ops(ops);
1464 ceph_osdc_cancel_event(rbd_dev->watch_event);
1465 rbd_dev->watch_event = NULL;
1466 return ret; 1884 return ret;
1467} 1885}
1468 1886
1469/* 1887/*
1470 * Synchronous osd object method call 1888 * Synchronous osd object method call
1471 */ 1889 */
1472static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1890static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1473 const char *object_name, 1891 const char *object_name,
1474 const char *class_name, 1892 const char *class_name,
1475 const char *method_name, 1893 const char *method_name,
@@ -1477,169 +1895,154 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1477 size_t outbound_size, 1895 size_t outbound_size,
1478 char *inbound, 1896 char *inbound,
1479 size_t inbound_size, 1897 size_t inbound_size,
1480 int flags, 1898 u64 *version)
1481 u64 *ver)
1482{ 1899{
1483 struct ceph_osd_req_op *ops; 1900 struct rbd_obj_request *obj_request;
1484 int class_name_len = strlen(class_name); 1901 struct ceph_osd_client *osdc;
1485 int method_name_len = strlen(method_name); 1902 struct ceph_osd_req_op *op;
1486 int payload_size; 1903 struct page **pages;
1904 u32 page_count;
1487 int ret; 1905 int ret;
1488 1906
1489 /* 1907 /*
1490 * Any input parameters required by the method we're calling 1908 * Method calls are ultimately read operations but they
1491 * will be sent along with the class and method names as 1909 * don't involve object data (so no offset or length).
1492 * part of the message payload. That data and its size are 1910 * The result should placed into the inbound buffer
1493 * supplied via the indata and indata_len fields (named from 1911 * provided. They also supply outbound data--parameters for
1494 * the perspective of the server side) in the OSD request 1912 * the object method. Currently if this is present it will
1495 * operation. 1913 * be a snapshot id.
1496 */ 1914 */
1497 payload_size = class_name_len + method_name_len + outbound_size; 1915 page_count = (u32) calc_pages_for(0, inbound_size);
1498 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 1916 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1499 if (!ops) 1917 if (IS_ERR(pages))
1500 return -ENOMEM; 1918 return PTR_ERR(pages);
1501 1919
1502 ops[0].cls.class_name = class_name; 1920 ret = -ENOMEM;
1503 ops[0].cls.class_len = (__u8) class_name_len; 1921 obj_request = rbd_obj_request_create(object_name, 0, 0,
1504 ops[0].cls.method_name = method_name; 1922 OBJ_REQUEST_PAGES);
1505 ops[0].cls.method_len = (__u8) method_name_len; 1923 if (!obj_request)
1506 ops[0].cls.argc = 0; 1924 goto out;
1507 ops[0].cls.indata = outbound;
1508 ops[0].cls.indata_len = outbound_size;
1509 1925
1510 ret = rbd_req_sync_op(rbd_dev, NULL, 1926 obj_request->pages = pages;
1511 CEPH_NOSNAP, 1927 obj_request->page_count = page_count;
1512 flags, ops,
1513 object_name, 0, inbound_size, inbound,
1514 NULL, ver);
1515 1928
1516 rbd_destroy_ops(ops); 1929 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1930 method_name, outbound, outbound_size);
1931 if (!op)
1932 goto out;
1933 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1934 obj_request, op);
1935 rbd_osd_req_op_destroy(op);
1936 if (!obj_request->osd_req)
1937 goto out;
1517 1938
1518 dout("cls_exec returned %d\n", ret); 1939 osdc = &rbd_dev->rbd_client->client->osdc;
1519 return ret; 1940 ret = rbd_obj_request_submit(osdc, obj_request);
1520} 1941 if (ret)
1942 goto out;
1943 ret = rbd_obj_request_wait(obj_request);
1944 if (ret)
1945 goto out;
1521 1946
1522static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 1947 ret = obj_request->result;
1523{ 1948 if (ret < 0)
1524 struct rbd_req_coll *coll = 1949 goto out;
1525 kzalloc(sizeof(struct rbd_req_coll) + 1950 ret = 0;
1526 sizeof(struct rbd_req_status) * num_reqs, 1951 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
1527 GFP_ATOMIC); 1952 if (version)
1953 *version = obj_request->version;
1954out:
1955 if (obj_request)
1956 rbd_obj_request_put(obj_request);
1957 else
1958 ceph_release_page_vector(pages, page_count);
1528 1959
1529 if (!coll) 1960 return ret;
1530 return NULL;
1531 coll->total = num_reqs;
1532 kref_init(&coll->kref);
1533 return coll;
1534} 1961}
1535 1962
1536/* 1963static void rbd_request_fn(struct request_queue *q)
1537 * block device queue callback 1964 __releases(q->queue_lock) __acquires(q->queue_lock)
1538 */
1539static void rbd_rq_fn(struct request_queue *q)
1540{ 1965{
1541 struct rbd_device *rbd_dev = q->queuedata; 1966 struct rbd_device *rbd_dev = q->queuedata;
1967 bool read_only = rbd_dev->mapping.read_only;
1542 struct request *rq; 1968 struct request *rq;
1969 int result;
1543 1970
1544 while ((rq = blk_fetch_request(q))) { 1971 while ((rq = blk_fetch_request(q))) {
1545 struct bio *bio; 1972 bool write_request = rq_data_dir(rq) == WRITE;
1546 bool do_write; 1973 struct rbd_img_request *img_request;
1547 unsigned int size; 1974 u64 offset;
1548 u64 ofs; 1975 u64 length;
1549 int num_segs, cur_seg = 0; 1976
1550 struct rbd_req_coll *coll; 1977 /* Ignore any non-FS requests that filter through. */
1551 struct ceph_snap_context *snapc;
1552 unsigned int bio_offset;
1553
1554 dout("fetched request\n");
1555
1556 /* filter out block requests we don't understand */
1557 if ((rq->cmd_type != REQ_TYPE_FS)) {
1558 __blk_end_request_all(rq, 0);
1559 continue;
1560 }
1561 1978
1562 /* deduce our operation (read, write) */ 1979 if (rq->cmd_type != REQ_TYPE_FS) {
1563 do_write = (rq_data_dir(rq) == WRITE); 1980 dout("%s: non-fs request type %d\n", __func__,
1564 if (do_write && rbd_dev->mapping.read_only) { 1981 (int) rq->cmd_type);
1565 __blk_end_request_all(rq, -EROFS); 1982 __blk_end_request_all(rq, 0);
1566 continue; 1983 continue;
1567 } 1984 }
1568 1985
1569 spin_unlock_irq(q->queue_lock); 1986 /* Ignore/skip any zero-length requests */
1570 1987
1571 down_read(&rbd_dev->header_rwsem); 1988 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1989 length = (u64) blk_rq_bytes(rq);
1572 1990
1573 if (!rbd_dev->exists) { 1991 if (!length) {
1574 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 1992 dout("%s: zero-length request\n", __func__);
1575 up_read(&rbd_dev->header_rwsem); 1993 __blk_end_request_all(rq, 0);
1576 dout("request for non-existent snapshot");
1577 spin_lock_irq(q->queue_lock);
1578 __blk_end_request_all(rq, -ENXIO);
1579 continue; 1994 continue;
1580 } 1995 }
1581 1996
1582 snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1997 spin_unlock_irq(q->queue_lock);
1583
1584 up_read(&rbd_dev->header_rwsem);
1585
1586 size = blk_rq_bytes(rq);
1587 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1588 bio = rq->bio;
1589 1998
1590 dout("%s 0x%x bytes at 0x%llx\n", 1999 /* Disallow writes to a read-only device */
1591 do_write ? "write" : "read",
1592 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1593 2000
1594 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 2001 if (write_request) {
1595 if (num_segs <= 0) { 2002 result = -EROFS;
1596 spin_lock_irq(q->queue_lock); 2003 if (read_only)
1597 __blk_end_request_all(rq, num_segs); 2004 goto end_request;
1598 ceph_put_snap_context(snapc); 2005 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1599 continue;
1600 } 2006 }
1601 coll = rbd_alloc_coll(num_segs);
1602 if (!coll) {
1603 spin_lock_irq(q->queue_lock);
1604 __blk_end_request_all(rq, -ENOMEM);
1605 ceph_put_snap_context(snapc);
1606 continue;
1607 }
1608
1609 bio_offset = 0;
1610 do {
1611 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1612 unsigned int chain_size;
1613 struct bio *bio_chain;
1614
1615 BUG_ON(limit > (u64) UINT_MAX);
1616 chain_size = (unsigned int) limit;
1617 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1618 2007
1619 kref_get(&coll->kref); 2008 /*
2009 * Quit early if the mapped snapshot no longer
2010 * exists. It's still possible the snapshot will
2011 * have disappeared by the time our request arrives
2012 * at the osd, but there's no sense in sending it if
2013 * we already know.
2014 */
2015 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2016 dout("request for non-existent snapshot");
2017 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2018 result = -ENXIO;
2019 goto end_request;
2020 }
1620 2021
1621 /* Pass a cloned bio chain via an osd request */ 2022 result = -EINVAL;
2023 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2024 goto end_request; /* Shouldn't happen */
1622 2025
1623 bio_chain = bio_chain_clone_range(&bio, 2026 result = -ENOMEM;
1624 &bio_offset, chain_size, 2027 img_request = rbd_img_request_create(rbd_dev, offset, length,
1625 GFP_ATOMIC); 2028 write_request);
1626 if (bio_chain) 2029 if (!img_request)
1627 (void) rbd_do_op(rq, rbd_dev, snapc, 2030 goto end_request;
1628 ofs, chain_size,
1629 bio_chain, coll, cur_seg);
1630 else
1631 rbd_coll_end_req_index(rq, coll, cur_seg,
1632 -ENOMEM, chain_size);
1633 size -= chain_size;
1634 ofs += chain_size;
1635 2031
1636 cur_seg++; 2032 img_request->rq = rq;
1637 } while (size > 0);
1638 kref_put(&coll->kref, rbd_coll_release);
1639 2033
2034 result = rbd_img_request_fill_bio(img_request, rq->bio);
2035 if (!result)
2036 result = rbd_img_request_submit(img_request);
2037 if (result)
2038 rbd_img_request_put(img_request);
2039end_request:
1640 spin_lock_irq(q->queue_lock); 2040 spin_lock_irq(q->queue_lock);
1641 2041 if (result < 0) {
1642 ceph_put_snap_context(snapc); 2042 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2043 write_request ? "write" : "read", result);
2044 __blk_end_request_all(rq, result);
2045 }
1643 } 2046 }
1644} 2047}
1645 2048
@@ -1703,6 +2106,71 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
1703 put_disk(disk); 2106 put_disk(disk);
1704} 2107}
1705 2108
2109static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2110 const char *object_name,
2111 u64 offset, u64 length,
2112 char *buf, u64 *version)
2113
2114{
2115 struct ceph_osd_req_op *op;
2116 struct rbd_obj_request *obj_request;
2117 struct ceph_osd_client *osdc;
2118 struct page **pages = NULL;
2119 u32 page_count;
2120 size_t size;
2121 int ret;
2122
2123 page_count = (u32) calc_pages_for(offset, length);
2124 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2125 if (IS_ERR(pages))
2126 ret = PTR_ERR(pages);
2127
2128 ret = -ENOMEM;
2129 obj_request = rbd_obj_request_create(object_name, offset, length,
2130 OBJ_REQUEST_PAGES);
2131 if (!obj_request)
2132 goto out;
2133
2134 obj_request->pages = pages;
2135 obj_request->page_count = page_count;
2136
2137 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2138 if (!op)
2139 goto out;
2140 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2141 obj_request, op);
2142 rbd_osd_req_op_destroy(op);
2143 if (!obj_request->osd_req)
2144 goto out;
2145
2146 osdc = &rbd_dev->rbd_client->client->osdc;
2147 ret = rbd_obj_request_submit(osdc, obj_request);
2148 if (ret)
2149 goto out;
2150 ret = rbd_obj_request_wait(obj_request);
2151 if (ret)
2152 goto out;
2153
2154 ret = obj_request->result;
2155 if (ret < 0)
2156 goto out;
2157
2158 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2159 size = (size_t) obj_request->xferred;
2160 ceph_copy_from_page_vector(pages, buf, 0, size);
2161 rbd_assert(size <= (size_t) INT_MAX);
2162 ret = (int) size;
2163 if (version)
2164 *version = obj_request->version;
2165out:
2166 if (obj_request)
2167 rbd_obj_request_put(obj_request);
2168 else
2169 ceph_release_page_vector(pages, page_count);
2170
2171 return ret;
2172}
2173
1706/* 2174/*
1707 * Read the complete header for the given rbd device. 2175 * Read the complete header for the given rbd device.
1708 * 2176 *
@@ -1741,24 +2209,20 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1741 if (!ondisk) 2209 if (!ondisk)
1742 return ERR_PTR(-ENOMEM); 2210 return ERR_PTR(-ENOMEM);
1743 2211
1744 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 2212 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
1745 rbd_dev->header_name,
1746 0, size, 2213 0, size,
1747 (char *) ondisk, version); 2214 (char *) ondisk, version);
1748
1749 if (ret < 0) 2215 if (ret < 0)
1750 goto out_err; 2216 goto out_err;
1751 if (WARN_ON((size_t) ret < size)) { 2217 if (WARN_ON((size_t) ret < size)) {
1752 ret = -ENXIO; 2218 ret = -ENXIO;
1753 pr_warning("short header read for image %s" 2219 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1754 " (want %zd got %d)\n", 2220 size, ret);
1755 rbd_dev->spec->image_name, size, ret);
1756 goto out_err; 2221 goto out_err;
1757 } 2222 }
1758 if (!rbd_dev_ondisk_valid(ondisk)) { 2223 if (!rbd_dev_ondisk_valid(ondisk)) {
1759 ret = -ENXIO; 2224 ret = -ENXIO;
1760 pr_warning("invalid header for image %s\n", 2225 rbd_warn(rbd_dev, "invalid header");
1761 rbd_dev->spec->image_name);
1762 goto out_err; 2226 goto out_err;
1763 } 2227 }
1764 2228
@@ -1895,8 +2359,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1895 disk->fops = &rbd_bd_ops; 2359 disk->fops = &rbd_bd_ops;
1896 disk->private_data = rbd_dev; 2360 disk->private_data = rbd_dev;
1897 2361
1898 /* init rq */ 2362 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
1899 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1900 if (!q) 2363 if (!q)
1901 goto out_disk; 2364 goto out_disk;
1902 2365
@@ -2233,7 +2696,7 @@ static void rbd_spec_free(struct kref *kref)
2233 kfree(spec); 2696 kfree(spec);
2234} 2697}
2235 2698
2236struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2699static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2237 struct rbd_spec *spec) 2700 struct rbd_spec *spec)
2238{ 2701{
2239 struct rbd_device *rbd_dev; 2702 struct rbd_device *rbd_dev;
@@ -2243,6 +2706,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2243 return NULL; 2706 return NULL;
2244 2707
2245 spin_lock_init(&rbd_dev->lock); 2708 spin_lock_init(&rbd_dev->lock);
2709 rbd_dev->flags = 0;
2246 INIT_LIST_HEAD(&rbd_dev->node); 2710 INIT_LIST_HEAD(&rbd_dev->node);
2247 INIT_LIST_HEAD(&rbd_dev->snaps); 2711 INIT_LIST_HEAD(&rbd_dev->snaps);
2248 init_rwsem(&rbd_dev->header_rwsem); 2712 init_rwsem(&rbd_dev->header_rwsem);
@@ -2250,6 +2714,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2250 rbd_dev->spec = spec; 2714 rbd_dev->spec = spec;
2251 rbd_dev->rbd_client = rbdc; 2715 rbd_dev->rbd_client = rbdc;
2252 2716
2717 /* Initialize the layout used for all rbd requests */
2718
2719 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2720 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2721 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2722 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2723
2253 return rbd_dev; 2724 return rbd_dev;
2254} 2725}
2255 2726
@@ -2360,12 +2831,11 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2360 __le64 size; 2831 __le64 size;
2361 } __attribute__ ((packed)) size_buf = { 0 }; 2832 } __attribute__ ((packed)) size_buf = { 0 };
2362 2833
2363 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2834 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2364 "rbd", "get_size", 2835 "rbd", "get_size",
2365 (char *) &snapid, sizeof (snapid), 2836 (char *) &snapid, sizeof (snapid),
2366 (char *) &size_buf, sizeof (size_buf), 2837 (char *) &size_buf, sizeof (size_buf), NULL);
2367 CEPH_OSD_FLAG_READ, NULL); 2838 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2368 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2369 if (ret < 0) 2839 if (ret < 0)
2370 return ret; 2840 return ret;
2371 2841
@@ -2396,15 +2866,13 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2396 if (!reply_buf) 2866 if (!reply_buf)
2397 return -ENOMEM; 2867 return -ENOMEM;
2398 2868
2399 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2869 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2400 "rbd", "get_object_prefix", 2870 "rbd", "get_object_prefix",
2401 NULL, 0, 2871 NULL, 0,
2402 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 2872 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2403 CEPH_OSD_FLAG_READ, NULL); 2873 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2404 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2405 if (ret < 0) 2874 if (ret < 0)
2406 goto out; 2875 goto out;
2407 ret = 0; /* rbd_req_sync_exec() can return positive */
2408 2876
2409 p = reply_buf; 2877 p = reply_buf;
2410 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 2878 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
@@ -2435,12 +2903,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2435 u64 incompat; 2903 u64 incompat;
2436 int ret; 2904 int ret;
2437 2905
2438 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2906 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2439 "rbd", "get_features", 2907 "rbd", "get_features",
2440 (char *) &snapid, sizeof (snapid), 2908 (char *) &snapid, sizeof (snapid),
2441 (char *) &features_buf, sizeof (features_buf), 2909 (char *) &features_buf, sizeof (features_buf),
2442 CEPH_OSD_FLAG_READ, NULL); 2910 NULL);
2443 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2911 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2444 if (ret < 0) 2912 if (ret < 0)
2445 return ret; 2913 return ret;
2446 2914
@@ -2474,7 +2942,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2474 void *end; 2942 void *end;
2475 char *image_id; 2943 char *image_id;
2476 u64 overlap; 2944 u64 overlap;
2477 size_t len = 0;
2478 int ret; 2945 int ret;
2479 2946
2480 parent_spec = rbd_spec_alloc(); 2947 parent_spec = rbd_spec_alloc();
@@ -2492,12 +2959,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2492 } 2959 }
2493 2960
2494 snapid = cpu_to_le64(CEPH_NOSNAP); 2961 snapid = cpu_to_le64(CEPH_NOSNAP);
2495 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2962 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2496 "rbd", "get_parent", 2963 "rbd", "get_parent",
2497 (char *) &snapid, sizeof (snapid), 2964 (char *) &snapid, sizeof (snapid),
2498 (char *) reply_buf, size, 2965 (char *) reply_buf, size, NULL);
2499 CEPH_OSD_FLAG_READ, NULL); 2966 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2500 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2501 if (ret < 0) 2967 if (ret < 0)
2502 goto out_err; 2968 goto out_err;
2503 2969
@@ -2508,13 +2974,18 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2508 if (parent_spec->pool_id == CEPH_NOPOOL) 2974 if (parent_spec->pool_id == CEPH_NOPOOL)
2509 goto out; /* No parent? No problem. */ 2975 goto out; /* No parent? No problem. */
2510 2976
2511 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 2977 /* The ceph file layout needs to fit pool id in 32 bits */
2978
2979 ret = -EIO;
2980 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2981 goto out;
2982
2983 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2512 if (IS_ERR(image_id)) { 2984 if (IS_ERR(image_id)) {
2513 ret = PTR_ERR(image_id); 2985 ret = PTR_ERR(image_id);
2514 goto out_err; 2986 goto out_err;
2515 } 2987 }
2516 parent_spec->image_id = image_id; 2988 parent_spec->image_id = image_id;
2517 parent_spec->image_id_len = len;
2518 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 2989 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2519 ceph_decode_64_safe(&p, end, overlap, out_err); 2990 ceph_decode_64_safe(&p, end, overlap, out_err);
2520 2991
@@ -2544,26 +3015,25 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2544 3015
2545 rbd_assert(!rbd_dev->spec->image_name); 3016 rbd_assert(!rbd_dev->spec->image_name);
2546 3017
2547 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; 3018 len = strlen(rbd_dev->spec->image_id);
3019 image_id_size = sizeof (__le32) + len;
2548 image_id = kmalloc(image_id_size, GFP_KERNEL); 3020 image_id = kmalloc(image_id_size, GFP_KERNEL);
2549 if (!image_id) 3021 if (!image_id)
2550 return NULL; 3022 return NULL;
2551 3023
2552 p = image_id; 3024 p = image_id;
2553 end = (char *) image_id + image_id_size; 3025 end = (char *) image_id + image_id_size;
2554 ceph_encode_string(&p, end, rbd_dev->spec->image_id, 3026 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
2555 (u32) rbd_dev->spec->image_id_len);
2556 3027
2557 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 3028 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2558 reply_buf = kmalloc(size, GFP_KERNEL); 3029 reply_buf = kmalloc(size, GFP_KERNEL);
2559 if (!reply_buf) 3030 if (!reply_buf)
2560 goto out; 3031 goto out;
2561 3032
2562 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, 3033 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
2563 "rbd", "dir_get_name", 3034 "rbd", "dir_get_name",
2564 image_id, image_id_size, 3035 image_id, image_id_size,
2565 (char *) reply_buf, size, 3036 (char *) reply_buf, size, NULL);
2566 CEPH_OSD_FLAG_READ, NULL);
2567 if (ret < 0) 3037 if (ret < 0)
2568 goto out; 3038 goto out;
2569 p = reply_buf; 3039 p = reply_buf;
@@ -2602,8 +3072,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2602 3072
2603 osdc = &rbd_dev->rbd_client->client->osdc; 3073 osdc = &rbd_dev->rbd_client->client->osdc;
2604 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3074 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2605 if (!name) 3075 if (!name) {
2606 return -EIO; /* pool id too large (>= 2^31) */ 3076 rbd_warn(rbd_dev, "there is no pool with id %llu",
3077 rbd_dev->spec->pool_id); /* Really a BUG() */
3078 return -EIO;
3079 }
2607 3080
2608 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 3081 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2609 if (!rbd_dev->spec->pool_name) 3082 if (!rbd_dev->spec->pool_name)
@@ -2612,19 +3085,17 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2612 /* Fetch the image name; tolerate failure here */ 3085 /* Fetch the image name; tolerate failure here */
2613 3086
2614 name = rbd_dev_image_name(rbd_dev); 3087 name = rbd_dev_image_name(rbd_dev);
2615 if (name) { 3088 if (name)
2616 rbd_dev->spec->image_name_len = strlen(name);
2617 rbd_dev->spec->image_name = (char *) name; 3089 rbd_dev->spec->image_name = (char *) name;
2618 } else { 3090 else
2619 pr_warning(RBD_DRV_NAME "%d " 3091 rbd_warn(rbd_dev, "unable to get image name");
2620 "unable to get image name for image id %s\n",
2621 rbd_dev->major, rbd_dev->spec->image_id);
2622 }
2623 3092
2624 /* Look up the snapshot name. */ 3093 /* Look up the snapshot name. */
2625 3094
2626 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 3095 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2627 if (!name) { 3096 if (!name) {
3097 rbd_warn(rbd_dev, "no snapshot with id %llu",
3098 rbd_dev->spec->snap_id); /* Really a BUG() */
2628 ret = -EIO; 3099 ret = -EIO;
2629 goto out_err; 3100 goto out_err;
2630 } 3101 }
@@ -2665,12 +3136,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
2665 if (!reply_buf) 3136 if (!reply_buf)
2666 return -ENOMEM; 3137 return -ENOMEM;
2667 3138
2668 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 3139 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2669 "rbd", "get_snapcontext", 3140 "rbd", "get_snapcontext",
2670 NULL, 0, 3141 NULL, 0,
2671 reply_buf, size, 3142 reply_buf, size, ver);
2672 CEPH_OSD_FLAG_READ, ver); 3143 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2673 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2674 if (ret < 0) 3144 if (ret < 0)
2675 goto out; 3145 goto out;
2676 3146
@@ -2735,12 +3205,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2735 return ERR_PTR(-ENOMEM); 3205 return ERR_PTR(-ENOMEM);
2736 3206
2737 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 3207 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2738 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 3208 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2739 "rbd", "get_snapshot_name", 3209 "rbd", "get_snapshot_name",
2740 (char *) &snap_id, sizeof (snap_id), 3210 (char *) &snap_id, sizeof (snap_id),
2741 reply_buf, size, 3211 reply_buf, size, NULL);
2742 CEPH_OSD_FLAG_READ, NULL); 3212 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2743 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2744 if (ret < 0) 3213 if (ret < 0)
2745 goto out; 3214 goto out;
2746 3215
@@ -2766,7 +3235,7 @@ out:
2766static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 3235static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2767 u64 *snap_size, u64 *snap_features) 3236 u64 *snap_size, u64 *snap_features)
2768{ 3237{
2769 __le64 snap_id; 3238 u64 snap_id;
2770 u8 order; 3239 u8 order;
2771 int ret; 3240 int ret;
2772 3241
@@ -2865,10 +3334,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2865 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 3334 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2866 struct list_head *next = links->next; 3335 struct list_head *next = links->next;
2867 3336
2868 /* Existing snapshot not in the new snap context */ 3337 /*
2869 3338 * A previously-existing snapshot is not in
3339 * the new snap context.
3340 *
3341 * If the now missing snapshot is the one the
3342 * image is mapped to, clear its exists flag
3343 * so we can avoid sending any more requests
3344 * to it.
3345 */
2870 if (rbd_dev->spec->snap_id == snap->id) 3346 if (rbd_dev->spec->snap_id == snap->id)
2871 rbd_dev->exists = false; 3347 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
2872 rbd_remove_snap_dev(snap); 3348 rbd_remove_snap_dev(snap);
2873 dout("%ssnap id %llu has been removed\n", 3349 dout("%ssnap id %llu has been removed\n",
2874 rbd_dev->spec->snap_id == snap->id ? 3350 rbd_dev->spec->snap_id == snap->id ?
@@ -2942,7 +3418,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2942 struct rbd_snap *snap; 3418 struct rbd_snap *snap;
2943 int ret = 0; 3419 int ret = 0;
2944 3420
2945 dout("%s called\n", __func__); 3421 dout("%s:\n", __func__);
2946 if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 3422 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2947 return -EIO; 3423 return -EIO;
2948 3424
@@ -2983,22 +3459,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2983 device_unregister(&rbd_dev->dev); 3459 device_unregister(&rbd_dev->dev);
2984} 3460}
2985 3461
2986static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2987{
2988 int ret, rc;
2989
2990 do {
2991 ret = rbd_req_sync_watch(rbd_dev);
2992 if (ret == -ERANGE) {
2993 rc = rbd_dev_refresh(rbd_dev, NULL);
2994 if (rc < 0)
2995 return rc;
2996 }
2997 } while (ret == -ERANGE);
2998
2999 return ret;
3000}
3001
3002static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 3462static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3003 3463
3004/* 3464/*
@@ -3138,11 +3598,9 @@ static inline char *dup_token(const char **buf, size_t *lenp)
3138 size_t len; 3598 size_t len;
3139 3599
3140 len = next_token(buf); 3600 len = next_token(buf);
3141 dup = kmalloc(len + 1, GFP_KERNEL); 3601 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3142 if (!dup) 3602 if (!dup)
3143 return NULL; 3603 return NULL;
3144
3145 memcpy(dup, *buf, len);
3146 *(dup + len) = '\0'; 3604 *(dup + len) = '\0';
3147 *buf += len; 3605 *buf += len;
3148 3606
@@ -3210,8 +3668,10 @@ static int rbd_add_parse_args(const char *buf,
3210 /* The first four tokens are required */ 3668 /* The first four tokens are required */
3211 3669
3212 len = next_token(&buf); 3670 len = next_token(&buf);
3213 if (!len) 3671 if (!len) {
3214 return -EINVAL; /* Missing monitor address(es) */ 3672 rbd_warn(NULL, "no monitor address(es) provided");
3673 return -EINVAL;
3674 }
3215 mon_addrs = buf; 3675 mon_addrs = buf;
3216 mon_addrs_size = len + 1; 3676 mon_addrs_size = len + 1;
3217 buf += len; 3677 buf += len;
@@ -3220,8 +3680,10 @@ static int rbd_add_parse_args(const char *buf,
3220 options = dup_token(&buf, NULL); 3680 options = dup_token(&buf, NULL);
3221 if (!options) 3681 if (!options)
3222 return -ENOMEM; 3682 return -ENOMEM;
3223 if (!*options) 3683 if (!*options) {
3224 goto out_err; /* Missing options */ 3684 rbd_warn(NULL, "no options provided");
3685 goto out_err;
3686 }
3225 3687
3226 spec = rbd_spec_alloc(); 3688 spec = rbd_spec_alloc();
3227 if (!spec) 3689 if (!spec)
@@ -3230,14 +3692,18 @@ static int rbd_add_parse_args(const char *buf,
3230 spec->pool_name = dup_token(&buf, NULL); 3692 spec->pool_name = dup_token(&buf, NULL);
3231 if (!spec->pool_name) 3693 if (!spec->pool_name)
3232 goto out_mem; 3694 goto out_mem;
3233 if (!*spec->pool_name) 3695 if (!*spec->pool_name) {
3234 goto out_err; /* Missing pool name */ 3696 rbd_warn(NULL, "no pool name provided");
3697 goto out_err;
3698 }
3235 3699
3236 spec->image_name = dup_token(&buf, &spec->image_name_len); 3700 spec->image_name = dup_token(&buf, NULL);
3237 if (!spec->image_name) 3701 if (!spec->image_name)
3238 goto out_mem; 3702 goto out_mem;
3239 if (!*spec->image_name) 3703 if (!*spec->image_name) {
3240 goto out_err; /* Missing image name */ 3704 rbd_warn(NULL, "no image name provided");
3705 goto out_err;
3706 }
3241 3707
3242 /* 3708 /*
3243 * Snapshot name is optional; default is to use "-" 3709 * Snapshot name is optional; default is to use "-"
@@ -3251,10 +3717,9 @@ static int rbd_add_parse_args(const char *buf,
3251 ret = -ENAMETOOLONG; 3717 ret = -ENAMETOOLONG;
3252 goto out_err; 3718 goto out_err;
3253 } 3719 }
3254 spec->snap_name = kmalloc(len + 1, GFP_KERNEL); 3720 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3255 if (!spec->snap_name) 3721 if (!spec->snap_name)
3256 goto out_mem; 3722 goto out_mem;
3257 memcpy(spec->snap_name, buf, len);
3258 *(spec->snap_name + len) = '\0'; 3723 *(spec->snap_name + len) = '\0';
3259 3724
3260 /* Initialize all rbd options to the defaults */ 3725 /* Initialize all rbd options to the defaults */
@@ -3323,7 +3788,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3323 * First, see if the format 2 image id file exists, and if 3788 * First, see if the format 2 image id file exists, and if
3324 * so, get the image's persistent id from it. 3789 * so, get the image's persistent id from it.
3325 */ 3790 */
3326 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; 3791 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3327 object_name = kmalloc(size, GFP_NOIO); 3792 object_name = kmalloc(size, GFP_NOIO);
3328 if (!object_name) 3793 if (!object_name)
3329 return -ENOMEM; 3794 return -ENOMEM;
@@ -3339,21 +3804,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3339 goto out; 3804 goto out;
3340 } 3805 }
3341 3806
3342 ret = rbd_req_sync_exec(rbd_dev, object_name, 3807 ret = rbd_obj_method_sync(rbd_dev, object_name,
3343 "rbd", "get_id", 3808 "rbd", "get_id",
3344 NULL, 0, 3809 NULL, 0,
3345 response, RBD_IMAGE_ID_LEN_MAX, 3810 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3346 CEPH_OSD_FLAG_READ, NULL); 3811 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3347 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3348 if (ret < 0) 3812 if (ret < 0)
3349 goto out; 3813 goto out;
3350 ret = 0; /* rbd_req_sync_exec() can return positive */
3351 3814
3352 p = response; 3815 p = response;
3353 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3816 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3354 p + RBD_IMAGE_ID_LEN_MAX, 3817 p + RBD_IMAGE_ID_LEN_MAX,
3355 &rbd_dev->spec->image_id_len, 3818 NULL, GFP_NOIO);
3356 GFP_NOIO);
3357 if (IS_ERR(rbd_dev->spec->image_id)) { 3819 if (IS_ERR(rbd_dev->spec->image_id)) {
3358 ret = PTR_ERR(rbd_dev->spec->image_id); 3820 ret = PTR_ERR(rbd_dev->spec->image_id);
3359 rbd_dev->spec->image_id = NULL; 3821 rbd_dev->spec->image_id = NULL;
@@ -3377,11 +3839,10 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3377 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 3839 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3378 if (!rbd_dev->spec->image_id) 3840 if (!rbd_dev->spec->image_id)
3379 return -ENOMEM; 3841 return -ENOMEM;
3380 rbd_dev->spec->image_id_len = 0;
3381 3842
3382 /* Record the header object name for this rbd image. */ 3843 /* Record the header object name for this rbd image. */
3383 3844
3384 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); 3845 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3385 rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3846 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3386 if (!rbd_dev->header_name) { 3847 if (!rbd_dev->header_name) {
3387 ret = -ENOMEM; 3848 ret = -ENOMEM;
@@ -3427,7 +3888,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3427 * Image id was filled in by the caller. Record the header 3888 * Image id was filled in by the caller. Record the header
3428 * object name for this rbd image. 3889 * object name for this rbd image.
3429 */ 3890 */
3430 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; 3891 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3431 rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3892 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3432 if (!rbd_dev->header_name) 3893 if (!rbd_dev->header_name)
3433 return -ENOMEM; 3894 return -ENOMEM;
@@ -3542,7 +4003,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3542 if (ret) 4003 if (ret)
3543 goto err_out_bus; 4004 goto err_out_bus;
3544 4005
3545 ret = rbd_init_watch_dev(rbd_dev); 4006 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
3546 if (ret) 4007 if (ret)
3547 goto err_out_bus; 4008 goto err_out_bus;
3548 4009
@@ -3638,6 +4099,13 @@ static ssize_t rbd_add(struct bus_type *bus,
3638 goto err_out_client; 4099 goto err_out_client;
3639 spec->pool_id = (u64) rc; 4100 spec->pool_id = (u64) rc;
3640 4101
4102 /* The ceph file layout needs to fit pool id in 32 bits */
4103
4104 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4105 rc = -EIO;
4106 goto err_out_client;
4107 }
4108
3641 rbd_dev = rbd_dev_create(rbdc, spec); 4109 rbd_dev = rbd_dev_create(rbdc, spec);
3642 if (!rbd_dev) 4110 if (!rbd_dev)
3643 goto err_out_client; 4111 goto err_out_client;
@@ -3691,15 +4159,8 @@ static void rbd_dev_release(struct device *dev)
3691{ 4159{
3692 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4160 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3693 4161
3694 if (rbd_dev->watch_request) {
3695 struct ceph_client *client = rbd_dev->rbd_client->client;
3696
3697 ceph_osdc_unregister_linger_request(&client->osdc,
3698 rbd_dev->watch_request);
3699 }
3700 if (rbd_dev->watch_event) 4162 if (rbd_dev->watch_event)
3701 rbd_req_sync_unwatch(rbd_dev); 4163 rbd_dev_header_watch_sync(rbd_dev, 0);
3702
3703 4164
3704 /* clean up and free blkdev */ 4165 /* clean up and free blkdev */
3705 rbd_free_disk(rbd_dev); 4166 rbd_free_disk(rbd_dev);
@@ -3743,10 +4204,14 @@ static ssize_t rbd_remove(struct bus_type *bus,
3743 goto done; 4204 goto done;
3744 } 4205 }
3745 4206
3746 if (rbd_dev->open_count) { 4207 spin_lock_irq(&rbd_dev->lock);
4208 if (rbd_dev->open_count)
3747 ret = -EBUSY; 4209 ret = -EBUSY;
4210 else
4211 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4212 spin_unlock_irq(&rbd_dev->lock);
4213 if (ret < 0)
3748 goto done; 4214 goto done;
3749 }
3750 4215
3751 rbd_remove_all_snaps(rbd_dev); 4216 rbd_remove_all_snaps(rbd_dev);
3752 rbd_bus_del_dev(rbd_dev); 4217 rbd_bus_del_dev(rbd_dev);
@@ -3782,10 +4247,15 @@ static void rbd_sysfs_cleanup(void)
3782 device_unregister(&rbd_root_dev); 4247 device_unregister(&rbd_root_dev);
3783} 4248}
3784 4249
3785int __init rbd_init(void) 4250static int __init rbd_init(void)
3786{ 4251{
3787 int rc; 4252 int rc;
3788 4253
4254 if (!libceph_compatible(NULL)) {
4255 rbd_warn(NULL, "libceph incompatibility (quitting)");
4256
4257 return -EINVAL;
4258 }
3789 rc = rbd_sysfs_init(); 4259 rc = rbd_sysfs_init();
3790 if (rc) 4260 if (rc)
3791 return rc; 4261 return rc;
@@ -3793,7 +4263,7 @@ int __init rbd_init(void)
3793 return 0; 4263 return 0;
3794} 4264}
3795 4265
3796void __exit rbd_exit(void) 4266static void __exit rbd_exit(void)
3797{ 4267{
3798 rbd_sysfs_cleanup(); 4268 rbd_sysfs_cleanup();
3799} 4269}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d4f81edd9a5d..a60ea977af6f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
236static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 236static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
237{ 237{
238 struct inode *inode = req->r_inode; 238 struct inode *inode = req->r_inode;
239 struct ceph_osd_reply_head *replyhead; 239 int rc = req->r_result;
240 int rc, bytes; 240 int bytes = le32_to_cpu(msg->hdr.data_len);
241 int i; 241 int i;
242 242
243 /* parse reply */
244 replyhead = msg->front.iov_base;
245 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
246 rc = le32_to_cpu(replyhead->result);
247 bytes = le32_to_cpu(msg->hdr.data_len);
248
249 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 243 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
250 244
251 /* unlock all pages, zeroing any data we didn't read */ 245 /* unlock all pages, zeroing any data we didn't read */
@@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
315 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 309 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
316 NULL, 0, 310 NULL, 0,
317 ci->i_truncate_seq, ci->i_truncate_size, 311 ci->i_truncate_seq, ci->i_truncate_size,
318 NULL, false, 1, 0); 312 NULL, false, 0);
319 if (IS_ERR(req)) 313 if (IS_ERR(req))
320 return PTR_ERR(req); 314 return PTR_ERR(req);
321 315
@@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
492 &ci->i_layout, snapc, 486 &ci->i_layout, snapc,
493 page_off, len, 487 page_off, len,
494 ci->i_truncate_seq, ci->i_truncate_size, 488 ci->i_truncate_seq, ci->i_truncate_size,
495 &inode->i_mtime, 489 &inode->i_mtime, &page, 1);
496 &page, 1, 0, 0, true);
497 if (err < 0) { 490 if (err < 0) {
498 dout("writepage setting page/mapping error %d %p\n", err, page); 491 dout("writepage setting page/mapping error %d %p\n", err, page);
499 SetPageError(page); 492 SetPageError(page);
@@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
554 struct ceph_msg *msg) 547 struct ceph_msg *msg)
555{ 548{
556 struct inode *inode = req->r_inode; 549 struct inode *inode = req->r_inode;
557 struct ceph_osd_reply_head *replyhead;
558 struct ceph_osd_op *op;
559 struct ceph_inode_info *ci = ceph_inode(inode); 550 struct ceph_inode_info *ci = ceph_inode(inode);
560 unsigned wrote; 551 unsigned wrote;
561 struct page *page; 552 struct page *page;
562 int i; 553 int i;
563 struct ceph_snap_context *snapc = req->r_snapc; 554 struct ceph_snap_context *snapc = req->r_snapc;
564 struct address_space *mapping = inode->i_mapping; 555 struct address_space *mapping = inode->i_mapping;
565 __s32 rc = -EIO; 556 int rc = req->r_result;
566 u64 bytes = 0; 557 u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
567 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 558 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
568 long writeback_stat; 559 long writeback_stat;
569 unsigned issued = ceph_caps_issued(ci); 560 unsigned issued = ceph_caps_issued(ci);
570 561
571 /* parse reply */
572 replyhead = msg->front.iov_base;
573 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
574 op = (void *)(replyhead + 1);
575 rc = le32_to_cpu(replyhead->result);
576 bytes = le64_to_cpu(op->extent.length);
577
578 if (rc >= 0) { 562 if (rc >= 0) {
579 /* 563 /*
580 * Assume we wrote the pages we originally sent. The 564 * Assume we wrote the pages we originally sent. The
@@ -741,8 +725,6 @@ retry:
741 struct page *page; 725 struct page *page;
742 int want; 726 int want;
743 u64 offset, len; 727 u64 offset, len;
744 struct ceph_osd_request_head *reqhead;
745 struct ceph_osd_op *op;
746 long writeback_stat; 728 long writeback_stat;
747 729
748 next = 0; 730 next = 0;
@@ -838,7 +820,7 @@ get_more_pages:
838 snapc, do_sync, 820 snapc, do_sync,
839 ci->i_truncate_seq, 821 ci->i_truncate_seq,
840 ci->i_truncate_size, 822 ci->i_truncate_size,
841 &inode->i_mtime, true, 1, 0); 823 &inode->i_mtime, true, 0);
842 824
843 if (IS_ERR(req)) { 825 if (IS_ERR(req)) {
844 rc = PTR_ERR(req); 826 rc = PTR_ERR(req);
@@ -906,10 +888,8 @@ get_more_pages:
906 888
907 /* revise final length, page count */ 889 /* revise final length, page count */
908 req->r_num_pages = locked_pages; 890 req->r_num_pages = locked_pages;
909 reqhead = req->r_request->front.iov_base; 891 req->r_request_ops[0].extent.length = cpu_to_le64(len);
910 op = (void *)(reqhead + 1); 892 req->r_request_ops[0].payload_len = cpu_to_le32(len);
911 op->extent.length = cpu_to_le64(len);
912 op->payload_len = cpu_to_le32(len);
913 req->r_request->hdr.data_len = cpu_to_le32(len); 893 req->r_request->hdr.data_len = cpu_to_le32(len);
914 894
915 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 895 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ae2be696eb5b..78e2f575247d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,8 +611,16 @@ retry:
611 611
612 if (flags & CEPH_CAP_FLAG_AUTH) 612 if (flags & CEPH_CAP_FLAG_AUTH)
613 ci->i_auth_cap = cap; 613 ci->i_auth_cap = cap;
614 else if (ci->i_auth_cap == cap) 614 else if (ci->i_auth_cap == cap) {
615 ci->i_auth_cap = NULL; 615 ci->i_auth_cap = NULL;
616 spin_lock(&mdsc->cap_dirty_lock);
617 if (!list_empty(&ci->i_dirty_item)) {
618 dout(" moving %p to cap_dirty_migrating\n", inode);
619 list_move(&ci->i_dirty_item,
620 &mdsc->cap_dirty_migrating);
621 }
622 spin_unlock(&mdsc->cap_dirty_lock);
623 }
616 624
617 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", 625 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
618 inode, ceph_vinop(inode), cap, ceph_cap_string(issued), 626 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
@@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1460 struct ceph_mds_client *mdsc = fsc->mdsc; 1468 struct ceph_mds_client *mdsc = fsc->mdsc;
1461 struct inode *inode = &ci->vfs_inode; 1469 struct inode *inode = &ci->vfs_inode;
1462 struct ceph_cap *cap; 1470 struct ceph_cap *cap;
1463 int file_wanted, used; 1471 int file_wanted, used, cap_used;
1464 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ 1472 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1465 int issued, implemented, want, retain, revoking, flushing = 0; 1473 int issued, implemented, want, retain, revoking, flushing = 0;
1466 int mds = -1; /* keep track of how far we've gone through i_caps list 1474 int mds = -1; /* keep track of how far we've gone through i_caps list
@@ -1563,9 +1571,14 @@ retry_locked:
1563 1571
1564 /* NOTE: no side-effects allowed, until we take s_mutex */ 1572 /* NOTE: no side-effects allowed, until we take s_mutex */
1565 1573
1574 cap_used = used;
1575 if (ci->i_auth_cap && cap != ci->i_auth_cap)
1576 cap_used &= ~ci->i_auth_cap->issued;
1577
1566 revoking = cap->implemented & ~cap->issued; 1578 revoking = cap->implemented & ~cap->issued;
1567 dout(" mds%d cap %p issued %s implemented %s revoking %s\n", 1579 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
1568 cap->mds, cap, ceph_cap_string(cap->issued), 1580 cap->mds, cap, ceph_cap_string(cap->issued),
1581 ceph_cap_string(cap_used),
1569 ceph_cap_string(cap->implemented), 1582 ceph_cap_string(cap->implemented),
1570 ceph_cap_string(revoking)); 1583 ceph_cap_string(revoking));
1571 1584
@@ -1593,7 +1606,7 @@ retry_locked:
1593 } 1606 }
1594 1607
1595 /* completed revocation? going down and there are no caps? */ 1608 /* completed revocation? going down and there are no caps? */
1596 if (revoking && (revoking & used) == 0) { 1609 if (revoking && (revoking & cap_used) == 0) {
1597 dout("completed revocation of %s\n", 1610 dout("completed revocation of %s\n",
1598 ceph_cap_string(cap->implemented & ~cap->issued)); 1611 ceph_cap_string(cap->implemented & ~cap->issued));
1599 goto ack; 1612 goto ack;
@@ -1670,8 +1683,8 @@ ack:
1670 sent++; 1683 sent++;
1671 1684
1672 /* __send_cap drops i_ceph_lock */ 1685 /* __send_cap drops i_ceph_lock */
1673 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, 1686 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
1674 retain, flushing, NULL); 1687 want, retain, flushing, NULL);
1675 goto retry; /* retake i_ceph_lock and restart our cap scan. */ 1688 goto retry; /* retake i_ceph_lock and restart our cap scan. */
1676 } 1689 }
1677 1690
@@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2417 dout("mds wanted %s -> %s\n", 2430 dout("mds wanted %s -> %s\n",
2418 ceph_cap_string(le32_to_cpu(grant->wanted)), 2431 ceph_cap_string(le32_to_cpu(grant->wanted)),
2419 ceph_cap_string(wanted)); 2432 ceph_cap_string(wanted));
2420 grant->wanted = cpu_to_le32(wanted); 2433 /* imported cap may not have correct mds_wanted */
2434 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
2435 check_caps = 1;
2421 } 2436 }
2422 2437
2423 cap->seq = seq; 2438 cap->seq = seq;
@@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2821 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 2836 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2822 (unsigned)seq); 2837 (unsigned)seq);
2823 2838
2839 if (op == CEPH_CAP_OP_IMPORT)
2840 ceph_add_cap_releases(mdsc, session);
2841
2824 /* lookup ino */ 2842 /* lookup ino */
2825 inode = ceph_find_inode(sb, vino); 2843 inode = ceph_find_inode(sb, vino);
2826 ci = ceph_inode(inode); 2844 ci = ceph_inode(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 11b57c2c8f15..bf338d9b67e3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
243 err = ceph_mdsc_do_request(mdsc, 243 err = ceph_mdsc_do_request(mdsc,
244 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 244 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
245 req); 245 req);
246 if (err)
247 goto out_err;
248
246 err = ceph_handle_snapdir(req, dentry, err); 249 err = ceph_handle_snapdir(req, dentry, err);
247 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 250 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
248 err = ceph_handle_notrace_create(dir, dentry); 251 err = ceph_handle_notrace_create(dir, dentry);
@@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
263 err = finish_no_open(file, dn); 266 err = finish_no_open(file, dn);
264 } else { 267 } else {
265 dout("atomic_open finish_open on dn %p\n", dn); 268 dout("atomic_open finish_open on dn %p\n", dn);
269 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
270 *opened |= FILE_CREATED;
271 }
266 err = finish_open(file, dentry, ceph_open, opened); 272 err = finish_open(file, dentry, ceph_open, opened);
267 } 273 }
268 274
@@ -535,7 +541,7 @@ more:
535 ci->i_snap_realm->cached_context, 541 ci->i_snap_realm->cached_context,
536 do_sync, 542 do_sync,
537 ci->i_truncate_seq, ci->i_truncate_size, 543 ci->i_truncate_seq, ci->i_truncate_size,
538 &mtime, false, 2, page_align); 544 &mtime, false, page_align);
539 if (IS_ERR(req)) 545 if (IS_ERR(req))
540 return PTR_ERR(req); 546 return PTR_ERR(req);
541 547
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f5ed767806df..4a989345b37b 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
185 &ceph_sb_to_client(inode->i_sb)->client->osdc; 185 &ceph_sb_to_client(inode->i_sb)->client->osdc;
186 u64 len = 1, olen; 186 u64 len = 1, olen;
187 u64 tmp; 187 u64 tmp;
188 struct ceph_object_layout ol;
189 struct ceph_pg pgid; 188 struct ceph_pg pgid;
190 int r; 189 int r;
191 190
@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
194 return -EFAULT; 193 return -EFAULT;
195 194
196 down_read(&osdc->map_sem); 195 down_read(&osdc->map_sem);
197 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, 196 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
198 &dl.object_no, &dl.object_offset, 197 &dl.object_no, &dl.object_offset,
199 &olen); 198 &olen);
200 if (r < 0) 199 if (r < 0)
@@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
209 208
210 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", 209 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
211 ceph_ino(inode), dl.object_no); 210 ceph_ino(inode), dl.object_no);
212 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, 211 ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
213 osdc->osdmap); 212 osdc->osdmap);
214 213
215 pgid = ol.ol_pgid;
216 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); 214 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
217 if (dl.osd >= 0) { 215 if (dl.osd >= 0) {
218 struct ceph_entity_addr *a = 216 struct ceph_entity_addr *a =
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7a3dfe0a9a80..442880d099c9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -233,6 +233,30 @@ bad:
233} 233}
234 234
235/* 235/*
236 * parse create results
237 */
238static int parse_reply_info_create(void **p, void *end,
239 struct ceph_mds_reply_info_parsed *info,
240 int features)
241{
242 if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
243 if (*p == end) {
244 info->has_create_ino = false;
245 } else {
246 info->has_create_ino = true;
247 info->ino = ceph_decode_64(p);
248 }
249 }
250
251 if (unlikely(*p != end))
252 goto bad;
253 return 0;
254
255bad:
256 return -EIO;
257}
258
259/*
236 * parse extra results 260 * parse extra results
237 */ 261 */
238static int parse_reply_info_extra(void **p, void *end, 262static int parse_reply_info_extra(void **p, void *end,
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
241{ 265{
242 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 266 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
243 return parse_reply_info_filelock(p, end, info, features); 267 return parse_reply_info_filelock(p, end, info, features);
244 else 268 else if (info->head->op == CEPH_MDS_OP_READDIR)
245 return parse_reply_info_dir(p, end, info, features); 269 return parse_reply_info_dir(p, end, info, features);
270 else if (info->head->op == CEPH_MDS_OP_CREATE)
271 return parse_reply_info_create(p, end, info, features);
272 else
273 return -EIO;
246} 274}
247 275
248/* 276/*
@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2170 mutex_lock(&req->r_fill_mutex); 2198 mutex_lock(&req->r_fill_mutex);
2171 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2199 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2172 if (err == 0) { 2200 if (err == 0) {
2173 if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && 2201 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
2202 req->r_op == CEPH_MDS_OP_LSSNAP) &&
2174 rinfo->dir_nr) 2203 rinfo->dir_nr)
2175 ceph_readdir_prepopulate(req, req->r_session); 2204 ceph_readdir_prepopulate(req, req->r_session);
2176 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2205 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ff4188bf6199..c2a19fbbe517 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
74 struct ceph_mds_reply_info_in *dir_in; 74 struct ceph_mds_reply_info_in *dir_in;
75 u8 dir_complete, dir_end; 75 u8 dir_complete, dir_end;
76 }; 76 };
77
78 /* for create results */
79 struct {
80 bool has_create_ino;
81 u64 ino;
82 };
77 }; 83 };
78 84
79 /* encoded blob describing snapshot contexts for certain 85 /* encoded blob describing snapshot contexts for certain
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a35..0d3c9240c61b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
59 return ERR_PTR(-ENOMEM); 59 return ERR_PTR(-ENOMEM);
60 60
61 ceph_decode_16_safe(p, end, version, bad); 61 ceph_decode_16_safe(p, end, version, bad);
62 if (version > 3) {
63 pr_warning("got mdsmap version %d > 3, failing", version);
64 goto bad;
65 }
62 66
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); 67 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p); 68 m->m_epoch = ceph_decode_32(p);
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
144 /* pg_pools */ 148 /* pg_pools */
145 ceph_decode_32_safe(p, end, n, bad); 149 ceph_decode_32_safe(p, end, n, bad);
146 m->m_num_data_pg_pools = n; 150 m->m_num_data_pg_pools = n;
147 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); 151 m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
148 if (!m->m_data_pg_pools) 152 if (!m->m_data_pg_pools)
149 goto badmem; 153 goto badmem;
150 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); 154 ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
151 for (i = 0; i < n; i++) 155 for (i = 0; i < n; i++)
152 m->m_data_pg_pools[i] = ceph_decode_32(p); 156 m->m_data_pg_pools[i] = ceph_decode_64(p);
153 m->m_cas_pg_pool = ceph_decode_32(p); 157 m->m_cas_pg_pool = ceph_decode_64(p);
154 158
155 /* ok, we don't care about the rest. */ 159 /* ok, we don't care about the rest. */
156 dout("mdsmap_decode success epoch %u\n", m->m_epoch); 160 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c804..89fa4a940a0f 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
15 case CEPH_MDS_STATE_BOOT: return "up:boot"; 15 case CEPH_MDS_STATE_BOOT: return "up:boot";
16 case CEPH_MDS_STATE_STANDBY: return "up:standby"; 16 case CEPH_MDS_STATE_STANDBY: return "up:standby";
17 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; 17 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
18 case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
18 case CEPH_MDS_STATE_CREATING: return "up:creating"; 19 case CEPH_MDS_STATE_CREATING: return "up:creating";
19 case CEPH_MDS_STATE_STARTING: return "up:starting"; 20 case CEPH_MDS_STATE_STARTING: return "up:starting";
20 /* up and in */ 21 /* up and in */
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
50 case CEPH_MDS_OP_LOOKUP: return "lookup"; 51 case CEPH_MDS_OP_LOOKUP: return "lookup";
51 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; 52 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
52 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; 53 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
54 case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
53 case CEPH_MDS_OP_GETATTR: return "getattr"; 55 case CEPH_MDS_OP_GETATTR: return "getattr";
54 case CEPH_MDS_OP_SETXATTR: return "setxattr"; 56 case CEPH_MDS_OP_SETXATTR: return "setxattr";
55 case CEPH_MDS_OP_SETATTR: return "setattr"; 57 case CEPH_MDS_OP_SETATTR: return "setattr";
56 case CEPH_MDS_OP_RMXATTR: return "rmxattr"; 58 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
59 case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
60 case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
57 case CEPH_MDS_OP_READDIR: return "readdir"; 61 case CEPH_MDS_OP_READDIR: return "readdir";
58 case CEPH_MDS_OP_MKNOD: return "mknod"; 62 case CEPH_MDS_OP_MKNOD: return "mknod";
59 case CEPH_MDS_OP_LINK: return "link"; 63 case CEPH_MDS_OP_LINK: return "link";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e86aa9948124..9fe17c6c2876 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
71 /* 71 /*
72 * express utilization in terms of large blocks to avoid 72 * express utilization in terms of large blocks to avoid
73 * overflow on 32-bit machines. 73 * overflow on 32-bit machines.
74 *
75 * NOTE: for the time being, we make bsize == frsize to humor
76 * not-yet-ancient versions of glibc that are broken.
77 * Someday, we will probably want to report a real block
78 * size... whatever that may mean for a network file system!
74 */ 79 */
75 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; 80 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
81 buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
76 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); 82 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
77 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 83 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
78 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 84 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
80 buf->f_files = le64_to_cpu(st.num_objects); 86 buf->f_files = le64_to_cpu(st.num_objects);
81 buf->f_ffree = -1; 87 buf->f_ffree = -1;
82 buf->f_namelen = NAME_MAX; 88 buf->f_namelen = NAME_MAX;
83 buf->f_frsize = PAGE_CACHE_SIZE;
84 89
85 /* leave fsid little-endian, regardless of host endianness */ 90 /* leave fsid little-endian, regardless of host endianness */
86 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); 91 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f053bbd1886f..c7b309723dcc 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,7 +21,7 @@
21 21
22/* large granularity for statfs utilization stats to facilitate 22/* large granularity for statfs utilization stats to facilitate
23 * large volume sizes on 32-bit machines. */ 23 * large volume sizes on 32-bit machines. */
24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
26 26
27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
798/* file.c */ 798/* file.c */
799extern const struct file_operations ceph_file_fops; 799extern const struct file_operations ceph_file_fops;
800extern const struct address_space_operations ceph_aops; 800extern const struct address_space_operations ceph_aops;
801extern int ceph_copy_to_page_vector(struct page **pages, 801
802 const char *data,
803 loff_t off, size_t len);
804extern int ceph_copy_from_page_vector(struct page **pages,
805 char *data,
806 loff_t off, size_t len);
807extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
808extern int ceph_open(struct inode *inode, struct file *file); 802extern int ceph_open(struct inode *inode, struct file *file);
809extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, 803extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
810 struct file *file, unsigned flags, umode_t mode, 804 struct file *file, unsigned flags, umode_t mode,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2c2ae5be9902..9b6b2b6dd164 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -29,9 +29,94 @@ struct ceph_vxattr {
29 size_t name_size; /* strlen(name) + 1 (for '\0') */ 29 size_t name_size; /* strlen(name) + 1 (for '\0') */
30 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, 30 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
31 size_t size); 31 size_t size);
32 bool readonly; 32 bool readonly, hidden;
33 bool (*exists_cb)(struct ceph_inode_info *ci);
33}; 34};
34 35
36/* layouts */
37
38static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
39{
40 size_t s;
41 char *p = (char *)&ci->i_layout;
42
43 for (s = 0; s < sizeof(ci->i_layout); s++, p++)
44 if (*p)
45 return true;
46 return false;
47}
48
49static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
50 size_t size)
51{
52 int ret;
53 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
54 struct ceph_osd_client *osdc = &fsc->client->osdc;
55 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
56 const char *pool_name;
57
58 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
59 down_read(&osdc->map_sem);
60 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
61 if (pool_name)
62 ret = snprintf(val, size,
63 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
64 (unsigned long long)ceph_file_layout_su(ci->i_layout),
65 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
66 (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
67 pool_name);
68 else
69 ret = snprintf(val, size,
70 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
71 (unsigned long long)ceph_file_layout_su(ci->i_layout),
72 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
73 (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
74 (unsigned long long)pool);
75
76 up_read(&osdc->map_sem);
77 return ret;
78}
79
80static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
81 char *val, size_t size)
82{
83 return snprintf(val, size, "%lld",
84 (unsigned long long)ceph_file_layout_su(ci->i_layout));
85}
86
87static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
88 char *val, size_t size)
89{
90 return snprintf(val, size, "%lld",
91 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
92}
93
94static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
95 char *val, size_t size)
96{
97 return snprintf(val, size, "%lld",
98 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
99}
100
101static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
102 char *val, size_t size)
103{
104 int ret;
105 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
106 struct ceph_osd_client *osdc = &fsc->client->osdc;
107 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
108 const char *pool_name;
109
110 down_read(&osdc->map_sem);
111 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
112 if (pool_name)
113 ret = snprintf(val, size, "%s", pool_name);
114 else
115 ret = snprintf(val, size, "%lld", (unsigned long long)pool);
116 up_read(&osdc->map_sem);
117 return ret;
118}
119
35/* directories */ 120/* directories */
36 121
37static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, 122static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
83 (long)ci->i_rctime.tv_nsec); 168 (long)ci->i_rctime.tv_nsec);
84} 169}
85 170
86#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
87 171
88#define XATTR_NAME_CEPH(_type, _name) \ 172#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
89 { \ 173#define CEPH_XATTR_NAME2(_type, _name, _name2) \
90 .name = CEPH_XATTR_NAME(_type, _name), \ 174 XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
91 .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ 175
92 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ 176#define XATTR_NAME_CEPH(_type, _name) \
93 .readonly = true, \ 177 { \
94 } 178 .name = CEPH_XATTR_NAME(_type, _name), \
179 .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
180 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
181 .readonly = true, \
182 .hidden = false, \
183 .exists_cb = NULL, \
184 }
185#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
186 { \
187 .name = CEPH_XATTR_NAME2(_type, _name, _field), \
188 .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
189 .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
190 .readonly = false, \
191 .hidden = true, \
192 .exists_cb = ceph_vxattrcb_layout_exists, \
193 }
95 194
96static struct ceph_vxattr ceph_dir_vxattrs[] = { 195static struct ceph_vxattr ceph_dir_vxattrs[] = {
196 {
197 .name = "ceph.dir.layout",
198 .name_size = sizeof("ceph.dir.layout"),
199 .getxattr_cb = ceph_vxattrcb_layout,
200 .readonly = false,
201 .hidden = false,
202 .exists_cb = ceph_vxattrcb_layout_exists,
203 },
204 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
205 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
206 XATTR_LAYOUT_FIELD(dir, layout, object_size),
207 XATTR_LAYOUT_FIELD(dir, layout, pool),
97 XATTR_NAME_CEPH(dir, entries), 208 XATTR_NAME_CEPH(dir, entries),
98 XATTR_NAME_CEPH(dir, files), 209 XATTR_NAME_CEPH(dir, files),
99 XATTR_NAME_CEPH(dir, subdirs), 210 XATTR_NAME_CEPH(dir, subdirs),
@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
102 XATTR_NAME_CEPH(dir, rsubdirs), 213 XATTR_NAME_CEPH(dir, rsubdirs),
103 XATTR_NAME_CEPH(dir, rbytes), 214 XATTR_NAME_CEPH(dir, rbytes),
104 XATTR_NAME_CEPH(dir, rctime), 215 XATTR_NAME_CEPH(dir, rctime),
105 { 0 } /* Required table terminator */ 216 { .name = NULL, 0 } /* Required table terminator */
106}; 217};
107static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ 218static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
108 219
109/* files */ 220/* files */
110 221
111static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
112 size_t size)
113{
114 int ret;
115
116 ret = snprintf(val, size,
117 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
118 (unsigned long long)ceph_file_layout_su(ci->i_layout),
119 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
120 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
121 return ret;
122}
123
124static struct ceph_vxattr ceph_file_vxattrs[] = { 222static struct ceph_vxattr ceph_file_vxattrs[] = {
125 XATTR_NAME_CEPH(file, layout),
126 /* The following extended attribute name is deprecated */
127 { 223 {
128 .name = XATTR_CEPH_PREFIX "layout", 224 .name = "ceph.file.layout",
129 .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), 225 .name_size = sizeof("ceph.file.layout"),
130 .getxattr_cb = ceph_vxattrcb_file_layout, 226 .getxattr_cb = ceph_vxattrcb_layout,
131 .readonly = true, 227 .readonly = false,
228 .hidden = false,
229 .exists_cb = ceph_vxattrcb_layout_exists,
132 }, 230 },
133 { 0 } /* Required table terminator */ 231 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
232 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
233 XATTR_LAYOUT_FIELD(file, layout, object_size),
234 XATTR_LAYOUT_FIELD(file, layout, pool),
235 { .name = NULL, 0 } /* Required table terminator */
134}; 236};
135static size_t ceph_file_vxattrs_name_size; /* total size of all names */ 237static size_t ceph_file_vxattrs_name_size; /* total size of all names */
136 238
@@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
164 size_t size = 0; 266 size_t size = 0;
165 267
166 for (vxattr = vxattrs; vxattr->name; vxattr++) 268 for (vxattr = vxattrs; vxattr->name; vxattr++)
167 size += vxattr->name_size; 269 if (!vxattr->hidden)
270 size += vxattr->name_size;
168 271
169 return size; 272 return size;
170} 273}
@@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
572 if (!ceph_is_valid_xattr(name)) 675 if (!ceph_is_valid_xattr(name))
573 return -ENODATA; 676 return -ENODATA;
574 677
575 /* let's see if a virtual xattr was requested */
576 vxattr = ceph_match_vxattr(inode, name);
577
578 spin_lock(&ci->i_ceph_lock); 678 spin_lock(&ci->i_ceph_lock);
579 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 679 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
580 ci->i_xattrs.version, ci->i_xattrs.index_version); 680 ci->i_xattrs.version, ci->i_xattrs.index_version);
581 681
682 /* let's see if a virtual xattr was requested */
683 vxattr = ceph_match_vxattr(inode, name);
684 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
685 err = vxattr->getxattr_cb(ci, value, size);
686 goto out;
687 }
688
582 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 689 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
583 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 690 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
584 goto get_xattr; 691 goto get_xattr;
@@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
592 699
593 spin_lock(&ci->i_ceph_lock); 700 spin_lock(&ci->i_ceph_lock);
594 701
595 if (vxattr && vxattr->readonly) {
596 err = vxattr->getxattr_cb(ci, value, size);
597 goto out;
598 }
599
600 err = __build_xattrs(inode); 702 err = __build_xattrs(inode);
601 if (err < 0) 703 if (err < 0)
602 goto out; 704 goto out;
@@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
604get_xattr: 706get_xattr:
605 err = -ENODATA; /* == ENOATTR */ 707 err = -ENODATA; /* == ENOATTR */
606 xattr = __get_xattr(ci, name); 708 xattr = __get_xattr(ci, name);
607 if (!xattr) { 709 if (!xattr)
608 if (vxattr)
609 err = vxattr->getxattr_cb(ci, value, size);
610 goto out; 710 goto out;
611 }
612 711
613 err = -ERANGE; 712 err = -ERANGE;
614 if (size && size < xattr->val_len) 713 if (size && size < xattr->val_len)
@@ -664,23 +763,30 @@ list_xattr:
664 vir_namelen = ceph_vxattrs_name_size(vxattrs); 763 vir_namelen = ceph_vxattrs_name_size(vxattrs);
665 764
666 /* adding 1 byte per each variable due to the null termination */ 765 /* adding 1 byte per each variable due to the null termination */
667 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; 766 namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
668 err = -ERANGE; 767 err = -ERANGE;
669 if (size && namelen > size) 768 if (size && vir_namelen + namelen > size)
670 goto out; 769 goto out;
671 770
672 err = namelen; 771 err = namelen + vir_namelen;
673 if (size == 0) 772 if (size == 0)
674 goto out; 773 goto out;
675 774
676 names = __copy_xattr_names(ci, names); 775 names = __copy_xattr_names(ci, names);
677 776
678 /* virtual xattr names, too */ 777 /* virtual xattr names, too */
679 if (vxattrs) 778 err = namelen;
779 if (vxattrs) {
680 for (i = 0; vxattrs[i].name; i++) { 780 for (i = 0; vxattrs[i].name; i++) {
681 len = sprintf(names, "%s", vxattrs[i].name); 781 if (!vxattrs[i].hidden &&
682 names += len + 1; 782 !(vxattrs[i].exists_cb &&
783 !vxattrs[i].exists_cb(ci))) {
784 len = sprintf(names, "%s", vxattrs[i].name);
785 names += len + 1;
786 err += len + 1;
787 }
683 } 788 }
789 }
684 790
685out: 791out:
686 spin_unlock(&ci->i_ceph_lock); 792 spin_unlock(&ci->i_ceph_lock);
@@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
782 if (vxattr && vxattr->readonly) 888 if (vxattr && vxattr->readonly)
783 return -EOPNOTSUPP; 889 return -EOPNOTSUPP;
784 890
891 /* pass any unhandled ceph.* xattrs through to the MDS */
892 if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
893 goto do_sync_unlocked;
894
785 /* preallocate memory for xattr name, value, index node */ 895 /* preallocate memory for xattr name, value, index node */
786 err = -ENOMEM; 896 err = -ENOMEM;
787 newname = kmemdup(name, name_len + 1, GFP_NOFS); 897 newname = kmemdup(name, name_len + 1, GFP_NOFS);
@@ -838,6 +948,7 @@ retry:
838 948
839do_sync: 949do_sync:
840 spin_unlock(&ci->i_ceph_lock); 950 spin_unlock(&ci->i_ceph_lock);
951do_sync_unlocked:
841 err = ceph_sync_setxattr(dentry, name, value, size, flags); 952 err = ceph_sync_setxattr(dentry, name, value, size, flags);
842out: 953out:
843 kfree(newname); 954 kfree(newname);
@@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
892 if (vxattr && vxattr->readonly) 1003 if (vxattr && vxattr->readonly)
893 return -EOPNOTSUPP; 1004 return -EOPNOTSUPP;
894 1005
1006 /* pass any unhandled ceph.* xattrs through to the MDS */
1007 if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
1008 goto do_sync_unlocked;
1009
895 err = -ENOMEM; 1010 err = -ENOMEM;
896 spin_lock(&ci->i_ceph_lock); 1011 spin_lock(&ci->i_ceph_lock);
897retry: 1012retry:
@@ -931,6 +1046,7 @@ retry:
931 return err; 1046 return err;
932do_sync: 1047do_sync:
933 spin_unlock(&ci->i_ceph_lock); 1048 spin_unlock(&ci->i_ceph_lock);
1049do_sync_unlocked:
934 err = ceph_send_removexattr(dentry, name); 1050 err = ceph_send_removexattr(dentry, name);
935out: 1051out:
936 return err; 1052 return err;
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index dad579b0c0e6..76554cecaab2 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,16 +12,46 @@
12#define CEPH_FEATURE_MONNAMES (1<<5) 12#define CEPH_FEATURE_MONNAMES (1<<5)
13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) 13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) 14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
15/* bits 8-17 defined by user-space; not supported yet here */ 15#define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
16#define CEPH_FEATURE_PGID64 (1<<9)
17#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
18#define CEPH_FEATURE_PGPOOL3 (1<<11)
19#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
20#define CEPH_FEATURE_OSDENC (1<<13)
21#define CEPH_FEATURE_OMAP (1<<14)
22#define CEPH_FEATURE_MONENC (1<<15)
23#define CEPH_FEATURE_QUERY_T (1<<16)
24#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
16#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) 25#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
26#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
27#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
28#define CEPH_FEATURE_MON_GV (1<<21)
29#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
30#define CEPH_FEATURE_MSG_AUTH (1<<23)
31#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
32#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
33#define CEPH_FEATURE_CREATEPOOLID (1<<26)
34#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
35#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
36#define CEPH_FEATURE_MDSENC (1<<29)
37#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
17 38
18/* 39/*
19 * Features supported. 40 * Features supported.
20 */ 41 */
21#define CEPH_FEATURES_SUPPORTED_DEFAULT \ 42#define CEPH_FEATURES_SUPPORTED_DEFAULT \
22 (CEPH_FEATURE_NOSRCADDR | \ 43 (CEPH_FEATURE_NOSRCADDR | \
23 CEPH_FEATURE_CRUSH_TUNABLES) 44 CEPH_FEATURE_PGID64 | \
45 CEPH_FEATURE_PGPOOL3 | \
46 CEPH_FEATURE_OSDENC | \
47 CEPH_FEATURE_CRUSH_TUNABLES | \
48 CEPH_FEATURE_CRUSH_TUNABLES2 | \
49 CEPH_FEATURE_REPLY_CREATE_INODE | \
50 CEPH_FEATURE_OSDHASHPSPOOL)
24 51
25#define CEPH_FEATURES_REQUIRED_DEFAULT \ 52#define CEPH_FEATURES_REQUIRED_DEFAULT \
26 (CEPH_FEATURE_NOSRCADDR) 53 (CEPH_FEATURE_NOSRCADDR | \
54 CEPH_FEATURE_PGID64 | \
55 CEPH_FEATURE_PGPOOL3 | \
56 CEPH_FEATURE_OSDENC)
27#endif 57#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cf6f4d998a76..2ad7b860f062 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -21,16 +21,14 @@
21 * internal cluster protocols separately from the public, 21 * internal cluster protocols separately from the public,
22 * client-facing protocol. 22 * client-facing protocol.
23 */ 23 */
24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
27#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 24#define CEPH_OSDC_PROTOCOL 24 /* server/client */
28#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 25#define CEPH_MDSC_PROTOCOL 32 /* server/client */
29#define CEPH_MONC_PROTOCOL 15 /* server/client */ 26#define CEPH_MONC_PROTOCOL 15 /* server/client */
30 27
31 28
32#define CEPH_INO_ROOT 1 29#define CEPH_INO_ROOT 1
33#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ 30#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
31#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
34 32
35/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ 33/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
36#define CEPH_MAX_MON 31 34#define CEPH_MAX_MON 31
@@ -51,7 +49,7 @@ struct ceph_file_layout {
51 __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ 49 __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
52 50
53 /* object -> pg layout */ 51 /* object -> pg layout */
54 __le32 fl_unused; /* unused; used to be preferred primary (-1) */ 52 __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
55 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ 53 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
56} __attribute__ ((packed)); 54} __attribute__ ((packed));
57 55
@@ -101,6 +99,8 @@ struct ceph_dir_layout {
101#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 99#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
102#define CEPH_MSG_AUTH 17 100#define CEPH_MSG_AUTH 17
103#define CEPH_MSG_AUTH_REPLY 18 101#define CEPH_MSG_AUTH_REPLY 18
102#define CEPH_MSG_MON_GET_VERSION 19
103#define CEPH_MSG_MON_GET_VERSION_REPLY 20
104 104
105/* client <-> mds */ 105/* client <-> mds */
106#define CEPH_MSG_MDS_MAP 21 106#define CEPH_MSG_MDS_MAP 21
@@ -221,6 +221,11 @@ struct ceph_mon_subscribe_ack {
221} __attribute__ ((packed)); 221} __attribute__ ((packed));
222 222
223/* 223/*
224 * mdsmap flags
225 */
226#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
227
228/*
224 * mds states 229 * mds states
225 * > 0 -> in 230 * > 0 -> in
226 * <= 0 -> out 231 * <= 0 -> out
@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack {
233#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ 238#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
234#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ 239#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
235#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ 240#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
241#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */
236 242
237#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ 243#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
238#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed 244#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s);
264#define CEPH_LOCK_IXATTR 2048 270#define CEPH_LOCK_IXATTR 2048
265#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ 271#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
266#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ 272#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
273#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
267 274
268/* client_session ops */ 275/* client_session ops */
269enum { 276enum {
@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op);
338#define CEPH_SETATTR_SIZE 32 345#define CEPH_SETATTR_SIZE 32
339#define CEPH_SETATTR_CTIME 64 346#define CEPH_SETATTR_CTIME 64
340 347
348/*
349 * Ceph setxattr request flags.
350 */
351#define CEPH_XATTR_CREATE 1
352#define CEPH_XATTR_REPLACE 2
353
341union ceph_mds_request_args { 354union ceph_mds_request_args {
342 struct { 355 struct {
343 __le32 mask; /* CEPH_CAP_* */ 356 __le32 mask; /* CEPH_CAP_* */
@@ -522,14 +535,17 @@ int ceph_flags_to_mode(int flags);
522#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ 535#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
523#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ 536#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
524 537
538#define CEPH_CAP_SIMPLE_BITS 2
539#define CEPH_CAP_FILE_BITS 8
540
525/* per-lock shift */ 541/* per-lock shift */
526#define CEPH_CAP_SAUTH 2 542#define CEPH_CAP_SAUTH 2
527#define CEPH_CAP_SLINK 4 543#define CEPH_CAP_SLINK 4
528#define CEPH_CAP_SXATTR 6 544#define CEPH_CAP_SXATTR 6
529#define CEPH_CAP_SFILE 8 545#define CEPH_CAP_SFILE 8
530#define CEPH_CAP_SFLOCK 20 546#define CEPH_CAP_SFLOCK 20
531 547
532#define CEPH_CAP_BITS 22 548#define CEPH_CAP_BITS 22
533 549
534/* composed values */ 550/* composed values */
535#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) 551#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 63d092822bad..360d9d08ca9e 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
52 return end >= *p && n <= end - *p; 52 return end >= *p && n <= end - *p;
53} 53}
54 54
55#define ceph_decode_need(p, end, n, bad) \ 55#define ceph_decode_need(p, end, n, bad) \
56 do { \ 56 do { \
57 if (!likely(ceph_has_room(p, end, n))) \ 57 if (!likely(ceph_has_room(p, end, n))) \
58 goto bad; \ 58 goto bad; \
59 } while (0) 59 } while (0)
60 60
61#define ceph_decode_64_safe(p, end, v, bad) \ 61#define ceph_decode_64_safe(p, end, v, bad) \
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
99 * 99 *
100 * There are two possible failures: 100 * There are two possible failures:
101 * - converting the string would require accessing memory at or 101 * - converting the string would require accessing memory at or
102 * beyond the "end" pointer provided (-E 102 * beyond the "end" pointer provided (-ERANGE)
103 * - memory could not be allocated for the result 103 * - memory could not be allocated for the result (-ENOMEM)
104 */ 104 */
105static inline char *ceph_extract_encoded_string(void **p, void *end, 105static inline char *ceph_extract_encoded_string(void **p, void *end,
106 size_t *lenp, gfp_t gfp) 106 size_t *lenp, gfp_t gfp)
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end,
217 *p += len; 217 *p += len;
218} 218}
219 219
220#define ceph_encode_need(p, end, n, bad) \ 220#define ceph_encode_need(p, end, n, bad) \
221 do { \ 221 do { \
222 if (!likely(ceph_has_room(p, end, n))) \ 222 if (!likely(ceph_has_room(p, end, n))) \
223 goto bad; \ 223 goto bad; \
224 } while (0) 224 } while (0)
225 225
226#define ceph_encode_64_safe(p, end, v, bad) \ 226#define ceph_encode_64_safe(p, end, v, bad) \
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end,
231#define ceph_encode_32_safe(p, end, v, bad) \ 231#define ceph_encode_32_safe(p, end, v, bad) \
232 do { \ 232 do { \
233 ceph_encode_need(p, end, sizeof(u32), bad); \ 233 ceph_encode_need(p, end, sizeof(u32), bad); \
234 ceph_encode_32(p, v); \ 234 ceph_encode_32(p, v); \
235 } while (0) 235 } while (0)
236#define ceph_encode_16_safe(p, end, v, bad) \ 236#define ceph_encode_16_safe(p, end, v, bad) \
237 do { \ 237 do { \
238 ceph_encode_need(p, end, sizeof(u16), bad); \ 238 ceph_encode_need(p, end, sizeof(u16), bad); \
239 ceph_encode_16(p, v); \ 239 ceph_encode_16(p, v); \
240 } while (0)
241#define ceph_encode_8_safe(p, end, v, bad) \
242 do { \
243 ceph_encode_need(p, end, sizeof(u8), bad); \
244 ceph_encode_8(p, v); \
240 } while (0) 245 } while (0)
241 246
242#define ceph_encode_copy_safe(p, end, pv, n, bad) \ 247#define ceph_encode_copy_safe(p, end, pv, n, bad) \
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 084d3c622b12..29818fc3fa49 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len)
193} 193}
194 194
195/* ceph_common.c */ 195/* ceph_common.c */
196extern bool libceph_compatible(void *data);
197
196extern const char *ceph_msg_type_name(int type); 198extern const char *ceph_msg_type_name(int type);
197extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 199extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
198extern struct kmem_cache *ceph_inode_cachep; 200extern struct kmem_cache *ceph_inode_cachep;
@@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client);
220/* pagevec.c */ 222/* pagevec.c */
221extern void ceph_release_page_vector(struct page **pages, int num_pages); 223extern void ceph_release_page_vector(struct page **pages, int num_pages);
222 224
223extern struct page **ceph_get_direct_page_vector(const char __user *data, 225extern struct page **ceph_get_direct_page_vector(const void __user *data,
224 int num_pages, 226 int num_pages,
225 bool write_page); 227 bool write_page);
226extern void ceph_put_page_vector(struct page **pages, int num_pages, 228extern void ceph_put_page_vector(struct page **pages, int num_pages,
@@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages,
228extern void ceph_release_page_vector(struct page **pages, int num_pages); 230extern void ceph_release_page_vector(struct page **pages, int num_pages);
229extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); 231extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
230extern int ceph_copy_user_to_page_vector(struct page **pages, 232extern int ceph_copy_user_to_page_vector(struct page **pages,
231 const char __user *data, 233 const void __user *data,
232 loff_t off, size_t len); 234 loff_t off, size_t len);
233extern int ceph_copy_to_page_vector(struct page **pages, 235extern void ceph_copy_to_page_vector(struct page **pages,
234 const char *data, 236 const void *data,
235 loff_t off, size_t len); 237 loff_t off, size_t len);
236extern int ceph_copy_from_page_vector(struct page **pages, 238extern void ceph_copy_from_page_vector(struct page **pages,
237 char *data, 239 void *data,
238 loff_t off, size_t len); 240 loff_t off, size_t len);
239extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, 241extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
240 loff_t off, size_t len); 242 loff_t off, size_t len);
241extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); 243extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
242 244
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index cb15b5d867c7..87ed09f54800 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -29,8 +29,8 @@ struct ceph_mdsmap {
29 29
30 /* which object pools file data can be stored in */ 30 /* which object pools file data can be stored in */
31 int m_num_data_pg_pools; 31 int m_num_data_pg_pools;
32 u32 *m_data_pg_pools; 32 u64 *m_data_pg_pools;
33 u32 m_cas_pg_pool; 33 u64 m_cas_pg_pool;
34}; 34};
35 35
36static inline struct ceph_entity_addr * 36static inline struct ceph_entity_addr *
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 14ba5ee738a9..60903e0f665c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -83,9 +83,11 @@ struct ceph_msg {
83 struct list_head list_head; 83 struct list_head list_head;
84 84
85 struct kref kref; 85 struct kref kref;
86#ifdef CONFIG_BLOCK
86 struct bio *bio; /* instead of pages/pagelist */ 87 struct bio *bio; /* instead of pages/pagelist */
87 struct bio *bio_iter; /* bio iterator */ 88 struct bio *bio_iter; /* bio iterator */
88 int bio_seg; /* current bio segment */ 89 int bio_seg; /* current bio segment */
90#endif /* CONFIG_BLOCK */
89 struct ceph_pagelist *trail; /* the trailing part of the data */ 91 struct ceph_pagelist *trail; /* the trailing part of the data */
90 bool front_is_vmalloc; 92 bool front_is_vmalloc;
91 bool more_to_follow; 93 bool more_to_follow;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d9b880e977e6..1dd5d466b6f9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -10,6 +10,7 @@
10#include <linux/ceph/osdmap.h> 10#include <linux/ceph/osdmap.h>
11#include <linux/ceph/messenger.h> 11#include <linux/ceph/messenger.h>
12#include <linux/ceph/auth.h> 12#include <linux/ceph/auth.h>
13#include <linux/ceph/pagelist.h>
13 14
14/* 15/*
15 * Maximum object name size 16 * Maximum object name size
@@ -22,7 +23,6 @@ struct ceph_snap_context;
22struct ceph_osd_request; 23struct ceph_osd_request;
23struct ceph_osd_client; 24struct ceph_osd_client;
24struct ceph_authorizer; 25struct ceph_authorizer;
25struct ceph_pagelist;
26 26
27/* 27/*
28 * completion callback for async writepages 28 * completion callback for async writepages
@@ -47,6 +47,9 @@ struct ceph_osd {
47 struct list_head o_keepalive_item; 47 struct list_head o_keepalive_item;
48}; 48};
49 49
50
51#define CEPH_OSD_MAX_OP 10
52
50/* an in-flight request */ 53/* an in-flight request */
51struct ceph_osd_request { 54struct ceph_osd_request {
52 u64 r_tid; /* unique for this client */ 55 u64 r_tid; /* unique for this client */
@@ -63,9 +66,23 @@ struct ceph_osd_request {
63 struct ceph_connection *r_con_filling_msg; 66 struct ceph_connection *r_con_filling_msg;
64 67
65 struct ceph_msg *r_request, *r_reply; 68 struct ceph_msg *r_request, *r_reply;
66 int r_result;
67 int r_flags; /* any additional flags for the osd */ 69 int r_flags; /* any additional flags for the osd */
68 u32 r_sent; /* >0 if r_request is sending/sent */ 70 u32 r_sent; /* >0 if r_request is sending/sent */
71 int r_num_ops;
72
73 /* encoded message content */
74 struct ceph_osd_op *r_request_ops;
75 /* these are updated on each send */
76 __le32 *r_request_osdmap_epoch;
77 __le32 *r_request_flags;
78 __le64 *r_request_pool;
79 void *r_request_pgid;
80 __le32 *r_request_attempts;
81 struct ceph_eversion *r_request_reassert_version;
82
83 int r_result;
84 int r_reply_op_len[CEPH_OSD_MAX_OP];
85 s32 r_reply_op_result[CEPH_OSD_MAX_OP];
69 int r_got_reply; 86 int r_got_reply;
70 int r_linger; 87 int r_linger;
71 88
@@ -82,6 +99,7 @@ struct ceph_osd_request {
82 99
83 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ 100 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
84 int r_oid_len; 101 int r_oid_len;
102 u64 r_snapid;
85 unsigned long r_stamp; /* send OR check time */ 103 unsigned long r_stamp; /* send OR check time */
86 104
87 struct ceph_file_layout r_file_layout; 105 struct ceph_file_layout r_file_layout;
@@ -95,7 +113,7 @@ struct ceph_osd_request {
95 struct bio *r_bio; /* instead of pages */ 113 struct bio *r_bio; /* instead of pages */
96#endif 114#endif
97 115
98 struct ceph_pagelist *r_trail; /* trailing part of the data */ 116 struct ceph_pagelist r_trail; /* trailing part of the data */
99}; 117};
100 118
101struct ceph_osd_event { 119struct ceph_osd_event {
@@ -107,7 +125,6 @@ struct ceph_osd_event {
107 struct rb_node node; 125 struct rb_node node;
108 struct list_head osd_node; 126 struct list_head osd_node;
109 struct kref kref; 127 struct kref kref;
110 struct completion completion;
111}; 128};
112 129
113struct ceph_osd_event_work { 130struct ceph_osd_event_work {
@@ -157,7 +174,7 @@ struct ceph_osd_client {
157 174
158struct ceph_osd_req_op { 175struct ceph_osd_req_op {
159 u16 op; /* CEPH_OSD_OP_* */ 176 u16 op; /* CEPH_OSD_OP_* */
160 u32 flags; /* CEPH_OSD_FLAG_* */ 177 u32 payload_len;
161 union { 178 union {
162 struct { 179 struct {
163 u64 offset, length; 180 u64 offset, length;
@@ -166,23 +183,24 @@ struct ceph_osd_req_op {
166 } extent; 183 } extent;
167 struct { 184 struct {
168 const char *name; 185 const char *name;
169 u32 name_len;
170 const char *val; 186 const char *val;
187 u32 name_len;
171 u32 value_len; 188 u32 value_len;
172 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 189 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
173 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 190 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
174 } xattr; 191 } xattr;
175 struct { 192 struct {
176 const char *class_name; 193 const char *class_name;
177 __u8 class_len;
178 const char *method_name; 194 const char *method_name;
179 __u8 method_len;
180 __u8 argc;
181 const char *indata; 195 const char *indata;
182 u32 indata_len; 196 u32 indata_len;
197 __u8 class_len;
198 __u8 method_len;
199 __u8 argc;
183 } cls; 200 } cls;
184 struct { 201 struct {
185 u64 cookie, count; 202 u64 cookie;
203 u64 count;
186 } pgls; 204 } pgls;
187 struct { 205 struct {
188 u64 snapid; 206 u64 snapid;
@@ -190,12 +208,11 @@ struct ceph_osd_req_op {
190 struct { 208 struct {
191 u64 cookie; 209 u64 cookie;
192 u64 ver; 210 u64 ver;
193 __u8 flag;
194 u32 prot_ver; 211 u32 prot_ver;
195 u32 timeout; 212 u32 timeout;
213 __u8 flag;
196 } watch; 214 } watch;
197 }; 215 };
198 u32 payload_len;
199}; 216};
200 217
201extern int ceph_osdc_init(struct ceph_osd_client *osdc, 218extern int ceph_osdc_init(struct ceph_osd_client *osdc,
@@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
207extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 224extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
208 struct ceph_msg *msg); 225 struct ceph_msg *msg);
209 226
210extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
211 struct ceph_file_layout *layout,
212 u64 snapid,
213 u64 off, u64 *plen, u64 *bno,
214 struct ceph_osd_request *req,
215 struct ceph_osd_req_op *op);
216
217extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 227extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
218 int flags,
219 struct ceph_snap_context *snapc, 228 struct ceph_snap_context *snapc,
220 struct ceph_osd_req_op *ops, 229 unsigned int num_op,
221 bool use_mempool, 230 bool use_mempool,
222 gfp_t gfp_flags, 231 gfp_t gfp_flags);
223 struct page **pages,
224 struct bio *bio);
225 232
226extern void ceph_osdc_build_request(struct ceph_osd_request *req, 233extern void ceph_osdc_build_request(struct ceph_osd_request *req,
227 u64 off, u64 *plen, 234 u64 off, u64 len,
235 unsigned int num_op,
228 struct ceph_osd_req_op *src_ops, 236 struct ceph_osd_req_op *src_ops,
229 struct ceph_snap_context *snapc, 237 struct ceph_snap_context *snapc,
230 struct timespec *mtime, 238 u64 snap_id,
231 const char *oid, 239 struct timespec *mtime);
232 int oid_len);
233 240
234extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 241extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
235 struct ceph_file_layout *layout, 242 struct ceph_file_layout *layout,
@@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
239 int do_sync, u32 truncate_seq, 246 int do_sync, u32 truncate_seq,
240 u64 truncate_size, 247 u64 truncate_size,
241 struct timespec *mtime, 248 struct timespec *mtime,
242 bool use_mempool, int num_reply, 249 bool use_mempool, int page_align);
243 int page_align);
244 250
245extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 251extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
246 struct ceph_osd_request *req); 252 struct ceph_osd_request *req);
@@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
279 u64 off, u64 len, 285 u64 off, u64 len,
280 u32 truncate_seq, u64 truncate_size, 286 u32 truncate_seq, u64 truncate_size,
281 struct timespec *mtime, 287 struct timespec *mtime,
282 struct page **pages, int nr_pages, 288 struct page **pages, int nr_pages);
283 int flags, int do_sync, bool nofail);
284 289
285/* watch/notify events */ 290/* watch/notify events */
286extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, 291extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
287 void (*event_cb)(u64, u64, u8, void *), 292 void (*event_cb)(u64, u64, u8, void *),
288 int one_shot, void *data, 293 void *data, struct ceph_osd_event **pevent);
289 struct ceph_osd_event **pevent);
290extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); 294extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
291extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
292 unsigned long timeout);
293extern void ceph_osdc_put_event(struct ceph_osd_event *event); 295extern void ceph_osdc_put_event(struct ceph_osd_event *event);
294#endif 296#endif
295 297
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 10a417f9f76f..c819190d1642 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -18,14 +18,31 @@
18 * The map can be updated either via an incremental map (diff) describing 18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map. 19 * the change between two successive epochs, or as a fully encoded map.
20 */ 20 */
21struct ceph_pg {
22 uint64_t pool;
23 uint32_t seed;
24};
25
26#define CEPH_POOL_FLAG_HASHPSPOOL 1
27
21struct ceph_pg_pool_info { 28struct ceph_pg_pool_info {
22 struct rb_node node; 29 struct rb_node node;
23 int id; 30 s64 id;
24 struct ceph_pg_pool v; 31 u8 type;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 32 u8 size;
33 u8 crush_ruleset;
34 u8 object_hash;
35 u32 pg_num, pgp_num;
36 int pg_num_mask, pgp_num_mask;
37 u64 flags;
26 char *name; 38 char *name;
27}; 39};
28 40
41struct ceph_object_locator {
42 uint64_t pool;
43 char *key;
44};
45
29struct ceph_pg_mapping { 46struct ceph_pg_mapping {
30 struct rb_node node; 47 struct rb_node node;
31 struct ceph_pg pgid; 48 struct ceph_pg pgid;
@@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
110 127
111/* calculate mapping of a file extent to an object */ 128/* calculate mapping of a file extent to an object */
112extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 129extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
113 u64 off, u64 *plen, 130 u64 off, u64 len,
114 u64 *bno, u64 *oxoff, u64 *oxlen); 131 u64 *bno, u64 *oxoff, u64 *oxlen);
115 132
116/* calculate mapping of object to a placement group */ 133/* calculate mapping of object to a placement group */
117extern int ceph_calc_object_layout(struct ceph_object_layout *ol, 134extern int ceph_calc_object_layout(struct ceph_pg *pg,
118 const char *oid, 135 const char *oid,
119 struct ceph_file_layout *fl, 136 struct ceph_file_layout *fl,
120 struct ceph_osdmap *osdmap); 137 struct ceph_osdmap *osdmap);
121extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 138extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
139 struct ceph_pg pgid,
122 int *acting); 140 int *acting);
123extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 141extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
124 struct ceph_pg pgid); 142 struct ceph_pg pgid);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2c04afeead1c..68c96a508ac2 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -9,14 +9,6 @@
9#include <linux/ceph/msgr.h> 9#include <linux/ceph/msgr.h>
10 10
11/* 11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 6
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 6
18
19/*
20 * fs id 12 * fs id
21 */ 13 */
22struct ceph_fsid { 14struct ceph_fsid {
@@ -64,7 +56,7 @@ struct ceph_timespec {
64 * placement group. 56 * placement group.
65 * we encode this into one __le64. 57 * we encode this into one __le64.
66 */ 58 */
67struct ceph_pg { 59struct ceph_pg_v1 {
68 __le16 preferred; /* preferred primary osd */ 60 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */ 61 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */ 62 __le32 pool; /* object pool */
@@ -91,21 +83,6 @@ struct ceph_pg {
91 83
92#define CEPH_PG_TYPE_REP 1 84#define CEPH_PG_TYPE_REP 1
93#define CEPH_PG_TYPE_RAID4 2 85#define CEPH_PG_TYPE_RAID4 2
94#define CEPH_PG_POOL_VERSION 2
95struct ceph_pg_pool {
96 __u8 type; /* CEPH_PG_TYPE_* */
97 __u8 size; /* number of osds in each pg */
98 __u8 crush_ruleset; /* crush placement rule */
99 __u8 object_hash; /* hash mapping object name to ps */
100 __le32 pg_num, pgp_num; /* number of pg's */
101 __le32 lpg_num, lpgp_num; /* number of localized pg's */
102 __le32 last_change; /* most recent epoch changed */
103 __le64 snap_seq; /* seq for per-pool snapshot */
104 __le32 snap_epoch; /* epoch of last snap */
105 __le32 num_snaps;
106 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
107 __le64 auid; /* who owns the pg */
108} __attribute__ ((packed));
109 86
110/* 87/*
111 * stable_mod func is used to control number of placement groups. 88 * stable_mod func is used to control number of placement groups.
@@ -128,7 +105,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask)
128 * object layout - how a given object should be stored. 105 * object layout - how a given object should be stored.
129 */ 106 */
130struct ceph_object_layout { 107struct ceph_object_layout {
131 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ 108 struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */
132 __le32 ol_stripe_unit; /* for per-object parity, if any */ 109 __le32 ol_stripe_unit; /* for per-object parity, if any */
133} __attribute__ ((packed)); 110} __attribute__ ((packed));
134 111
@@ -145,8 +122,12 @@ struct ceph_eversion {
145 */ 122 */
146 123
147/* status bits */ 124/* status bits */
148#define CEPH_OSD_EXISTS 1 125#define CEPH_OSD_EXISTS (1<<0)
149#define CEPH_OSD_UP 2 126#define CEPH_OSD_UP (1<<1)
127#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
128#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
129
130extern const char *ceph_osd_state_name(int s);
150 131
151/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ 132/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
152#define CEPH_OSD_IN 0x10000 133#define CEPH_OSD_IN 0x10000
@@ -161,9 +142,25 @@ struct ceph_eversion {
161#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ 142#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
162#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ 143#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
163#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ 144#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
145#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
146#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
147#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
148#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
149#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
150#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
151
152/*
153 * The error code to return when an OSD can't handle a write
154 * because it is too large.
155 */
156#define OSD_WRITETOOBIG EMSGSIZE
164 157
165/* 158/*
166 * osd ops 159 * osd ops
160 *
161 * WARNING: do not use these op codes directly. Use the helpers
162 * defined below instead. In certain cases, op code behavior was
163 * redefined, resulting in special-cases in the helpers.
167 */ 164 */
168#define CEPH_OSD_OP_MODE 0xf000 165#define CEPH_OSD_OP_MODE 0xf000
169#define CEPH_OSD_OP_MODE_RD 0x1000 166#define CEPH_OSD_OP_MODE_RD 0x1000
@@ -177,6 +174,7 @@ struct ceph_eversion {
177#define CEPH_OSD_OP_TYPE_ATTR 0x0300 174#define CEPH_OSD_OP_TYPE_ATTR 0x0300
178#define CEPH_OSD_OP_TYPE_EXEC 0x0400 175#define CEPH_OSD_OP_TYPE_EXEC 0x0400
179#define CEPH_OSD_OP_TYPE_PG 0x0500 176#define CEPH_OSD_OP_TYPE_PG 0x0500
177#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
180 178
181enum { 179enum {
182 /** data **/ 180 /** data **/
@@ -217,6 +215,23 @@ enum {
217 215
218 CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, 216 CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
219 217
218 /* omap */
219 CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17,
220 CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18,
221 CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19,
222 CEPH_OSD_OP_OMAPGETVALSBYKEYS =
223 CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20,
224 CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21,
225 CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22,
226 CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23,
227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
229
230 /** multi **/
231 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
232 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
233 CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3,
234
220 /** attrs **/ 235 /** attrs **/
221 /* read */ 236 /* read */
222 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 237 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -238,6 +253,7 @@ enum {
238 CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, 253 CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6,
239 CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, 254 CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7,
240 CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, 255 CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8,
256 CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9,
241 257
242 /** lock **/ 258 /** lock **/
243 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, 259 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
@@ -248,10 +264,12 @@ enum {
248 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, 264 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
249 265
250 /** exec **/ 266 /** exec **/
267 /* note: the RD bit here is wrong; see special-case below in helper */
251 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, 268 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
252 269
253 /** pg **/ 270 /** pg **/
254 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, 271 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
272 CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2,
255}; 273};
256 274
257static inline int ceph_osd_op_type_lock(int op) 275static inline int ceph_osd_op_type_lock(int op)
@@ -274,6 +292,10 @@ static inline int ceph_osd_op_type_pg(int op)
274{ 292{
275 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; 293 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
276} 294}
295static inline int ceph_osd_op_type_multi(int op)
296{
297 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI;
298}
277 299
278static inline int ceph_osd_op_mode_subop(int op) 300static inline int ceph_osd_op_mode_subop(int op)
279{ 301{
@@ -281,11 +303,12 @@ static inline int ceph_osd_op_mode_subop(int op)
281} 303}
282static inline int ceph_osd_op_mode_read(int op) 304static inline int ceph_osd_op_mode_read(int op)
283{ 305{
284 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; 306 return (op & CEPH_OSD_OP_MODE_RD) &&
307 op != CEPH_OSD_OP_CALL;
285} 308}
286static inline int ceph_osd_op_mode_modify(int op) 309static inline int ceph_osd_op_mode_modify(int op)
287{ 310{
288 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 311 return op & CEPH_OSD_OP_MODE_WR;
289} 312}
290 313
291/* 314/*
@@ -294,34 +317,38 @@ static inline int ceph_osd_op_mode_modify(int op)
294 */ 317 */
295#define CEPH_OSD_TMAP_HDR 'h' 318#define CEPH_OSD_TMAP_HDR 'h'
296#define CEPH_OSD_TMAP_SET 's' 319#define CEPH_OSD_TMAP_SET 's'
320#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
297#define CEPH_OSD_TMAP_RM 'r' 321#define CEPH_OSD_TMAP_RM 'r'
322#define CEPH_OSD_TMAP_RMSLOPPY 'R'
298 323
299extern const char *ceph_osd_op_name(int op); 324extern const char *ceph_osd_op_name(int op);
300 325
301
302/* 326/*
303 * osd op flags 327 * osd op flags
304 * 328 *
305 * An op may be READ, WRITE, or READ|WRITE. 329 * An op may be READ, WRITE, or READ|WRITE.
306 */ 330 */
307enum { 331enum {
308 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ 332 CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
309 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ 333 CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
310 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ 334 CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
311 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ 335 CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
312 CEPH_OSD_FLAG_READ = 16, /* op may read */ 336 CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
313 CEPH_OSD_FLAG_WRITE = 32, /* op may write */ 337 CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
314 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ 338 CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
315 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ 339 CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
316 CEPH_OSD_FLAG_BALANCE_READS = 256, 340 CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
317 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 341 CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
318 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 342 CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
319 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 343 CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
320 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ 344 CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
345 CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
346 CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
321}; 347};
322 348
323enum { 349enum {
324 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ 350 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
351 CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
325}; 352};
326 353
327#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 354#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
@@ -381,48 +408,13 @@ struct ceph_osd_op {
381 __le64 ver; 408 __le64 ver;
382 __u8 flag; /* 0 = unwatch, 1 = watch */ 409 __u8 flag; /* 0 = unwatch, 1 = watch */
383 } __attribute__ ((packed)) watch; 410 } __attribute__ ((packed)) watch;
384}; 411 struct {
412 __le64 offset, length;
413 __le64 src_offset;
414 } __attribute__ ((packed)) clonerange;
415 };
385 __le32 payload_len; 416 __le32 payload_len;
386} __attribute__ ((packed)); 417} __attribute__ ((packed));
387 418
388/*
389 * osd request message header. each request may include multiple
390 * ceph_osd_op object operations.
391 */
392struct ceph_osd_request_head {
393 __le32 client_inc; /* client incarnation */
394 struct ceph_object_layout layout; /* pgid */
395 __le32 osdmap_epoch; /* client's osdmap epoch */
396
397 __le32 flags;
398
399 struct ceph_timespec mtime; /* for mutations only */
400 struct ceph_eversion reassert_version; /* if we are replaying op */
401
402 __le32 object_len; /* length of object name */
403
404 __le64 snapid; /* snapid to read */
405 __le64 snap_seq; /* writer's snap context */
406 __le32 num_snaps;
407
408 __le16 num_ops;
409 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
410} __attribute__ ((packed));
411
412struct ceph_osd_reply_head {
413 __le32 client_inc; /* client incarnation */
414 __le32 flags;
415 struct ceph_object_layout layout;
416 __le32 osdmap_epoch;
417 struct ceph_eversion reassert_version; /* for replaying uncommitted */
418
419 __le32 result; /* result code */
420
421 __le32 object_len; /* length of object name */
422 __le32 num_ops;
423 struct ceph_osd_op ops[0]; /* ops[], object */
424} __attribute__ ((packed));
425
426
427 419
428#endif 420#endif
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 25baa287cff7..6a1101f24cfb 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -162,6 +162,8 @@ struct crush_map {
162 __u32 choose_local_fallback_tries; 162 __u32 choose_local_fallback_tries;
163 /* choose attempts before giving up */ 163 /* choose attempts before giving up */
164 __u32 choose_total_tries; 164 __u32 choose_total_tries;
165 /* attempt chooseleaf inner descent once; on failure retry outer descent */
166 __u32 chooseleaf_descend_once;
165}; 167};
166 168
167 169
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 1deb29af82fd..e65e6e4be38b 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -28,6 +28,22 @@
28#include "crypto.h" 28#include "crypto.h"
29 29
30 30
31/*
32 * Module compatibility interface. For now it doesn't do anything,
33 * but its existence signals a certain level of functionality.
34 *
35 * The data buffer is used to pass information both to and from
36 * libceph. The return value indicates whether libceph determines
37 * it is compatible with the caller (from another kernel module),
38 * given the provided data.
39 *
40 * The data pointer can be null.
41 */
42bool libceph_compatible(void *data)
43{
44 return true;
45}
46EXPORT_SYMBOL(libceph_compatible);
31 47
32/* 48/*
33 * find filename portion of a path (/foo/bar/baz -> baz) 49 * find filename portion of a path (/foo/bar/baz -> baz)
@@ -590,10 +606,8 @@ static int __init init_ceph_lib(void)
590 if (ret < 0) 606 if (ret < 0)
591 goto out_crypto; 607 goto out_crypto;
592 608
593 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", 609 pr_info("loaded (mon/osd proto %d/%d)\n",
594 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, 610 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
595 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
596 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
597 611
598 return 0; 612 return 0;
599 613
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 3fbda04de29c..1348df96fe15 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op)
21 switch (op) { 21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read"; 22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat"; 23 case CEPH_OSD_OP_STAT: return "stat";
24 case CEPH_OSD_OP_MAPEXT: return "mapext";
25 case CEPH_OSD_OP_SPARSE_READ: return "sparse-read";
26 case CEPH_OSD_OP_NOTIFY: return "notify";
27 case CEPH_OSD_OP_NOTIFY_ACK: return "notify-ack";
28 case CEPH_OSD_OP_ASSERT_VER: return "assert-version";
24 29
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; 30 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26 31
32 case CEPH_OSD_OP_CREATE: return "create";
27 case CEPH_OSD_OP_WRITE: return "write"; 33 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete"; 34 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate"; 35 case CEPH_OSD_OP_TRUNCATE: return "truncate";
@@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op)
39 case CEPH_OSD_OP_TMAPUP: return "tmapup"; 45 case CEPH_OSD_OP_TMAPUP: return "tmapup";
40 case CEPH_OSD_OP_TMAPGET: return "tmapget"; 46 case CEPH_OSD_OP_TMAPGET: return "tmapget";
41 case CEPH_OSD_OP_TMAPPUT: return "tmapput"; 47 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
48 case CEPH_OSD_OP_WATCH: return "watch";
49
50 case CEPH_OSD_OP_CLONERANGE: return "clonerange";
51 case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version";
52 case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr";
42 53
43 case CEPH_OSD_OP_GETXATTR: return "getxattr"; 54 case CEPH_OSD_OP_GETXATTR: return "getxattr";
44 case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; 55 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
@@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op)
53 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; 64 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
54 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; 65 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
55 case CEPH_OSD_OP_SCRUB: return "scrub"; 66 case CEPH_OSD_OP_SCRUB: return "scrub";
67 case CEPH_OSD_OP_SCRUB_RESERVE: return "scrub-reserve";
68 case CEPH_OSD_OP_SCRUB_UNRESERVE: return "scrub-unreserve";
69 case CEPH_OSD_OP_SCRUB_STOP: return "scrub-stop";
70 case CEPH_OSD_OP_SCRUB_MAP: return "scrub-map";
56 71
57 case CEPH_OSD_OP_WRLOCK: return "wrlock"; 72 case CEPH_OSD_OP_WRLOCK: return "wrlock";
58 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; 73 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
@@ -64,10 +79,34 @@ const char *ceph_osd_op_name(int op)
64 case CEPH_OSD_OP_CALL: return "call"; 79 case CEPH_OSD_OP_CALL: return "call";
65 80
66 case CEPH_OSD_OP_PGLS: return "pgls"; 81 case CEPH_OSD_OP_PGLS: return "pgls";
82 case CEPH_OSD_OP_PGLS_FILTER: return "pgls-filter";
83 case CEPH_OSD_OP_OMAPGETKEYS: return "omap-get-keys";
84 case CEPH_OSD_OP_OMAPGETVALS: return "omap-get-vals";
85 case CEPH_OSD_OP_OMAPGETHEADER: return "omap-get-header";
86 case CEPH_OSD_OP_OMAPGETVALSBYKEYS: return "omap-get-vals-by-keys";
87 case CEPH_OSD_OP_OMAPSETVALS: return "omap-set-vals";
88 case CEPH_OSD_OP_OMAPSETHEADER: return "omap-set-header";
89 case CEPH_OSD_OP_OMAPCLEAR: return "omap-clear";
90 case CEPH_OSD_OP_OMAPRMKEYS: return "omap-rm-keys";
67 } 91 }
68 return "???"; 92 return "???";
69} 93}
70 94
95const char *ceph_osd_state_name(int s)
96{
97 switch (s) {
98 case CEPH_OSD_EXISTS:
99 return "exists";
100 case CEPH_OSD_UP:
101 return "up";
102 case CEPH_OSD_AUTOOUT:
103 return "autoout";
104 case CEPH_OSD_NEW:
105 return "new";
106 default:
107 return "???";
108 }
109}
71 110
72const char *ceph_pool_op_name(int op) 111const char *ceph_pool_op_name(int op)
73{ 112{
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 35fce755ce10..cbd06a91941c 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
287 * @outpos: our position in that vector 287 * @outpos: our position in that vector
288 * @firstn: true if choosing "first n" items, false if choosing "indep" 288 * @firstn: true if choosing "first n" items, false if choosing "indep"
289 * @recurse_to_leaf: true if we want one device under each item of given type 289 * @recurse_to_leaf: true if we want one device under each item of given type
290 * @descend_once: true if we should only try one descent before giving up
290 * @out2: second output vector for leaf items (if @recurse_to_leaf) 291 * @out2: second output vector for leaf items (if @recurse_to_leaf)
291 */ 292 */
292static int crush_choose(const struct crush_map *map, 293static int crush_choose(const struct crush_map *map,
@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map,
295 int x, int numrep, int type, 296 int x, int numrep, int type,
296 int *out, int outpos, 297 int *out, int outpos,
297 int firstn, int recurse_to_leaf, 298 int firstn, int recurse_to_leaf,
298 int *out2) 299 int descend_once, int *out2)
299{ 300{
300 int rep; 301 int rep;
301 unsigned int ftotal, flocal; 302 unsigned int ftotal, flocal;
@@ -391,7 +392,7 @@ static int crush_choose(const struct crush_map *map,
391 } 392 }
392 393
393 reject = 0; 394 reject = 0;
394 if (recurse_to_leaf) { 395 if (!collide && recurse_to_leaf) {
395 if (item < 0) { 396 if (item < 0) {
396 if (crush_choose(map, 397 if (crush_choose(map,
397 map->buckets[-1-item], 398 map->buckets[-1-item],
@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map,
399 x, outpos+1, 0, 400 x, outpos+1, 0,
400 out2, outpos, 401 out2, outpos,
401 firstn, 0, 402 firstn, 0,
403 map->chooseleaf_descend_once,
402 NULL) <= outpos) 404 NULL) <= outpos)
403 /* didn't get leaf */ 405 /* didn't get leaf */
404 reject = 1; 406 reject = 1;
@@ -422,7 +424,10 @@ reject:
422 ftotal++; 424 ftotal++;
423 flocal++; 425 flocal++;
424 426
425 if (collide && flocal <= map->choose_local_tries) 427 if (reject && descend_once)
428 /* let outer call try again */
429 skip_rep = 1;
430 else if (collide && flocal <= map->choose_local_tries)
426 /* retry locally a few times */ 431 /* retry locally a few times */
427 retry_bucket = 1; 432 retry_bucket = 1;
428 else if (map->choose_local_fallback_tries > 0 && 433 else if (map->choose_local_fallback_tries > 0 &&
@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map,
485 int i, j; 490 int i, j;
486 int numrep; 491 int numrep;
487 int firstn; 492 int firstn;
493 const int descend_once = 0;
488 494
489 if ((__u32)ruleno >= map->max_rules) { 495 if ((__u32)ruleno >= map->max_rules) {
490 dprintk(" bad ruleno %d\n", ruleno); 496 dprintk(" bad ruleno %d\n", ruleno);
@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map,
544 curstep->arg2, 550 curstep->arg2,
545 o+osize, j, 551 o+osize, j,
546 firstn, 552 firstn,
547 recurse_to_leaf, c+osize); 553 recurse_to_leaf,
554 descend_once, c+osize);
548 } 555 }
549 556
550 if (recurse_to_leaf) 557 if (recurse_to_leaf)
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index af14cb425164..6e7a236525b6 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -423,7 +423,8 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
423 } 423 }
424} 424}
425 425
426int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) 426static int ceph_key_instantiate(struct key *key,
427 struct key_preparsed_payload *prep)
427{ 428{
428 struct ceph_crypto_key *ckey; 429 struct ceph_crypto_key *ckey;
429 size_t datalen = prep->datalen; 430 size_t datalen = prep->datalen;
@@ -458,12 +459,12 @@ err:
458 return ret; 459 return ret;
459} 460}
460 461
461int ceph_key_match(const struct key *key, const void *description) 462static int ceph_key_match(const struct key *key, const void *description)
462{ 463{
463 return strcmp(key->description, description) == 0; 464 return strcmp(key->description, description) == 0;
464} 465}
465 466
466void ceph_key_destroy(struct key *key) { 467static void ceph_key_destroy(struct key *key) {
467 struct ceph_crypto_key *ckey = key->payload.data; 468 struct ceph_crypto_key *ckey = key->payload.data;
468 469
469 ceph_crypto_key_destroy(ckey); 470 ceph_crypto_key_destroy(ckey);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 38b5dc1823d4..00d051f4894e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p)
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { 66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool = 67 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node); 68 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", 69 seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
70 pool->id, pool->v.pg_num, pool->pg_num_mask, 70 (unsigned long long)pool->id, pool->pg_num,
71 pool->v.lpg_num, pool->lpg_num_mask); 71 pool->pg_num_mask);
72 } 72 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) { 73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
74 struct ceph_entity_addr *addr = 74 struct ceph_entity_addr *addr =
@@ -123,26 +123,16 @@ static int osdc_show(struct seq_file *s, void *pp)
123 mutex_lock(&osdc->request_mutex); 123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req; 125 struct ceph_osd_request *req;
126 struct ceph_osd_request_head *head; 126 int opcode;
127 struct ceph_osd_op *op;
128 int num_ops;
129 int opcode, olen;
130 int i; 127 int i;
131 128
132 req = rb_entry(p, struct ceph_osd_request, r_node); 129 req = rb_entry(p, struct ceph_osd_request, r_node);
133 130
134 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, 131 seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
135 req->r_osd ? req->r_osd->o_osd : -1, 132 req->r_osd ? req->r_osd->o_osd : -1,
136 le32_to_cpu(req->r_pgid.pool), 133 req->r_pgid.pool, req->r_pgid.seed);
137 le16_to_cpu(req->r_pgid.ps));
138 134
139 head = req->r_request->front.iov_base; 135 seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
140 op = (void *)(head + 1);
141
142 num_ops = le16_to_cpu(head->num_ops);
143 olen = le32_to_cpu(head->object_len);
144 seq_printf(s, "%.*s", olen,
145 (const char *)(head->ops + num_ops));
146 136
147 if (req->r_reassert_version.epoch) 137 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu", 138 seq_printf(s, "\t%u'%llu",
@@ -151,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp)
151 else 141 else
152 seq_printf(s, "\t"); 142 seq_printf(s, "\t");
153 143
154 for (i = 0; i < num_ops; i++) { 144 for (i = 0; i < req->r_num_ops; i++) {
155 opcode = le16_to_cpu(op->op); 145 opcode = le16_to_cpu(req->r_request_ops[i].op);
156 seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); 146 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
157 op++;
158 } 147 }
159 148
160 seq_printf(s, "\n"); 149 seq_printf(s, "\n");
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5ccf87ed8d68..2c0669fb54e3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -9,8 +9,9 @@
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/socket.h> 10#include <linux/socket.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#ifdef CONFIG_BLOCK
12#include <linux/bio.h> 13#include <linux/bio.h>
13#include <linux/blkdev.h> 14#endif /* CONFIG_BLOCK */
14#include <linux/dns_resolver.h> 15#include <linux/dns_resolver.h>
15#include <net/tcp.h> 16#include <net/tcp.h>
16 17
@@ -97,6 +98,57 @@
97#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ 98#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
98#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ 99#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
99 100
101static bool con_flag_valid(unsigned long con_flag)
102{
103 switch (con_flag) {
104 case CON_FLAG_LOSSYTX:
105 case CON_FLAG_KEEPALIVE_PENDING:
106 case CON_FLAG_WRITE_PENDING:
107 case CON_FLAG_SOCK_CLOSED:
108 case CON_FLAG_BACKOFF:
109 return true;
110 default:
111 return false;
112 }
113}
114
115static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
116{
117 BUG_ON(!con_flag_valid(con_flag));
118
119 clear_bit(con_flag, &con->flags);
120}
121
122static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
123{
124 BUG_ON(!con_flag_valid(con_flag));
125
126 set_bit(con_flag, &con->flags);
127}
128
129static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
130{
131 BUG_ON(!con_flag_valid(con_flag));
132
133 return test_bit(con_flag, &con->flags);
134}
135
136static bool con_flag_test_and_clear(struct ceph_connection *con,
137 unsigned long con_flag)
138{
139 BUG_ON(!con_flag_valid(con_flag));
140
141 return test_and_clear_bit(con_flag, &con->flags);
142}
143
144static bool con_flag_test_and_set(struct ceph_connection *con,
145 unsigned long con_flag)
146{
147 BUG_ON(!con_flag_valid(con_flag));
148
149 return test_and_set_bit(con_flag, &con->flags);
150}
151
100/* static tag bytes (protocol control messages) */ 152/* static tag bytes (protocol control messages) */
101static char tag_msg = CEPH_MSGR_TAG_MSG; 153static char tag_msg = CEPH_MSGR_TAG_MSG;
102static char tag_ack = CEPH_MSGR_TAG_ACK; 154static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -114,7 +166,7 @@ static struct lock_class_key socket_class;
114 166
115static void queue_con(struct ceph_connection *con); 167static void queue_con(struct ceph_connection *con);
116static void con_work(struct work_struct *); 168static void con_work(struct work_struct *);
117static void ceph_fault(struct ceph_connection *con); 169static void con_fault(struct ceph_connection *con);
118 170
119/* 171/*
120 * Nicely render a sockaddr as a string. An array of formatted 172 * Nicely render a sockaddr as a string. An array of formatted
@@ -171,7 +223,7 @@ static void encode_my_addr(struct ceph_messenger *msgr)
171 */ 223 */
172static struct workqueue_struct *ceph_msgr_wq; 224static struct workqueue_struct *ceph_msgr_wq;
173 225
174void _ceph_msgr_exit(void) 226static void _ceph_msgr_exit(void)
175{ 227{
176 if (ceph_msgr_wq) { 228 if (ceph_msgr_wq) {
177 destroy_workqueue(ceph_msgr_wq); 229 destroy_workqueue(ceph_msgr_wq);
@@ -308,7 +360,7 @@ static void ceph_sock_write_space(struct sock *sk)
308 * buffer. See net/ipv4/tcp_input.c:tcp_check_space() 360 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
309 * and net/core/stream.c:sk_stream_write_space(). 361 * and net/core/stream.c:sk_stream_write_space().
310 */ 362 */
311 if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { 363 if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
312 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { 364 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
313 dout("%s %p queueing write work\n", __func__, con); 365 dout("%s %p queueing write work\n", __func__, con);
314 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 366 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -333,7 +385,7 @@ static void ceph_sock_state_change(struct sock *sk)
333 case TCP_CLOSE_WAIT: 385 case TCP_CLOSE_WAIT:
334 dout("%s TCP_CLOSE_WAIT\n", __func__); 386 dout("%s TCP_CLOSE_WAIT\n", __func__);
335 con_sock_state_closing(con); 387 con_sock_state_closing(con);
336 set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); 388 con_flag_set(con, CON_FLAG_SOCK_CLOSED);
337 queue_con(con); 389 queue_con(con);
338 break; 390 break;
339 case TCP_ESTABLISHED: 391 case TCP_ESTABLISHED:
@@ -474,7 +526,7 @@ static int con_close_socket(struct ceph_connection *con)
474 * received a socket close event before we had the chance to 526 * received a socket close event before we had the chance to
475 * shut the socket down. 527 * shut the socket down.
476 */ 528 */
477 clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); 529 con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
478 530
479 con_sock_state_closed(con); 531 con_sock_state_closed(con);
480 return rc; 532 return rc;
@@ -538,11 +590,10 @@ void ceph_con_close(struct ceph_connection *con)
538 ceph_pr_addr(&con->peer_addr.in_addr)); 590 ceph_pr_addr(&con->peer_addr.in_addr));
539 con->state = CON_STATE_CLOSED; 591 con->state = CON_STATE_CLOSED;
540 592
541 clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ 593 con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
542 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); 594 con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
543 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 595 con_flag_clear(con, CON_FLAG_WRITE_PENDING);
544 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); 596 con_flag_clear(con, CON_FLAG_BACKOFF);
545 clear_bit(CON_FLAG_BACKOFF, &con->flags);
546 597
547 reset_connection(con); 598 reset_connection(con);
548 con->peer_global_seq = 0; 599 con->peer_global_seq = 0;
@@ -798,7 +849,7 @@ static void prepare_write_message(struct ceph_connection *con)
798 /* no, queue up footer too and be done */ 849 /* no, queue up footer too and be done */
799 prepare_write_message_footer(con); 850 prepare_write_message_footer(con);
800 851
801 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 852 con_flag_set(con, CON_FLAG_WRITE_PENDING);
802} 853}
803 854
804/* 855/*
@@ -819,7 +870,7 @@ static void prepare_write_ack(struct ceph_connection *con)
819 &con->out_temp_ack); 870 &con->out_temp_ack);
820 871
821 con->out_more = 1; /* more will follow.. eventually.. */ 872 con->out_more = 1; /* more will follow.. eventually.. */
822 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 873 con_flag_set(con, CON_FLAG_WRITE_PENDING);
823} 874}
824 875
825/* 876/*
@@ -830,7 +881,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
830 dout("prepare_write_keepalive %p\n", con); 881 dout("prepare_write_keepalive %p\n", con);
831 con_out_kvec_reset(con); 882 con_out_kvec_reset(con);
832 con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); 883 con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
833 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 884 con_flag_set(con, CON_FLAG_WRITE_PENDING);
834} 885}
835 886
836/* 887/*
@@ -873,7 +924,7 @@ static void prepare_write_banner(struct ceph_connection *con)
873 &con->msgr->my_enc_addr); 924 &con->msgr->my_enc_addr);
874 925
875 con->out_more = 0; 926 con->out_more = 0;
876 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 927 con_flag_set(con, CON_FLAG_WRITE_PENDING);
877} 928}
878 929
879static int prepare_write_connect(struct ceph_connection *con) 930static int prepare_write_connect(struct ceph_connection *con)
@@ -923,7 +974,7 @@ static int prepare_write_connect(struct ceph_connection *con)
923 auth->authorizer_buf); 974 auth->authorizer_buf);
924 975
925 con->out_more = 0; 976 con->out_more = 0;
926 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 977 con_flag_set(con, CON_FLAG_WRITE_PENDING);
927 978
928 return 0; 979 return 0;
929} 980}
@@ -1643,7 +1694,7 @@ static int process_connect(struct ceph_connection *con)
1643 le32_to_cpu(con->in_reply.connect_seq)); 1694 le32_to_cpu(con->in_reply.connect_seq));
1644 1695
1645 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) 1696 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1646 set_bit(CON_FLAG_LOSSYTX, &con->flags); 1697 con_flag_set(con, CON_FLAG_LOSSYTX);
1647 1698
1648 con->delay = 0; /* reset backoff memory */ 1699 con->delay = 0; /* reset backoff memory */
1649 1700
@@ -2080,15 +2131,14 @@ do_next:
2080 prepare_write_ack(con); 2131 prepare_write_ack(con);
2081 goto more; 2132 goto more;
2082 } 2133 }
2083 if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, 2134 if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
2084 &con->flags)) {
2085 prepare_write_keepalive(con); 2135 prepare_write_keepalive(con);
2086 goto more; 2136 goto more;
2087 } 2137 }
2088 } 2138 }
2089 2139
2090 /* Nothing to do! */ 2140 /* Nothing to do! */
2091 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 2141 con_flag_clear(con, CON_FLAG_WRITE_PENDING);
2092 dout("try_write nothing else to write.\n"); 2142 dout("try_write nothing else to write.\n");
2093 ret = 0; 2143 ret = 0;
2094out: 2144out:
@@ -2268,7 +2318,7 @@ static void queue_con(struct ceph_connection *con)
2268 2318
2269static bool con_sock_closed(struct ceph_connection *con) 2319static bool con_sock_closed(struct ceph_connection *con)
2270{ 2320{
2271 if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) 2321 if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
2272 return false; 2322 return false;
2273 2323
2274#define CASE(x) \ 2324#define CASE(x) \
@@ -2295,6 +2345,41 @@ static bool con_sock_closed(struct ceph_connection *con)
2295 return true; 2345 return true;
2296} 2346}
2297 2347
2348static bool con_backoff(struct ceph_connection *con)
2349{
2350 int ret;
2351
2352 if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
2353 return false;
2354
2355 ret = queue_con_delay(con, round_jiffies_relative(con->delay));
2356 if (ret) {
2357 dout("%s: con %p FAILED to back off %lu\n", __func__,
2358 con, con->delay);
2359 BUG_ON(ret == -ENOENT);
2360 con_flag_set(con, CON_FLAG_BACKOFF);
2361 }
2362
2363 return true;
2364}
2365
2366/* Finish fault handling; con->mutex must *not* be held here */
2367
2368static void con_fault_finish(struct ceph_connection *con)
2369{
2370 /*
2371 * in case we faulted due to authentication, invalidate our
2372 * current tickets so that we can get new ones.
2373 */
2374 if (con->auth_retry && con->ops->invalidate_authorizer) {
2375 dout("calling invalidate_authorizer()\n");
2376 con->ops->invalidate_authorizer(con);
2377 }
2378
2379 if (con->ops->fault)
2380 con->ops->fault(con);
2381}
2382
2298/* 2383/*
2299 * Do some work on a connection. Drop a connection ref when we're done. 2384 * Do some work on a connection. Drop a connection ref when we're done.
2300 */ 2385 */
@@ -2302,73 +2387,68 @@ static void con_work(struct work_struct *work)
2302{ 2387{
2303 struct ceph_connection *con = container_of(work, struct ceph_connection, 2388 struct ceph_connection *con = container_of(work, struct ceph_connection,
2304 work.work); 2389 work.work);
2305 int ret; 2390 bool fault;
2306 2391
2307 mutex_lock(&con->mutex); 2392 mutex_lock(&con->mutex);
2308restart: 2393 while (true) {
2309 if (con_sock_closed(con)) 2394 int ret;
2310 goto fault;
2311 2395
2312 if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { 2396 if ((fault = con_sock_closed(con))) {
2313 dout("con_work %p backing off\n", con); 2397 dout("%s: con %p SOCK_CLOSED\n", __func__, con);
2314 ret = queue_con_delay(con, round_jiffies_relative(con->delay)); 2398 break;
2315 if (ret) { 2399 }
2316 dout("con_work %p FAILED to back off %lu\n", con, 2400 if (con_backoff(con)) {
2317 con->delay); 2401 dout("%s: con %p BACKOFF\n", __func__, con);
2318 BUG_ON(ret == -ENOENT); 2402 break;
2319 set_bit(CON_FLAG_BACKOFF, &con->flags); 2403 }
2404 if (con->state == CON_STATE_STANDBY) {
2405 dout("%s: con %p STANDBY\n", __func__, con);
2406 break;
2407 }
2408 if (con->state == CON_STATE_CLOSED) {
2409 dout("%s: con %p CLOSED\n", __func__, con);
2410 BUG_ON(con->sock);
2411 break;
2412 }
2413 if (con->state == CON_STATE_PREOPEN) {
2414 dout("%s: con %p PREOPEN\n", __func__, con);
2415 BUG_ON(con->sock);
2320 } 2416 }
2321 goto done;
2322 }
2323 2417
2324 if (con->state == CON_STATE_STANDBY) { 2418 ret = try_read(con);
2325 dout("con_work %p STANDBY\n", con); 2419 if (ret < 0) {
2326 goto done; 2420 if (ret == -EAGAIN)
2327 } 2421 continue;
2328 if (con->state == CON_STATE_CLOSED) { 2422 con->error_msg = "socket error on read";
2329 dout("con_work %p CLOSED\n", con); 2423 fault = true;
2330 BUG_ON(con->sock); 2424 break;
2331 goto done; 2425 }
2332 }
2333 if (con->state == CON_STATE_PREOPEN) {
2334 dout("con_work OPENING\n");
2335 BUG_ON(con->sock);
2336 }
2337 2426
2338 ret = try_read(con); 2427 ret = try_write(con);
2339 if (ret == -EAGAIN) 2428 if (ret < 0) {
2340 goto restart; 2429 if (ret == -EAGAIN)
2341 if (ret < 0) { 2430 continue;
2342 con->error_msg = "socket error on read"; 2431 con->error_msg = "socket error on write";
2343 goto fault; 2432 fault = true;
2344 } 2433 }
2345 2434
2346 ret = try_write(con); 2435 break; /* If we make it to here, we're done */
2347 if (ret == -EAGAIN)
2348 goto restart;
2349 if (ret < 0) {
2350 con->error_msg = "socket error on write";
2351 goto fault;
2352 } 2436 }
2353 2437 if (fault)
2354done: 2438 con_fault(con);
2355 mutex_unlock(&con->mutex); 2439 mutex_unlock(&con->mutex);
2356done_unlocked:
2357 con->ops->put(con);
2358 return;
2359 2440
2360fault: 2441 if (fault)
2361 ceph_fault(con); /* error/fault path */ 2442 con_fault_finish(con);
2362 goto done_unlocked;
2363}
2364 2443
2444 con->ops->put(con);
2445}
2365 2446
2366/* 2447/*
2367 * Generic error/fault handler. A retry mechanism is used with 2448 * Generic error/fault handler. A retry mechanism is used with
2368 * exponential backoff 2449 * exponential backoff
2369 */ 2450 */
2370static void ceph_fault(struct ceph_connection *con) 2451static void con_fault(struct ceph_connection *con)
2371 __releases(con->mutex)
2372{ 2452{
2373 pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), 2453 pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2374 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); 2454 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
@@ -2381,10 +2461,10 @@ static void ceph_fault(struct ceph_connection *con)
2381 2461
2382 con_close_socket(con); 2462 con_close_socket(con);
2383 2463
2384 if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { 2464 if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
2385 dout("fault on LOSSYTX channel, marking CLOSED\n"); 2465 dout("fault on LOSSYTX channel, marking CLOSED\n");
2386 con->state = CON_STATE_CLOSED; 2466 con->state = CON_STATE_CLOSED;
2387 goto out_unlock; 2467 return;
2388 } 2468 }
2389 2469
2390 if (con->in_msg) { 2470 if (con->in_msg) {
@@ -2401,9 +2481,9 @@ static void ceph_fault(struct ceph_connection *con)
2401 /* If there are no messages queued or keepalive pending, place 2481 /* If there are no messages queued or keepalive pending, place
2402 * the connection in a STANDBY state */ 2482 * the connection in a STANDBY state */
2403 if (list_empty(&con->out_queue) && 2483 if (list_empty(&con->out_queue) &&
2404 !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { 2484 !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
2405 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); 2485 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
2406 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 2486 con_flag_clear(con, CON_FLAG_WRITE_PENDING);
2407 con->state = CON_STATE_STANDBY; 2487 con->state = CON_STATE_STANDBY;
2408 } else { 2488 } else {
2409 /* retry after a delay. */ 2489 /* retry after a delay. */
@@ -2412,23 +2492,9 @@ static void ceph_fault(struct ceph_connection *con)
2412 con->delay = BASE_DELAY_INTERVAL; 2492 con->delay = BASE_DELAY_INTERVAL;
2413 else if (con->delay < MAX_DELAY_INTERVAL) 2493 else if (con->delay < MAX_DELAY_INTERVAL)
2414 con->delay *= 2; 2494 con->delay *= 2;
2415 set_bit(CON_FLAG_BACKOFF, &con->flags); 2495 con_flag_set(con, CON_FLAG_BACKOFF);
2416 queue_con(con); 2496 queue_con(con);
2417 } 2497 }
2418
2419out_unlock:
2420 mutex_unlock(&con->mutex);
2421 /*
2422 * in case we faulted due to authentication, invalidate our
2423 * current tickets so that we can get new ones.
2424 */
2425 if (con->auth_retry && con->ops->invalidate_authorizer) {
2426 dout("calling invalidate_authorizer()\n");
2427 con->ops->invalidate_authorizer(con);
2428 }
2429
2430 if (con->ops->fault)
2431 con->ops->fault(con);
2432} 2498}
2433 2499
2434 2500
@@ -2469,8 +2535,8 @@ static void clear_standby(struct ceph_connection *con)
2469 dout("clear_standby %p and ++connect_seq\n", con); 2535 dout("clear_standby %p and ++connect_seq\n", con);
2470 con->state = CON_STATE_PREOPEN; 2536 con->state = CON_STATE_PREOPEN;
2471 con->connect_seq++; 2537 con->connect_seq++;
2472 WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); 2538 WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
2473 WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); 2539 WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
2474 } 2540 }
2475} 2541}
2476 2542
@@ -2511,7 +2577,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2511 2577
2512 /* if there wasn't anything waiting to send before, queue 2578 /* if there wasn't anything waiting to send before, queue
2513 * new work */ 2579 * new work */
2514 if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) 2580 if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
2515 queue_con(con); 2581 queue_con(con);
2516} 2582}
2517EXPORT_SYMBOL(ceph_con_send); 2583EXPORT_SYMBOL(ceph_con_send);
@@ -2600,8 +2666,8 @@ void ceph_con_keepalive(struct ceph_connection *con)
2600 mutex_lock(&con->mutex); 2666 mutex_lock(&con->mutex);
2601 clear_standby(con); 2667 clear_standby(con);
2602 mutex_unlock(&con->mutex); 2668 mutex_unlock(&con->mutex);
2603 if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && 2669 if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
2604 test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) 2670 con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
2605 queue_con(con); 2671 queue_con(con);
2606} 2672}
2607EXPORT_SYMBOL(ceph_con_keepalive); 2673EXPORT_SYMBOL(ceph_con_keepalive);
@@ -2651,9 +2717,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2651 m->page_alignment = 0; 2717 m->page_alignment = 0;
2652 m->pages = NULL; 2718 m->pages = NULL;
2653 m->pagelist = NULL; 2719 m->pagelist = NULL;
2720#ifdef CONFIG_BLOCK
2654 m->bio = NULL; 2721 m->bio = NULL;
2655 m->bio_iter = NULL; 2722 m->bio_iter = NULL;
2656 m->bio_seg = 0; 2723 m->bio_seg = 0;
2724#endif /* CONFIG_BLOCK */
2657 m->trail = NULL; 2725 m->trail = NULL;
2658 2726
2659 /* front */ 2727 /* front */
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 812eb3b46c1f..aef5b1062bee 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -697,7 +697,7 @@ int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
697 u32 pool, u64 snapid) 697 u32 pool, u64 snapid)
698{ 698{
699 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 699 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
700 pool, snapid, 0, 0); 700 pool, snapid, NULL, 0);
701 701
702} 702}
703 703
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eb9a44478764..d730dd4d8eb2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -23,7 +23,7 @@
23 23
24static const struct ceph_connection_operations osd_con_ops; 24static const struct ceph_connection_operations osd_con_ops;
25 25
26static void send_queued(struct ceph_osd_client *osdc); 26static void __send_queued(struct ceph_osd_client *osdc);
27static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); 27static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
28static void __register_request(struct ceph_osd_client *osdc, 28static void __register_request(struct ceph_osd_client *osdc,
29 struct ceph_osd_request *req); 29 struct ceph_osd_request *req);
@@ -32,64 +32,12 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
32static void __send_request(struct ceph_osd_client *osdc, 32static void __send_request(struct ceph_osd_client *osdc,
33 struct ceph_osd_request *req); 33 struct ceph_osd_request *req);
34 34
35static int op_needs_trail(int op)
36{
37 switch (op) {
38 case CEPH_OSD_OP_GETXATTR:
39 case CEPH_OSD_OP_SETXATTR:
40 case CEPH_OSD_OP_CMPXATTR:
41 case CEPH_OSD_OP_CALL:
42 case CEPH_OSD_OP_NOTIFY:
43 return 1;
44 default:
45 return 0;
46 }
47}
48
49static int op_has_extent(int op) 35static int op_has_extent(int op)
50{ 36{
51 return (op == CEPH_OSD_OP_READ || 37 return (op == CEPH_OSD_OP_READ ||
52 op == CEPH_OSD_OP_WRITE); 38 op == CEPH_OSD_OP_WRITE);
53} 39}
54 40
55int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
56 struct ceph_file_layout *layout,
57 u64 snapid,
58 u64 off, u64 *plen, u64 *bno,
59 struct ceph_osd_request *req,
60 struct ceph_osd_req_op *op)
61{
62 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
63 u64 orig_len = *plen;
64 u64 objoff, objlen; /* extent in object */
65 int r;
66
67 reqhead->snapid = cpu_to_le64(snapid);
68
69 /* object extent? */
70 r = ceph_calc_file_object_mapping(layout, off, plen, bno,
71 &objoff, &objlen);
72 if (r < 0)
73 return r;
74 if (*plen < orig_len)
75 dout(" skipping last %llu, final file extent %llu~%llu\n",
76 orig_len - *plen, off, *plen);
77
78 if (op_has_extent(op->op)) {
79 op->extent.offset = objoff;
80 op->extent.length = objlen;
81 }
82 req->r_num_pages = calc_pages_for(off, *plen);
83 req->r_page_alignment = off & ~PAGE_MASK;
84 if (op->op == CEPH_OSD_OP_WRITE)
85 op->payload_len = *plen;
86
87 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
88 *bno, objoff, objlen, req->r_num_pages);
89 return 0;
90}
91EXPORT_SYMBOL(ceph_calc_raw_layout);
92
93/* 41/*
94 * Implement client access to distributed object storage cluster. 42 * Implement client access to distributed object storage cluster.
95 * 43 *
@@ -115,20 +63,48 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
115 * 63 *
116 * fill osd op in request message. 64 * fill osd op in request message.
117 */ 65 */
118static int calc_layout(struct ceph_osd_client *osdc, 66static int calc_layout(struct ceph_vino vino,
119 struct ceph_vino vino,
120 struct ceph_file_layout *layout, 67 struct ceph_file_layout *layout,
121 u64 off, u64 *plen, 68 u64 off, u64 *plen,
122 struct ceph_osd_request *req, 69 struct ceph_osd_request *req,
123 struct ceph_osd_req_op *op) 70 struct ceph_osd_req_op *op)
124{ 71{
125 u64 bno; 72 u64 orig_len = *plen;
73 u64 bno = 0;
74 u64 objoff = 0;
75 u64 objlen = 0;
126 int r; 76 int r;
127 77
128 r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, 78 /* object extent? */
129 plen, &bno, req, op); 79 r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno,
80 &objoff, &objlen);
130 if (r < 0) 81 if (r < 0)
131 return r; 82 return r;
83 if (objlen < orig_len) {
84 *plen = objlen;
85 dout(" skipping last %llu, final file extent %llu~%llu\n",
86 orig_len - *plen, off, *plen);
87 }
88
89 if (op_has_extent(op->op)) {
90 u32 osize = le32_to_cpu(layout->fl_object_size);
91 op->extent.offset = objoff;
92 op->extent.length = objlen;
93 if (op->extent.truncate_size <= off - objoff) {
94 op->extent.truncate_size = 0;
95 } else {
96 op->extent.truncate_size -= off - objoff;
97 if (op->extent.truncate_size > osize)
98 op->extent.truncate_size = osize;
99 }
100 }
101 req->r_num_pages = calc_pages_for(off, *plen);
102 req->r_page_alignment = off & ~PAGE_MASK;
103 if (op->op == CEPH_OSD_OP_WRITE)
104 op->payload_len = *plen;
105
106 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
107 bno, objoff, objlen, req->r_num_pages);
132 108
133 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 109 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
134 req->r_oid_len = strlen(req->r_oid); 110 req->r_oid_len = strlen(req->r_oid);
@@ -148,25 +124,19 @@ void ceph_osdc_release_request(struct kref *kref)
148 if (req->r_request) 124 if (req->r_request)
149 ceph_msg_put(req->r_request); 125 ceph_msg_put(req->r_request);
150 if (req->r_con_filling_msg) { 126 if (req->r_con_filling_msg) {
151 dout("%s revoking pages %p from con %p\n", __func__, 127 dout("%s revoking msg %p from con %p\n", __func__,
152 req->r_pages, req->r_con_filling_msg); 128 req->r_reply, req->r_con_filling_msg);
153 ceph_msg_revoke_incoming(req->r_reply); 129 ceph_msg_revoke_incoming(req->r_reply);
154 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 130 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
131 req->r_con_filling_msg = NULL;
155 } 132 }
156 if (req->r_reply) 133 if (req->r_reply)
157 ceph_msg_put(req->r_reply); 134 ceph_msg_put(req->r_reply);
158 if (req->r_own_pages) 135 if (req->r_own_pages)
159 ceph_release_page_vector(req->r_pages, 136 ceph_release_page_vector(req->r_pages,
160 req->r_num_pages); 137 req->r_num_pages);
161#ifdef CONFIG_BLOCK
162 if (req->r_bio)
163 bio_put(req->r_bio);
164#endif
165 ceph_put_snap_context(req->r_snapc); 138 ceph_put_snap_context(req->r_snapc);
166 if (req->r_trail) { 139 ceph_pagelist_release(&req->r_trail);
167 ceph_pagelist_release(req->r_trail);
168 kfree(req->r_trail);
169 }
170 if (req->r_mempool) 140 if (req->r_mempool)
171 mempool_free(req, req->r_osdc->req_mempool); 141 mempool_free(req, req->r_osdc->req_mempool);
172 else 142 else
@@ -174,37 +144,25 @@ void ceph_osdc_release_request(struct kref *kref)
174} 144}
175EXPORT_SYMBOL(ceph_osdc_release_request); 145EXPORT_SYMBOL(ceph_osdc_release_request);
176 146
177static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
178{
179 int i = 0;
180
181 if (needs_trail)
182 *needs_trail = 0;
183 while (ops[i].op) {
184 if (needs_trail && op_needs_trail(ops[i].op))
185 *needs_trail = 1;
186 i++;
187 }
188
189 return i;
190}
191
192struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 147struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
193 int flags,
194 struct ceph_snap_context *snapc, 148 struct ceph_snap_context *snapc,
195 struct ceph_osd_req_op *ops, 149 unsigned int num_ops,
196 bool use_mempool, 150 bool use_mempool,
197 gfp_t gfp_flags, 151 gfp_t gfp_flags)
198 struct page **pages,
199 struct bio *bio)
200{ 152{
201 struct ceph_osd_request *req; 153 struct ceph_osd_request *req;
202 struct ceph_msg *msg; 154 struct ceph_msg *msg;
203 int needs_trail; 155 size_t msg_size;
204 int num_op = get_num_ops(ops, &needs_trail); 156
205 size_t msg_size = sizeof(struct ceph_osd_request_head); 157 msg_size = 4 + 4 + 8 + 8 + 4+8;
206 158 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
207 msg_size += num_op*sizeof(struct ceph_osd_op); 159 msg_size += 1 + 8 + 4 + 4; /* pg_t */
160 msg_size += 4 + MAX_OBJ_NAME_SIZE;
161 msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
162 msg_size += 8; /* snapid */
163 msg_size += 8; /* snap_seq */
164 msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
165 msg_size += 4;
208 166
209 if (use_mempool) { 167 if (use_mempool) {
210 req = mempool_alloc(osdc->req_mempool, gfp_flags); 168 req = mempool_alloc(osdc->req_mempool, gfp_flags);
@@ -228,10 +186,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
228 INIT_LIST_HEAD(&req->r_req_lru_item); 186 INIT_LIST_HEAD(&req->r_req_lru_item);
229 INIT_LIST_HEAD(&req->r_osd_item); 187 INIT_LIST_HEAD(&req->r_osd_item);
230 188
231 req->r_flags = flags;
232
233 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
234
235 /* create reply message */ 189 /* create reply message */
236 if (use_mempool) 190 if (use_mempool)
237 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 191 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
@@ -244,20 +198,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
244 } 198 }
245 req->r_reply = msg; 199 req->r_reply = msg;
246 200
247 /* allocate space for the trailing data */ 201 ceph_pagelist_init(&req->r_trail);
248 if (needs_trail) {
249 req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
250 if (!req->r_trail) {
251 ceph_osdc_put_request(req);
252 return NULL;
253 }
254 ceph_pagelist_init(req->r_trail);
255 }
256 202
257 /* create request message; allow space for oid */ 203 /* create request message; allow space for oid */
258 msg_size += MAX_OBJ_NAME_SIZE;
259 if (snapc)
260 msg_size += sizeof(u64) * snapc->num_snaps;
261 if (use_mempool) 204 if (use_mempool)
262 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 205 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
263 else 206 else
@@ -270,13 +213,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
270 memset(msg->front.iov_base, 0, msg->front.iov_len); 213 memset(msg->front.iov_base, 0, msg->front.iov_len);
271 214
272 req->r_request = msg; 215 req->r_request = msg;
273 req->r_pages = pages;
274#ifdef CONFIG_BLOCK
275 if (bio) {
276 req->r_bio = bio;
277 bio_get(req->r_bio);
278 }
279#endif
280 216
281 return req; 217 return req;
282} 218}
@@ -289,6 +225,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
289 dst->op = cpu_to_le16(src->op); 225 dst->op = cpu_to_le16(src->op);
290 226
291 switch (src->op) { 227 switch (src->op) {
228 case CEPH_OSD_OP_STAT:
229 break;
292 case CEPH_OSD_OP_READ: 230 case CEPH_OSD_OP_READ:
293 case CEPH_OSD_OP_WRITE: 231 case CEPH_OSD_OP_WRITE:
294 dst->extent.offset = 232 dst->extent.offset =
@@ -300,52 +238,20 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
300 dst->extent.truncate_seq = 238 dst->extent.truncate_seq =
301 cpu_to_le32(src->extent.truncate_seq); 239 cpu_to_le32(src->extent.truncate_seq);
302 break; 240 break;
303
304 case CEPH_OSD_OP_GETXATTR:
305 case CEPH_OSD_OP_SETXATTR:
306 case CEPH_OSD_OP_CMPXATTR:
307 BUG_ON(!req->r_trail);
308
309 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
310 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
311 dst->xattr.cmp_op = src->xattr.cmp_op;
312 dst->xattr.cmp_mode = src->xattr.cmp_mode;
313 ceph_pagelist_append(req->r_trail, src->xattr.name,
314 src->xattr.name_len);
315 ceph_pagelist_append(req->r_trail, src->xattr.val,
316 src->xattr.value_len);
317 break;
318 case CEPH_OSD_OP_CALL: 241 case CEPH_OSD_OP_CALL:
319 BUG_ON(!req->r_trail);
320
321 dst->cls.class_len = src->cls.class_len; 242 dst->cls.class_len = src->cls.class_len;
322 dst->cls.method_len = src->cls.method_len; 243 dst->cls.method_len = src->cls.method_len;
323 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); 244 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
324 245
325 ceph_pagelist_append(req->r_trail, src->cls.class_name, 246 ceph_pagelist_append(&req->r_trail, src->cls.class_name,
326 src->cls.class_len); 247 src->cls.class_len);
327 ceph_pagelist_append(req->r_trail, src->cls.method_name, 248 ceph_pagelist_append(&req->r_trail, src->cls.method_name,
328 src->cls.method_len); 249 src->cls.method_len);
329 ceph_pagelist_append(req->r_trail, src->cls.indata, 250 ceph_pagelist_append(&req->r_trail, src->cls.indata,
330 src->cls.indata_len); 251 src->cls.indata_len);
331 break; 252 break;
332 case CEPH_OSD_OP_ROLLBACK:
333 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
334 break;
335 case CEPH_OSD_OP_STARTSYNC: 253 case CEPH_OSD_OP_STARTSYNC:
336 break; 254 break;
337 case CEPH_OSD_OP_NOTIFY:
338 {
339 __le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
340 __le32 timeout = cpu_to_le32(src->watch.timeout);
341
342 BUG_ON(!req->r_trail);
343
344 ceph_pagelist_append(req->r_trail,
345 &prot_ver, sizeof(prot_ver));
346 ceph_pagelist_append(req->r_trail,
347 &timeout, sizeof(timeout));
348 }
349 case CEPH_OSD_OP_NOTIFY_ACK: 255 case CEPH_OSD_OP_NOTIFY_ACK:
350 case CEPH_OSD_OP_WATCH: 256 case CEPH_OSD_OP_WATCH:
351 dst->watch.cookie = cpu_to_le64(src->watch.cookie); 257 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
@@ -356,6 +262,64 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
356 pr_err("unrecognized osd opcode %d\n", dst->op); 262 pr_err("unrecognized osd opcode %d\n", dst->op);
357 WARN_ON(1); 263 WARN_ON(1);
358 break; 264 break;
265 case CEPH_OSD_OP_MAPEXT:
266 case CEPH_OSD_OP_MASKTRUNC:
267 case CEPH_OSD_OP_SPARSE_READ:
268 case CEPH_OSD_OP_NOTIFY:
269 case CEPH_OSD_OP_ASSERT_VER:
270 case CEPH_OSD_OP_WRITEFULL:
271 case CEPH_OSD_OP_TRUNCATE:
272 case CEPH_OSD_OP_ZERO:
273 case CEPH_OSD_OP_DELETE:
274 case CEPH_OSD_OP_APPEND:
275 case CEPH_OSD_OP_SETTRUNC:
276 case CEPH_OSD_OP_TRIMTRUNC:
277 case CEPH_OSD_OP_TMAPUP:
278 case CEPH_OSD_OP_TMAPPUT:
279 case CEPH_OSD_OP_TMAPGET:
280 case CEPH_OSD_OP_CREATE:
281 case CEPH_OSD_OP_ROLLBACK:
282 case CEPH_OSD_OP_OMAPGETKEYS:
283 case CEPH_OSD_OP_OMAPGETVALS:
284 case CEPH_OSD_OP_OMAPGETHEADER:
285 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
286 case CEPH_OSD_OP_MODE_RD:
287 case CEPH_OSD_OP_OMAPSETVALS:
288 case CEPH_OSD_OP_OMAPSETHEADER:
289 case CEPH_OSD_OP_OMAPCLEAR:
290 case CEPH_OSD_OP_OMAPRMKEYS:
291 case CEPH_OSD_OP_OMAP_CMP:
292 case CEPH_OSD_OP_CLONERANGE:
293 case CEPH_OSD_OP_ASSERT_SRC_VERSION:
294 case CEPH_OSD_OP_SRC_CMPXATTR:
295 case CEPH_OSD_OP_GETXATTR:
296 case CEPH_OSD_OP_GETXATTRS:
297 case CEPH_OSD_OP_CMPXATTR:
298 case CEPH_OSD_OP_SETXATTR:
299 case CEPH_OSD_OP_SETXATTRS:
300 case CEPH_OSD_OP_RESETXATTRS:
301 case CEPH_OSD_OP_RMXATTR:
302 case CEPH_OSD_OP_PULL:
303 case CEPH_OSD_OP_PUSH:
304 case CEPH_OSD_OP_BALANCEREADS:
305 case CEPH_OSD_OP_UNBALANCEREADS:
306 case CEPH_OSD_OP_SCRUB:
307 case CEPH_OSD_OP_SCRUB_RESERVE:
308 case CEPH_OSD_OP_SCRUB_UNRESERVE:
309 case CEPH_OSD_OP_SCRUB_STOP:
310 case CEPH_OSD_OP_SCRUB_MAP:
311 case CEPH_OSD_OP_WRLOCK:
312 case CEPH_OSD_OP_WRUNLOCK:
313 case CEPH_OSD_OP_RDLOCK:
314 case CEPH_OSD_OP_RDUNLOCK:
315 case CEPH_OSD_OP_UPLOCK:
316 case CEPH_OSD_OP_DNLOCK:
317 case CEPH_OSD_OP_PGLS:
318 case CEPH_OSD_OP_PGLS_FILTER:
319 pr_err("unsupported osd opcode %s\n",
320 ceph_osd_op_name(dst->op));
321 WARN_ON(1);
322 break;
359 } 323 }
360 dst->payload_len = cpu_to_le32(src->payload_len); 324 dst->payload_len = cpu_to_le32(src->payload_len);
361} 325}
@@ -365,75 +329,95 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
365 * 329 *
366 */ 330 */
367void ceph_osdc_build_request(struct ceph_osd_request *req, 331void ceph_osdc_build_request(struct ceph_osd_request *req,
368 u64 off, u64 *plen, 332 u64 off, u64 len, unsigned int num_ops,
369 struct ceph_osd_req_op *src_ops, 333 struct ceph_osd_req_op *src_ops,
370 struct ceph_snap_context *snapc, 334 struct ceph_snap_context *snapc, u64 snap_id,
371 struct timespec *mtime, 335 struct timespec *mtime)
372 const char *oid,
373 int oid_len)
374{ 336{
375 struct ceph_msg *msg = req->r_request; 337 struct ceph_msg *msg = req->r_request;
376 struct ceph_osd_request_head *head;
377 struct ceph_osd_req_op *src_op; 338 struct ceph_osd_req_op *src_op;
378 struct ceph_osd_op *op;
379 void *p; 339 void *p;
380 int num_op = get_num_ops(src_ops, NULL); 340 size_t msg_size;
381 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
382 int flags = req->r_flags; 341 int flags = req->r_flags;
383 u64 data_len = 0; 342 u64 data_len;
384 int i; 343 int i;
385 344
386 head = msg->front.iov_base; 345 req->r_num_ops = num_ops;
387 op = (void *)(head + 1); 346 req->r_snapid = snap_id;
388 p = (void *)(op + num_op);
389
390 req->r_snapc = ceph_get_snap_context(snapc); 347 req->r_snapc = ceph_get_snap_context(snapc);
391 348
392 head->client_inc = cpu_to_le32(1); /* always, for now. */ 349 /* encode request */
393 head->flags = cpu_to_le32(flags); 350 msg->hdr.version = cpu_to_le16(4);
394 if (flags & CEPH_OSD_FLAG_WRITE)
395 ceph_encode_timespec(&head->mtime, mtime);
396 head->num_ops = cpu_to_le16(num_op);
397
398
399 /* fill in oid */
400 head->object_len = cpu_to_le32(oid_len);
401 memcpy(p, oid, oid_len);
402 p += oid_len;
403 351
352 p = msg->front.iov_base;
353 ceph_encode_32(&p, 1); /* client_inc is always 1 */
354 req->r_request_osdmap_epoch = p;
355 p += 4;
356 req->r_request_flags = p;
357 p += 4;
358 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
359 ceph_encode_timespec(p, mtime);
360 p += sizeof(struct ceph_timespec);
361 req->r_request_reassert_version = p;
362 p += sizeof(struct ceph_eversion); /* will get filled in */
363
364 /* oloc */
365 ceph_encode_8(&p, 4);
366 ceph_encode_8(&p, 4);
367 ceph_encode_32(&p, 8 + 4 + 4);
368 req->r_request_pool = p;
369 p += 8;
370 ceph_encode_32(&p, -1); /* preferred */
371 ceph_encode_32(&p, 0); /* key len */
372
373 ceph_encode_8(&p, 1);
374 req->r_request_pgid = p;
375 p += 8 + 4;
376 ceph_encode_32(&p, -1); /* preferred */
377
378 /* oid */
379 ceph_encode_32(&p, req->r_oid_len);
380 memcpy(p, req->r_oid, req->r_oid_len);
381 dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
382 p += req->r_oid_len;
383
384 /* ops */
385 ceph_encode_16(&p, num_ops);
404 src_op = src_ops; 386 src_op = src_ops;
405 while (src_op->op) { 387 req->r_request_ops = p;
406 osd_req_encode_op(req, op, src_op); 388 for (i = 0; i < num_ops; i++, src_op++) {
407 src_op++; 389 osd_req_encode_op(req, p, src_op);
408 op++; 390 p += sizeof(struct ceph_osd_op);
409 } 391 }
410 392
411 if (req->r_trail) 393 /* snaps */
412 data_len += req->r_trail->length; 394 ceph_encode_64(&p, req->r_snapid);
413 395 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
414 if (snapc) { 396 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
415 head->snap_seq = cpu_to_le64(snapc->seq); 397 if (req->r_snapc) {
416 head->num_snaps = cpu_to_le32(snapc->num_snaps);
417 for (i = 0; i < snapc->num_snaps; i++) { 398 for (i = 0; i < snapc->num_snaps; i++) {
418 put_unaligned_le64(snapc->snaps[i], p); 399 ceph_encode_64(&p, req->r_snapc->snaps[i]);
419 p += sizeof(u64);
420 } 400 }
421 } 401 }
422 402
403 req->r_request_attempts = p;
404 p += 4;
405
406 data_len = req->r_trail.length;
423 if (flags & CEPH_OSD_FLAG_WRITE) { 407 if (flags & CEPH_OSD_FLAG_WRITE) {
424 req->r_request->hdr.data_off = cpu_to_le16(off); 408 req->r_request->hdr.data_off = cpu_to_le16(off);
425 req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); 409 data_len += len;
426 } else if (data_len) {
427 req->r_request->hdr.data_off = 0;
428 req->r_request->hdr.data_len = cpu_to_le32(data_len);
429 } 410 }
430 411 req->r_request->hdr.data_len = cpu_to_le32(data_len);
431 req->r_request->page_alignment = req->r_page_alignment; 412 req->r_request->page_alignment = req->r_page_alignment;
432 413
433 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 414 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
434 msg_size = p - msg->front.iov_base; 415 msg_size = p - msg->front.iov_base;
435 msg->front.iov_len = msg_size; 416 msg->front.iov_len = msg_size;
436 msg->hdr.front_len = cpu_to_le32(msg_size); 417 msg->hdr.front_len = cpu_to_le32(msg_size);
418
419 dout("build_request msg_size was %d num_ops %d\n", (int)msg_size,
420 num_ops);
437 return; 421 return;
438} 422}
439EXPORT_SYMBOL(ceph_osdc_build_request); 423EXPORT_SYMBOL(ceph_osdc_build_request);
@@ -459,34 +443,33 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
459 u32 truncate_seq, 443 u32 truncate_seq,
460 u64 truncate_size, 444 u64 truncate_size,
461 struct timespec *mtime, 445 struct timespec *mtime,
462 bool use_mempool, int num_reply, 446 bool use_mempool,
463 int page_align) 447 int page_align)
464{ 448{
465 struct ceph_osd_req_op ops[3]; 449 struct ceph_osd_req_op ops[2];
466 struct ceph_osd_request *req; 450 struct ceph_osd_request *req;
451 unsigned int num_op = 1;
467 int r; 452 int r;
468 453
454 memset(&ops, 0, sizeof ops);
455
469 ops[0].op = opcode; 456 ops[0].op = opcode;
470 ops[0].extent.truncate_seq = truncate_seq; 457 ops[0].extent.truncate_seq = truncate_seq;
471 ops[0].extent.truncate_size = truncate_size; 458 ops[0].extent.truncate_size = truncate_size;
472 ops[0].payload_len = 0;
473 459
474 if (do_sync) { 460 if (do_sync) {
475 ops[1].op = CEPH_OSD_OP_STARTSYNC; 461 ops[1].op = CEPH_OSD_OP_STARTSYNC;
476 ops[1].payload_len = 0; 462 num_op++;
477 ops[2].op = 0; 463 }
478 } else 464
479 ops[1].op = 0; 465 req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool,
480 466 GFP_NOFS);
481 req = ceph_osdc_alloc_request(osdc, flags,
482 snapc, ops,
483 use_mempool,
484 GFP_NOFS, NULL, NULL);
485 if (!req) 467 if (!req)
486 return ERR_PTR(-ENOMEM); 468 return ERR_PTR(-ENOMEM);
469 req->r_flags = flags;
487 470
488 /* calculate max write size */ 471 /* calculate max write size */
489 r = calc_layout(osdc, vino, layout, off, plen, req, ops); 472 r = calc_layout(vino, layout, off, plen, req, ops);
490 if (r < 0) 473 if (r < 0)
491 return ERR_PTR(r); 474 return ERR_PTR(r);
492 req->r_file_layout = *layout; /* keep a copy */ 475 req->r_file_layout = *layout; /* keep a copy */
@@ -496,10 +479,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
496 req->r_num_pages = calc_pages_for(page_align, *plen); 479 req->r_num_pages = calc_pages_for(page_align, *plen);
497 req->r_page_alignment = page_align; 480 req->r_page_alignment = page_align;
498 481
499 ceph_osdc_build_request(req, off, plen, ops, 482 ceph_osdc_build_request(req, off, *plen, num_op, ops,
500 snapc, 483 snapc, vino.snap, mtime);
501 mtime,
502 req->r_oid, req->r_oid_len);
503 484
504 return req; 485 return req;
505} 486}
@@ -623,8 +604,8 @@ static void osd_reset(struct ceph_connection *con)
623 down_read(&osdc->map_sem); 604 down_read(&osdc->map_sem);
624 mutex_lock(&osdc->request_mutex); 605 mutex_lock(&osdc->request_mutex);
625 __kick_osd_requests(osdc, osd); 606 __kick_osd_requests(osdc, osd);
607 __send_queued(osdc);
626 mutex_unlock(&osdc->request_mutex); 608 mutex_unlock(&osdc->request_mutex);
627 send_queued(osdc);
628 up_read(&osdc->map_sem); 609 up_read(&osdc->map_sem);
629} 610}
630 611
@@ -739,31 +720,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc)
739 */ 720 */
740static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 721static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
741{ 722{
742 struct ceph_osd_request *req; 723 struct ceph_entity_addr *peer_addr;
743 int ret = 0;
744 724
745 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 725 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
746 if (list_empty(&osd->o_requests) && 726 if (list_empty(&osd->o_requests) &&
747 list_empty(&osd->o_linger_requests)) { 727 list_empty(&osd->o_linger_requests)) {
748 __remove_osd(osdc, osd); 728 __remove_osd(osdc, osd);
749 ret = -ENODEV; 729
750 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], 730 return -ENODEV;
751 &osd->o_con.peer_addr, 731 }
752 sizeof(osd->o_con.peer_addr)) == 0 && 732
753 !ceph_con_opened(&osd->o_con)) { 733 peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
734 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
735 !ceph_con_opened(&osd->o_con)) {
736 struct ceph_osd_request *req;
737
754 dout(" osd addr hasn't changed and connection never opened," 738 dout(" osd addr hasn't changed and connection never opened,"
755 " letting msgr retry"); 739 " letting msgr retry");
756 /* touch each r_stamp for handle_timeout()'s benfit */ 740 /* touch each r_stamp for handle_timeout()'s benfit */
757 list_for_each_entry(req, &osd->o_requests, r_osd_item) 741 list_for_each_entry(req, &osd->o_requests, r_osd_item)
758 req->r_stamp = jiffies; 742 req->r_stamp = jiffies;
759 ret = -EAGAIN; 743
760 } else { 744 return -EAGAIN;
761 ceph_con_close(&osd->o_con);
762 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
763 &osdc->osdmap->osd_addr[osd->o_osd]);
764 osd->o_incarnation++;
765 } 745 }
766 return ret; 746
747 ceph_con_close(&osd->o_con);
748 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
749 osd->o_incarnation++;
750
751 return 0;
767} 752}
768 753
769static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) 754static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
@@ -961,20 +946,18 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger);
961static int __map_request(struct ceph_osd_client *osdc, 946static int __map_request(struct ceph_osd_client *osdc,
962 struct ceph_osd_request *req, int force_resend) 947 struct ceph_osd_request *req, int force_resend)
963{ 948{
964 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
965 struct ceph_pg pgid; 949 struct ceph_pg pgid;
966 int acting[CEPH_PG_MAX_SIZE]; 950 int acting[CEPH_PG_MAX_SIZE];
967 int o = -1, num = 0; 951 int o = -1, num = 0;
968 int err; 952 int err;
969 953
970 dout("map_request %p tid %lld\n", req, req->r_tid); 954 dout("map_request %p tid %lld\n", req, req->r_tid);
971 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, 955 err = ceph_calc_object_layout(&pgid, req->r_oid,
972 &req->r_file_layout, osdc->osdmap); 956 &req->r_file_layout, osdc->osdmap);
973 if (err) { 957 if (err) {
974 list_move(&req->r_req_lru_item, &osdc->req_notarget); 958 list_move(&req->r_req_lru_item, &osdc->req_notarget);
975 return err; 959 return err;
976 } 960 }
977 pgid = reqhead->layout.ol_pgid;
978 req->r_pgid = pgid; 961 req->r_pgid = pgid;
979 962
980 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); 963 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
@@ -991,8 +974,8 @@ static int __map_request(struct ceph_osd_client *osdc,
991 (req->r_osd == NULL && o == -1)) 974 (req->r_osd == NULL && o == -1))
992 return 0; /* no change */ 975 return 0; /* no change */
993 976
994 dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", 977 dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
995 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, 978 req->r_tid, pgid.pool, pgid.seed, o,
996 req->r_osd ? req->r_osd->o_osd : -1); 979 req->r_osd ? req->r_osd->o_osd : -1);
997 980
998 /* record full pg acting set */ 981 /* record full pg acting set */
@@ -1041,15 +1024,22 @@ out:
1041static void __send_request(struct ceph_osd_client *osdc, 1024static void __send_request(struct ceph_osd_client *osdc,
1042 struct ceph_osd_request *req) 1025 struct ceph_osd_request *req)
1043{ 1026{
1044 struct ceph_osd_request_head *reqhead; 1027 void *p;
1045
1046 dout("send_request %p tid %llu to osd%d flags %d\n",
1047 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
1048 1028
1049 reqhead = req->r_request->front.iov_base; 1029 dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
1050 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); 1030 req, req->r_tid, req->r_osd->o_osd, req->r_flags,
1051 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ 1031 (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
1052 reqhead->reassert_version = req->r_reassert_version; 1032
1033 /* fill in message content that changes each time we send it */
1034 put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
1035 put_unaligned_le32(req->r_flags, req->r_request_flags);
1036 put_unaligned_le64(req->r_pgid.pool, req->r_request_pool);
1037 p = req->r_request_pgid;
1038 ceph_encode_64(&p, req->r_pgid.pool);
1039 ceph_encode_32(&p, req->r_pgid.seed);
1040 put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
1041 memcpy(req->r_request_reassert_version, &req->r_reassert_version,
1042 sizeof(req->r_reassert_version));
1053 1043
1054 req->r_stamp = jiffies; 1044 req->r_stamp = jiffies;
1055 list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 1045 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
@@ -1062,16 +1052,13 @@ static void __send_request(struct ceph_osd_client *osdc,
1062/* 1052/*
1063 * Send any requests in the queue (req_unsent). 1053 * Send any requests in the queue (req_unsent).
1064 */ 1054 */
1065static void send_queued(struct ceph_osd_client *osdc) 1055static void __send_queued(struct ceph_osd_client *osdc)
1066{ 1056{
1067 struct ceph_osd_request *req, *tmp; 1057 struct ceph_osd_request *req, *tmp;
1068 1058
1069 dout("send_queued\n"); 1059 dout("__send_queued\n");
1070 mutex_lock(&osdc->request_mutex); 1060 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
1071 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
1072 __send_request(osdc, req); 1061 __send_request(osdc, req);
1073 }
1074 mutex_unlock(&osdc->request_mutex);
1075} 1062}
1076 1063
1077/* 1064/*
@@ -1123,8 +1110,8 @@ static void handle_timeout(struct work_struct *work)
1123 } 1110 }
1124 1111
1125 __schedule_osd_timeout(osdc); 1112 __schedule_osd_timeout(osdc);
1113 __send_queued(osdc);
1126 mutex_unlock(&osdc->request_mutex); 1114 mutex_unlock(&osdc->request_mutex);
1127 send_queued(osdc);
1128 up_read(&osdc->map_sem); 1115 up_read(&osdc->map_sem);
1129} 1116}
1130 1117
@@ -1152,6 +1139,26 @@ static void complete_request(struct ceph_osd_request *req)
1152 complete_all(&req->r_safe_completion); /* fsync waiter */ 1139 complete_all(&req->r_safe_completion); /* fsync waiter */
1153} 1140}
1154 1141
1142static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
1143{
1144 __u8 v;
1145
1146 ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
1147 v = ceph_decode_8(p);
1148 if (v > 1) {
1149 pr_warning("do not understand pg encoding %d > 1", v);
1150 return -EINVAL;
1151 }
1152 pgid->pool = ceph_decode_64(p);
1153 pgid->seed = ceph_decode_32(p);
1154 *p += 4;
1155 return 0;
1156
1157bad:
1158 pr_warning("incomplete pg encoding");
1159 return -EINVAL;
1160}
1161
1155/* 1162/*
1156 * handle osd op reply. either call the callback if it is specified, 1163 * handle osd op reply. either call the callback if it is specified,
1157 * or do the completion to wake up the waiting thread. 1164 * or do the completion to wake up the waiting thread.
@@ -1159,22 +1166,42 @@ static void complete_request(struct ceph_osd_request *req)
1159static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, 1166static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1160 struct ceph_connection *con) 1167 struct ceph_connection *con)
1161{ 1168{
1162 struct ceph_osd_reply_head *rhead = msg->front.iov_base; 1169 void *p, *end;
1163 struct ceph_osd_request *req; 1170 struct ceph_osd_request *req;
1164 u64 tid; 1171 u64 tid;
1165 int numops, object_len, flags; 1172 int object_len;
1173 int numops, payload_len, flags;
1166 s32 result; 1174 s32 result;
1175 s32 retry_attempt;
1176 struct ceph_pg pg;
1177 int err;
1178 u32 reassert_epoch;
1179 u64 reassert_version;
1180 u32 osdmap_epoch;
1181 int i;
1167 1182
1168 tid = le64_to_cpu(msg->hdr.tid); 1183 tid = le64_to_cpu(msg->hdr.tid);
1169 if (msg->front.iov_len < sizeof(*rhead)) 1184 dout("handle_reply %p tid %llu\n", msg, tid);
1170 goto bad; 1185
1171 numops = le32_to_cpu(rhead->num_ops); 1186 p = msg->front.iov_base;
1172 object_len = le32_to_cpu(rhead->object_len); 1187 end = p + msg->front.iov_len;
1173 result = le32_to_cpu(rhead->result); 1188
1174 if (msg->front.iov_len != sizeof(*rhead) + object_len + 1189 ceph_decode_need(&p, end, 4, bad);
1175 numops * sizeof(struct ceph_osd_op)) 1190 object_len = ceph_decode_32(&p);
1191 ceph_decode_need(&p, end, object_len, bad);
1192 p += object_len;
1193
1194 err = __decode_pgid(&p, end, &pg);
1195 if (err)
1176 goto bad; 1196 goto bad;
1177 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); 1197
1198 ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
1199 flags = ceph_decode_64(&p);
1200 result = ceph_decode_32(&p);
1201 reassert_epoch = ceph_decode_32(&p);
1202 reassert_version = ceph_decode_64(&p);
1203 osdmap_epoch = ceph_decode_32(&p);
1204
1178 /* lookup */ 1205 /* lookup */
1179 mutex_lock(&osdc->request_mutex); 1206 mutex_lock(&osdc->request_mutex);
1180 req = __lookup_request(osdc, tid); 1207 req = __lookup_request(osdc, tid);
@@ -1184,7 +1211,38 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1184 return; 1211 return;
1185 } 1212 }
1186 ceph_osdc_get_request(req); 1213 ceph_osdc_get_request(req);
1187 flags = le32_to_cpu(rhead->flags); 1214
1215 dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
1216 req, result);
1217
1218 ceph_decode_need(&p, end, 4, bad);
1219 numops = ceph_decode_32(&p);
1220 if (numops > CEPH_OSD_MAX_OP)
1221 goto bad_put;
1222 if (numops != req->r_num_ops)
1223 goto bad_put;
1224 payload_len = 0;
1225 ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad);
1226 for (i = 0; i < numops; i++) {
1227 struct ceph_osd_op *op = p;
1228 int len;
1229
1230 len = le32_to_cpu(op->payload_len);
1231 req->r_reply_op_len[i] = len;
1232 dout(" op %d has %d bytes\n", i, len);
1233 payload_len += len;
1234 p += sizeof(*op);
1235 }
1236 if (payload_len != le32_to_cpu(msg->hdr.data_len)) {
1237 pr_warning("sum of op payload lens %d != data_len %d",
1238 payload_len, le32_to_cpu(msg->hdr.data_len));
1239 goto bad_put;
1240 }
1241
1242 ceph_decode_need(&p, end, 4 + numops * 4, bad);
1243 retry_attempt = ceph_decode_32(&p);
1244 for (i = 0; i < numops; i++)
1245 req->r_reply_op_result[i] = ceph_decode_32(&p);
1188 1246
1189 /* 1247 /*
1190 * if this connection filled our message, drop our reference now, to 1248 * if this connection filled our message, drop our reference now, to
@@ -1199,7 +1257,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1199 if (!req->r_got_reply) { 1257 if (!req->r_got_reply) {
1200 unsigned int bytes; 1258 unsigned int bytes;
1201 1259
1202 req->r_result = le32_to_cpu(rhead->result); 1260 req->r_result = result;
1203 bytes = le32_to_cpu(msg->hdr.data_len); 1261 bytes = le32_to_cpu(msg->hdr.data_len);
1204 dout("handle_reply result %d bytes %d\n", req->r_result, 1262 dout("handle_reply result %d bytes %d\n", req->r_result,
1205 bytes); 1263 bytes);
@@ -1207,7 +1265,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1207 req->r_result = bytes; 1265 req->r_result = bytes;
1208 1266
1209 /* in case this is a write and we need to replay, */ 1267 /* in case this is a write and we need to replay, */
1210 req->r_reassert_version = rhead->reassert_version; 1268 req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
1269 req->r_reassert_version.version = cpu_to_le64(reassert_version);
1211 1270
1212 req->r_got_reply = 1; 1271 req->r_got_reply = 1;
1213 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { 1272 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
@@ -1242,10 +1301,11 @@ done:
1242 ceph_osdc_put_request(req); 1301 ceph_osdc_put_request(req);
1243 return; 1302 return;
1244 1303
1304bad_put:
1305 ceph_osdc_put_request(req);
1245bad: 1306bad:
1246 pr_err("corrupt osd_op_reply got %d %d expected %d\n", 1307 pr_err("corrupt osd_op_reply got %d %d\n",
1247 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), 1308 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
1248 (int)sizeof(*rhead));
1249 ceph_msg_dump(msg); 1309 ceph_msg_dump(msg);
1250} 1310}
1251 1311
@@ -1462,7 +1522,9 @@ done:
1462 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 1522 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
1463 ceph_monc_request_next_osdmap(&osdc->client->monc); 1523 ceph_monc_request_next_osdmap(&osdc->client->monc);
1464 1524
1465 send_queued(osdc); 1525 mutex_lock(&osdc->request_mutex);
1526 __send_queued(osdc);
1527 mutex_unlock(&osdc->request_mutex);
1466 up_read(&osdc->map_sem); 1528 up_read(&osdc->map_sem);
1467 wake_up_all(&osdc->client->auth_wq); 1529 wake_up_all(&osdc->client->auth_wq);
1468 return; 1530 return;
@@ -1556,8 +1618,7 @@ static void __remove_event(struct ceph_osd_event *event)
1556 1618
1557int ceph_osdc_create_event(struct ceph_osd_client *osdc, 1619int ceph_osdc_create_event(struct ceph_osd_client *osdc,
1558 void (*event_cb)(u64, u64, u8, void *), 1620 void (*event_cb)(u64, u64, u8, void *),
1559 int one_shot, void *data, 1621 void *data, struct ceph_osd_event **pevent)
1560 struct ceph_osd_event **pevent)
1561{ 1622{
1562 struct ceph_osd_event *event; 1623 struct ceph_osd_event *event;
1563 1624
@@ -1567,14 +1628,13 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
1567 1628
1568 dout("create_event %p\n", event); 1629 dout("create_event %p\n", event);
1569 event->cb = event_cb; 1630 event->cb = event_cb;
1570 event->one_shot = one_shot; 1631 event->one_shot = 0;
1571 event->data = data; 1632 event->data = data;
1572 event->osdc = osdc; 1633 event->osdc = osdc;
1573 INIT_LIST_HEAD(&event->osd_node); 1634 INIT_LIST_HEAD(&event->osd_node);
1574 RB_CLEAR_NODE(&event->node); 1635 RB_CLEAR_NODE(&event->node);
1575 kref_init(&event->kref); /* one ref for us */ 1636 kref_init(&event->kref); /* one ref for us */
1576 kref_get(&event->kref); /* one ref for the caller */ 1637 kref_get(&event->kref); /* one ref for the caller */
1577 init_completion(&event->completion);
1578 1638
1579 spin_lock(&osdc->event_lock); 1639 spin_lock(&osdc->event_lock);
1580 event->cookie = ++osdc->event_count; 1640 event->cookie = ++osdc->event_count;
@@ -1610,7 +1670,6 @@ static void do_event_work(struct work_struct *work)
1610 1670
1611 dout("do_event_work completing %p\n", event); 1671 dout("do_event_work completing %p\n", event);
1612 event->cb(ver, notify_id, opcode, event->data); 1672 event->cb(ver, notify_id, opcode, event->data);
1613 complete(&event->completion);
1614 dout("do_event_work completed %p\n", event); 1673 dout("do_event_work completed %p\n", event);
1615 ceph_osdc_put_event(event); 1674 ceph_osdc_put_event(event);
1616 kfree(event_work); 1675 kfree(event_work);
@@ -1620,7 +1679,8 @@ static void do_event_work(struct work_struct *work)
1620/* 1679/*
1621 * Process osd watch notifications 1680 * Process osd watch notifications
1622 */ 1681 */
1623void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) 1682static void handle_watch_notify(struct ceph_osd_client *osdc,
1683 struct ceph_msg *msg)
1624{ 1684{
1625 void *p, *end; 1685 void *p, *end;
1626 u8 proto_ver; 1686 u8 proto_ver;
@@ -1641,9 +1701,8 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1641 spin_lock(&osdc->event_lock); 1701 spin_lock(&osdc->event_lock);
1642 event = __find_event(osdc, cookie); 1702 event = __find_event(osdc, cookie);
1643 if (event) { 1703 if (event) {
1704 BUG_ON(event->one_shot);
1644 get_event(event); 1705 get_event(event);
1645 if (event->one_shot)
1646 __remove_event(event);
1647 } 1706 }
1648 spin_unlock(&osdc->event_lock); 1707 spin_unlock(&osdc->event_lock);
1649 dout("handle_watch_notify cookie %lld ver %lld event %p\n", 1708 dout("handle_watch_notify cookie %lld ver %lld event %p\n",
@@ -1668,7 +1727,6 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1668 return; 1727 return;
1669 1728
1670done_err: 1729done_err:
1671 complete(&event->completion);
1672 ceph_osdc_put_event(event); 1730 ceph_osdc_put_event(event);
1673 return; 1731 return;
1674 1732
@@ -1677,21 +1735,6 @@ bad:
1677 return; 1735 return;
1678} 1736}
1679 1737
1680int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout)
1681{
1682 int err;
1683
1684 dout("wait_event %p\n", event);
1685 err = wait_for_completion_interruptible_timeout(&event->completion,
1686 timeout * HZ);
1687 ceph_osdc_put_event(event);
1688 if (err > 0)
1689 err = 0;
1690 dout("wait_event %p returns %d\n", event, err);
1691 return err;
1692}
1693EXPORT_SYMBOL(ceph_osdc_wait_event);
1694
1695/* 1738/*
1696 * Register request, send initial attempt. 1739 * Register request, send initial attempt.
1697 */ 1740 */
@@ -1706,7 +1749,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1706#ifdef CONFIG_BLOCK 1749#ifdef CONFIG_BLOCK
1707 req->r_request->bio = req->r_bio; 1750 req->r_request->bio = req->r_bio;
1708#endif 1751#endif
1709 req->r_request->trail = req->r_trail; 1752 req->r_request->trail = &req->r_trail;
1710 1753
1711 register_request(osdc, req); 1754 register_request(osdc, req);
1712 1755
@@ -1865,7 +1908,6 @@ out_mempool:
1865out: 1908out:
1866 return err; 1909 return err;
1867} 1910}
1868EXPORT_SYMBOL(ceph_osdc_init);
1869 1911
1870void ceph_osdc_stop(struct ceph_osd_client *osdc) 1912void ceph_osdc_stop(struct ceph_osd_client *osdc)
1871{ 1913{
@@ -1882,7 +1924,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
1882 ceph_msgpool_destroy(&osdc->msgpool_op); 1924 ceph_msgpool_destroy(&osdc->msgpool_op);
1883 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 1925 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1884} 1926}
1885EXPORT_SYMBOL(ceph_osdc_stop);
1886 1927
1887/* 1928/*
1888 * Read some contiguous pages. If we cross a stripe boundary, shorten 1929 * Read some contiguous pages. If we cross a stripe boundary, shorten
@@ -1902,7 +1943,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1902 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1943 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1903 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1944 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1904 NULL, 0, truncate_seq, truncate_size, NULL, 1945 NULL, 0, truncate_seq, truncate_size, NULL,
1905 false, 1, page_align); 1946 false, page_align);
1906 if (IS_ERR(req)) 1947 if (IS_ERR(req))
1907 return PTR_ERR(req); 1948 return PTR_ERR(req);
1908 1949
@@ -1931,8 +1972,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1931 u64 off, u64 len, 1972 u64 off, u64 len,
1932 u32 truncate_seq, u64 truncate_size, 1973 u32 truncate_seq, u64 truncate_size,
1933 struct timespec *mtime, 1974 struct timespec *mtime,
1934 struct page **pages, int num_pages, 1975 struct page **pages, int num_pages)
1935 int flags, int do_sync, bool nofail)
1936{ 1976{
1937 struct ceph_osd_request *req; 1977 struct ceph_osd_request *req;
1938 int rc = 0; 1978 int rc = 0;
@@ -1941,11 +1981,10 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1941 BUG_ON(vino.snap != CEPH_NOSNAP); 1981 BUG_ON(vino.snap != CEPH_NOSNAP);
1942 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1982 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1943 CEPH_OSD_OP_WRITE, 1983 CEPH_OSD_OP_WRITE,
1944 flags | CEPH_OSD_FLAG_ONDISK | 1984 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1945 CEPH_OSD_FLAG_WRITE, 1985 snapc, 0,
1946 snapc, do_sync,
1947 truncate_seq, truncate_size, mtime, 1986 truncate_seq, truncate_size, mtime,
1948 nofail, 1, page_align); 1987 true, page_align);
1949 if (IS_ERR(req)) 1988 if (IS_ERR(req))
1950 return PTR_ERR(req); 1989 return PTR_ERR(req);
1951 1990
@@ -1954,7 +1993,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1954 dout("writepages %llu~%llu (%d pages)\n", off, len, 1993 dout("writepages %llu~%llu (%d pages)\n", off, len,
1955 req->r_num_pages); 1994 req->r_num_pages);
1956 1995
1957 rc = ceph_osdc_start_request(osdc, req, nofail); 1996 rc = ceph_osdc_start_request(osdc, req, true);
1958 if (!rc) 1997 if (!rc)
1959 rc = ceph_osdc_wait_request(osdc, req); 1998 rc = ceph_osdc_wait_request(osdc, req);
1960 1999
@@ -2047,7 +2086,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2047 if (data_len > 0) { 2086 if (data_len > 0) {
2048 int want = calc_pages_for(req->r_page_alignment, data_len); 2087 int want = calc_pages_for(req->r_page_alignment, data_len);
2049 2088
2050 if (unlikely(req->r_num_pages < want)) { 2089 if (req->r_pages && unlikely(req->r_num_pages < want)) {
2051 pr_warning("tid %lld reply has %d bytes %d pages, we" 2090 pr_warning("tid %lld reply has %d bytes %d pages, we"
2052 " had only %d pages ready\n", tid, data_len, 2091 " had only %d pages ready\n", tid, data_len,
2053 want, req->r_num_pages); 2092 want, req->r_num_pages);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214b5d26..69bc4bf89e3e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -13,26 +13,18 @@
13 13
14char *ceph_osdmap_state_str(char *str, int len, int state) 14char *ceph_osdmap_state_str(char *str, int len, int state)
15{ 15{
16 int flag = 0;
17
18 if (!len) 16 if (!len)
19 goto done; 17 return str;
20 18
21 *str = '\0'; 19 if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
22 if (state) { 20 snprintf(str, len, "exists, up");
23 if (state & CEPH_OSD_EXISTS) { 21 else if (state & CEPH_OSD_EXISTS)
24 snprintf(str, len, "exists"); 22 snprintf(str, len, "exists");
25 flag = 1; 23 else if (state & CEPH_OSD_UP)
26 } 24 snprintf(str, len, "up");
27 if (state & CEPH_OSD_UP) { 25 else
28 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
29 "up");
30 flag = 1;
31 }
32 } else {
33 snprintf(str, len, "doesn't exist"); 26 snprintf(str, len, "doesn't exist");
34 } 27
35done:
36 return str; 28 return str;
37} 29}
38 30
@@ -53,13 +45,8 @@ static int calc_bits_of(unsigned int t)
53 */ 45 */
54static void calc_pg_masks(struct ceph_pg_pool_info *pi) 46static void calc_pg_masks(struct ceph_pg_pool_info *pi)
55{ 47{
56 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; 48 pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
57 pi->pgp_num_mask = 49 pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
58 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
59 pi->lpg_num_mask =
60 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
61 pi->lpgp_num_mask =
62 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
63} 50}
64 51
65/* 52/*
@@ -170,6 +157,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
170 c->choose_local_tries = 2; 157 c->choose_local_tries = 2;
171 c->choose_local_fallback_tries = 5; 158 c->choose_local_fallback_tries = 5;
172 c->choose_total_tries = 19; 159 c->choose_total_tries = 19;
160 c->chooseleaf_descend_once = 0;
173 161
174 ceph_decode_need(p, end, 4*sizeof(u32), bad); 162 ceph_decode_need(p, end, 4*sizeof(u32), bad);
175 magic = ceph_decode_32(p); 163 magic = ceph_decode_32(p);
@@ -336,6 +324,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
336 dout("crush decode tunable choose_total_tries = %d", 324 dout("crush decode tunable choose_total_tries = %d",
337 c->choose_total_tries); 325 c->choose_total_tries);
338 326
327 ceph_decode_need(p, end, sizeof(u32), done);
328 c->chooseleaf_descend_once = ceph_decode_32(p);
329 dout("crush decode tunable chooseleaf_descend_once = %d",
330 c->chooseleaf_descend_once);
331
339done: 332done:
340 dout("crush_decode success\n"); 333 dout("crush_decode success\n");
341 return c; 334 return c;
@@ -354,12 +347,13 @@ bad:
354 */ 347 */
355static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) 348static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
356{ 349{
357 u64 a = *(u64 *)&l; 350 if (l.pool < r.pool)
358 u64 b = *(u64 *)&r; 351 return -1;
359 352 if (l.pool > r.pool)
360 if (a < b) 353 return 1;
354 if (l.seed < r.seed)
361 return -1; 355 return -1;
362 if (a > b) 356 if (l.seed > r.seed)
363 return 1; 357 return 1;
364 return 0; 358 return 0;
365} 359}
@@ -405,8 +399,8 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
405 } else if (c > 0) { 399 } else if (c > 0) {
406 n = n->rb_right; 400 n = n->rb_right;
407 } else { 401 } else {
408 dout("__lookup_pg_mapping %llx got %p\n", 402 dout("__lookup_pg_mapping %lld.%x got %p\n",
409 *(u64 *)&pgid, pg); 403 pgid.pool, pgid.seed, pg);
410 return pg; 404 return pg;
411 } 405 }
412 } 406 }
@@ -418,12 +412,13 @@ static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
418 struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); 412 struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
419 413
420 if (pg) { 414 if (pg) {
421 dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg); 415 dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
416 pg);
422 rb_erase(&pg->node, root); 417 rb_erase(&pg->node, root);
423 kfree(pg); 418 kfree(pg);
424 return 0; 419 return 0;
425 } 420 }
426 dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid); 421 dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
427 return -ENOENT; 422 return -ENOENT;
428} 423}
429 424
@@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
452 return 0; 447 return 0;
453} 448}
454 449
455static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) 450static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
456{ 451{
457 struct ceph_pg_pool_info *pi; 452 struct ceph_pg_pool_info *pi;
458 struct rb_node *n = root->rb_node; 453 struct rb_node *n = root->rb_node;
@@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
508 503
509static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 504static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
510{ 505{
511 unsigned int n, m; 506 u8 ev, cv;
507 unsigned len, num;
508 void *pool_end;
509
510 ceph_decode_need(p, end, 2 + 4, bad);
511 ev = ceph_decode_8(p); /* encoding version */
512 cv = ceph_decode_8(p); /* compat version */
513 if (ev < 5) {
514 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
515 return -EINVAL;
516 }
517 if (cv > 7) {
518 pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
519 return -EINVAL;
520 }
521 len = ceph_decode_32(p);
522 ceph_decode_need(p, end, len, bad);
523 pool_end = *p + len;
512 524
513 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 525 pi->type = ceph_decode_8(p);
514 calc_pg_masks(pi); 526 pi->size = ceph_decode_8(p);
527 pi->crush_ruleset = ceph_decode_8(p);
528 pi->object_hash = ceph_decode_8(p);
529
530 pi->pg_num = ceph_decode_32(p);
531 pi->pgp_num = ceph_decode_32(p);
532
533 *p += 4 + 4; /* skip lpg* */
534 *p += 4; /* skip last_change */
535 *p += 8 + 4; /* skip snap_seq, snap_epoch */
515 536
516 /* num_snaps * snap_info_t */ 537 /* skip snaps */
517 n = le32_to_cpu(pi->v.num_snaps); 538 num = ceph_decode_32(p);
518 while (n--) { 539 while (num--) {
519 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + 540 *p += 8; /* snapid key */
520 sizeof(struct ceph_timespec), bad); 541 *p += 1 + 1; /* versions */
521 *p += sizeof(u64) + /* key */ 542 len = ceph_decode_32(p);
522 1 + sizeof(u64) + /* u8, snapid */ 543 *p += len;
523 sizeof(struct ceph_timespec);
524 m = ceph_decode_32(p); /* snap name */
525 *p += m;
526 } 544 }
527 545
528 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 546 /* skip removed snaps */
547 num = ceph_decode_32(p);
548 *p += num * (8 + 8);
549
550 *p += 8; /* skip auid */
551 pi->flags = ceph_decode_64(p);
552
553 /* ignore the rest */
554
555 *p = pool_end;
556 calc_pg_masks(pi);
529 return 0; 557 return 0;
530 558
531bad: 559bad:
@@ -535,14 +563,15 @@ bad:
535static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 563static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
536{ 564{
537 struct ceph_pg_pool_info *pi; 565 struct ceph_pg_pool_info *pi;
538 u32 num, len, pool; 566 u32 num, len;
567 u64 pool;
539 568
540 ceph_decode_32_safe(p, end, num, bad); 569 ceph_decode_32_safe(p, end, num, bad);
541 dout(" %d pool names\n", num); 570 dout(" %d pool names\n", num);
542 while (num--) { 571 while (num--) {
543 ceph_decode_32_safe(p, end, pool, bad); 572 ceph_decode_64_safe(p, end, pool, bad);
544 ceph_decode_32_safe(p, end, len, bad); 573 ceph_decode_32_safe(p, end, len, bad);
545 dout(" pool %d len %d\n", pool, len); 574 dout(" pool %llu len %d\n", pool, len);
546 ceph_decode_need(p, end, len, bad); 575 ceph_decode_need(p, end, len, bad);
547 pi = __lookup_pg_pool(&map->pg_pools, pool); 576 pi = __lookup_pg_pool(&map->pg_pools, pool);
548 if (pi) { 577 if (pi) {
@@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
633 struct ceph_osdmap *map; 662 struct ceph_osdmap *map;
634 u16 version; 663 u16 version;
635 u32 len, max, i; 664 u32 len, max, i;
636 u8 ev;
637 int err = -EINVAL; 665 int err = -EINVAL;
638 void *start = *p; 666 void *start = *p;
639 struct ceph_pg_pool_info *pi; 667 struct ceph_pg_pool_info *pi;
@@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
646 map->pg_temp = RB_ROOT; 674 map->pg_temp = RB_ROOT;
647 675
648 ceph_decode_16_safe(p, end, version, bad); 676 ceph_decode_16_safe(p, end, version, bad);
649 if (version > CEPH_OSDMAP_VERSION) { 677 if (version > 6) {
650 pr_warning("got unknown v %d > %d of osdmap\n", version, 678 pr_warning("got unknown v %d > 6 of osdmap\n", version);
651 CEPH_OSDMAP_VERSION); 679 goto bad;
680 }
681 if (version < 6) {
682 pr_warning("got old v %d < 6 of osdmap\n", version);
652 goto bad; 683 goto bad;
653 } 684 }
654 685
@@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
660 691
661 ceph_decode_32_safe(p, end, max, bad); 692 ceph_decode_32_safe(p, end, max, bad);
662 while (max--) { 693 while (max--) {
663 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 694 ceph_decode_need(p, end, 8 + 2, bad);
664 err = -ENOMEM; 695 err = -ENOMEM;
665 pi = kzalloc(sizeof(*pi), GFP_NOFS); 696 pi = kzalloc(sizeof(*pi), GFP_NOFS);
666 if (!pi) 697 if (!pi)
667 goto bad; 698 goto bad;
668 pi->id = ceph_decode_32(p); 699 pi->id = ceph_decode_64(p);
669 err = -EINVAL;
670 ev = ceph_decode_8(p); /* encoding version */
671 if (ev > CEPH_PG_POOL_VERSION) {
672 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
673 ev, CEPH_PG_POOL_VERSION);
674 kfree(pi);
675 goto bad;
676 }
677 err = __decode_pool(p, end, pi); 700 err = __decode_pool(p, end, pi);
678 if (err < 0) { 701 if (err < 0) {
679 kfree(pi); 702 kfree(pi);
@@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
682 __insert_pg_pool(&map->pg_pools, pi); 705 __insert_pg_pool(&map->pg_pools, pi);
683 } 706 }
684 707
685 if (version >= 5) { 708 err = __decode_pool_names(p, end, map);
686 err = __decode_pool_names(p, end, map); 709 if (err < 0) {
687 if (err < 0) { 710 dout("fail to decode pool names");
688 dout("fail to decode pool names"); 711 goto bad;
689 goto bad;
690 }
691 } 712 }
692 713
693 ceph_decode_32_safe(p, end, map->pool_max, bad); 714 ceph_decode_32_safe(p, end, map->pool_max, bad);
@@ -724,10 +745,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
724 for (i = 0; i < len; i++) { 745 for (i = 0; i < len; i++) {
725 int n, j; 746 int n, j;
726 struct ceph_pg pgid; 747 struct ceph_pg pgid;
748 struct ceph_pg_v1 pgid_v1;
727 struct ceph_pg_mapping *pg; 749 struct ceph_pg_mapping *pg;
728 750
729 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); 751 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
730 ceph_decode_copy(p, &pgid, sizeof(pgid)); 752 ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
753 pgid.pool = le32_to_cpu(pgid_v1.pool);
754 pgid.seed = le16_to_cpu(pgid_v1.ps);
731 n = ceph_decode_32(p); 755 n = ceph_decode_32(p);
732 err = -EINVAL; 756 err = -EINVAL;
733 if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 757 if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
@@ -745,7 +769,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
745 err = __insert_pg_mapping(pg, &map->pg_temp); 769 err = __insert_pg_mapping(pg, &map->pg_temp);
746 if (err) 770 if (err)
747 goto bad; 771 goto bad;
748 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); 772 dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
773 len);
749 } 774 }
750 775
751 /* crush */ 776 /* crush */
@@ -784,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
784 struct ceph_fsid fsid; 809 struct ceph_fsid fsid;
785 u32 epoch = 0; 810 u32 epoch = 0;
786 struct ceph_timespec modified; 811 struct ceph_timespec modified;
787 u32 len, pool; 812 s32 len;
788 __s32 new_pool_max, new_flags, max; 813 u64 pool;
814 __s64 new_pool_max;
815 __s32 new_flags, max;
789 void *start = *p; 816 void *start = *p;
790 int err = -EINVAL; 817 int err = -EINVAL;
791 u16 version; 818 u16 version;
792 819
793 ceph_decode_16_safe(p, end, version, bad); 820 ceph_decode_16_safe(p, end, version, bad);
794 if (version > CEPH_OSDMAP_INC_VERSION) { 821 if (version > 6) {
795 pr_warning("got unknown v %d > %d of inc osdmap\n", version, 822 pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6);
796 CEPH_OSDMAP_INC_VERSION);
797 goto bad; 823 goto bad;
798 } 824 }
799 825
@@ -803,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
803 epoch = ceph_decode_32(p); 829 epoch = ceph_decode_32(p);
804 BUG_ON(epoch != map->epoch+1); 830 BUG_ON(epoch != map->epoch+1);
805 ceph_decode_copy(p, &modified, sizeof(modified)); 831 ceph_decode_copy(p, &modified, sizeof(modified));
806 new_pool_max = ceph_decode_32(p); 832 new_pool_max = ceph_decode_64(p);
807 new_flags = ceph_decode_32(p); 833 new_flags = ceph_decode_32(p);
808 834
809 /* full map? */ 835 /* full map? */
@@ -853,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
853 /* new_pool */ 879 /* new_pool */
854 ceph_decode_32_safe(p, end, len, bad); 880 ceph_decode_32_safe(p, end, len, bad);
855 while (len--) { 881 while (len--) {
856 __u8 ev;
857 struct ceph_pg_pool_info *pi; 882 struct ceph_pg_pool_info *pi;
858 883
859 ceph_decode_32_safe(p, end, pool, bad); 884 ceph_decode_64_safe(p, end, pool, bad);
860 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
861 ev = ceph_decode_8(p); /* encoding version */
862 if (ev > CEPH_PG_POOL_VERSION) {
863 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
864 ev, CEPH_PG_POOL_VERSION);
865 err = -EINVAL;
866 goto bad;
867 }
868 pi = __lookup_pg_pool(&map->pg_pools, pool); 885 pi = __lookup_pg_pool(&map->pg_pools, pool);
869 if (!pi) { 886 if (!pi) {
870 pi = kzalloc(sizeof(*pi), GFP_NOFS); 887 pi = kzalloc(sizeof(*pi), GFP_NOFS);
@@ -890,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
890 while (len--) { 907 while (len--) {
891 struct ceph_pg_pool_info *pi; 908 struct ceph_pg_pool_info *pi;
892 909
893 ceph_decode_32_safe(p, end, pool, bad); 910 ceph_decode_64_safe(p, end, pool, bad);
894 pi = __lookup_pg_pool(&map->pg_pools, pool); 911 pi = __lookup_pg_pool(&map->pg_pools, pool);
895 if (pi) 912 if (pi)
896 __remove_pg_pool(&map->pg_pools, pi); 913 __remove_pg_pool(&map->pg_pools, pi);
@@ -946,10 +963,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
946 while (len--) { 963 while (len--) {
947 struct ceph_pg_mapping *pg; 964 struct ceph_pg_mapping *pg;
948 int j; 965 int j;
966 struct ceph_pg_v1 pgid_v1;
949 struct ceph_pg pgid; 967 struct ceph_pg pgid;
950 u32 pglen; 968 u32 pglen;
951 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); 969 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
952 ceph_decode_copy(p, &pgid, sizeof(pgid)); 970 ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
971 pgid.pool = le32_to_cpu(pgid_v1.pool);
972 pgid.seed = le16_to_cpu(pgid_v1.ps);
953 pglen = ceph_decode_32(p); 973 pglen = ceph_decode_32(p);
954 974
955 if (pglen) { 975 if (pglen) {
@@ -975,8 +995,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
975 kfree(pg); 995 kfree(pg);
976 goto bad; 996 goto bad;
977 } 997 }
978 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, 998 dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
979 pglen); 999 pgid.seed, pglen);
980 } else { 1000 } else {
981 /* remove */ 1001 /* remove */
982 __remove_pg_mapping(&map->pg_temp, pgid); 1002 __remove_pg_mapping(&map->pg_temp, pgid);
@@ -1010,7 +1030,7 @@ bad:
1010 * pass a stride back to the caller. 1030 * pass a stride back to the caller.
1011 */ 1031 */
1012int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 1032int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1013 u64 off, u64 *plen, 1033 u64 off, u64 len,
1014 u64 *ono, 1034 u64 *ono,
1015 u64 *oxoff, u64 *oxlen) 1035 u64 *oxoff, u64 *oxlen)
1016{ 1036{
@@ -1021,7 +1041,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1021 u32 su_per_object; 1041 u32 su_per_object;
1022 u64 t, su_offset; 1042 u64 t, su_offset;
1023 1043
1024 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, 1044 dout("mapping %llu~%llu osize %u fl_su %u\n", off, len,
1025 osize, su); 1045 osize, su);
1026 if (su == 0 || sc == 0) 1046 if (su == 0 || sc == 0)
1027 goto invalid; 1047 goto invalid;
@@ -1054,11 +1074,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1054 1074
1055 /* 1075 /*
1056 * Calculate the length of the extent being written to the selected 1076 * Calculate the length of the extent being written to the selected
1057 * object. This is the minimum of the full length requested (plen) or 1077 * object. This is the minimum of the full length requested (len) or
1058 * the remainder of the current stripe being written to. 1078 * the remainder of the current stripe being written to.
1059 */ 1079 */
1060 *oxlen = min_t(u64, *plen, su - su_offset); 1080 *oxlen = min_t(u64, len, su - su_offset);
1061 *plen = *oxlen;
1062 1081
1063 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 1082 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
1064 return 0; 1083 return 0;
@@ -1076,33 +1095,24 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
1076 * calculate an object layout (i.e. pgid) from an oid, 1095 * calculate an object layout (i.e. pgid) from an oid,
1077 * file_layout, and osdmap 1096 * file_layout, and osdmap
1078 */ 1097 */
1079int ceph_calc_object_layout(struct ceph_object_layout *ol, 1098int ceph_calc_object_layout(struct ceph_pg *pg,
1080 const char *oid, 1099 const char *oid,
1081 struct ceph_file_layout *fl, 1100 struct ceph_file_layout *fl,
1082 struct ceph_osdmap *osdmap) 1101 struct ceph_osdmap *osdmap)
1083{ 1102{
1084 unsigned int num, num_mask; 1103 unsigned int num, num_mask;
1085 struct ceph_pg pgid;
1086 int poolid = le32_to_cpu(fl->fl_pg_pool);
1087 struct ceph_pg_pool_info *pool; 1104 struct ceph_pg_pool_info *pool;
1088 unsigned int ps;
1089 1105
1090 BUG_ON(!osdmap); 1106 BUG_ON(!osdmap);
1091 1107 pg->pool = le32_to_cpu(fl->fl_pg_pool);
1092 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 1108 pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool);
1093 if (!pool) 1109 if (!pool)
1094 return -EIO; 1110 return -EIO;
1095 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); 1111 pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
1096 num = le32_to_cpu(pool->v.pg_num); 1112 num = pool->pg_num;
1097 num_mask = pool->pg_num_mask; 1113 num_mask = pool->pg_num_mask;
1098 1114
1099 pgid.ps = cpu_to_le16(ps); 1115 dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed);
1100 pgid.preferred = cpu_to_le16(-1);
1101 pgid.pool = fl->fl_pg_pool;
1102 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1103
1104 ol->ol_pgid = pgid;
1105 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1106 return 0; 1116 return 0;
1107} 1117}
1108EXPORT_SYMBOL(ceph_calc_object_layout); 1118EXPORT_SYMBOL(ceph_calc_object_layout);
@@ -1117,19 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1117 struct ceph_pg_mapping *pg; 1127 struct ceph_pg_mapping *pg;
1118 struct ceph_pg_pool_info *pool; 1128 struct ceph_pg_pool_info *pool;
1119 int ruleno; 1129 int ruleno;
1120 unsigned int poolid, ps, pps, t, r; 1130 int r;
1121 1131 u32 pps;
1122 poolid = le32_to_cpu(pgid.pool);
1123 ps = le16_to_cpu(pgid.ps);
1124 1132
1125 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 1133 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
1126 if (!pool) 1134 if (!pool)
1127 return NULL; 1135 return NULL;
1128 1136
1129 /* pg_temp? */ 1137 /* pg_temp? */
1130 t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), 1138 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
1131 pool->pgp_num_mask); 1139 pool->pgp_num_mask);
1132 pgid.ps = cpu_to_le16(t);
1133 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1140 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1134 if (pg) { 1141 if (pg) {
1135 *num = pg->len; 1142 *num = pg->len;
@@ -1137,26 +1144,39 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1137 } 1144 }
1138 1145
1139 /* crush */ 1146 /* crush */
1140 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 1147 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
1141 pool->v.type, pool->v.size); 1148 pool->type, pool->size);
1142 if (ruleno < 0) { 1149 if (ruleno < 0) {
1143 pr_err("no crush rule pool %d ruleset %d type %d size %d\n", 1150 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
1144 poolid, pool->v.crush_ruleset, pool->v.type, 1151 pgid.pool, pool->crush_ruleset, pool->type,
1145 pool->v.size); 1152 pool->size);
1146 return NULL; 1153 return NULL;
1147 } 1154 }
1148 1155
1149 pps = ceph_stable_mod(ps, 1156 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1150 le32_to_cpu(pool->v.pgp_num), 1157 /* hash pool id and seed sothat pool PGs do not overlap */
1151 pool->pgp_num_mask); 1158 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
1152 pps += poolid; 1159 ceph_stable_mod(pgid.seed, pool->pgp_num,
1160 pool->pgp_num_mask),
1161 pgid.pool);
1162 } else {
1163 /*
1164 * legacy ehavior: add ps and pool together. this is
1165 * not a great approach because the PGs from each pool
1166 * will overlap on top of each other: 0.5 == 1.4 ==
1167 * 2.3 == ...
1168 */
1169 pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
1170 pool->pgp_num_mask) +
1171 (unsigned)pgid.pool;
1172 }
1153 r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1173 r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1154 min_t(int, pool->v.size, *num), 1174 min_t(int, pool->size, *num),
1155 osdmap->osd_weight); 1175 osdmap->osd_weight);
1156 if (r < 0) { 1176 if (r < 0) {
1157 pr_err("error %d from crush rule: pool %d ruleset %d type %d" 1177 pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
1158 " size %d\n", r, poolid, pool->v.crush_ruleset, 1178 " size %d\n", r, pgid.pool, pool->crush_ruleset,
1159 pool->v.type, pool->v.size); 1179 pool->type, pool->size);
1160 return NULL; 1180 return NULL;
1161 } 1181 }
1162 *num = r; 1182 *num = r;
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index cd9c21df87d1..815a2249cfa9 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -12,7 +12,7 @@
12/* 12/*
13 * build a vector of user pages 13 * build a vector of user pages
14 */ 14 */
15struct page **ceph_get_direct_page_vector(const char __user *data, 15struct page **ceph_get_direct_page_vector(const void __user *data,
16 int num_pages, bool write_page) 16 int num_pages, bool write_page)
17{ 17{
18 struct page **pages; 18 struct page **pages;
@@ -93,7 +93,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector);
93 * copy user data into a page vector 93 * copy user data into a page vector
94 */ 94 */
95int ceph_copy_user_to_page_vector(struct page **pages, 95int ceph_copy_user_to_page_vector(struct page **pages,
96 const char __user *data, 96 const void __user *data,
97 loff_t off, size_t len) 97 loff_t off, size_t len)
98{ 98{
99 int i = 0; 99 int i = 0;
@@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages,
118} 118}
119EXPORT_SYMBOL(ceph_copy_user_to_page_vector); 119EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
120 120
121int ceph_copy_to_page_vector(struct page **pages, 121void ceph_copy_to_page_vector(struct page **pages,
122 const char *data, 122 const void *data,
123 loff_t off, size_t len) 123 loff_t off, size_t len)
124{ 124{
125 int i = 0; 125 int i = 0;
126 size_t po = off & ~PAGE_CACHE_MASK; 126 size_t po = off & ~PAGE_CACHE_MASK;
127 size_t left = len; 127 size_t left = len;
128 size_t l;
129 128
130 while (left > 0) { 129 while (left > 0) {
131 l = min_t(size_t, PAGE_CACHE_SIZE-po, left); 130 size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
131
132 memcpy(page_address(pages[i]) + po, data, l); 132 memcpy(page_address(pages[i]) + po, data, l);
133 data += l; 133 data += l;
134 left -= l; 134 left -= l;
@@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages,
138 i++; 138 i++;
139 } 139 }
140 } 140 }
141 return len;
142} 141}
143EXPORT_SYMBOL(ceph_copy_to_page_vector); 142EXPORT_SYMBOL(ceph_copy_to_page_vector);
144 143
145int ceph_copy_from_page_vector(struct page **pages, 144void ceph_copy_from_page_vector(struct page **pages,
146 char *data, 145 void *data,
147 loff_t off, size_t len) 146 loff_t off, size_t len)
148{ 147{
149 int i = 0; 148 int i = 0;
150 size_t po = off & ~PAGE_CACHE_MASK; 149 size_t po = off & ~PAGE_CACHE_MASK;
151 size_t left = len; 150 size_t left = len;
152 size_t l;
153 151
154 while (left > 0) { 152 while (left > 0) {
155 l = min_t(size_t, PAGE_CACHE_SIZE-po, left); 153 size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
154
156 memcpy(data, page_address(pages[i]) + po, l); 155 memcpy(data, page_address(pages[i]) + po, l);
157 data += l; 156 data += l;
158 left -= l; 157 left -= l;
@@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages,
162 i++; 161 i++;
163 } 162 }
164 } 163 }
165 return len;
166} 164}
167EXPORT_SYMBOL(ceph_copy_from_page_vector); 165EXPORT_SYMBOL(ceph_copy_from_page_vector);
168 166
@@ -170,7 +168,7 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector);
170 * copy user data from a page vector into a user pointer 168 * copy user data from a page vector into a user pointer
171 */ 169 */
172int ceph_copy_page_vector_to_user(struct page **pages, 170int ceph_copy_page_vector_to_user(struct page **pages,
173 char __user *data, 171 void __user *data,
174 loff_t off, size_t len) 172 loff_t off, size_t len)
175{ 173{
176 int i = 0; 174 int i = 0;