aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 20:43:09 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 20:43:09 -0500
commit1cf0209c431fa7790253c532039d53b0773193aa (patch)
tree24310eaaf4c9583988d9098f6c85a4a34970b5b9
parentde1a2262b006220dae2561a299a6ea128c46f4fe (diff)
parent83ca14fdd35821554058e5fd4fa7b118ee504a33 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "A few groups of patches here. Alex has been hard at work improving the RBD code, layout groundwork for understanding the new formats and doing layering. Most of the infrastructure is now in place for the final bits that will come with the next window. There are a few changes to the data layout. Jim Schutt's patch fixes some non-ideal CRUSH behavior, and a set of patches from me updates the client to speak a newer version of the protocol and implement an improved hashing strategy across storage nodes (when the server side supports it too). A pair of patches from Sam Lang fix the atomicity of open+create operations. Several patches from Yan, Zheng fix various mds/client issues that turned up during multi-mds torture tests. A final set of patches expose file layouts via virtual xattrs, and allow the policies to be set on directories via xattrs as well (avoiding the awkward ioctl interface and providing a consistent interface for both kernel mount and ceph-fuse users)." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (143 commits) libceph: add support for HASHPSPOOL pool flag libceph: update osd request/reply encoding libceph: calculate placement based on the internal data types ceph: update support for PGID64, PGPOOL3, OSDENC protocol features ceph: update "ceph_features.h" libceph: decode into cpu-native ceph_pg type libceph: rename ceph_pg -> ceph_pg_v1 rbd: pass length, not op for osd completions rbd: move rbd_osd_trivial_callback() libceph: use a do..while loop in con_work() libceph: use a flag to indicate a fault has occurred libceph: separate non-locked fault handling libceph: encapsulate connection backoff libceph: eliminate sparse warnings ceph: eliminate sparse warnings in fs code rbd: eliminate sparse warnings libceph: define connection flag helpers rbd: normalize dout() calls rbd: barriers are hard rbd: ignore zero-length requests ...
-rw-r--r--drivers/block/rbd.c1852
-rw-r--r--fs/ceph/addr.c38
-rw-r--r--fs/ceph/caps.c32
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/ioctl.c6
-rw-r--r--fs/ceph/mds_client.c33
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/strings.c4
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h10
-rw-r--r--fs/ceph/xattr.c214
-rw-r--r--include/linux/ceph/ceph_features.h38
-rw-r--r--include/linux/ceph/ceph_fs.h32
-rw-r--r--include/linux/ceph/decode.h29
-rw-r--r--include/linux/ceph/libceph.h16
-rw-r--r--include/linux/ceph/mdsmap.h4
-rw-r--r--include/linux/ceph/messenger.h2
-rw-r--r--include/linux/ceph/osd_client.h74
-rw-r--r--include/linux/ceph/osdmap.h30
-rw-r--r--include/linux/ceph/rados.h158
-rw-r--r--include/linux/crush/crush.h2
-rw-r--r--net/ceph/ceph_common.c22
-rw-r--r--net/ceph/ceph_strings.c39
-rw-r--r--net/ceph/crush/mapper.c15
-rw-r--r--net/ceph/crypto.c7
-rw-r--r--net/ceph/debugfs.c29
-rw-r--r--net/ceph/messenger.c260
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/ceph/osd_client.c635
-rw-r--r--net/ceph/osdmap.c290
-rw-r--r--net/ceph/pagevec.c24
32 files changed, 2402 insertions, 1528 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 89576a0b3f2e..6c81a4c040b9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,9 +52,12 @@
52#define SECTOR_SHIFT 9 52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54 54
55/* It might be useful to have this defined elsewhere too */ 55/* It might be useful to have these defined elsewhere */
56 56
57#define U64_MAX ((u64) (~0ULL)) 57#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
58 61
59#define RBD_DRV_NAME "rbd" 62#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)" 63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
@@ -66,7 +69,6 @@
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67 70
68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 71#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
69#define RBD_MAX_OPT_LEN 1024
70 72
71#define RBD_SNAP_HEAD_NAME "-" 73#define RBD_SNAP_HEAD_NAME "-"
72 74
@@ -93,8 +95,6 @@
93#define DEV_NAME_LEN 32 95#define DEV_NAME_LEN 32
94#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 96#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
95 97
96#define RBD_READ_ONLY_DEFAULT false
97
98/* 98/*
99 * block device image metadata (in-memory version) 99 * block device image metadata (in-memory version)
100 */ 100 */
@@ -119,16 +119,33 @@ struct rbd_image_header {
119 * An rbd image specification. 119 * An rbd image specification.
120 * 120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image. 122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
123 */ 142 */
124struct rbd_spec { 143struct rbd_spec {
125 u64 pool_id; 144 u64 pool_id;
126 char *pool_name; 145 char *pool_name;
127 146
128 char *image_id; 147 char *image_id;
129 size_t image_id_len;
130 char *image_name; 148 char *image_name;
131 size_t image_name_len;
132 149
133 u64 snap_id; 150 u64 snap_id;
134 char *snap_name; 151 char *snap_name;
@@ -136,10 +153,6 @@ struct rbd_spec {
136 struct kref kref; 153 struct kref kref;
137}; 154};
138 155
139struct rbd_options {
140 bool read_only;
141};
142
143/* 156/*
144 * an instance of the client. multiple devices may share an rbd client. 157 * an instance of the client. multiple devices may share an rbd client.
145 */ 158 */
@@ -149,37 +162,76 @@ struct rbd_client {
149 struct list_head node; 162 struct list_head node;
150}; 163};
151 164
152/* 165struct rbd_img_request;
153 * a request completion status 166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
154 */ 167
155struct rbd_req_status { 168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
156 int done; 169
157 int rc; 170struct rbd_obj_request;
158 u64 bytes; 171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
159}; 175};
160 176
161/* 177struct rbd_obj_request {
162 * a collection of requests 178 const char *object_name;
163 */ 179 u64 offset; /* object start byte */
164struct rbd_req_coll { 180 u64 length; /* bytes from offset */
165 int total; 181
166 int num_done; 182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 int result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
203 struct completion completion;
204
167 struct kref kref; 205 struct kref kref;
168 struct rbd_req_status status[0];
169}; 206};
170 207
171/* 208struct rbd_img_request {
172 * a single io request 209 struct request *rq;
173 */ 210 struct rbd_device *rbd_dev;
174struct rbd_request { 211 u64 offset; /* starting image byte offset */
175 struct request *rq; /* blk layer request */ 212 u64 length; /* byte count from offset */
176 struct bio *bio; /* cloned bio */ 213 bool write_request; /* false for read */
177 struct page **pages; /* list of used pages */ 214 union {
178 u64 len; 215 struct ceph_snap_context *snapc; /* for writes */
179 int coll_index; 216 u64 snap_id; /* for reads */
180 struct rbd_req_coll *coll; 217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
181}; 226};
182 227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
234
183struct rbd_snap { 235struct rbd_snap {
184 struct device dev; 236 struct device dev;
185 const char *name; 237 const char *name;
@@ -209,16 +261,18 @@ struct rbd_device {
209 261
210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211 263
212 spinlock_t lock; /* queue lock */ 264 spinlock_t lock; /* queue, flags, open_count */
213 265
214 struct rbd_image_header header; 266 struct rbd_image_header header;
215 bool exists; 267 unsigned long flags; /* possibly lock protected */
216 struct rbd_spec *spec; 268 struct rbd_spec *spec;
217 269
218 char *header_name; 270 char *header_name;
219 271
272 struct ceph_file_layout layout;
273
220 struct ceph_osd_event *watch_event; 274 struct ceph_osd_event *watch_event;
221 struct ceph_osd_request *watch_request; 275 struct rbd_obj_request *watch_request;
222 276
223 struct rbd_spec *parent_spec; 277 struct rbd_spec *parent_spec;
224 u64 parent_overlap; 278 u64 parent_overlap;
@@ -235,7 +289,19 @@ struct rbd_device {
235 289
236 /* sysfs related */ 290 /* sysfs related */
237 struct device dev; 291 struct device dev;
238 unsigned long open_count; 292 unsigned long open_count; /* protected by lock */
293};
294
295/*
296 * Flag bits for rbd_dev->flags. If atomicity is required,
297 * rbd_dev->lock is used to protect access.
298 *
299 * Currently, only the "removing" flag (which is coupled with the
300 * "open_count" field) requires atomic access.
301 */
302enum rbd_dev_flags {
303 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
304 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
239}; 305};
240 306
241static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 307static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
@@ -277,6 +343,33 @@ static struct device rbd_root_dev = {
277 .release = rbd_root_dev_release, 343 .release = rbd_root_dev_release,
278}; 344};
279 345
346static __printf(2, 3)
347void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
348{
349 struct va_format vaf;
350 va_list args;
351
352 va_start(args, fmt);
353 vaf.fmt = fmt;
354 vaf.va = &args;
355
356 if (!rbd_dev)
357 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
358 else if (rbd_dev->disk)
359 printk(KERN_WARNING "%s: %s: %pV\n",
360 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
361 else if (rbd_dev->spec && rbd_dev->spec->image_name)
362 printk(KERN_WARNING "%s: image %s: %pV\n",
363 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
364 else if (rbd_dev->spec && rbd_dev->spec->image_id)
365 printk(KERN_WARNING "%s: id %s: %pV\n",
366 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
367 else /* punt */
368 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
369 RBD_DRV_NAME, rbd_dev, &vaf);
370 va_end(args);
371}
372
280#ifdef RBD_DEBUG 373#ifdef RBD_DEBUG
281#define rbd_assert(expr) \ 374#define rbd_assert(expr) \
282 if (unlikely(!(expr))) { \ 375 if (unlikely(!(expr))) { \
@@ -296,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
296static int rbd_open(struct block_device *bdev, fmode_t mode) 389static int rbd_open(struct block_device *bdev, fmode_t mode)
297{ 390{
298 struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 391 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
392 bool removing = false;
299 393
300 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 394 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
301 return -EROFS; 395 return -EROFS;
302 396
397 spin_lock_irq(&rbd_dev->lock);
398 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
399 removing = true;
400 else
401 rbd_dev->open_count++;
402 spin_unlock_irq(&rbd_dev->lock);
403 if (removing)
404 return -ENOENT;
405
303 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 406 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
304 (void) get_device(&rbd_dev->dev); 407 (void) get_device(&rbd_dev->dev);
305 set_device_ro(bdev, rbd_dev->mapping.read_only); 408 set_device_ro(bdev, rbd_dev->mapping.read_only);
306 rbd_dev->open_count++;
307 mutex_unlock(&ctl_mutex); 409 mutex_unlock(&ctl_mutex);
308 410
309 return 0; 411 return 0;
@@ -312,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
312static int rbd_release(struct gendisk *disk, fmode_t mode) 414static int rbd_release(struct gendisk *disk, fmode_t mode)
313{ 415{
314 struct rbd_device *rbd_dev = disk->private_data; 416 struct rbd_device *rbd_dev = disk->private_data;
417 unsigned long open_count_before;
418
419 spin_lock_irq(&rbd_dev->lock);
420 open_count_before = rbd_dev->open_count--;
421 spin_unlock_irq(&rbd_dev->lock);
422 rbd_assert(open_count_before > 0);
315 423
316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317 rbd_assert(rbd_dev->open_count > 0);
318 rbd_dev->open_count--;
319 put_device(&rbd_dev->dev); 425 put_device(&rbd_dev->dev);
320 mutex_unlock(&ctl_mutex); 426 mutex_unlock(&ctl_mutex);
321 427
@@ -337,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
337 struct rbd_client *rbdc; 443 struct rbd_client *rbdc;
338 int ret = -ENOMEM; 444 int ret = -ENOMEM;
339 445
340 dout("rbd_client_create\n"); 446 dout("%s:\n", __func__);
341 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 447 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
342 if (!rbdc) 448 if (!rbdc)
343 goto out_opt; 449 goto out_opt;
@@ -361,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
361 spin_unlock(&rbd_client_list_lock); 467 spin_unlock(&rbd_client_list_lock);
362 468
363 mutex_unlock(&ctl_mutex); 469 mutex_unlock(&ctl_mutex);
470 dout("%s: rbdc %p\n", __func__, rbdc);
364 471
365 dout("rbd_client_create created %p\n", rbdc);
366 return rbdc; 472 return rbdc;
367 473
368out_err: 474out_err:
@@ -373,6 +479,8 @@ out_mutex:
373out_opt: 479out_opt:
374 if (ceph_opts) 480 if (ceph_opts)
375 ceph_destroy_options(ceph_opts); 481 ceph_destroy_options(ceph_opts);
482 dout("%s: error %d\n", __func__, ret);
483
376 return ERR_PTR(ret); 484 return ERR_PTR(ret);
377} 485}
378 486
@@ -426,6 +534,12 @@ static match_table_t rbd_opts_tokens = {
426 {-1, NULL} 534 {-1, NULL}
427}; 535};
428 536
537struct rbd_options {
538 bool read_only;
539};
540
541#define RBD_READ_ONLY_DEFAULT false
542
429static int parse_rbd_opts_token(char *c, void *private) 543static int parse_rbd_opts_token(char *c, void *private)
430{ 544{
431 struct rbd_options *rbd_opts = private; 545 struct rbd_options *rbd_opts = private;
@@ -493,7 +607,7 @@ static void rbd_client_release(struct kref *kref)
493{ 607{
494 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 608 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
495 609
496 dout("rbd_release_client %p\n", rbdc); 610 dout("%s: rbdc %p\n", __func__, rbdc);
497 spin_lock(&rbd_client_list_lock); 611 spin_lock(&rbd_client_list_lock);
498 list_del(&rbdc->node); 612 list_del(&rbdc->node);
499 spin_unlock(&rbd_client_list_lock); 613 spin_unlock(&rbd_client_list_lock);
@@ -512,18 +626,6 @@ static void rbd_put_client(struct rbd_client *rbdc)
512 kref_put(&rbdc->kref, rbd_client_release); 626 kref_put(&rbdc->kref, rbd_client_release);
513} 627}
514 628
515/*
516 * Destroy requests collection
517 */
518static void rbd_coll_release(struct kref *kref)
519{
520 struct rbd_req_coll *coll =
521 container_of(kref, struct rbd_req_coll, kref);
522
523 dout("rbd_coll_release %p\n", coll);
524 kfree(coll);
525}
526
527static bool rbd_image_format_valid(u32 image_format) 629static bool rbd_image_format_valid(u32 image_format)
528{ 630{
529 return image_format == 1 || image_format == 2; 631 return image_format == 1 || image_format == 2;
@@ -707,7 +809,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
707 goto done; 809 goto done;
708 rbd_dev->mapping.read_only = true; 810 rbd_dev->mapping.read_only = true;
709 } 811 }
710 rbd_dev->exists = true; 812 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
813
711done: 814done:
712 return ret; 815 return ret;
713} 816}
@@ -724,7 +827,7 @@ static void rbd_header_free(struct rbd_image_header *header)
724 header->snapc = NULL; 827 header->snapc = NULL;
725} 828}
726 829
727static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 830static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
728{ 831{
729 char *name; 832 char *name;
730 u64 segment; 833 u64 segment;
@@ -767,23 +870,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
767 return length; 870 return length;
768} 871}
769 872
770static int rbd_get_num_segments(struct rbd_image_header *header,
771 u64 ofs, u64 len)
772{
773 u64 start_seg;
774 u64 end_seg;
775
776 if (!len)
777 return 0;
778 if (len - 1 > U64_MAX - ofs)
779 return -ERANGE;
780
781 start_seg = ofs >> header->obj_order;
782 end_seg = (ofs + len - 1) >> header->obj_order;
783
784 return end_seg - start_seg + 1;
785}
786
787/* 873/*
788 * returns the size of an object in the image 874 * returns the size of an object in the image
789 */ 875 */
@@ -949,8 +1035,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
949 unsigned int bi_size; 1035 unsigned int bi_size;
950 struct bio *bio; 1036 struct bio *bio;
951 1037
952 if (!bi) 1038 if (!bi) {
1039 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
953 goto out_err; /* EINVAL; ran out of bio's */ 1040 goto out_err; /* EINVAL; ran out of bio's */
1041 }
954 bi_size = min_t(unsigned int, bi->bi_size - off, len); 1042 bi_size = min_t(unsigned int, bi->bi_size - off, len);
955 bio = bio_clone_range(bi, off, bi_size, gfpmask); 1043 bio = bio_clone_range(bi, off, bi_size, gfpmask);
956 if (!bio) 1044 if (!bio)
@@ -976,399 +1064,721 @@ out_err:
976 return NULL; 1064 return NULL;
977} 1065}
978 1066
979/* 1067static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
980 * helpers for osd request op vectors.
981 */
982static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
983 int opcode, u32 payload_len)
984{ 1068{
985 struct ceph_osd_req_op *ops; 1069 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1070 atomic_read(&obj_request->kref.refcount));
1071 kref_get(&obj_request->kref);
1072}
1073
1074static void rbd_obj_request_destroy(struct kref *kref);
1075static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076{
1077 rbd_assert(obj_request != NULL);
1078 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1079 atomic_read(&obj_request->kref.refcount));
1080 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081}
1082
1083static void rbd_img_request_get(struct rbd_img_request *img_request)
1084{
1085 dout("%s: img %p (was %d)\n", __func__, img_request,
1086 atomic_read(&img_request->kref.refcount));
1087 kref_get(&img_request->kref);
1088}
1089
1090static void rbd_img_request_destroy(struct kref *kref);
1091static void rbd_img_request_put(struct rbd_img_request *img_request)
1092{
1093 rbd_assert(img_request != NULL);
1094 dout("%s: img %p (was %d)\n", __func__, img_request,
1095 atomic_read(&img_request->kref.refcount));
1096 kref_put(&img_request->kref, rbd_img_request_destroy);
1097}
1098
1099static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100 struct rbd_obj_request *obj_request)
1101{
1102 rbd_assert(obj_request->img_request == NULL);
1103
1104 rbd_obj_request_get(obj_request);
1105 obj_request->img_request = img_request;
1106 obj_request->which = img_request->obj_request_count;
1107 rbd_assert(obj_request->which != BAD_WHICH);
1108 img_request->obj_request_count++;
1109 list_add_tail(&obj_request->links, &img_request->obj_requests);
1110 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1111 obj_request->which);
1112}
986 1113
987 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); 1114static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
988 if (!ops) 1115 struct rbd_obj_request *obj_request)
1116{
1117 rbd_assert(obj_request->which != BAD_WHICH);
1118
1119 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1120 obj_request->which);
1121 list_del(&obj_request->links);
1122 rbd_assert(img_request->obj_request_count > 0);
1123 img_request->obj_request_count--;
1124 rbd_assert(obj_request->which == img_request->obj_request_count);
1125 obj_request->which = BAD_WHICH;
1126 rbd_assert(obj_request->img_request == img_request);
1127 obj_request->img_request = NULL;
1128 obj_request->callback = NULL;
1129 rbd_obj_request_put(obj_request);
1130}
1131
1132static bool obj_request_type_valid(enum obj_request_type type)
1133{
1134 switch (type) {
1135 case OBJ_REQUEST_NODATA:
1136 case OBJ_REQUEST_BIO:
1137 case OBJ_REQUEST_PAGES:
1138 return true;
1139 default:
1140 return false;
1141 }
1142}
1143
1144static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1145{
1146 struct ceph_osd_req_op *op;
1147 va_list args;
1148 size_t size;
1149
1150 op = kzalloc(sizeof (*op), GFP_NOIO);
1151 if (!op)
989 return NULL; 1152 return NULL;
1153 op->op = opcode;
1154 va_start(args, opcode);
1155 switch (opcode) {
1156 case CEPH_OSD_OP_READ:
1157 case CEPH_OSD_OP_WRITE:
1158 /* rbd_osd_req_op_create(READ, offset, length) */
1159 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160 op->extent.offset = va_arg(args, u64);
1161 op->extent.length = va_arg(args, u64);
1162 if (opcode == CEPH_OSD_OP_WRITE)
1163 op->payload_len = op->extent.length;
1164 break;
1165 case CEPH_OSD_OP_STAT:
1166 break;
1167 case CEPH_OSD_OP_CALL:
1168 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169 op->cls.class_name = va_arg(args, char *);
1170 size = strlen(op->cls.class_name);
1171 rbd_assert(size <= (size_t) U8_MAX);
1172 op->cls.class_len = size;
1173 op->payload_len = size;
1174
1175 op->cls.method_name = va_arg(args, char *);
1176 size = strlen(op->cls.method_name);
1177 rbd_assert(size <= (size_t) U8_MAX);
1178 op->cls.method_len = size;
1179 op->payload_len += size;
1180
1181 op->cls.argc = 0;
1182 op->cls.indata = va_arg(args, void *);
1183 size = va_arg(args, size_t);
1184 rbd_assert(size <= (size_t) U32_MAX);
1185 op->cls.indata_len = (u32) size;
1186 op->payload_len += size;
1187 break;
1188 case CEPH_OSD_OP_NOTIFY_ACK:
1189 case CEPH_OSD_OP_WATCH:
1190 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192 op->watch.cookie = va_arg(args, u64);
1193 op->watch.ver = va_arg(args, u64);
1194 op->watch.ver = cpu_to_le64(op->watch.ver);
1195 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196 op->watch.flag = (u8) 1;
1197 break;
1198 default:
1199 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200 kfree(op);
1201 op = NULL;
1202 break;
1203 }
1204 va_end(args);
990 1205
991 ops[0].op = opcode; 1206 return op;
1207}
992 1208
993 /* 1209static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
994 * op extent offset and length will be set later on 1210{
995 * in calc_raw_layout() 1211 kfree(op);
996 */ 1212}
997 ops[0].payload_len = payload_len; 1213
1214static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215 struct rbd_obj_request *obj_request)
1216{
1217 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
998 1218
999 return ops; 1219 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1000} 1220}
1001 1221
1002static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 1222static void rbd_img_request_complete(struct rbd_img_request *img_request)
1003{ 1223{
1004 kfree(ops); 1224 dout("%s: img %p\n", __func__, img_request);
1225 if (img_request->callback)
1226 img_request->callback(img_request);
1227 else
1228 rbd_img_request_put(img_request);
1005} 1229}
1006 1230
1007static void rbd_coll_end_req_index(struct request *rq, 1231/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1008 struct rbd_req_coll *coll, 1232
1009 int index, 1233static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1010 int ret, u64 len)
1011{ 1234{
1012 struct request_queue *q; 1235 dout("%s: obj %p\n", __func__, obj_request);
1013 int min, max, i;
1014 1236
1015 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 1237 return wait_for_completion_interruptible(&obj_request->completion);
1016 coll, index, ret, (unsigned long long) len); 1238}
1017 1239
1018 if (!rq) 1240static void obj_request_done_init(struct rbd_obj_request *obj_request)
1019 return; 1241{
1242 atomic_set(&obj_request->done, 0);
1243 smp_wmb();
1244}
1020 1245
1021 if (!coll) { 1246static void obj_request_done_set(struct rbd_obj_request *obj_request)
1022 blk_end_request(rq, ret, len); 1247{
1023 return; 1248 int done;
1249
1250 done = atomic_inc_return(&obj_request->done);
1251 if (done > 1) {
1252 struct rbd_img_request *img_request = obj_request->img_request;
1253 struct rbd_device *rbd_dev;
1254
1255 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257 obj_request);
1024 } 1258 }
1259}
1025 1260
1026 q = rq->q; 1261static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1027 1262{
1028 spin_lock_irq(q->queue_lock); 1263 smp_mb();
1029 coll->status[index].done = 1; 1264 return atomic_read(&obj_request->done) != 0;
1030 coll->status[index].rc = ret; 1265}
1031 coll->status[index].bytes = len; 1266
1032 max = min = coll->num_done; 1267static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1033 while (max < coll->total && coll->status[max].done) 1268{
1034 max++; 1269 dout("%s: obj %p cb %p\n", __func__, obj_request,
1035 1270 obj_request->callback);
1036 for (i = min; i<max; i++) { 1271 if (obj_request->callback)
1037 __blk_end_request(rq, coll->status[i].rc, 1272 obj_request->callback(obj_request);
1038 coll->status[i].bytes); 1273 else
1039 coll->num_done++; 1274 complete_all(&obj_request->completion);
1040 kref_put(&coll->kref, rbd_coll_release); 1275}
1276
1277static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1278{
1279 dout("%s: obj %p\n", __func__, obj_request);
1280 obj_request_done_set(obj_request);
1281}
1282
1283static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1284{
1285 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1286 obj_request->result, obj_request->xferred, obj_request->length);
1287 /*
1288 * ENOENT means a hole in the object. We zero-fill the
1289 * entire length of the request. A short read also implies
1290 * zero-fill to the end of the request. Either way we
1291 * update the xferred count to indicate the whole request
1292 * was satisfied.
1293 */
1294 if (obj_request->result == -ENOENT) {
1295 zero_bio_chain(obj_request->bio_list, 0);
1296 obj_request->result = 0;
1297 obj_request->xferred = obj_request->length;
1298 } else if (obj_request->xferred < obj_request->length &&
1299 !obj_request->result) {
1300 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1301 obj_request->xferred = obj_request->length;
1041 } 1302 }
1042 spin_unlock_irq(q->queue_lock); 1303 obj_request_done_set(obj_request);
1043} 1304}
1044 1305
1045static void rbd_coll_end_req(struct rbd_request *req, 1306static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1046 int ret, u64 len)
1047{ 1307{
1048 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); 1308 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1309 obj_request->result, obj_request->length);
1310 /*
1311 * There is no such thing as a successful short write.
1312 * Our xferred value is the number of bytes transferred
1313 * back. Set it to our originally-requested length.
1314 */
1315 obj_request->xferred = obj_request->length;
1316 obj_request_done_set(obj_request);
1049} 1317}
1050 1318
1051/* 1319/*
1052 * Send ceph osd request 1320 * For a simple stat call there's nothing to do. We'll do more if
1321 * this is part of a write sequence for a layered image.
1053 */ 1322 */
1054static int rbd_do_request(struct request *rq, 1323static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1055 struct rbd_device *rbd_dev, 1324{
1056 struct ceph_snap_context *snapc, 1325 dout("%s: obj %p\n", __func__, obj_request);
1057 u64 snapid, 1326 obj_request_done_set(obj_request);
1058 const char *object_name, u64 ofs, u64 len, 1327}
1059 struct bio *bio,
1060 struct page **pages,
1061 int num_pages,
1062 int flags,
1063 struct ceph_osd_req_op *ops,
1064 struct rbd_req_coll *coll,
1065 int coll_index,
1066 void (*rbd_cb)(struct ceph_osd_request *req,
1067 struct ceph_msg *msg),
1068 struct ceph_osd_request **linger_req,
1069 u64 *ver)
1070{
1071 struct ceph_osd_request *req;
1072 struct ceph_file_layout *layout;
1073 int ret;
1074 u64 bno;
1075 struct timespec mtime = CURRENT_TIME;
1076 struct rbd_request *req_data;
1077 struct ceph_osd_request_head *reqhead;
1078 struct ceph_osd_client *osdc;
1079 1328
1080 req_data = kzalloc(sizeof(*req_data), GFP_NOIO); 1329static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1081 if (!req_data) { 1330 struct ceph_msg *msg)
1082 if (coll) 1331{
1083 rbd_coll_end_req_index(rq, coll, coll_index, 1332 struct rbd_obj_request *obj_request = osd_req->r_priv;
1084 -ENOMEM, len); 1333 u16 opcode;
1085 return -ENOMEM; 1334
1335 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1336 rbd_assert(osd_req == obj_request->osd_req);
1337 rbd_assert(!!obj_request->img_request ^
1338 (obj_request->which == BAD_WHICH));
1339
1340 if (osd_req->r_result < 0)
1341 obj_request->result = osd_req->r_result;
1342 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1343
1344 WARN_ON(osd_req->r_num_ops != 1); /* For now */
1345
1346 /*
1347 * We support a 64-bit length, but ultimately it has to be
1348 * passed to blk_end_request(), which takes an unsigned int.
1349 */
1350 obj_request->xferred = osd_req->r_reply_op_len[0];
1351 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
1352 opcode = osd_req->r_request_ops[0].op;
1353 switch (opcode) {
1354 case CEPH_OSD_OP_READ:
1355 rbd_osd_read_callback(obj_request);
1356 break;
1357 case CEPH_OSD_OP_WRITE:
1358 rbd_osd_write_callback(obj_request);
1359 break;
1360 case CEPH_OSD_OP_STAT:
1361 rbd_osd_stat_callback(obj_request);
1362 break;
1363 case CEPH_OSD_OP_CALL:
1364 case CEPH_OSD_OP_NOTIFY_ACK:
1365 case CEPH_OSD_OP_WATCH:
1366 rbd_osd_trivial_callback(obj_request);
1367 break;
1368 default:
1369 rbd_warn(NULL, "%s: unsupported op %hu\n",
1370 obj_request->object_name, (unsigned short) opcode);
1371 break;
1086 } 1372 }
1087 1373
1088 if (coll) { 1374 if (obj_request_done_test(obj_request))
1089 req_data->coll = coll; 1375 rbd_obj_request_complete(obj_request);
1090 req_data->coll_index = coll_index; 1376}
1377
1378static struct ceph_osd_request *rbd_osd_req_create(
1379 struct rbd_device *rbd_dev,
1380 bool write_request,
1381 struct rbd_obj_request *obj_request,
1382 struct ceph_osd_req_op *op)
1383{
1384 struct rbd_img_request *img_request = obj_request->img_request;
1385 struct ceph_snap_context *snapc = NULL;
1386 struct ceph_osd_client *osdc;
1387 struct ceph_osd_request *osd_req;
1388 struct timespec now;
1389 struct timespec *mtime;
1390 u64 snap_id = CEPH_NOSNAP;
1391 u64 offset = obj_request->offset;
1392 u64 length = obj_request->length;
1393
1394 if (img_request) {
1395 rbd_assert(img_request->write_request == write_request);
1396 if (img_request->write_request)
1397 snapc = img_request->snapc;
1398 else
1399 snap_id = img_request->snap_id;
1091 } 1400 }
1092 1401
1093 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", 1402 /* Allocate and initialize the request, for the single op */
1094 object_name, (unsigned long long) ofs,
1095 (unsigned long long) len, coll, coll_index);
1096 1403
1097 osdc = &rbd_dev->rbd_client->client->osdc; 1404 osdc = &rbd_dev->rbd_client->client->osdc;
1098 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, 1405 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1099 false, GFP_NOIO, pages, bio); 1406 if (!osd_req)
1100 if (!req) { 1407 return NULL; /* ENOMEM */
1101 ret = -ENOMEM; 1408
1102 goto done_pages; 1409 rbd_assert(obj_request_type_valid(obj_request->type));
1410 switch (obj_request->type) {
1411 case OBJ_REQUEST_NODATA:
1412 break; /* Nothing to do */
1413 case OBJ_REQUEST_BIO:
1414 rbd_assert(obj_request->bio_list != NULL);
1415 osd_req->r_bio = obj_request->bio_list;
1416 break;
1417 case OBJ_REQUEST_PAGES:
1418 osd_req->r_pages = obj_request->pages;
1419 osd_req->r_num_pages = obj_request->page_count;
1420 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1421 break;
1103 } 1422 }
1104 1423
1105 req->r_callback = rbd_cb; 1424 if (write_request) {
1425 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1426 now = CURRENT_TIME;
1427 mtime = &now;
1428 } else {
1429 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1430 mtime = NULL; /* not needed for reads */
1431 offset = 0; /* These are not used... */
1432 length = 0; /* ...for osd read requests */
1433 }
1106 1434
1107 req_data->rq = rq; 1435 osd_req->r_callback = rbd_osd_req_callback;
1108 req_data->bio = bio; 1436 osd_req->r_priv = obj_request;
1109 req_data->pages = pages;
1110 req_data->len = len;
1111 1437
1112 req->r_priv = req_data; 1438 osd_req->r_oid_len = strlen(obj_request->object_name);
1439 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1440 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1113 1441
1114 reqhead = req->r_request->front.iov_base; 1442 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1115 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1116 1443
1117 strncpy(req->r_oid, object_name, sizeof(req->r_oid)); 1444 /* osd_req will get its own reference to snapc (if non-null) */
1118 req->r_oid_len = strlen(req->r_oid);
1119 1445
1120 layout = &req->r_file_layout; 1446 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1121 memset(layout, 0, sizeof(*layout)); 1447 snapc, snap_id, mtime);
1122 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1123 layout->fl_stripe_count = cpu_to_le32(1);
1124 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1125 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
1126 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1127 req, ops);
1128 rbd_assert(ret == 0);
1129 1448
1130 ceph_osdc_build_request(req, ofs, &len, 1449 return osd_req;
1131 ops, 1450}
1132 snapc,
1133 &mtime,
1134 req->r_oid, req->r_oid_len);
1135 1451
1136 if (linger_req) { 1452static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1137 ceph_osdc_set_request_linger(osdc, req); 1453{
1138 *linger_req = req; 1454 ceph_osdc_put_request(osd_req);
1139 } 1455}
1140 1456
1141 ret = ceph_osdc_start_request(osdc, req, false); 1457/* object_name is assumed to be a non-null pointer and NUL-terminated */
1142 if (ret < 0) 1458
1143 goto done_err; 1459static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1144 1460 u64 offset, u64 length,
1145 if (!rbd_cb) { 1461 enum obj_request_type type)
1146 ret = ceph_osdc_wait_request(osdc, req); 1462{
1147 if (ver) 1463 struct rbd_obj_request *obj_request;
1148 *ver = le64_to_cpu(req->r_reassert_version.version); 1464 size_t size;
1149 dout("reassert_ver=%llu\n", 1465 char *name;
1150 (unsigned long long) 1466
1151 le64_to_cpu(req->r_reassert_version.version)); 1467 rbd_assert(obj_request_type_valid(type));
1152 ceph_osdc_put_request(req); 1468
1469 size = strlen(object_name) + 1;
1470 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1471 if (!obj_request)
1472 return NULL;
1473
1474 name = (char *)(obj_request + 1);
1475 obj_request->object_name = memcpy(name, object_name, size);
1476 obj_request->offset = offset;
1477 obj_request->length = length;
1478 obj_request->which = BAD_WHICH;
1479 obj_request->type = type;
1480 INIT_LIST_HEAD(&obj_request->links);
1481 obj_request_done_init(obj_request);
1482 init_completion(&obj_request->completion);
1483 kref_init(&obj_request->kref);
1484
1485 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1486 offset, length, (int)type, obj_request);
1487
1488 return obj_request;
1489}
1490
1491static void rbd_obj_request_destroy(struct kref *kref)
1492{
1493 struct rbd_obj_request *obj_request;
1494
1495 obj_request = container_of(kref, struct rbd_obj_request, kref);
1496
1497 dout("%s: obj %p\n", __func__, obj_request);
1498
1499 rbd_assert(obj_request->img_request == NULL);
1500 rbd_assert(obj_request->which == BAD_WHICH);
1501
1502 if (obj_request->osd_req)
1503 rbd_osd_req_destroy(obj_request->osd_req);
1504
1505 rbd_assert(obj_request_type_valid(obj_request->type));
1506 switch (obj_request->type) {
1507 case OBJ_REQUEST_NODATA:
1508 break; /* Nothing to do */
1509 case OBJ_REQUEST_BIO:
1510 if (obj_request->bio_list)
1511 bio_chain_put(obj_request->bio_list);
1512 break;
1513 case OBJ_REQUEST_PAGES:
1514 if (obj_request->pages)
1515 ceph_release_page_vector(obj_request->pages,
1516 obj_request->page_count);
1517 break;
1153 } 1518 }
1154 return ret;
1155 1519
1156done_err: 1520 kfree(obj_request);
1157 bio_chain_put(req_data->bio);
1158 ceph_osdc_put_request(req);
1159done_pages:
1160 rbd_coll_end_req(req_data, ret, len);
1161 kfree(req_data);
1162 return ret;
1163} 1521}
1164 1522
1165/* 1523/*
1166 * Ceph osd op callback 1524 * Caller is responsible for filling in the list of object requests
1525 * that comprises the image request, and the Linux request pointer
1526 * (if there is one).
1167 */ 1527 */
1168static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 1528static struct rbd_img_request *rbd_img_request_create(
1169{ 1529 struct rbd_device *rbd_dev,
1170 struct rbd_request *req_data = req->r_priv; 1530 u64 offset, u64 length,
1171 struct ceph_osd_reply_head *replyhead; 1531 bool write_request)
1172 struct ceph_osd_op *op; 1532{
1173 __s32 rc; 1533 struct rbd_img_request *img_request;
1174 u64 bytes; 1534 struct ceph_snap_context *snapc = NULL;
1175 int read_op;
1176
1177 /* parse reply */
1178 replyhead = msg->front.iov_base;
1179 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1180 op = (void *)(replyhead + 1);
1181 rc = le32_to_cpu(replyhead->result);
1182 bytes = le64_to_cpu(op->extent.length);
1183 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1184
1185 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1186 (unsigned long long) bytes, read_op, (int) rc);
1187
1188 if (rc == -ENOENT && read_op) {
1189 zero_bio_chain(req_data->bio, 0);
1190 rc = 0;
1191 } else if (rc == 0 && read_op && bytes < req_data->len) {
1192 zero_bio_chain(req_data->bio, bytes);
1193 bytes = req_data->len;
1194 }
1195 1535
1196 rbd_coll_end_req(req_data, rc, bytes); 1536 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1537 if (!img_request)
1538 return NULL;
1197 1539
1198 if (req_data->bio) 1540 if (write_request) {
1199 bio_chain_put(req_data->bio); 1541 down_read(&rbd_dev->header_rwsem);
1542 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1543 up_read(&rbd_dev->header_rwsem);
1544 if (WARN_ON(!snapc)) {
1545 kfree(img_request);
1546 return NULL; /* Shouldn't happen */
1547 }
1548 }
1200 1549
1201 ceph_osdc_put_request(req); 1550 img_request->rq = NULL;
1202 kfree(req_data); 1551 img_request->rbd_dev = rbd_dev;
1552 img_request->offset = offset;
1553 img_request->length = length;
1554 img_request->write_request = write_request;
1555 if (write_request)
1556 img_request->snapc = snapc;
1557 else
1558 img_request->snap_id = rbd_dev->spec->snap_id;
1559 spin_lock_init(&img_request->completion_lock);
1560 img_request->next_completion = 0;
1561 img_request->callback = NULL;
1562 img_request->obj_request_count = 0;
1563 INIT_LIST_HEAD(&img_request->obj_requests);
1564 kref_init(&img_request->kref);
1565
1566 rbd_img_request_get(img_request); /* Avoid a warning */
1567 rbd_img_request_put(img_request); /* TEMPORARY */
1568
1569 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1570 write_request ? "write" : "read", offset, length,
1571 img_request);
1572
1573 return img_request;
1203} 1574}
1204 1575
1205static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 1576static void rbd_img_request_destroy(struct kref *kref)
1206{ 1577{
1207 ceph_osdc_put_request(req); 1578 struct rbd_img_request *img_request;
1579 struct rbd_obj_request *obj_request;
1580 struct rbd_obj_request *next_obj_request;
1581
1582 img_request = container_of(kref, struct rbd_img_request, kref);
1583
1584 dout("%s: img %p\n", __func__, img_request);
1585
1586 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1587 rbd_img_obj_request_del(img_request, obj_request);
1588 rbd_assert(img_request->obj_request_count == 0);
1589
1590 if (img_request->write_request)
1591 ceph_put_snap_context(img_request->snapc);
1592
1593 kfree(img_request);
1208} 1594}
1209 1595
1210/* 1596static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1211 * Do a synchronous ceph osd operation 1597 struct bio *bio_list)
1212 */
1213static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1214 struct ceph_snap_context *snapc,
1215 u64 snapid,
1216 int flags,
1217 struct ceph_osd_req_op *ops,
1218 const char *object_name,
1219 u64 ofs, u64 inbound_size,
1220 char *inbound,
1221 struct ceph_osd_request **linger_req,
1222 u64 *ver)
1223{ 1598{
1224 int ret; 1599 struct rbd_device *rbd_dev = img_request->rbd_dev;
1225 struct page **pages; 1600 struct rbd_obj_request *obj_request = NULL;
1226 int num_pages; 1601 struct rbd_obj_request *next_obj_request;
1227 1602 unsigned int bio_offset;
1228 rbd_assert(ops != NULL); 1603 u64 image_offset;
1604 u64 resid;
1605 u16 opcode;
1606
1607 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1608
1609 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1610 : CEPH_OSD_OP_READ;
1611 bio_offset = 0;
1612 image_offset = img_request->offset;
1613 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1614 resid = img_request->length;
1615 rbd_assert(resid > 0);
1616 while (resid) {
1617 const char *object_name;
1618 unsigned int clone_size;
1619 struct ceph_osd_req_op *op;
1620 u64 offset;
1621 u64 length;
1622
1623 object_name = rbd_segment_name(rbd_dev, image_offset);
1624 if (!object_name)
1625 goto out_unwind;
1626 offset = rbd_segment_offset(rbd_dev, image_offset);
1627 length = rbd_segment_length(rbd_dev, image_offset, resid);
1628 obj_request = rbd_obj_request_create(object_name,
1629 offset, length,
1630 OBJ_REQUEST_BIO);
1631 kfree(object_name); /* object request has its own copy */
1632 if (!obj_request)
1633 goto out_unwind;
1634
1635 rbd_assert(length <= (u64) UINT_MAX);
1636 clone_size = (unsigned int) length;
1637 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1638 &bio_offset, clone_size,
1639 GFP_ATOMIC);
1640 if (!obj_request->bio_list)
1641 goto out_partial;
1229 1642
1230 num_pages = calc_pages_for(ofs, inbound_size); 1643 /*
1231 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1644 * Build up the op to use in building the osd
1232 if (IS_ERR(pages)) 1645 * request. Note that the contents of the op are
1233 return PTR_ERR(pages); 1646 * copied by rbd_osd_req_create().
1647 */
1648 op = rbd_osd_req_op_create(opcode, offset, length);
1649 if (!op)
1650 goto out_partial;
1651 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1652 img_request->write_request,
1653 obj_request, op);
1654 rbd_osd_req_op_destroy(op);
1655 if (!obj_request->osd_req)
1656 goto out_partial;
1657 /* status and version are initially zero-filled */
1658
1659 rbd_img_obj_request_add(img_request, obj_request);
1660
1661 image_offset += length;
1662 resid -= length;
1663 }
1234 1664
1235 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1665 return 0;
1236 object_name, ofs, inbound_size, NULL,
1237 pages, num_pages,
1238 flags,
1239 ops,
1240 NULL, 0,
1241 NULL,
1242 linger_req, ver);
1243 if (ret < 0)
1244 goto done;
1245 1666
1246 if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1667out_partial:
1247 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1668 rbd_obj_request_put(obj_request);
1669out_unwind:
1670 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1671 rbd_obj_request_put(obj_request);
1248 1672
1249done: 1673 return -ENOMEM;
1250 ceph_release_page_vector(pages, num_pages);
1251 return ret;
1252} 1674}
1253 1675
1254/* 1676static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1255 * Do an asynchronous ceph osd operation 1677{
1256 */ 1678 struct rbd_img_request *img_request;
1257static int rbd_do_op(struct request *rq, 1679 u32 which = obj_request->which;
1258 struct rbd_device *rbd_dev, 1680 bool more = true;
1259 struct ceph_snap_context *snapc, 1681
1260 u64 ofs, u64 len, 1682 img_request = obj_request->img_request;
1261 struct bio *bio, 1683
1262 struct rbd_req_coll *coll, 1684 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1263 int coll_index) 1685 rbd_assert(img_request != NULL);
1264{ 1686 rbd_assert(img_request->rq != NULL);
1265 char *seg_name; 1687 rbd_assert(img_request->obj_request_count > 0);
1266 u64 seg_ofs; 1688 rbd_assert(which != BAD_WHICH);
1267 u64 seg_len; 1689 rbd_assert(which < img_request->obj_request_count);
1268 int ret; 1690 rbd_assert(which >= img_request->next_completion);
1269 struct ceph_osd_req_op *ops; 1691
1270 u32 payload_len; 1692 spin_lock_irq(&img_request->completion_lock);
1271 int opcode; 1693 if (which != img_request->next_completion)
1272 int flags; 1694 goto out;
1273 u64 snapid; 1695
1274 1696 for_each_obj_request_from(img_request, obj_request) {
1275 seg_name = rbd_segment_name(rbd_dev, ofs); 1697 unsigned int xferred;
1276 if (!seg_name) 1698 int result;
1277 return -ENOMEM; 1699
1278 seg_len = rbd_segment_length(rbd_dev, ofs, len); 1700 rbd_assert(more);
1279 seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1701 rbd_assert(which < img_request->obj_request_count);
1280 1702
1281 if (rq_data_dir(rq) == WRITE) { 1703 if (!obj_request_done_test(obj_request))
1282 opcode = CEPH_OSD_OP_WRITE; 1704 break;
1283 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; 1705
1284 snapid = CEPH_NOSNAP; 1706 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1285 payload_len = seg_len; 1707 xferred = (unsigned int) obj_request->xferred;
1286 } else { 1708 result = (int) obj_request->result;
1287 opcode = CEPH_OSD_OP_READ; 1709 if (result)
1288 flags = CEPH_OSD_FLAG_READ; 1710 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1289 snapc = NULL; 1711 img_request->write_request ? "write" : "read",
1290 snapid = rbd_dev->spec->snap_id; 1712 result, xferred);
1291 payload_len = 0; 1713
1714 more = blk_end_request(img_request->rq, result, xferred);
1715 which++;
1292 } 1716 }
1293 1717
1294 ret = -ENOMEM; 1718 rbd_assert(more ^ (which == img_request->obj_request_count));
1295 ops = rbd_create_rw_ops(1, opcode, payload_len); 1719 img_request->next_completion = which;
1296 if (!ops) 1720out:
1297 goto done; 1721 spin_unlock_irq(&img_request->completion_lock);
1298 1722
1299 /* we've taken care of segment sizes earlier when we 1723 if (!more)
1300 cloned the bios. We should never have a segment 1724 rbd_img_request_complete(img_request);
1301 truncated at this point */
1302 rbd_assert(seg_len == len);
1303
1304 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1305 seg_name, seg_ofs, seg_len,
1306 bio,
1307 NULL, 0,
1308 flags,
1309 ops,
1310 coll, coll_index,
1311 rbd_req_cb, 0, NULL);
1312
1313 rbd_destroy_ops(ops);
1314done:
1315 kfree(seg_name);
1316 return ret;
1317} 1725}
1318 1726
1319/* 1727static int rbd_img_request_submit(struct rbd_img_request *img_request)
1320 * Request sync osd read 1728{
1321 */ 1729 struct rbd_device *rbd_dev = img_request->rbd_dev;
1322static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1730 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1323 u64 snapid, 1731 struct rbd_obj_request *obj_request;
1324 const char *object_name,
1325 u64 ofs, u64 len,
1326 char *buf,
1327 u64 *ver)
1328{
1329 struct ceph_osd_req_op *ops;
1330 int ret;
1331 1732
1332 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); 1733 dout("%s: img %p\n", __func__, img_request);
1333 if (!ops) 1734 for_each_obj_request(img_request, obj_request) {
1334 return -ENOMEM; 1735 int ret;
1335 1736
1336 ret = rbd_req_sync_op(rbd_dev, NULL, 1737 obj_request->callback = rbd_img_obj_callback;
1337 snapid, 1738 ret = rbd_obj_request_submit(osdc, obj_request);
1338 CEPH_OSD_FLAG_READ, 1739 if (ret)
1339 ops, object_name, ofs, len, buf, NULL, ver); 1740 return ret;
1340 rbd_destroy_ops(ops); 1741 /*
1742 * The image request has its own reference to each
1743 * of its object requests, so we can safely drop the
1744 * initial one here.
1745 */
1746 rbd_obj_request_put(obj_request);
1747 }
1341 1748
1342 return ret; 1749 return 0;
1343} 1750}
1344 1751
1345/* 1752static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1346 * Request sync osd watch 1753 u64 ver, u64 notify_id)
1347 */
1348static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
1349 u64 ver,
1350 u64 notify_id)
1351{ 1754{
1352 struct ceph_osd_req_op *ops; 1755 struct rbd_obj_request *obj_request;
1756 struct ceph_osd_req_op *op;
1757 struct ceph_osd_client *osdc;
1353 int ret; 1758 int ret;
1354 1759
1355 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); 1760 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1356 if (!ops) 1761 OBJ_REQUEST_NODATA);
1762 if (!obj_request)
1357 return -ENOMEM; 1763 return -ENOMEM;
1358 1764
1359 ops[0].watch.ver = cpu_to_le64(ver); 1765 ret = -ENOMEM;
1360 ops[0].watch.cookie = notify_id; 1766 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1361 ops[0].watch.flag = 0; 1767 if (!op)
1768 goto out;
1769 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1770 obj_request, op);
1771 rbd_osd_req_op_destroy(op);
1772 if (!obj_request->osd_req)
1773 goto out;
1362 1774
1363 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 1775 osdc = &rbd_dev->rbd_client->client->osdc;
1364 rbd_dev->header_name, 0, 0, NULL, 1776 obj_request->callback = rbd_obj_request_put;
1365 NULL, 0, 1777 ret = rbd_obj_request_submit(osdc, obj_request);
1366 CEPH_OSD_FLAG_READ, 1778out:
1367 ops, 1779 if (ret)
1368 NULL, 0, 1780 rbd_obj_request_put(obj_request);
1369 rbd_simple_req_cb, 0, NULL);
1370 1781
1371 rbd_destroy_ops(ops);
1372 return ret; 1782 return ret;
1373} 1783}
1374 1784
@@ -1381,95 +1791,103 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1381 if (!rbd_dev) 1791 if (!rbd_dev)
1382 return; 1792 return;
1383 1793
1384 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1794 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1385 rbd_dev->header_name, (unsigned long long) notify_id, 1795 rbd_dev->header_name, (unsigned long long) notify_id,
1386 (unsigned int) opcode); 1796 (unsigned int) opcode);
1387 rc = rbd_dev_refresh(rbd_dev, &hver); 1797 rc = rbd_dev_refresh(rbd_dev, &hver);
1388 if (rc) 1798 if (rc)
1389 pr_warning(RBD_DRV_NAME "%d got notification but failed to " 1799 rbd_warn(rbd_dev, "got notification but failed to "
1390 " update snaps: %d\n", rbd_dev->major, rc); 1800 " update snaps: %d\n", rc);
1391 1801
1392 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 1802 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1393} 1803}
1394 1804
1395/* 1805/*
1396 * Request sync osd watch 1806 * Request sync osd watch/unwatch. The value of "start" determines
1807 * whether a watch request is being initiated or torn down.
1397 */ 1808 */
1398static int rbd_req_sync_watch(struct rbd_device *rbd_dev) 1809static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1399{ 1810{
1400 struct ceph_osd_req_op *ops;
1401 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1811 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1812 struct rbd_obj_request *obj_request;
1813 struct ceph_osd_req_op *op;
1402 int ret; 1814 int ret;
1403 1815
1404 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 1816 rbd_assert(start ^ !!rbd_dev->watch_event);
1405 if (!ops) 1817 rbd_assert(start ^ !!rbd_dev->watch_request);
1406 return -ENOMEM;
1407 1818
1408 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 1819 if (start) {
1409 (void *)rbd_dev, &rbd_dev->watch_event); 1820 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
1410 if (ret < 0) 1821 &rbd_dev->watch_event);
1411 goto fail; 1822 if (ret < 0)
1823 return ret;
1824 rbd_assert(rbd_dev->watch_event != NULL);
1825 }
1412 1826
1413 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); 1827 ret = -ENOMEM;
1414 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 1828 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1415 ops[0].watch.flag = 1; 1829 OBJ_REQUEST_NODATA);
1830 if (!obj_request)
1831 goto out_cancel;
1832
1833 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1834 rbd_dev->watch_event->cookie,
1835 rbd_dev->header.obj_version, start);
1836 if (!op)
1837 goto out_cancel;
1838 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1839 obj_request, op);
1840 rbd_osd_req_op_destroy(op);
1841 if (!obj_request->osd_req)
1842 goto out_cancel;
1843
1844 if (start)
1845 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1846 else
1847 ceph_osdc_unregister_linger_request(osdc,
1848 rbd_dev->watch_request->osd_req);
1849 ret = rbd_obj_request_submit(osdc, obj_request);
1850 if (ret)
1851 goto out_cancel;
1852 ret = rbd_obj_request_wait(obj_request);
1853 if (ret)
1854 goto out_cancel;
1855 ret = obj_request->result;
1856 if (ret)
1857 goto out_cancel;
1416 1858
1417 ret = rbd_req_sync_op(rbd_dev, NULL, 1859 /*
1418 CEPH_NOSNAP, 1860 * A watch request is set to linger, so the underlying osd
1419 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1861 * request won't go away until we unregister it. We retain
1420 ops, 1862 * a pointer to the object request during that time (in
1421 rbd_dev->header_name, 1863 * rbd_dev->watch_request), so we'll keep a reference to
1422 0, 0, NULL, 1864 * it. We'll drop that reference (below) after we've
1423 &rbd_dev->watch_request, NULL); 1865 * unregistered it.
1866 */
1867 if (start) {
1868 rbd_dev->watch_request = obj_request;
1424 1869
1425 if (ret < 0) 1870 return 0;
1426 goto fail_event; 1871 }
1427 1872
1428 rbd_destroy_ops(ops); 1873 /* We have successfully torn down the watch request */
1429 return 0;
1430 1874
1431fail_event: 1875 rbd_obj_request_put(rbd_dev->watch_request);
1876 rbd_dev->watch_request = NULL;
1877out_cancel:
1878 /* Cancel the event if we're tearing down, or on error */
1432 ceph_osdc_cancel_event(rbd_dev->watch_event); 1879 ceph_osdc_cancel_event(rbd_dev->watch_event);
1433 rbd_dev->watch_event = NULL; 1880 rbd_dev->watch_event = NULL;
1434fail: 1881 if (obj_request)
1435 rbd_destroy_ops(ops); 1882 rbd_obj_request_put(obj_request);
1436 return ret;
1437}
1438 1883
1439/*
1440 * Request sync osd unwatch
1441 */
1442static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
1443{
1444 struct ceph_osd_req_op *ops;
1445 int ret;
1446
1447 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1448 if (!ops)
1449 return -ENOMEM;
1450
1451 ops[0].watch.ver = 0;
1452 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
1453 ops[0].watch.flag = 0;
1454
1455 ret = rbd_req_sync_op(rbd_dev, NULL,
1456 CEPH_NOSNAP,
1457 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1458 ops,
1459 rbd_dev->header_name,
1460 0, 0, NULL, NULL, NULL);
1461
1462
1463 rbd_destroy_ops(ops);
1464 ceph_osdc_cancel_event(rbd_dev->watch_event);
1465 rbd_dev->watch_event = NULL;
1466 return ret; 1884 return ret;
1467} 1885}
1468 1886
1469/* 1887/*
1470 * Synchronous osd object method call 1888 * Synchronous osd object method call
1471 */ 1889 */
1472static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1890static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1473 const char *object_name, 1891 const char *object_name,
1474 const char *class_name, 1892 const char *class_name,
1475 const char *method_name, 1893 const char *method_name,
@@ -1477,169 +1895,154 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1477 size_t outbound_size, 1895 size_t outbound_size,
1478 char *inbound, 1896 char *inbound,
1479 size_t inbound_size, 1897 size_t inbound_size,
1480 int flags, 1898 u64 *version)
1481 u64 *ver)
1482{ 1899{
1483 struct ceph_osd_req_op *ops; 1900 struct rbd_obj_request *obj_request;
1484 int class_name_len = strlen(class_name); 1901 struct ceph_osd_client *osdc;
1485 int method_name_len = strlen(method_name); 1902 struct ceph_osd_req_op *op;
1486 int payload_size; 1903 struct page **pages;
1904 u32 page_count;
1487 int ret; 1905 int ret;
1488 1906
1489 /* 1907 /*
1490 * Any input parameters required by the method we're calling 1908 * Method calls are ultimately read operations but they
1491 * will be sent along with the class and method names as 1909 * don't involve object data (so no offset or length).
1492 * part of the message payload. That data and its size are 1910 * The result should placed into the inbound buffer
1493 * supplied via the indata and indata_len fields (named from 1911 * provided. They also supply outbound data--parameters for
1494 * the perspective of the server side) in the OSD request 1912 * the object method. Currently if this is present it will
1495 * operation. 1913 * be a snapshot id.
1496 */ 1914 */
1497 payload_size = class_name_len + method_name_len + outbound_size; 1915 page_count = (u32) calc_pages_for(0, inbound_size);
1498 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 1916 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1499 if (!ops) 1917 if (IS_ERR(pages))
1500 return -ENOMEM; 1918 return PTR_ERR(pages);
1501 1919
1502 ops[0].cls.class_name = class_name; 1920 ret = -ENOMEM;
1503 ops[0].cls.class_len = (__u8) class_name_len; 1921 obj_request = rbd_obj_request_create(object_name, 0, 0,
1504 ops[0].cls.method_name = method_name; 1922 OBJ_REQUEST_PAGES);
1505 ops[0].cls.method_len = (__u8) method_name_len; 1923 if (!obj_request)
1506 ops[0].cls.argc = 0; 1924 goto out;
1507 ops[0].cls.indata = outbound;
1508 ops[0].cls.indata_len = outbound_size;
1509 1925
1510 ret = rbd_req_sync_op(rbd_dev, NULL, 1926 obj_request->pages = pages;
1511 CEPH_NOSNAP, 1927 obj_request->page_count = page_count;
1512 flags, ops,
1513 object_name, 0, inbound_size, inbound,
1514 NULL, ver);
1515 1928
1516 rbd_destroy_ops(ops); 1929 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1930 method_name, outbound, outbound_size);
1931 if (!op)
1932 goto out;
1933 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1934 obj_request, op);
1935 rbd_osd_req_op_destroy(op);
1936 if (!obj_request->osd_req)
1937 goto out;
1517 1938
1518 dout("cls_exec returned %d\n", ret); 1939 osdc = &rbd_dev->rbd_client->client->osdc;
1519 return ret; 1940 ret = rbd_obj_request_submit(osdc, obj_request);
1520} 1941 if (ret)
1942 goto out;
1943 ret = rbd_obj_request_wait(obj_request);
1944 if (ret)
1945 goto out;
1521 1946
1522static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 1947 ret = obj_request->result;
1523{ 1948 if (ret < 0)
1524 struct rbd_req_coll *coll = 1949 goto out;
1525 kzalloc(sizeof(struct rbd_req_coll) + 1950 ret = 0;
1526 sizeof(struct rbd_req_status) * num_reqs, 1951 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
1527 GFP_ATOMIC); 1952 if (version)
1953 *version = obj_request->version;
1954out:
1955 if (obj_request)
1956 rbd_obj_request_put(obj_request);
1957 else
1958 ceph_release_page_vector(pages, page_count);
1528 1959
1529 if (!coll) 1960 return ret;
1530 return NULL;
1531 coll->total = num_reqs;
1532 kref_init(&coll->kref);
1533 return coll;
1534} 1961}
1535 1962
1536/* 1963static void rbd_request_fn(struct request_queue *q)
1537 * block device queue callback 1964 __releases(q->queue_lock) __acquires(q->queue_lock)
1538 */
1539static void rbd_rq_fn(struct request_queue *q)
1540{ 1965{
1541 struct rbd_device *rbd_dev = q->queuedata; 1966 struct rbd_device *rbd_dev = q->queuedata;
1967 bool read_only = rbd_dev->mapping.read_only;
1542 struct request *rq; 1968 struct request *rq;
1969 int result;
1543 1970
1544 while ((rq = blk_fetch_request(q))) { 1971 while ((rq = blk_fetch_request(q))) {
1545 struct bio *bio; 1972 bool write_request = rq_data_dir(rq) == WRITE;
1546 bool do_write; 1973 struct rbd_img_request *img_request;
1547 unsigned int size; 1974 u64 offset;
1548 u64 ofs; 1975 u64 length;
1549 int num_segs, cur_seg = 0; 1976
1550 struct rbd_req_coll *coll; 1977 /* Ignore any non-FS requests that filter through. */
1551 struct ceph_snap_context *snapc;
1552 unsigned int bio_offset;
1553
1554 dout("fetched request\n");
1555
1556 /* filter out block requests we don't understand */
1557 if ((rq->cmd_type != REQ_TYPE_FS)) {
1558 __blk_end_request_all(rq, 0);
1559 continue;
1560 }
1561 1978
1562 /* deduce our operation (read, write) */ 1979 if (rq->cmd_type != REQ_TYPE_FS) {
1563 do_write = (rq_data_dir(rq) == WRITE); 1980 dout("%s: non-fs request type %d\n", __func__,
1564 if (do_write && rbd_dev->mapping.read_only) { 1981 (int) rq->cmd_type);
1565 __blk_end_request_all(rq, -EROFS); 1982 __blk_end_request_all(rq, 0);
1566 continue; 1983 continue;
1567 } 1984 }
1568 1985
1569 spin_unlock_irq(q->queue_lock); 1986 /* Ignore/skip any zero-length requests */
1570 1987
1571 down_read(&rbd_dev->header_rwsem); 1988 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1989 length = (u64) blk_rq_bytes(rq);
1572 1990
1573 if (!rbd_dev->exists) { 1991 if (!length) {
1574 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 1992 dout("%s: zero-length request\n", __func__);
1575 up_read(&rbd_dev->header_rwsem); 1993 __blk_end_request_all(rq, 0);
1576 dout("request for non-existent snapshot");
1577 spin_lock_irq(q->queue_lock);
1578 __blk_end_request_all(rq, -ENXIO);
1579 continue; 1994 continue;
1580 } 1995 }
1581 1996
1582 snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1997 spin_unlock_irq(q->queue_lock);
1583
1584 up_read(&rbd_dev->header_rwsem);
1585
1586 size = blk_rq_bytes(rq);
1587 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1588 bio = rq->bio;
1589 1998
1590 dout("%s 0x%x bytes at 0x%llx\n", 1999 /* Disallow writes to a read-only device */
1591 do_write ? "write" : "read",
1592 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1593 2000
1594 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 2001 if (write_request) {
1595 if (num_segs <= 0) { 2002 result = -EROFS;
1596 spin_lock_irq(q->queue_lock); 2003 if (read_only)
1597 __blk_end_request_all(rq, num_segs); 2004 goto end_request;
1598 ceph_put_snap_context(snapc); 2005 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1599 continue;
1600 } 2006 }
1601 coll = rbd_alloc_coll(num_segs);
1602 if (!coll) {
1603 spin_lock_irq(q->queue_lock);
1604 __blk_end_request_all(rq, -ENOMEM);
1605 ceph_put_snap_context(snapc);
1606 continue;
1607 }
1608
1609 bio_offset = 0;
1610 do {
1611 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1612 unsigned int chain_size;
1613 struct bio *bio_chain;
1614
1615 BUG_ON(limit > (u64) UINT_MAX);
1616 chain_size = (unsigned int) limit;
1617 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1618 2007
1619 kref_get(&coll->kref); 2008 /*
2009 * Quit early if the mapped snapshot no longer
2010 * exists. It's still possible the snapshot will
2011 * have disappeared by the time our request arrives
2012 * at the osd, but there's no sense in sending it if
2013 * we already know.
2014 */
2015 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2016 dout("request for non-existent snapshot");
2017 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2018 result = -ENXIO;
2019 goto end_request;
2020 }
1620 2021
1621 /* Pass a cloned bio chain via an osd request */ 2022 result = -EINVAL;
2023 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2024 goto end_request; /* Shouldn't happen */
1622 2025
1623 bio_chain = bio_chain_clone_range(&bio, 2026 result = -ENOMEM;
1624 &bio_offset, chain_size, 2027 img_request = rbd_img_request_create(rbd_dev, offset, length,
1625 GFP_ATOMIC); 2028 write_request);
1626 if (bio_chain) 2029 if (!img_request)
1627 (void) rbd_do_op(rq, rbd_dev, snapc, 2030 goto end_request;
1628 ofs, chain_size,
1629 bio_chain, coll, cur_seg);
1630 else
1631 rbd_coll_end_req_index(rq, coll, cur_seg,
1632 -ENOMEM, chain_size);
1633 size -= chain_size;
1634 ofs += chain_size;
1635 2031
1636 cur_seg++; 2032 img_request->rq = rq;
1637 } while (size > 0);
1638 kref_put(&coll->kref, rbd_coll_release);
1639 2033
2034 result = rbd_img_request_fill_bio(img_request, rq->bio);
2035 if (!result)
2036 result = rbd_img_request_submit(img_request);
2037 if (result)
2038 rbd_img_request_put(img_request);
2039end_request:
1640 spin_lock_irq(q->queue_lock); 2040 spin_lock_irq(q->queue_lock);
1641 2041 if (result < 0) {
1642 ceph_put_snap_context(snapc); 2042 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2043 write_request ? "write" : "read", result);
2044 __blk_end_request_all(rq, result);
2045 }
1643 } 2046 }
1644} 2047}
1645 2048
@@ -1703,6 +2106,71 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
1703 put_disk(disk); 2106 put_disk(disk);
1704} 2107}
1705 2108
2109static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2110 const char *object_name,
2111 u64 offset, u64 length,
2112 char *buf, u64 *version)
2113
2114{
2115 struct ceph_osd_req_op *op;
2116 struct rbd_obj_request *obj_request;
2117 struct ceph_osd_client *osdc;
2118 struct page **pages = NULL;
2119 u32 page_count;
2120 size_t size;
2121 int ret;
2122
2123 page_count = (u32) calc_pages_for(offset, length);
2124 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2125 if (IS_ERR(pages))
2126 ret = PTR_ERR(pages);
2127
2128 ret = -ENOMEM;
2129 obj_request = rbd_obj_request_create(object_name, offset, length,
2130 OBJ_REQUEST_PAGES);
2131 if (!obj_request)
2132 goto out;
2133
2134 obj_request->pages = pages;
2135 obj_request->page_count = page_count;
2136
2137 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2138 if (!op)
2139 goto out;
2140 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2141 obj_request, op);
2142 rbd_osd_req_op_destroy(op);
2143 if (!obj_request->osd_req)
2144 goto out;
2145
2146 osdc = &rbd_dev->rbd_client->client->osdc;
2147 ret = rbd_obj_request_submit(osdc, obj_request);
2148 if (ret)
2149 goto out;
2150 ret = rbd_obj_request_wait(obj_request);
2151 if (ret)
2152 goto out;
2153
2154 ret = obj_request->result;
2155 if (ret < 0)
2156 goto out;
2157
2158 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2159 size = (size_t) obj_request->xferred;
2160 ceph_copy_from_page_vector(pages, buf, 0, size);
2161 rbd_assert(size <= (size_t) INT_MAX);
2162 ret = (int) size;
2163 if (version)
2164 *version = obj_request->version;
2165out:
2166 if (obj_request)
2167 rbd_obj_request_put(obj_request);
2168 else
2169 ceph_release_page_vector(pages, page_count);
2170
2171 return ret;
2172}
2173
1706/* 2174/*
1707 * Read the complete header for the given rbd device. 2175 * Read the complete header for the given rbd device.
1708 * 2176 *
@@ -1741,24 +2209,20 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1741 if (!ondisk) 2209 if (!ondisk)
1742 return ERR_PTR(-ENOMEM); 2210 return ERR_PTR(-ENOMEM);
1743 2211
1744 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 2212 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
1745 rbd_dev->header_name,
1746 0, size, 2213 0, size,
1747 (char *) ondisk, version); 2214 (char *) ondisk, version);
1748
1749 if (ret < 0) 2215 if (ret < 0)
1750 goto out_err; 2216 goto out_err;
1751 if (WARN_ON((size_t) ret < size)) { 2217 if (WARN_ON((size_t) ret < size)) {
1752 ret = -ENXIO; 2218 ret = -ENXIO;
1753 pr_warning("short header read for image %s" 2219 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1754 " (want %zd got %d)\n", 2220 size, ret);
1755 rbd_dev->spec->image_name, size, ret);
1756 goto out_err; 2221 goto out_err;
1757 } 2222 }
1758 if (!rbd_dev_ondisk_valid(ondisk)) { 2223 if (!rbd_dev_ondisk_valid(ondisk)) {
1759 ret = -ENXIO; 2224 ret = -ENXIO;
1760 pr_warning("invalid header for image %s\n", 2225 rbd_warn(rbd_dev, "invalid header");
1761 rbd_dev->spec->image_name);
1762 goto out_err; 2226 goto out_err;
1763 } 2227 }
1764 2228
@@ -1895,8 +2359,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
1895 disk->fops = &rbd_bd_ops; 2359 disk->fops = &rbd_bd_ops;
1896 disk->private_data = rbd_dev; 2360 disk->private_data = rbd_dev;
1897 2361
1898 /* init rq */ 2362 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
1899 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1900 if (!q) 2363 if (!q)
1901 goto out_disk; 2364 goto out_disk;
1902 2365
@@ -2233,7 +2696,7 @@ static void rbd_spec_free(struct kref *kref)
2233 kfree(spec); 2696 kfree(spec);
2234} 2697}
2235 2698
2236struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2699static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2237 struct rbd_spec *spec) 2700 struct rbd_spec *spec)
2238{ 2701{
2239 struct rbd_device *rbd_dev; 2702 struct rbd_device *rbd_dev;
@@ -2243,6 +2706,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2243 return NULL; 2706 return NULL;
2244 2707
2245 spin_lock_init(&rbd_dev->lock); 2708 spin_lock_init(&rbd_dev->lock);
2709 rbd_dev->flags = 0;
2246 INIT_LIST_HEAD(&rbd_dev->node); 2710 INIT_LIST_HEAD(&rbd_dev->node);
2247 INIT_LIST_HEAD(&rbd_dev->snaps); 2711 INIT_LIST_HEAD(&rbd_dev->snaps);
2248 init_rwsem(&rbd_dev->header_rwsem); 2712 init_rwsem(&rbd_dev->header_rwsem);
@@ -2250,6 +2714,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2250 rbd_dev->spec = spec; 2714 rbd_dev->spec = spec;
2251 rbd_dev->rbd_client = rbdc; 2715 rbd_dev->rbd_client = rbdc;
2252 2716
2717 /* Initialize the layout used for all rbd requests */
2718
2719 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2720 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2721 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2722 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2723
2253 return rbd_dev; 2724 return rbd_dev;
2254} 2725}
2255 2726
@@ -2360,12 +2831,11 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2360 __le64 size; 2831 __le64 size;
2361 } __attribute__ ((packed)) size_buf = { 0 }; 2832 } __attribute__ ((packed)) size_buf = { 0 };
2362 2833
2363 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2834 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2364 "rbd", "get_size", 2835 "rbd", "get_size",
2365 (char *) &snapid, sizeof (snapid), 2836 (char *) &snapid, sizeof (snapid),
2366 (char *) &size_buf, sizeof (size_buf), 2837 (char *) &size_buf, sizeof (size_buf), NULL);
2367 CEPH_OSD_FLAG_READ, NULL); 2838 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2368 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2369 if (ret < 0) 2839 if (ret < 0)
2370 return ret; 2840 return ret;
2371 2841
@@ -2396,15 +2866,13 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2396 if (!reply_buf) 2866 if (!reply_buf)
2397 return -ENOMEM; 2867 return -ENOMEM;
2398 2868
2399 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2869 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2400 "rbd", "get_object_prefix", 2870 "rbd", "get_object_prefix",
2401 NULL, 0, 2871 NULL, 0,
2402 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 2872 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2403 CEPH_OSD_FLAG_READ, NULL); 2873 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2404 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2405 if (ret < 0) 2874 if (ret < 0)
2406 goto out; 2875 goto out;
2407 ret = 0; /* rbd_req_sync_exec() can return positive */
2408 2876
2409 p = reply_buf; 2877 p = reply_buf;
2410 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 2878 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
@@ -2435,12 +2903,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2435 u64 incompat; 2903 u64 incompat;
2436 int ret; 2904 int ret;
2437 2905
2438 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2906 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2439 "rbd", "get_features", 2907 "rbd", "get_features",
2440 (char *) &snapid, sizeof (snapid), 2908 (char *) &snapid, sizeof (snapid),
2441 (char *) &features_buf, sizeof (features_buf), 2909 (char *) &features_buf, sizeof (features_buf),
2442 CEPH_OSD_FLAG_READ, NULL); 2910 NULL);
2443 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2911 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2444 if (ret < 0) 2912 if (ret < 0)
2445 return ret; 2913 return ret;
2446 2914
@@ -2474,7 +2942,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2474 void *end; 2942 void *end;
2475 char *image_id; 2943 char *image_id;
2476 u64 overlap; 2944 u64 overlap;
2477 size_t len = 0;
2478 int ret; 2945 int ret;
2479 2946
2480 parent_spec = rbd_spec_alloc(); 2947 parent_spec = rbd_spec_alloc();
@@ -2492,12 +2959,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2492 } 2959 }
2493 2960
2494 snapid = cpu_to_le64(CEPH_NOSNAP); 2961 snapid = cpu_to_le64(CEPH_NOSNAP);
2495 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2962 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2496 "rbd", "get_parent", 2963 "rbd", "get_parent",
2497 (char *) &snapid, sizeof (snapid), 2964 (char *) &snapid, sizeof (snapid),
2498 (char *) reply_buf, size, 2965 (char *) reply_buf, size, NULL);
2499 CEPH_OSD_FLAG_READ, NULL); 2966 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2500 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2501 if (ret < 0) 2967 if (ret < 0)
2502 goto out_err; 2968 goto out_err;
2503 2969
@@ -2508,13 +2974,18 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2508 if (parent_spec->pool_id == CEPH_NOPOOL) 2974 if (parent_spec->pool_id == CEPH_NOPOOL)
2509 goto out; /* No parent? No problem. */ 2975 goto out; /* No parent? No problem. */
2510 2976
2511 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 2977 /* The ceph file layout needs to fit pool id in 32 bits */
2978
2979 ret = -EIO;
2980 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2981 goto out;
2982
2983 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2512 if (IS_ERR(image_id)) { 2984 if (IS_ERR(image_id)) {
2513 ret = PTR_ERR(image_id); 2985 ret = PTR_ERR(image_id);
2514 goto out_err; 2986 goto out_err;
2515 } 2987 }
2516 parent_spec->image_id = image_id; 2988 parent_spec->image_id = image_id;
2517 parent_spec->image_id_len = len;
2518 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 2989 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2519 ceph_decode_64_safe(&p, end, overlap, out_err); 2990 ceph_decode_64_safe(&p, end, overlap, out_err);
2520 2991
@@ -2544,26 +3015,25 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2544 3015
2545 rbd_assert(!rbd_dev->spec->image_name); 3016 rbd_assert(!rbd_dev->spec->image_name);
2546 3017
2547 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; 3018 len = strlen(rbd_dev->spec->image_id);
3019 image_id_size = sizeof (__le32) + len;
2548 image_id = kmalloc(image_id_size, GFP_KERNEL); 3020 image_id = kmalloc(image_id_size, GFP_KERNEL);
2549 if (!image_id) 3021 if (!image_id)
2550 return NULL; 3022 return NULL;
2551 3023
2552 p = image_id; 3024 p = image_id;
2553 end = (char *) image_id + image_id_size; 3025 end = (char *) image_id + image_id_size;
2554 ceph_encode_string(&p, end, rbd_dev->spec->image_id, 3026 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
2555 (u32) rbd_dev->spec->image_id_len);
2556 3027
2557 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 3028 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2558 reply_buf = kmalloc(size, GFP_KERNEL); 3029 reply_buf = kmalloc(size, GFP_KERNEL);
2559 if (!reply_buf) 3030 if (!reply_buf)
2560 goto out; 3031 goto out;
2561 3032
2562 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, 3033 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
2563 "rbd", "dir_get_name", 3034 "rbd", "dir_get_name",
2564 image_id, image_id_size, 3035 image_id, image_id_size,
2565 (char *) reply_buf, size, 3036 (char *) reply_buf, size, NULL);
2566 CEPH_OSD_FLAG_READ, NULL);
2567 if (ret < 0) 3037 if (ret < 0)
2568 goto out; 3038 goto out;
2569 p = reply_buf; 3039 p = reply_buf;
@@ -2602,8 +3072,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2602 3072
2603 osdc = &rbd_dev->rbd_client->client->osdc; 3073 osdc = &rbd_dev->rbd_client->client->osdc;
2604 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3074 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2605 if (!name) 3075 if (!name) {
2606 return -EIO; /* pool id too large (>= 2^31) */ 3076 rbd_warn(rbd_dev, "there is no pool with id %llu",
3077 rbd_dev->spec->pool_id); /* Really a BUG() */
3078 return -EIO;
3079 }
2607 3080
2608 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 3081 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2609 if (!rbd_dev->spec->pool_name) 3082 if (!rbd_dev->spec->pool_name)
@@ -2612,19 +3085,17 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2612 /* Fetch the image name; tolerate failure here */ 3085 /* Fetch the image name; tolerate failure here */
2613 3086
2614 name = rbd_dev_image_name(rbd_dev); 3087 name = rbd_dev_image_name(rbd_dev);
2615 if (name) { 3088 if (name)
2616 rbd_dev->spec->image_name_len = strlen(name);
2617 rbd_dev->spec->image_name = (char *) name; 3089 rbd_dev->spec->image_name = (char *) name;
2618 } else { 3090 else
2619 pr_warning(RBD_DRV_NAME "%d " 3091 rbd_warn(rbd_dev, "unable to get image name");
2620 "unable to get image name for image id %s\n",
2621 rbd_dev->major, rbd_dev->spec->image_id);
2622 }
2623 3092
2624 /* Look up the snapshot name. */ 3093 /* Look up the snapshot name. */
2625 3094
2626 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 3095 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2627 if (!name) { 3096 if (!name) {
3097 rbd_warn(rbd_dev, "no snapshot with id %llu",
3098 rbd_dev->spec->snap_id); /* Really a BUG() */
2628 ret = -EIO; 3099 ret = -EIO;
2629 goto out_err; 3100 goto out_err;
2630 } 3101 }
@@ -2665,12 +3136,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
2665 if (!reply_buf) 3136 if (!reply_buf)
2666 return -ENOMEM; 3137 return -ENOMEM;
2667 3138
2668 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 3139 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2669 "rbd", "get_snapcontext", 3140 "rbd", "get_snapcontext",
2670 NULL, 0, 3141 NULL, 0,
2671 reply_buf, size, 3142 reply_buf, size, ver);
2672 CEPH_OSD_FLAG_READ, ver); 3143 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2673 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2674 if (ret < 0) 3144 if (ret < 0)
2675 goto out; 3145 goto out;
2676 3146
@@ -2735,12 +3205,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2735 return ERR_PTR(-ENOMEM); 3205 return ERR_PTR(-ENOMEM);
2736 3206
2737 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 3207 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2738 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 3208 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2739 "rbd", "get_snapshot_name", 3209 "rbd", "get_snapshot_name",
2740 (char *) &snap_id, sizeof (snap_id), 3210 (char *) &snap_id, sizeof (snap_id),
2741 reply_buf, size, 3211 reply_buf, size, NULL);
2742 CEPH_OSD_FLAG_READ, NULL); 3212 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2743 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2744 if (ret < 0) 3213 if (ret < 0)
2745 goto out; 3214 goto out;
2746 3215
@@ -2766,7 +3235,7 @@ out:
2766static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 3235static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2767 u64 *snap_size, u64 *snap_features) 3236 u64 *snap_size, u64 *snap_features)
2768{ 3237{
2769 __le64 snap_id; 3238 u64 snap_id;
2770 u8 order; 3239 u8 order;
2771 int ret; 3240 int ret;
2772 3241
@@ -2865,10 +3334,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2865 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 3334 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2866 struct list_head *next = links->next; 3335 struct list_head *next = links->next;
2867 3336
2868 /* Existing snapshot not in the new snap context */ 3337 /*
2869 3338 * A previously-existing snapshot is not in
3339 * the new snap context.
3340 *
3341 * If the now missing snapshot is the one the
3342 * image is mapped to, clear its exists flag
3343 * so we can avoid sending any more requests
3344 * to it.
3345 */
2870 if (rbd_dev->spec->snap_id == snap->id) 3346 if (rbd_dev->spec->snap_id == snap->id)
2871 rbd_dev->exists = false; 3347 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
2872 rbd_remove_snap_dev(snap); 3348 rbd_remove_snap_dev(snap);
2873 dout("%ssnap id %llu has been removed\n", 3349 dout("%ssnap id %llu has been removed\n",
2874 rbd_dev->spec->snap_id == snap->id ? 3350 rbd_dev->spec->snap_id == snap->id ?
@@ -2942,7 +3418,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2942 struct rbd_snap *snap; 3418 struct rbd_snap *snap;
2943 int ret = 0; 3419 int ret = 0;
2944 3420
2945 dout("%s called\n", __func__); 3421 dout("%s:\n", __func__);
2946 if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 3422 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2947 return -EIO; 3423 return -EIO;
2948 3424
@@ -2983,22 +3459,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2983 device_unregister(&rbd_dev->dev); 3459 device_unregister(&rbd_dev->dev);
2984} 3460}
2985 3461
2986static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2987{
2988 int ret, rc;
2989
2990 do {
2991 ret = rbd_req_sync_watch(rbd_dev);
2992 if (ret == -ERANGE) {
2993 rc = rbd_dev_refresh(rbd_dev, NULL);
2994 if (rc < 0)
2995 return rc;
2996 }
2997 } while (ret == -ERANGE);
2998
2999 return ret;
3000}
3001
3002static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 3462static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3003 3463
3004/* 3464/*
@@ -3138,11 +3598,9 @@ static inline char *dup_token(const char **buf, size_t *lenp)
3138 size_t len; 3598 size_t len;
3139 3599
3140 len = next_token(buf); 3600 len = next_token(buf);
3141 dup = kmalloc(len + 1, GFP_KERNEL); 3601 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3142 if (!dup) 3602 if (!dup)
3143 return NULL; 3603 return NULL;
3144
3145 memcpy(dup, *buf, len);
3146 *(dup + len) = '\0'; 3604 *(dup + len) = '\0';
3147 *buf += len; 3605 *buf += len;
3148 3606
@@ -3210,8 +3668,10 @@ static int rbd_add_parse_args(const char *buf,
3210 /* The first four tokens are required */ 3668 /* The first four tokens are required */
3211 3669
3212 len = next_token(&buf); 3670 len = next_token(&buf);
3213 if (!len) 3671 if (!len) {
3214 return -EINVAL; /* Missing monitor address(es) */ 3672 rbd_warn(NULL, "no monitor address(es) provided");
3673 return -EINVAL;
3674 }
3215 mon_addrs = buf; 3675 mon_addrs = buf;
3216 mon_addrs_size = len + 1; 3676 mon_addrs_size = len + 1;
3217 buf += len; 3677 buf += len;
@@ -3220,8 +3680,10 @@ static int rbd_add_parse_args(const char *buf,
3220 options = dup_token(&buf, NULL); 3680 options = dup_token(&buf, NULL);
3221 if (!options) 3681 if (!options)
3222 return -ENOMEM; 3682 return -ENOMEM;
3223 if (!*options) 3683 if (!*options) {
3224 goto out_err; /* Missing options */ 3684 rbd_warn(NULL, "no options provided");
3685 goto out_err;
3686 }
3225 3687
3226 spec = rbd_spec_alloc(); 3688 spec = rbd_spec_alloc();
3227 if (!spec) 3689 if (!spec)
@@ -3230,14 +3692,18 @@ static int rbd_add_parse_args(const char *buf,
3230 spec->pool_name = dup_token(&buf, NULL); 3692 spec->pool_name = dup_token(&buf, NULL);
3231 if (!spec->pool_name) 3693 if (!spec->pool_name)
3232 goto out_mem; 3694 goto out_mem;
3233 if (!*spec->pool_name) 3695 if (!*spec->pool_name) {
3234 goto out_err; /* Missing pool name */ 3696 rbd_warn(NULL, "no pool name provided");
3697 goto out_err;
3698 }
3235 3699
3236 spec->image_name = dup_token(&buf, &spec->image_name_len); 3700 spec->image_name = dup_token(&buf, NULL);
3237 if (!spec->image_name) 3701 if (!spec->image_name)
3238 goto out_mem; 3702 goto out_mem;
3239 if (!*spec->image_name) 3703 if (!*spec->image_name) {
3240 goto out_err; /* Missing image name */ 3704 rbd_warn(NULL, "no image name provided");
3705 goto out_err;
3706 }
3241 3707
3242 /* 3708 /*
3243 * Snapshot name is optional; default is to use "-" 3709 * Snapshot name is optional; default is to use "-"
@@ -3251,10 +3717,9 @@ static int rbd_add_parse_args(const char *buf,
3251 ret = -ENAMETOOLONG; 3717 ret = -ENAMETOOLONG;
3252 goto out_err; 3718 goto out_err;
3253 } 3719 }
3254 spec->snap_name = kmalloc(len + 1, GFP_KERNEL); 3720 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3255 if (!spec->snap_name) 3721 if (!spec->snap_name)
3256 goto out_mem; 3722 goto out_mem;
3257 memcpy(spec->snap_name, buf, len);
3258 *(spec->snap_name + len) = '\0'; 3723 *(spec->snap_name + len) = '\0';
3259 3724
3260 /* Initialize all rbd options to the defaults */ 3725 /* Initialize all rbd options to the defaults */
@@ -3323,7 +3788,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3323 * First, see if the format 2 image id file exists, and if 3788 * First, see if the format 2 image id file exists, and if
3324 * so, get the image's persistent id from it. 3789 * so, get the image's persistent id from it.
3325 */ 3790 */
3326 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; 3791 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3327 object_name = kmalloc(size, GFP_NOIO); 3792 object_name = kmalloc(size, GFP_NOIO);
3328 if (!object_name) 3793 if (!object_name)
3329 return -ENOMEM; 3794 return -ENOMEM;
@@ -3339,21 +3804,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3339 goto out; 3804 goto out;
3340 } 3805 }
3341 3806
3342 ret = rbd_req_sync_exec(rbd_dev, object_name, 3807 ret = rbd_obj_method_sync(rbd_dev, object_name,
3343 "rbd", "get_id", 3808 "rbd", "get_id",
3344 NULL, 0, 3809 NULL, 0,
3345 response, RBD_IMAGE_ID_LEN_MAX, 3810 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3346 CEPH_OSD_FLAG_READ, NULL); 3811 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3347 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3348 if (ret < 0) 3812 if (ret < 0)
3349 goto out; 3813 goto out;
3350 ret = 0; /* rbd_req_sync_exec() can return positive */
3351 3814
3352 p = response; 3815 p = response;
3353 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3816 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3354 p + RBD_IMAGE_ID_LEN_MAX, 3817 p + RBD_IMAGE_ID_LEN_MAX,
3355 &rbd_dev->spec->image_id_len, 3818 NULL, GFP_NOIO);
3356 GFP_NOIO);
3357 if (IS_ERR(rbd_dev->spec->image_id)) { 3819 if (IS_ERR(rbd_dev->spec->image_id)) {
3358 ret = PTR_ERR(rbd_dev->spec->image_id); 3820 ret = PTR_ERR(rbd_dev->spec->image_id);
3359 rbd_dev->spec->image_id = NULL; 3821 rbd_dev->spec->image_id = NULL;
@@ -3377,11 +3839,10 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3377 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 3839 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3378 if (!rbd_dev->spec->image_id) 3840 if (!rbd_dev->spec->image_id)
3379 return -ENOMEM; 3841 return -ENOMEM;
3380 rbd_dev->spec->image_id_len = 0;
3381 3842
3382 /* Record the header object name for this rbd image. */ 3843 /* Record the header object name for this rbd image. */
3383 3844
3384 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); 3845 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3385 rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3846 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3386 if (!rbd_dev->header_name) { 3847 if (!rbd_dev->header_name) {
3387 ret = -ENOMEM; 3848 ret = -ENOMEM;
@@ -3427,7 +3888,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3427 * Image id was filled in by the caller. Record the header 3888 * Image id was filled in by the caller. Record the header
3428 * object name for this rbd image. 3889 * object name for this rbd image.
3429 */ 3890 */
3430 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; 3891 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3431 rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3892 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3432 if (!rbd_dev->header_name) 3893 if (!rbd_dev->header_name)
3433 return -ENOMEM; 3894 return -ENOMEM;
@@ -3542,7 +4003,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3542 if (ret) 4003 if (ret)
3543 goto err_out_bus; 4004 goto err_out_bus;
3544 4005
3545 ret = rbd_init_watch_dev(rbd_dev); 4006 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
3546 if (ret) 4007 if (ret)
3547 goto err_out_bus; 4008 goto err_out_bus;
3548 4009
@@ -3638,6 +4099,13 @@ static ssize_t rbd_add(struct bus_type *bus,
3638 goto err_out_client; 4099 goto err_out_client;
3639 spec->pool_id = (u64) rc; 4100 spec->pool_id = (u64) rc;
3640 4101
4102 /* The ceph file layout needs to fit pool id in 32 bits */
4103
4104 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4105 rc = -EIO;
4106 goto err_out_client;
4107 }
4108
3641 rbd_dev = rbd_dev_create(rbdc, spec); 4109 rbd_dev = rbd_dev_create(rbdc, spec);
3642 if (!rbd_dev) 4110 if (!rbd_dev)
3643 goto err_out_client; 4111 goto err_out_client;
@@ -3691,15 +4159,8 @@ static void rbd_dev_release(struct device *dev)
3691{ 4159{
3692 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4160 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3693 4161
3694 if (rbd_dev->watch_request) {
3695 struct ceph_client *client = rbd_dev->rbd_client->client;
3696
3697 ceph_osdc_unregister_linger_request(&client->osdc,
3698 rbd_dev->watch_request);
3699 }
3700 if (rbd_dev->watch_event) 4162 if (rbd_dev->watch_event)
3701 rbd_req_sync_unwatch(rbd_dev); 4163 rbd_dev_header_watch_sync(rbd_dev, 0);
3702
3703 4164
3704 /* clean up and free blkdev */ 4165 /* clean up and free blkdev */
3705 rbd_free_disk(rbd_dev); 4166 rbd_free_disk(rbd_dev);
@@ -3743,10 +4204,14 @@ static ssize_t rbd_remove(struct bus_type *bus,
3743 goto done; 4204 goto done;
3744 } 4205 }
3745 4206
3746 if (rbd_dev->open_count) { 4207 spin_lock_irq(&rbd_dev->lock);
4208 if (rbd_dev->open_count)
3747 ret = -EBUSY; 4209 ret = -EBUSY;
4210 else
4211 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4212 spin_unlock_irq(&rbd_dev->lock);
4213 if (ret < 0)
3748 goto done; 4214 goto done;
3749 }
3750 4215
3751 rbd_remove_all_snaps(rbd_dev); 4216 rbd_remove_all_snaps(rbd_dev);
3752 rbd_bus_del_dev(rbd_dev); 4217 rbd_bus_del_dev(rbd_dev);
@@ -3782,10 +4247,15 @@ static void rbd_sysfs_cleanup(void)
3782 device_unregister(&rbd_root_dev); 4247 device_unregister(&rbd_root_dev);
3783} 4248}
3784 4249
3785int __init rbd_init(void) 4250static int __init rbd_init(void)
3786{ 4251{
3787 int rc; 4252 int rc;
3788 4253
4254 if (!libceph_compatible(NULL)) {
4255 rbd_warn(NULL, "libceph incompatibility (quitting)");
4256
4257 return -EINVAL;
4258 }
3789 rc = rbd_sysfs_init(); 4259 rc = rbd_sysfs_init();
3790 if (rc) 4260 if (rc)
3791 return rc; 4261 return rc;
@@ -3793,7 +4263,7 @@ int __init rbd_init(void)
3793 return 0; 4263 return 0;
3794} 4264}
3795 4265
3796void __exit rbd_exit(void) 4266static void __exit rbd_exit(void)
3797{ 4267{
3798 rbd_sysfs_cleanup(); 4268 rbd_sysfs_cleanup();
3799} 4269}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d4f81edd9a5d..a60ea977af6f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
236static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 236static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
237{ 237{
238 struct inode *inode = req->r_inode; 238 struct inode *inode = req->r_inode;
239 struct ceph_osd_reply_head *replyhead; 239 int rc = req->r_result;
240 int rc, bytes; 240 int bytes = le32_to_cpu(msg->hdr.data_len);
241 int i; 241 int i;
242 242
243 /* parse reply */
244 replyhead = msg->front.iov_base;
245 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
246 rc = le32_to_cpu(replyhead->result);
247 bytes = le32_to_cpu(msg->hdr.data_len);
248
249 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 243 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
250 244
251 /* unlock all pages, zeroing any data we didn't read */ 245 /* unlock all pages, zeroing any data we didn't read */
@@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
315 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 309 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
316 NULL, 0, 310 NULL, 0,
317 ci->i_truncate_seq, ci->i_truncate_size, 311 ci->i_truncate_seq, ci->i_truncate_size,
318 NULL, false, 1, 0); 312 NULL, false, 0);
319 if (IS_ERR(req)) 313 if (IS_ERR(req))
320 return PTR_ERR(req); 314 return PTR_ERR(req);
321 315
@@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
492 &ci->i_layout, snapc, 486 &ci->i_layout, snapc,
493 page_off, len, 487 page_off, len,
494 ci->i_truncate_seq, ci->i_truncate_size, 488 ci->i_truncate_seq, ci->i_truncate_size,
495 &inode->i_mtime, 489 &inode->i_mtime, &page, 1);
496 &page, 1, 0, 0, true);
497 if (err < 0) { 490 if (err < 0) {
498 dout("writepage setting page/mapping error %d %p\n", err, page); 491 dout("writepage setting page/mapping error %d %p\n", err, page);
499 SetPageError(page); 492 SetPageError(page);
@@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
554 struct ceph_msg *msg) 547 struct ceph_msg *msg)
555{ 548{
556 struct inode *inode = req->r_inode; 549 struct inode *inode = req->r_inode;
557 struct ceph_osd_reply_head *replyhead;
558 struct ceph_osd_op *op;
559 struct ceph_inode_info *ci = ceph_inode(inode); 550 struct ceph_inode_info *ci = ceph_inode(inode);
560 unsigned wrote; 551 unsigned wrote;
561 struct page *page; 552 struct page *page;
562 int i; 553 int i;
563 struct ceph_snap_context *snapc = req->r_snapc; 554 struct ceph_snap_context *snapc = req->r_snapc;
564 struct address_space *mapping = inode->i_mapping; 555 struct address_space *mapping = inode->i_mapping;
565 __s32 rc = -EIO; 556 int rc = req->r_result;
566 u64 bytes = 0; 557 u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
567 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 558 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
568 long writeback_stat; 559 long writeback_stat;
569 unsigned issued = ceph_caps_issued(ci); 560 unsigned issued = ceph_caps_issued(ci);
570 561
571 /* parse reply */
572 replyhead = msg->front.iov_base;
573 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
574 op = (void *)(replyhead + 1);
575 rc = le32_to_cpu(replyhead->result);
576 bytes = le64_to_cpu(op->extent.length);
577
578 if (rc >= 0) { 562 if (rc >= 0) {
579 /* 563 /*
580 * Assume we wrote the pages we originally sent. The 564 * Assume we wrote the pages we originally sent. The
@@ -741,8 +725,6 @@ retry:
741 struct page *page; 725 struct page *page;
742 int want; 726 int want;
743 u64 offset, len; 727 u64 offset, len;
744 struct ceph_osd_request_head *reqhead;
745 struct ceph_osd_op *op;
746 long writeback_stat; 728 long writeback_stat;
747 729
748 next = 0; 730 next = 0;
@@ -838,7 +820,7 @@ get_more_pages:
838 snapc, do_sync, 820 snapc, do_sync,
839 ci->i_truncate_seq, 821 ci->i_truncate_seq,
840 ci->i_truncate_size, 822 ci->i_truncate_size,
841 &inode->i_mtime, true, 1, 0); 823 &inode->i_mtime, true, 0);
842 824
843 if (IS_ERR(req)) { 825 if (IS_ERR(req)) {
844 rc = PTR_ERR(req); 826 rc = PTR_ERR(req);
@@ -906,10 +888,8 @@ get_more_pages:
906 888
907 /* revise final length, page count */ 889 /* revise final length, page count */
908 req->r_num_pages = locked_pages; 890 req->r_num_pages = locked_pages;
909 reqhead = req->r_request->front.iov_base; 891 req->r_request_ops[0].extent.length = cpu_to_le64(len);
910 op = (void *)(reqhead + 1); 892 req->r_request_ops[0].payload_len = cpu_to_le32(len);
911 op->extent.length = cpu_to_le64(len);
912 op->payload_len = cpu_to_le32(len);
913 req->r_request->hdr.data_len = cpu_to_le32(len); 893 req->r_request->hdr.data_len = cpu_to_le32(len);
914 894
915 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 895 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ae2be696eb5b..78e2f575247d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,8 +611,16 @@ retry:
611 611
612 if (flags & CEPH_CAP_FLAG_AUTH) 612 if (flags & CEPH_CAP_FLAG_AUTH)
613 ci->i_auth_cap = cap; 613 ci->i_auth_cap = cap;
614 else if (ci->i_auth_cap == cap) 614 else if (ci->i_auth_cap == cap) {
615 ci->i_auth_cap = NULL; 615 ci->i_auth_cap = NULL;
616 spin_lock(&mdsc->cap_dirty_lock);
617 if (!list_empty(&ci->i_dirty_item)) {
618 dout(" moving %p to cap_dirty_migrating\n", inode);
619 list_move(&ci->i_dirty_item,
620 &mdsc->cap_dirty_migrating);
621 }
622 spin_unlock(&mdsc->cap_dirty_lock);
623 }
616 624
617 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", 625 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
618 inode, ceph_vinop(inode), cap, ceph_cap_string(issued), 626 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
@@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1460 struct ceph_mds_client *mdsc = fsc->mdsc; 1468 struct ceph_mds_client *mdsc = fsc->mdsc;
1461 struct inode *inode = &ci->vfs_inode; 1469 struct inode *inode = &ci->vfs_inode;
1462 struct ceph_cap *cap; 1470 struct ceph_cap *cap;
1463 int file_wanted, used; 1471 int file_wanted, used, cap_used;
1464 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ 1472 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1465 int issued, implemented, want, retain, revoking, flushing = 0; 1473 int issued, implemented, want, retain, revoking, flushing = 0;
1466 int mds = -1; /* keep track of how far we've gone through i_caps list 1474 int mds = -1; /* keep track of how far we've gone through i_caps list
@@ -1563,9 +1571,14 @@ retry_locked:
1563 1571
1564 /* NOTE: no side-effects allowed, until we take s_mutex */ 1572 /* NOTE: no side-effects allowed, until we take s_mutex */
1565 1573
1574 cap_used = used;
1575 if (ci->i_auth_cap && cap != ci->i_auth_cap)
1576 cap_used &= ~ci->i_auth_cap->issued;
1577
1566 revoking = cap->implemented & ~cap->issued; 1578 revoking = cap->implemented & ~cap->issued;
1567 dout(" mds%d cap %p issued %s implemented %s revoking %s\n", 1579 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
1568 cap->mds, cap, ceph_cap_string(cap->issued), 1580 cap->mds, cap, ceph_cap_string(cap->issued),
1581 ceph_cap_string(cap_used),
1569 ceph_cap_string(cap->implemented), 1582 ceph_cap_string(cap->implemented),
1570 ceph_cap_string(revoking)); 1583 ceph_cap_string(revoking));
1571 1584
@@ -1593,7 +1606,7 @@ retry_locked:
1593 } 1606 }
1594 1607
1595 /* completed revocation? going down and there are no caps? */ 1608 /* completed revocation? going down and there are no caps? */
1596 if (revoking && (revoking & used) == 0) { 1609 if (revoking && (revoking & cap_used) == 0) {
1597 dout("completed revocation of %s\n", 1610 dout("completed revocation of %s\n",
1598 ceph_cap_string(cap->implemented & ~cap->issued)); 1611 ceph_cap_string(cap->implemented & ~cap->issued));
1599 goto ack; 1612 goto ack;
@@ -1670,8 +1683,8 @@ ack:
1670 sent++; 1683 sent++;
1671 1684
1672 /* __send_cap drops i_ceph_lock */ 1685 /* __send_cap drops i_ceph_lock */
1673 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, 1686 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
1674 retain, flushing, NULL); 1687 want, retain, flushing, NULL);
1675 goto retry; /* retake i_ceph_lock and restart our cap scan. */ 1688 goto retry; /* retake i_ceph_lock and restart our cap scan. */
1676 } 1689 }
1677 1690
@@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2417 dout("mds wanted %s -> %s\n", 2430 dout("mds wanted %s -> %s\n",
2418 ceph_cap_string(le32_to_cpu(grant->wanted)), 2431 ceph_cap_string(le32_to_cpu(grant->wanted)),
2419 ceph_cap_string(wanted)); 2432 ceph_cap_string(wanted));
2420 grant->wanted = cpu_to_le32(wanted); 2433 /* imported cap may not have correct mds_wanted */
2434 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
2435 check_caps = 1;
2421 } 2436 }
2422 2437
2423 cap->seq = seq; 2438 cap->seq = seq;
@@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2821 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 2836 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2822 (unsigned)seq); 2837 (unsigned)seq);
2823 2838
2839 if (op == CEPH_CAP_OP_IMPORT)
2840 ceph_add_cap_releases(mdsc, session);
2841
2824 /* lookup ino */ 2842 /* lookup ino */
2825 inode = ceph_find_inode(sb, vino); 2843 inode = ceph_find_inode(sb, vino);
2826 ci = ceph_inode(inode); 2844 ci = ceph_inode(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 11b57c2c8f15..bf338d9b67e3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
243 err = ceph_mdsc_do_request(mdsc, 243 err = ceph_mdsc_do_request(mdsc,
244 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 244 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
245 req); 245 req);
246 if (err)
247 goto out_err;
248
246 err = ceph_handle_snapdir(req, dentry, err); 249 err = ceph_handle_snapdir(req, dentry, err);
247 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 250 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
248 err = ceph_handle_notrace_create(dir, dentry); 251 err = ceph_handle_notrace_create(dir, dentry);
@@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
263 err = finish_no_open(file, dn); 266 err = finish_no_open(file, dn);
264 } else { 267 } else {
265 dout("atomic_open finish_open on dn %p\n", dn); 268 dout("atomic_open finish_open on dn %p\n", dn);
269 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
270 *opened |= FILE_CREATED;
271 }
266 err = finish_open(file, dentry, ceph_open, opened); 272 err = finish_open(file, dentry, ceph_open, opened);
267 } 273 }
268 274
@@ -535,7 +541,7 @@ more:
535 ci->i_snap_realm->cached_context, 541 ci->i_snap_realm->cached_context,
536 do_sync, 542 do_sync,
537 ci->i_truncate_seq, ci->i_truncate_size, 543 ci->i_truncate_seq, ci->i_truncate_size,
538 &mtime, false, 2, page_align); 544 &mtime, false, page_align);
539 if (IS_ERR(req)) 545 if (IS_ERR(req))
540 return PTR_ERR(req); 546 return PTR_ERR(req);
541 547
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f5ed767806df..4a989345b37b 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
185 &ceph_sb_to_client(inode->i_sb)->client->osdc; 185 &ceph_sb_to_client(inode->i_sb)->client->osdc;
186 u64 len = 1, olen; 186 u64 len = 1, olen;
187 u64 tmp; 187 u64 tmp;
188 struct ceph_object_layout ol;
189 struct ceph_pg pgid; 188 struct ceph_pg pgid;
190 int r; 189 int r;
191 190
@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
194 return -EFAULT; 193 return -EFAULT;
195 194
196 down_read(&osdc->map_sem); 195 down_read(&osdc->map_sem);
197 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, 196 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
198 &dl.object_no, &dl.object_offset, 197 &dl.object_no, &dl.object_offset,
199 &olen); 198 &olen);
200 if (r < 0) 199 if (r < 0)
@@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
209 208
210 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", 209 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
211 ceph_ino(inode), dl.object_no); 210 ceph_ino(inode), dl.object_no);
212 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, 211 ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
213 osdc->osdmap); 212 osdc->osdmap);
214 213
215 pgid = ol.ol_pgid;
216 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); 214 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
217 if (dl.osd >= 0) { 215 if (dl.osd >= 0) {
218 struct ceph_entity_addr *a = 216 struct ceph_entity_addr *a =
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7a3dfe0a9a80..442880d099c9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -233,6 +233,30 @@ bad:
233} 233}
234 234
235/* 235/*
236 * parse create results
237 */
238static int parse_reply_info_create(void **p, void *end,
239 struct ceph_mds_reply_info_parsed *info,
240 int features)
241{
242 if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
243 if (*p == end) {
244 info->has_create_ino = false;
245 } else {
246 info->has_create_ino = true;
247 info->ino = ceph_decode_64(p);
248 }
249 }
250
251 if (unlikely(*p != end))
252 goto bad;
253 return 0;
254
255bad:
256 return -EIO;
257}
258
259/*
236 * parse extra results 260 * parse extra results
237 */ 261 */
238static int parse_reply_info_extra(void **p, void *end, 262static int parse_reply_info_extra(void **p, void *end,
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
241{ 265{
242 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 266 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
243 return parse_reply_info_filelock(p, end, info, features); 267 return parse_reply_info_filelock(p, end, info, features);
244 else 268 else if (info->head->op == CEPH_MDS_OP_READDIR)
245 return parse_reply_info_dir(p, end, info, features); 269 return parse_reply_info_dir(p, end, info, features);
270 else if (info->head->op == CEPH_MDS_OP_CREATE)
271 return parse_reply_info_create(p, end, info, features);
272 else
273 return -EIO;
246} 274}
247 275
248/* 276/*
@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2170 mutex_lock(&req->r_fill_mutex); 2198 mutex_lock(&req->r_fill_mutex);
2171 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2199 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2172 if (err == 0) { 2200 if (err == 0) {
2173 if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && 2201 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
2202 req->r_op == CEPH_MDS_OP_LSSNAP) &&
2174 rinfo->dir_nr) 2203 rinfo->dir_nr)
2175 ceph_readdir_prepopulate(req, req->r_session); 2204 ceph_readdir_prepopulate(req, req->r_session);
2176 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2205 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ff4188bf6199..c2a19fbbe517 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
74 struct ceph_mds_reply_info_in *dir_in; 74 struct ceph_mds_reply_info_in *dir_in;
75 u8 dir_complete, dir_end; 75 u8 dir_complete, dir_end;
76 }; 76 };
77
78 /* for create results */
79 struct {
80 bool has_create_ino;
81 u64 ino;
82 };
77 }; 83 };
78 84
79 /* encoded blob describing snapshot contexts for certain 85 /* encoded blob describing snapshot contexts for certain
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a35..0d3c9240c61b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
59 return ERR_PTR(-ENOMEM); 59 return ERR_PTR(-ENOMEM);
60 60
61 ceph_decode_16_safe(p, end, version, bad); 61 ceph_decode_16_safe(p, end, version, bad);
62 if (version > 3) {
63 pr_warning("got mdsmap version %d > 3, failing", version);
64 goto bad;
65 }
62 66
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); 67 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p); 68 m->m_epoch = ceph_decode_32(p);
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
144 /* pg_pools */ 148 /* pg_pools */
145 ceph_decode_32_safe(p, end, n, bad); 149 ceph_decode_32_safe(p, end, n, bad);
146 m->m_num_data_pg_pools = n; 150 m->m_num_data_pg_pools = n;
147 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); 151 m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
148 if (!m->m_data_pg_pools) 152 if (!m->m_data_pg_pools)
149 goto badmem; 153 goto badmem;
150 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); 154 ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
151 for (i = 0; i < n; i++) 155 for (i = 0; i < n; i++)
152 m->m_data_pg_pools[i] = ceph_decode_32(p); 156 m->m_data_pg_pools[i] = ceph_decode_64(p);
153 m->m_cas_pg_pool = ceph_decode_32(p); 157 m->m_cas_pg_pool = ceph_decode_64(p);
154 158
155 /* ok, we don't care about the rest. */ 159 /* ok, we don't care about the rest. */
156 dout("mdsmap_decode success epoch %u\n", m->m_epoch); 160 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c804..89fa4a940a0f 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
15 case CEPH_MDS_STATE_BOOT: return "up:boot"; 15 case CEPH_MDS_STATE_BOOT: return "up:boot";
16 case CEPH_MDS_STATE_STANDBY: return "up:standby"; 16 case CEPH_MDS_STATE_STANDBY: return "up:standby";
17 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; 17 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
18 case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
18 case CEPH_MDS_STATE_CREATING: return "up:creating"; 19 case CEPH_MDS_STATE_CREATING: return "up:creating";
19 case CEPH_MDS_STATE_STARTING: return "up:starting"; 20 case CEPH_MDS_STATE_STARTING: return "up:starting";
20 /* up and in */ 21 /* up and in */
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
50 case CEPH_MDS_OP_LOOKUP: return "lookup"; 51 case CEPH_MDS_OP_LOOKUP: return "lookup";
51 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; 52 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
52 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; 53 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
54 case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
53 case CEPH_MDS_OP_GETATTR: return "getattr"; 55 case CEPH_MDS_OP_GETATTR: return "getattr";
54 case CEPH_MDS_OP_SETXATTR: return "setxattr"; 56 case CEPH_MDS_OP_SETXATTR: return "setxattr";
55 case CEPH_MDS_OP_SETATTR: return "setattr"; 57 case CEPH_MDS_OP_SETATTR: return "setattr";
56 case CEPH_MDS_OP_RMXATTR: return "rmxattr"; 58 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
59 case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
60 case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
57 case CEPH_MDS_OP_READDIR: return "readdir"; 61 case CEPH_MDS_OP_READDIR: return "readdir";
58 case CEPH_MDS_OP_MKNOD: return "mknod"; 62 case CEPH_MDS_OP_MKNOD: return "mknod";
59 case CEPH_MDS_OP_LINK: return "link"; 63 case CEPH_MDS_OP_LINK: return "link";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e86aa9948124..9fe17c6c2876 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
71 /* 71 /*
72 * express utilization in terms of large blocks to avoid 72 * express utilization in terms of large blocks to avoid
73 * overflow on 32-bit machines. 73 * overflow on 32-bit machines.
74 *
75 * NOTE: for the time being, we make bsize == frsize to humor
76 * not-yet-ancient versions of glibc that are broken.
77 * Someday, we will probably want to report a real block
78 * size... whatever that may mean for a network file system!
74 */ 79 */
75 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; 80 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
81 buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
76 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); 82 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
77 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 83 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
78 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 84 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
80 buf->f_files = le64_to_cpu(st.num_objects); 86 buf->f_files = le64_to_cpu(st.num_objects);
81 buf->f_ffree = -1; 87 buf->f_ffree = -1;
82 buf->f_namelen = NAME_MAX; 88 buf->f_namelen = NAME_MAX;
83 buf->f_frsize = PAGE_CACHE_SIZE;
84 89
85 /* leave fsid little-endian, regardless of host endianness */ 90 /* leave fsid little-endian, regardless of host endianness */
86 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); 91 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f053bbd1886f..c7b309723dcc 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,7 +21,7 @@
21 21
22/* large granularity for statfs utilization stats to facilitate 22/* large granularity for statfs utilization stats to facilitate
23 * large volume sizes on 32-bit machines. */ 23 * large volume sizes on 32-bit machines. */
24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
26 26
27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
798/* file.c */ 798/* file.c */
799extern const struct file_operations ceph_file_fops; 799extern const struct file_operations ceph_file_fops;
800extern const struct address_space_operations ceph_aops; 800extern const struct address_space_operations ceph_aops;
801extern int ceph_copy_to_page_vector(struct page **pages, 801
802 const char *data,
803 loff_t off, size_t len);
804extern int ceph_copy_from_page_vector(struct page **pages,
805 char *data,
806 loff_t off, size_t len);
807extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
808extern int ceph_open(struct inode *inode, struct file *file); 802extern int ceph_open(struct inode *inode, struct file *file);
809extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, 803extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
810 struct file *file, unsigned flags, umode_t mode, 804 struct file *file, unsigned flags, umode_t mode,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2c2ae5be9902..9b6b2b6dd164 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -29,9 +29,94 @@ struct ceph_vxattr {
29 size_t name_size; /* strlen(name) + 1 (for '\0') */ 29 size_t name_size; /* strlen(name) + 1 (for '\0') */
30 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, 30 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
31 size_t size); 31 size_t size);
32 bool readonly; 32 bool readonly, hidden;
33 bool (*exists_cb)(struct ceph_inode_info *ci);
33}; 34};
34 35
36/* layouts */
37
38static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
39{
40 size_t s;
41 char *p = (char *)&ci->i_layout;
42
43 for (s = 0; s < sizeof(ci->i_layout); s++, p++)
44 if (*p)
45 return true;
46 return false;
47}
48
49static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
50 size_t size)
51{
52 int ret;
53 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
54 struct ceph_osd_client *osdc = &fsc->client->osdc;
55 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
56 const char *pool_name;
57
58 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
59 down_read(&osdc->map_sem);
60 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
61 if (pool_name)
62 ret = snprintf(val, size,
63 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
64 (unsigned long long)ceph_file_layout_su(ci->i_layout),
65 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
66 (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
67 pool_name);
68 else
69 ret = snprintf(val, size,
70 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
71 (unsigned long long)ceph_file_layout_su(ci->i_layout),
72 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
73 (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
74 (unsigned long long)pool);
75
76 up_read(&osdc->map_sem);
77 return ret;
78}
79
80static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
81 char *val, size_t size)
82{
83 return snprintf(val, size, "%lld",
84 (unsigned long long)ceph_file_layout_su(ci->i_layout));
85}
86
87static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
88 char *val, size_t size)
89{
90 return snprintf(val, size, "%lld",
91 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
92}
93
94static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
95 char *val, size_t size)
96{
97 return snprintf(val, size, "%lld",
98 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
99}
100
101static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
102 char *val, size_t size)
103{
104 int ret;
105 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
106 struct ceph_osd_client *osdc = &fsc->client->osdc;
107 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
108 const char *pool_name;
109
110 down_read(&osdc->map_sem);
111 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
112 if (pool_name)
113 ret = snprintf(val, size, "%s", pool_name);
114 else
115 ret = snprintf(val, size, "%lld", (unsigned long long)pool);
116 up_read(&osdc->map_sem);
117 return ret;
118}
119
35/* directories */ 120/* directories */
36 121
37static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, 122static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
83 (long)ci->i_rctime.tv_nsec); 168 (long)ci->i_rctime.tv_nsec);
84} 169}
85 170
86#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
87 171
88#define XATTR_NAME_CEPH(_type, _name) \ 172#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
89 { \ 173#define CEPH_XATTR_NAME2(_type, _name, _name2) \
90 .name = CEPH_XATTR_NAME(_type, _name), \ 174 XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
91 .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ 175
92 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ 176#define XATTR_NAME_CEPH(_type, _name) \
93 .readonly = true, \ 177 { \
94 } 178 .name = CEPH_XATTR_NAME(_type, _name), \
179 .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
180 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
181 .readonly = true, \
182 .hidden = false, \
183 .exists_cb = NULL, \
184 }
185#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
186 { \
187 .name = CEPH_XATTR_NAME2(_type, _name, _field), \
188 .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
189 .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
190 .readonly = false, \
191 .hidden = true, \
192 .exists_cb = ceph_vxattrcb_layout_exists, \
193 }
95 194
96static struct ceph_vxattr ceph_dir_vxattrs[] = { 195static struct ceph_vxattr ceph_dir_vxattrs[] = {
196 {
197 .name = "ceph.dir.layout",
198 .name_size = sizeof("ceph.dir.layout"),
199 .getxattr_cb = ceph_vxattrcb_layout,
200 .readonly = false,
201 .hidden = false,
202 .exists_cb = ceph_vxattrcb_layout_exists,
203 },
204 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
205 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
206 XATTR_LAYOUT_FIELD(dir, layout, object_size),
207 XATTR_LAYOUT_FIELD(dir, layout, pool),
97 XATTR_NAME_CEPH(dir, entries), 208 XATTR_NAME_CEPH(dir, entries),
98 XATTR_NAME_CEPH(dir, files), 209 XATTR_NAME_CEPH(dir, files),
99 XATTR_NAME_CEPH(dir, subdirs), 210 XATTR_NAME_CEPH(dir, subdirs),
@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
102 XATTR_NAME_CEPH(dir, rsubdirs), 213 XATTR_NAME_CEPH(dir, rsubdirs),
103 XATTR_NAME_CEPH(dir, rbytes), 214 XATTR_NAME_CEPH(dir, rbytes),
104 XATTR_NAME_CEPH(dir, rctime), 215 XATTR_NAME_CEPH(dir, rctime),
105 { 0 } /* Required table terminator */ 216 { .name = NULL, 0 } /* Required table terminator */
106}; 217};
107static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ 218static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
108 219
109/* files */ 220/* files */
110 221
111static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
112 size_t size)
113{
114 int ret;
115
116 ret = snprintf(val, size,
117 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
118 (unsigned long long)ceph_file_layout_su(ci->i_layout),
119 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
120 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
121 return ret;
122}
123
124static struct ceph_vxattr ceph_file_vxattrs[] = { 222static struct ceph_vxattr ceph_file_vxattrs[] = {
125 XATTR_NAME_CEPH(file, layout),
126 /* The following extended attribute name is deprecated */
127 { 223 {
128 .name = XATTR_CEPH_PREFIX "layout", 224 .name = "ceph.file.layout",
129 .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), 225 .name_size = sizeof("ceph.file.layout"),
130 .getxattr_cb = ceph_vxattrcb_file_layout, 226 .getxattr_cb = ceph_vxattrcb_layout,
131 .readonly = true, 227 .readonly = false,
228 .hidden = false,
229 .exists_cb = ceph_vxattrcb_layout_exists,
132 }, 230 },
133 { 0 } /* Required table terminator */ 231 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
232 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
233 XATTR_LAYOUT_FIELD(file, layout, object_size),
234 XATTR_LAYOUT_FIELD(file, layout, pool),
235 { .name = NULL, 0 } /* Required table terminator */
134}; 236};
135static size_t ceph_file_vxattrs_name_size; /* total size of all names */ 237static size_t ceph_file_vxattrs_name_size; /* total size of all names */
136 238
@@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
164 size_t size = 0; 266 size_t size = 0;
165 267
166 for (vxattr = vxattrs; vxattr->name; vxattr++) 268 for (vxattr = vxattrs; vxattr->name; vxattr++)
167 size += vxattr->name_size; 269 if (!vxattr->hidden)
270 size += vxattr->name_size;
168 271
169 return size; 272 return size;
170} 273}
@@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
572 if (!ceph_is_valid_xattr(name)) 675 if (!ceph_is_valid_xattr(name))
573 return -ENODATA; 676 return -ENODATA;
574 677
575 /* let's see if a virtual xattr was requested */
576 vxattr = ceph_match_vxattr(inode, name);
577
578 spin_lock(&ci->i_ceph_lock); 678 spin_lock(&ci->i_ceph_lock);
579 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 679 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
580 ci->i_xattrs.version, ci->i_xattrs.index_version); 680 ci->i_xattrs.version, ci->i_xattrs.index_version);
581 681
682 /* let's see if a virtual xattr was requested */
683 vxattr = ceph_match_vxattr(inode, name);
684 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
685 err = vxattr->getxattr_cb(ci, value, size);
686 goto out;
687 }
688
582 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 689 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
583 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 690 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
584 goto get_xattr; 691 goto get_xattr;
@@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
592 699
593 spin_lock(&ci->i_ceph_lock); 700 spin_lock(&ci->i_ceph_lock);
594 701
595 if (vxattr && vxattr->readonly) {
596 err = vxattr->getxattr_cb(ci, value, size);
597 goto out;
598 }
599
600 err = __build_xattrs(inode); 702 err = __build_xattrs(inode);
601 if (err < 0) 703 if (err < 0)
602 goto out; 704 goto out;
@@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
604get_xattr: 706get_xattr:
605 err = -ENODATA; /* == ENOATTR */ 707 err = -ENODATA; /* == ENOATTR */
606 xattr = __get_xattr(ci, name); 708 xattr = __get_xattr(ci, name);
607 if (!xattr) { 709 if (!xattr)
608 if (vxattr)
609 err = vxattr->getxattr_cb(ci, value, size);
610 goto out; 710 goto out;
611 }
612 711
613 err = -ERANGE; 712 err = -ERANGE;
614 if (size && size < xattr->val_len) 713 if (size && size < xattr->val_len)
@@ -664,23 +763,30 @@ list_xattr:
664 vir_namelen = ceph_vxattrs_name_size(vxattrs); 763 vir_namelen = ceph_vxattrs_name_size(vxattrs);
665 764
666 /* adding 1 byte per each variable due to the null termination */ 765 /* adding 1 byte per each variable due to the null termination */
667 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; 766 namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
668 err = -ERANGE; 767 err = -ERANGE;
669 if (size && namelen > size) 768 if (size && vir_namelen + namelen > size)
670 goto out; 769 goto out;
671 770
672 err = namelen; 771 err = namelen + vir_namelen;
673 if (size == 0) 772 if (size == 0)
674 goto out; 773 goto out;
675 774
676 names = __copy_xattr_names(ci, names); 775 names = __copy_xattr_names(ci, names);
677 776
678 /* virtual xattr names, too */ 777 /* virtual xattr names, too */
679 if (vxattrs) 778 err = namelen;
779 if (vxattrs) {
680 for (i = 0; vxattrs[i].name; i++) { 780 for (i = 0; vxattrs[i].name; i++) {
681 len = sprintf(names, "%s", vxattrs[i].name); 781 if (!vxattrs[i].hidden &&
682 names += len + 1; 782 !(vxattrs[i].exists_cb &&
783 !vxattrs[i].exists_cb(ci))) {
784 len = sprintf(names, "%s", vxattrs[i].name);
785 names += len + 1;
786 err += len + 1;
787 }
683 } 788 }
789 }
684 790
685out: 791out:
686 spin_unlock(&ci->i_ceph_lock); 792 spin_unlock(&ci->i_ceph_lock);
@@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
782 if (vxattr && vxattr->readonly) 888 if (vxattr && vxattr->readonly)
783 return -EOPNOTSUPP; 889 return -EOPNOTSUPP;
784 890
891 /* pass any unhandled ceph.* xattrs through to the MDS */
892 if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
893 goto do_sync_unlocked;
894
785 /* preallocate memory for xattr name, value, index node */ 895 /* preallocate memory for xattr name, value, index node */
786 err = -ENOMEM; 896 err = -ENOMEM;
787 newname = kmemdup(name, name_len + 1, GFP_NOFS); 897 newname = kmemdup(name, name_len + 1, GFP_NOFS);
@@ -838,6 +948,7 @@ retry:
838 948
839do_sync: 949do_sync:
840 spin_unlock(&ci->i_ceph_lock); 950 spin_unlock(&ci->i_ceph_lock);
951do_sync_unlocked:
841 err = ceph_sync_setxattr(dentry, name, value, size, flags); 952 err = ceph_sync_setxattr(dentry, name, value, size, flags);
842out: 953out:
843 kfree(newname); 954 kfree(newname);
@@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
892 if (vxattr && vxattr->readonly) 1003 if (vxattr && vxattr->readonly)
893 return -EOPNOTSUPP; 1004 return -EOPNOTSUPP;
894 1005
1006 /* pass any unhandled ceph.* xattrs through to the MDS */
1007 if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
1008 goto do_sync_unlocked;
1009
895 err = -ENOMEM; 1010 err = -ENOMEM;
896 spin_lock(&ci->i_ceph_lock); 1011 spin_lock(&ci->i_ceph_lock);
897retry: 1012retry:
@@ -931,6 +1046,7 @@ retry:
931 return err; 1046 return err;
932do_sync: 1047do_sync:
933 spin_unlock(&ci->i_ceph_lock); 1048 spin_unlock(&ci->i_ceph_lock);
1049do_sync_unlocked:
934 err = ceph_send_removexattr(dentry, name); 1050 err = ceph_send_removexattr(dentry, name);
935out: 1051out:
936 return err; 1052 return err;
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index dad579b0c0e6..76554cecaab2 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,16 +12,46 @@
12#define CEPH_FEATURE_MONNAMES (1<<5) 12#define CEPH_FEATURE_MONNAMES (1<<5)
13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) 13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) 14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
15/* bits 8-17 defined by user-space; not supported yet here */ 15#define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
16#define CEPH_FEATURE_PGID64 (1<<9)
17#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
18#define CEPH_FEATURE_PGPOOL3 (1<<11)
19#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
20#define CEPH_FEATURE_OSDENC (1<<13)
21#define CEPH_FEATURE_OMAP (1<<14)
22#define CEPH_FEATURE_MONENC (1<<15)
23#define CEPH_FEATURE_QUERY_T (1<<16)
24#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
16#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) 25#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
26#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
27#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
28#define CEPH_FEATURE_MON_GV (1<<21)
29#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
30#define CEPH_FEATURE_MSG_AUTH (1<<23)
31#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
32#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
33#define CEPH_FEATURE_CREATEPOOLID (1<<26)
34#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
35#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
36#define CEPH_FEATURE_MDSENC (1<<29)
37#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
17 38
18/* 39/*
19 * Features supported. 40 * Features supported.
20 */ 41 */
21#define CEPH_FEATURES_SUPPORTED_DEFAULT \ 42#define CEPH_FEATURES_SUPPORTED_DEFAULT \
22 (CEPH_FEATURE_NOSRCADDR | \ 43 (CEPH_FEATURE_NOSRCADDR | \
23 CEPH_FEATURE_CRUSH_TUNABLES) 44 CEPH_FEATURE_PGID64 | \
45 CEPH_FEATURE_PGPOOL3 | \
46 CEPH_FEATURE_OSDENC | \
47 CEPH_FEATURE_CRUSH_TUNABLES | \
48 CEPH_FEATURE_CRUSH_TUNABLES2 | \
49 CEPH_FEATURE_REPLY_CREATE_INODE | \
50 CEPH_FEATURE_OSDHASHPSPOOL)
24 51
25#define CEPH_FEATURES_REQUIRED_DEFAULT \ 52#define CEPH_FEATURES_REQUIRED_DEFAULT \
26 (CEPH_FEATURE_NOSRCADDR) 53 (CEPH_FEATURE_NOSRCADDR | \
54 CEPH_FEATURE_PGID64 | \
55 CEPH_FEATURE_PGPOOL3 | \
56 CEPH_FEATURE_OSDENC)
27#endif 57#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cf6f4d998a76..2ad7b860f062 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -21,16 +21,14 @@
21 * internal cluster protocols separately from the public, 21 * internal cluster protocols separately from the public,
22 * client-facing protocol. 22 * client-facing protocol.
23 */ 23 */
24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
27#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 24#define CEPH_OSDC_PROTOCOL 24 /* server/client */
28#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 25#define CEPH_MDSC_PROTOCOL 32 /* server/client */
29#define CEPH_MONC_PROTOCOL 15 /* server/client */ 26#define CEPH_MONC_PROTOCOL 15 /* server/client */
30 27
31 28
32#define CEPH_INO_ROOT 1 29#define CEPH_INO_ROOT 1
33#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ 30#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
31#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
34 32
35/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ 33/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
36#define CEPH_MAX_MON 31 34#define CEPH_MAX_MON 31
@@ -51,7 +49,7 @@ struct ceph_file_layout {
51 __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ 49 __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
52 50
53 /* object -> pg layout */ 51 /* object -> pg layout */
54 __le32 fl_unused; /* unused; used to be preferred primary (-1) */ 52 __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
55 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ 53 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
56} __attribute__ ((packed)); 54} __attribute__ ((packed));
57 55
@@ -101,6 +99,8 @@ struct ceph_dir_layout {
101#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 99#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
102#define CEPH_MSG_AUTH 17 100#define CEPH_MSG_AUTH 17
103#define CEPH_MSG_AUTH_REPLY 18 101#define CEPH_MSG_AUTH_REPLY 18
102#define CEPH_MSG_MON_GET_VERSION 19
103#define CEPH_MSG_MON_GET_VERSION_REPLY 20
104 104
105/* client <-> mds */ 105/* client <-> mds */
106#define CEPH_MSG_MDS_MAP 21 106#define CEPH_MSG_MDS_MAP 21
@@ -221,6 +221,11 @@ struct ceph_mon_subscribe_ack {
221} __attribute__ ((packed)); 221} __attribute__ ((packed));
222 222
223/* 223/*
224 * mdsmap flags
225 */
226#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
227
228/*
224 * mds states 229 * mds states
225 * > 0 -> in 230 * > 0 -> in
226 * <= 0 -> out 231 * <= 0 -> out
@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack {
233#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ 238#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
234#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ 239#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
235#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ 240#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
241#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */
236 242
237#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ 243#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
238#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed 244#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s);
264#define CEPH_LOCK_IXATTR 2048 270#define CEPH_LOCK_IXATTR 2048
265#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ 271#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
266#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ 272#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
273#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
267 274
268/* client_session ops */ 275/* client_session ops */
269enum { 276enum {
@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op);
338#define CEPH_SETATTR_SIZE 32 345#define CEPH_SETATTR_SIZE 32
339#define CEPH_SETATTR_CTIME 64 346#define CEPH_SETATTR_CTIME 64
340 347
348/*
349 * Ceph setxattr request flags.
350 */
351#define CEPH_XATTR_CREATE 1
352#define CEPH_XATTR_REPLACE 2
353
341union ceph_mds_request_args { 354union ceph_mds_request_args {
342 struct { 355 struct {
343 __le32 mask; /* CEPH_CAP_* */ 356 __le32 mask; /* CEPH_CAP_* */
@@ -522,14 +535,17 @@ int ceph_flags_to_mode(int flags);
522#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ 535#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
523#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ 536#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
524 537
538#define CEPH_CAP_SIMPLE_BITS 2
539#define CEPH_CAP_FILE_BITS 8
540
525/* per-lock shift */ 541/* per-lock shift */
526#define CEPH_CAP_SAUTH 2 542#define CEPH_CAP_SAUTH 2
527#define CEPH_CAP_SLINK 4 543#define CEPH_CAP_SLINK 4
528#define CEPH_CAP_SXATTR 6 544#define CEPH_CAP_SXATTR 6
529#define CEPH_CAP_SFILE 8 545#define CEPH_CAP_SFILE 8
530#define CEPH_CAP_SFLOCK 20 546#define CEPH_CAP_SFLOCK 20
531 547
532#define CEPH_CAP_BITS 22 548#define CEPH_CAP_BITS 22
533 549
534/* composed values */ 550/* composed values */
535#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) 551#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 63d092822bad..360d9d08ca9e 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
52 return end >= *p && n <= end - *p; 52 return end >= *p && n <= end - *p;
53} 53}
54 54
55#define ceph_decode_need(p, end, n, bad) \ 55#define ceph_decode_need(p, end, n, bad) \
56 do { \ 56 do { \
57 if (!likely(ceph_has_room(p, end, n))) \ 57 if (!likely(ceph_has_room(p, end, n))) \
58 goto bad; \ 58 goto bad; \
59 } while (0) 59 } while (0)
60 60
61#define ceph_decode_64_safe(p, end, v, bad) \ 61#define ceph_decode_64_safe(p, end, v, bad) \
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
99 * 99 *
100 * There are two possible failures: 100 * There are two possible failures:
101 * - converting the string would require accessing memory at or 101 * - converting the string would require accessing memory at or
102 * beyond the "end" pointer provided (-E 102 * beyond the "end" pointer provided (-ERANGE)
103 * - memory could not be allocated for the result 103 * - memory could not be allocated for the result (-ENOMEM)
104 */ 104 */
105static inline char *ceph_extract_encoded_string(void **p, void *end, 105static inline char *ceph_extract_encoded_string(void **p, void *end,
106 size_t *lenp, gfp_t gfp) 106 size_t *lenp, gfp_t gfp)
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end,
217 *p += len; 217 *p += len;
218} 218}
219 219
220#define ceph_encode_need(p, end, n, bad) \ 220#define ceph_encode_need(p, end, n, bad) \
221 do { \ 221 do { \
222 if (!likely(ceph_has_room(p, end, n))) \ 222 if (!likely(ceph_has_room(p, end, n))) \
223 goto bad; \ 223 goto bad; \
224 } while (0) 224 } while (0)
225 225
226#define ceph_encode_64_safe(p, end, v, bad) \ 226#define ceph_encode_64_safe(p, end, v, bad) \
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end,
231#define ceph_encode_32_safe(p, end, v, bad) \ 231#define ceph_encode_32_safe(p, end, v, bad) \
232 do { \ 232 do { \
233 ceph_encode_need(p, end, sizeof(u32), bad); \ 233 ceph_encode_need(p, end, sizeof(u32), bad); \
234 ceph_encode_32(p, v); \ 234 ceph_encode_32(p, v); \
235 } while (0) 235 } while (0)
236#define ceph_encode_16_safe(p, end, v, bad) \ 236#define ceph_encode_16_safe(p, end, v, bad) \
237 do { \ 237 do { \
238 ceph_encode_need(p, end, sizeof(u16), bad); \ 238 ceph_encode_need(p, end, sizeof(u16), bad); \
239 ceph_encode_16(p, v); \ 239 ceph_encode_16(p, v); \
240 } while (0)
241#define ceph_encode_8_safe(p, end, v, bad) \
242 do { \
243 ceph_encode_need(p, end, sizeof(u8), bad); \
244 ceph_encode_8(p, v); \
240 } while (0) 245 } while (0)
241 246
242#define ceph_encode_copy_safe(p, end, pv, n, bad) \ 247#define ceph_encode_copy_safe(p, end, pv, n, bad) \
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 084d3c622b12..29818fc3fa49 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len)
193} 193}
194 194
195/* ceph_common.c */ 195/* ceph_common.c */
196extern bool libceph_compatible(void *data);
197
196extern const char *ceph_msg_type_name(int type); 198extern const char *ceph_msg_type_name(int type);
197extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 199extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
198extern struct kmem_cache *ceph_inode_cachep; 200extern struct kmem_cache *ceph_inode_cachep;
@@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client);
220/* pagevec.c */ 222/* pagevec.c */
221extern void ceph_release_page_vector(struct page **pages, int num_pages); 223extern void ceph_release_page_vector(struct page **pages, int num_pages);
222 224
223extern struct page **ceph_get_direct_page_vector(const char __user *data, 225extern struct page **ceph_get_direct_page_vector(const void __user *data,
224 int num_pages, 226 int num_pages,
225 bool write_page); 227 bool write_page);
226extern void ceph_put_page_vector(struct page **pages, int num_pages, 228extern void ceph_put_page_vector(struct page **pages, int num_pages,
@@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages,
228extern void ceph_release_page_vector(struct page **pages, int num_pages); 230extern void ceph_release_page_vector(struct page **pages, int num_pages);
229extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); 231extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
230extern int ceph_copy_user_to_page_vector(struct page **pages, 232extern int ceph_copy_user_to_page_vector(struct page **pages,
231 const char __user *data, 233 const void __user *data,
232 loff_t off, size_t len); 234 loff_t off, size_t len);
233extern int ceph_copy_to_page_vector(struct page **pages, 235extern void ceph_copy_to_page_vector(struct page **pages,
234 const char *data, 236 const void *data,
235 loff_t off, size_t len); 237 loff_t off, size_t len);
236extern int ceph_copy_from_page_vector(struct page **pages, 238extern void ceph_copy_from_page_vector(struct page **pages,
237 char *data, 239 void *data,
238 loff_t off, size_t len); 240 loff_t off, size_t len);
239extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, 241extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
240 loff_t off, size_t len); 242 loff_t off, size_t len);
241extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); 243extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
242 244
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index cb15b5d867c7..87ed09f54800 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -29,8 +29,8 @@ struct ceph_mdsmap {
29 29
30 /* which object pools file data can be stored in */ 30 /* which object pools file data can be stored in */
31 int m_num_data_pg_pools; 31 int m_num_data_pg_pools;
32 u32 *m_data_pg_pools; 32 u64 *m_data_pg_pools;
33 u32 m_cas_pg_pool; 33 u64 m_cas_pg_pool;
34}; 34};
35 35
36static inline struct ceph_entity_addr * 36static inline struct ceph_entity_addr *
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 14ba5ee738a9..60903e0f665c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -83,9 +83,11 @@ struct ceph_msg {
83 struct list_head list_head; 83 struct list_head list_head;
84 84
85 struct kref kref; 85 struct kref kref;
86#ifdef CONFIG_BLOCK
86 struct bio *bio; /* instead of pages/pagelist */ 87 struct bio *bio; /* instead of pages/pagelist */
87 struct bio *bio_iter; /* bio iterator */ 88 struct bio *bio_iter; /* bio iterator */
88 int bio_seg; /* current bio segment */ 89 int bio_seg; /* current bio segment */
90#endif /* CONFIG_BLOCK */
89 struct ceph_pagelist *trail; /* the trailing part of the data */ 91 struct ceph_pagelist *trail; /* the trailing part of the data */
90 bool front_is_vmalloc; 92 bool front_is_vmalloc;
91 bool more_to_follow; 93 bool more_to_follow;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d9b880e977e6..1dd5d466b6f9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -10,6 +10,7 @@
10#include <linux/ceph/osdmap.h> 10#include <linux/ceph/osdmap.h>
11#include <linux/ceph/messenger.h> 11#include <linux/ceph/messenger.h>
12#include <linux/ceph/auth.h> 12#include <linux/ceph/auth.h>
13#include <linux/ceph/pagelist.h>
13 14
14/* 15/*
15 * Maximum object name size 16 * Maximum object name size
@@ -22,7 +23,6 @@ struct ceph_snap_context;
22struct ceph_osd_request; 23struct ceph_osd_request;
23struct ceph_osd_client; 24struct ceph_osd_client;
24struct ceph_authorizer; 25struct ceph_authorizer;
25struct ceph_pagelist;
26 26
27/* 27/*
28 * completion callback for async writepages 28 * completion callback for async writepages
@@ -47,6 +47,9 @@ struct ceph_osd {
47 struct list_head o_keepalive_item; 47 struct list_head o_keepalive_item;
48}; 48};
49 49
50
51#define CEPH_OSD_MAX_OP 10
52
50/* an in-flight request */ 53/* an in-flight request */
51struct ceph_osd_request { 54struct ceph_osd_request {
52 u64 r_tid; /* unique for this client */ 55 u64 r_tid; /* unique for this client */
@@ -63,9 +66,23 @@ struct ceph_osd_request {
63 struct ceph_connection *r_con_filling_msg; 66 struct ceph_connection *r_con_filling_msg;
64 67
65 struct ceph_msg *r_request, *r_reply; 68 struct ceph_msg *r_request, *r_reply;
66 int r_result;
67 int r_flags; /* any additional flags for the osd */ 69 int r_flags; /* any additional flags for the osd */
68 u32 r_sent; /* >0 if r_request is sending/sent */ 70 u32 r_sent; /* >0 if r_request is sending/sent */
71 int r_num_ops;
72
73 /* encoded message content */
74 struct ceph_osd_op *r_request_ops;
75 /* these are updated on each send */
76 __le32 *r_request_osdmap_epoch;
77 __le32 *r_request_flags;
78 __le64 *r_request_pool;
79 void *r_request_pgid;
80 __le32 *r_request_attempts;
81 struct ceph_eversion *r_request_reassert_version;
82
83 int r_result;
84 int r_reply_op_len[CEPH_OSD_MAX_OP];
85 s32 r_reply_op_result[CEPH_OSD_MAX_OP];
69 int r_got_reply; 86 int r_got_reply;
70 int r_linger; 87 int r_linger;
71 88
@@ -82,6 +99,7 @@ struct ceph_osd_request {
82 99
83 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ 100 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
84 int r_oid_len; 101 int r_oid_len;
102 u64 r_snapid;
85 unsigned long r_stamp; /* send OR check time */ 103 unsigned long r_stamp; /* send OR check time */
86 104
87 struct ceph_file_layout r_file_layout; 105 struct ceph_file_layout r_file_layout;
@@ -95,7 +113,7 @@ struct ceph_osd_request {
95 struct bio *r_bio; /* instead of pages */ 113 struct bio *r_bio; /* instead of pages */
96#endif 114#endif
97 115
98 struct ceph_pagelist *r_trail; /* trailing part of the data */ 116 struct ceph_pagelist r_trail; /* trailing part of the data */
99}; 117};
100 118
101struct ceph_osd_event { 119struct ceph_osd_event {
@@ -107,7 +125,6 @@ struct ceph_osd_event {
107 struct rb_node node; 125 struct rb_node node;
108 struct list_head osd_node; 126 struct list_head osd_node;
109 struct kref kref; 127 struct kref kref;
110 struct completion completion;
111}; 128};
112 129
113struct ceph_osd_event_work { 130struct ceph_osd_event_work {
@@ -157,7 +174,7 @@ struct ceph_osd_client {
157 174
158struct ceph_osd_req_op { 175struct ceph_osd_req_op {
159 u16 op; /* CEPH_OSD_OP_* */ 176 u16 op; /* CEPH_OSD_OP_* */
160 u32 flags; /* CEPH_OSD_FLAG_* */ 177 u32 payload_len;
161 union { 178 union {
162 struct { 179 struct {
163 u64 offset, length; 180 u64 offset, length;
@@ -166,23 +183,24 @@ struct ceph_osd_req_op {
166 } extent; 183 } extent;
167 struct { 184 struct {
168 const char *name; 185 const char *name;
169 u32 name_len;
170 const char *val; 186 const char *val;
187 u32 name_len;
171 u32 value_len; 188 u32 value_len;
172 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 189 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
173 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 190 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
174 } xattr; 191 } xattr;
175 struct { 192 struct {
176 const char *class_name; 193 const char *class_name;
177 __u8 class_len;
178 const char *method_name; 194 const char *method_name;
179 __u8 method_len;
180 __u8 argc;
181 const char *indata; 195 const char *indata;
182 u32 indata_len; 196 u32 indata_len;
197 __u8 class_len;
198 __u8 method_len;
199 __u8 argc;
183 } cls; 200 } cls;
184 struct { 201 struct {
185 u64 cookie, count; 202 u64 cookie;
203 u64 count;
186 } pgls; 204 } pgls;
187 struct { 205 struct {
188 u64 snapid; 206 u64 snapid;
@@ -190,12 +208,11 @@ struct ceph_osd_req_op {
190 struct { 208 struct {
191 u64 cookie; 209 u64 cookie;
192 u64 ver; 210 u64 ver;
193 __u8 flag;
194 u32 prot_ver; 211 u32 prot_ver;
195 u32 timeout; 212 u32 timeout;
213 __u8 flag;
196 } watch; 214 } watch;
197 }; 215 };
198 u32 payload_len;
199}; 216};
200 217
201extern int ceph_osdc_init(struct ceph_osd_client *osdc, 218extern int ceph_osdc_init(struct ceph_osd_client *osdc,
@@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
207extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 224extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
208 struct ceph_msg *msg); 225 struct ceph_msg *msg);
209 226
210extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
211 struct ceph_file_layout *layout,
212 u64 snapid,
213 u64 off, u64 *plen, u64 *bno,
214 struct ceph_osd_request *req,
215 struct ceph_osd_req_op *op);
216
217extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 227extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
218 int flags,
219 struct ceph_snap_context *snapc, 228 struct ceph_snap_context *snapc,
220 struct ceph_osd_req_op *ops, 229 unsigned int num_op,
221 bool use_mempool, 230 bool use_mempool,
222 gfp_t gfp_flags, 231 gfp_t gfp_flags);
223 struct page **pages,
224 struct bio *bio);
225 232
226extern void ceph_osdc_build_request(struct ceph_osd_request *req, 233extern void ceph_osdc_build_request(struct ceph_osd_request *req,
227 u64 off, u64 *plen, 234 u64 off, u64 len,
235 unsigned int num_op,
228 struct ceph_osd_req_op *src_ops, 236 struct ceph_osd_req_op *src_ops,
229 struct ceph_snap_context *snapc, 237 struct ceph_snap_context *snapc,
230 struct timespec *mtime, 238 u64 snap_id,
231 const char *oid, 239 struct timespec *mtime);
232 int oid_len);
233 240
234extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 241extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
235 struct ceph_file_layout *layout, 242 struct ceph_file_layout *layout,
@@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
239 int do_sync, u32 truncate_seq, 246 int do_sync, u32 truncate_seq,
240 u64 truncate_size, 247 u64 truncate_size,
241 struct timespec *mtime, 248 struct timespec *mtime,
242 bool use_mempool, int num_reply, 249 bool use_mempool, int page_align);
243 int page_align);
244 250
245extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 251extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
246 struct ceph_osd_request *req); 252 struct ceph_osd_request *req);
@@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
279 u64 off, u64 len, 285 u64 off, u64 len,
280 u32 truncate_seq, u64 truncate_size, 286 u32 truncate_seq, u64 truncate_size,
281 struct timespec *mtime, 287 struct timespec *mtime,
282 struct page **pages, int nr_pages, 288 struct page **pages, int nr_pages);
283 int flags, int do_sync, bool nofail);
284 289
285/* watch/notify events */ 290/* watch/notify events */
286extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, 291extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
287 void (*event_cb)(u64, u64, u8, void *), 292 void (*event_cb)(u64, u64, u8, void *),
288 int one_shot, void *data, 293 void *data, struct ceph_osd_event **pevent);
289 struct ceph_osd_event **pevent);
290extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); 294extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
291extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
292 unsigned long timeout);
293extern void ceph_osdc_put_event(struct ceph_osd_event *event); 295extern void ceph_osdc_put_event(struct ceph_osd_event *event);
294#endif 296#endif
295 297
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 10a417f9f76f..c819190d1642 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -18,14 +18,31 @@
18 * The map can be updated either via an incremental map (diff) describing 18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map. 19 * the change between two successive epochs, or as a fully encoded map.
20 */ 20 */
21struct ceph_pg {
22 uint64_t pool;
23 uint32_t seed;
24};
25
26#define CEPH_POOL_FLAG_HASHPSPOOL 1
27
21struct ceph_pg_pool_info { 28struct ceph_pg_pool_info {
22 struct rb_node node; 29 struct rb_node node;
23 int id; 30 s64 id;
24 struct ceph_pg_pool v; 31 u8 type;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 32 u8 size;
33 u8 crush_ruleset;
34 u8 object_hash;
35 u32 pg_num, pgp_num;
36 int pg_num_mask, pgp_num_mask;
37 u64 flags;
26 char *name; 38 char *name;
27}; 39};
28 40
41struct ceph_object_locator {
42 uint64_t pool;
43 char *key;
44};
45
29struct ceph_pg_mapping { 46struct ceph_pg_mapping {
30 struct rb_node node; 47 struct rb_node node;
31 struct ceph_pg pgid; 48 struct ceph_pg pgid;
@@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
110 127
111/* calculate mapping of a file extent to an object */ 128/* calculate mapping of a file extent to an object */
112extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 129extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
113 u64 off, u64 *plen, 130 u64 off, u64 len,
114 u64 *bno, u64 *oxoff, u64 *oxlen); 131 u64 *bno, u64 *oxoff, u64 *oxlen);
115 132
116/* calculate mapping of object to a placement group */ 133/* calculate mapping of object to a placement group */
117extern int ceph_calc_object_layout(struct ceph_object_layout *ol, 134extern int ceph_calc_object_layout(struct ceph_pg *pg,
118 const char *oid, 135 const char *oid,
119 struct ceph_file_layout *fl, 136 struct ceph_file_layout *fl,
120 struct ceph_osdmap *osdmap); 137 struct ceph_osdmap *osdmap);
121extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 138extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
139 struct ceph_pg pgid,
122 int *acting); 140 int *acting);
123extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 141extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
124 struct ceph_pg pgid); 142 struct ceph_pg pgid);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2c04afeead1c..68c96a508ac2 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -9,14 +9,6 @@
9#include <linux/ceph/msgr.h> 9#include <linux/ceph/msgr.h>
10 10
11/* 11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 6
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 6
18
19/*
20 * fs id 12 * fs id
21 */ 13 */
22struct ceph_fsid { 14struct ceph_fsid {
@@ -64,7 +56,7 @@ struct ceph_timespec {
64 * placement group. 56 * placement group.
65 * we encode this into one __le64. 57 * we encode this into one __le64.
66 */ 58 */
67struct ceph_pg { 59struct ceph_pg_v1 {
68 __le16 preferred; /* preferred primary osd */ 60 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */ 61 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */ 62 __le32 pool; /* object pool */
@@ -91,21 +83,6 @@ struct ceph_pg {
91 83
92#define CEPH_PG_TYPE_REP 1 84#define CEPH_PG_TYPE_REP 1
93#define CEPH_PG_TYPE_RAID4 2 85#define CEPH_PG_TYPE_RAID4 2
94#define CEPH_PG_POOL_VERSION 2
95struct ceph_pg_pool {
96 __u8 type; /* CEPH_PG_TYPE_* */
97 __u8 size; /* number of osds in each pg */
98 __u8 crush_ruleset; /* crush placement rule */
99 __u8 object_hash; /* hash mapping object name to ps */
100 __le32 pg_num, pgp_num; /* number of pg's */
101 __le32 lpg_num, lpgp_num; /* number of localized pg's */
102 __le32 last_change; /* most recent epoch changed */
103 __le64 snap_seq; /* seq for per-pool snapshot */
104 __le32 snap_epoch; /* epoch of last snap */
105 __le32 num_snaps;
106 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
107 __le64 auid; /* who owns the pg */
108} __attribute__ ((packed));
109 86
110/* 87/*
111 * stable_mod func is used to control number of placement groups. 88 * stable_mod func is used to control number of placement groups.
@@ -128,7 +105,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask)
128 * object layout - how a given object should be stored. 105 * object layout - how a given object should be stored.
129 */ 106 */
130struct ceph_object_layout { 107struct ceph_object_layout {
131 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ 108 struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */
132 __le32 ol_stripe_unit; /* for per-object parity, if any */ 109 __le32 ol_stripe_unit; /* for per-object parity, if any */
133} __attribute__ ((packed)); 110} __attribute__ ((packed));
134 111
@@ -145,8 +122,12 @@ struct ceph_eversion {
145 */ 122 */
146 123
147/* status bits */ 124/* status bits */
148#define CEPH_OSD_EXISTS 1 125#define CEPH_OSD_EXISTS (1<<0)
149#define CEPH_OSD_UP 2 126#define CEPH_OSD_UP (1<<1)
127#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
128#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
129
130extern const char *ceph_osd_state_name(int s);
150 131
151/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ 132/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
152#define CEPH_OSD_IN 0x10000 133#define CEPH_OSD_IN 0x10000
@@ -161,9 +142,25 @@ struct ceph_eversion {
161#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ 142#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
162#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ 143#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
163#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ 144#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
145#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
146#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
147#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
148#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
149#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
150#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
151
152/*
153 * The error code to return when an OSD can't handle a write
154 * because it is too large.
155 */
156#define OSD_WRITETOOBIG EMSGSIZE
164 157
165/* 158/*
166 * osd ops 159 * osd ops
160 *
161 * WARNING: do not use these op codes directly. Use the helpers
162 * defined below instead. In certain cases, op code behavior was
163 * redefined, resulting in special-cases in the helpers.
167 */ 164 */
168#define CEPH_OSD_OP_MODE 0xf000 165#define CEPH_OSD_OP_MODE 0xf000
169#define CEPH_OSD_OP_MODE_RD 0x1000 166#define CEPH_OSD_OP_MODE_RD 0x1000
@@ -177,6 +174,7 @@ struct ceph_eversion {
177#define CEPH_OSD_OP_TYPE_ATTR 0x0300 174#define CEPH_OSD_OP_TYPE_ATTR 0x0300
178#define CEPH_OSD_OP_TYPE_EXEC 0x0400 175#define CEPH_OSD_OP_TYPE_EXEC 0x0400
179#define CEPH_OSD_OP_TYPE_PG 0x0500 176#define CEPH_OSD_OP_TYPE_PG 0x0500
177#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
180 178
181enum { 179enum {
182 /** data **/ 180 /** data **/
@@ -217,6 +215,23 @@ enum {
217 215
218 CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, 216 CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
219 217
218 /* omap */
219 CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17,
220 CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18,
221 CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19,
222 CEPH_OSD_OP_OMAPGETVALSBYKEYS =
223 CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20,
224 CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21,
225 CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22,
226 CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23,
227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
229
230 /** multi **/
231 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
232 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
233 CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3,
234
220 /** attrs **/ 235 /** attrs **/
221 /* read */ 236 /* read */
222 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 237 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -238,6 +253,7 @@ enum {
238 CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, 253 CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6,
239 CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, 254 CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7,
240 CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, 255 CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8,
256 CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9,
241 257
242 /** lock **/ 258 /** lock **/
243 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, 259 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
@@ -248,10 +264,12 @@ enum {
248 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, 264 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
249 265
250 /** exec **/ 266 /** exec **/
267 /* note: the RD bit here is wrong; see special-case below in helper */
251 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, 268 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
252 269
253 /** pg **/ 270 /** pg **/
254 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, 271 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
272 CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2,
255}; 273};
256 274
257static inline int ceph_osd_op_type_lock(int op) 275static inline int ceph_osd_op_type_lock(int op)
@@ -274,6 +292,10 @@ static inline int ceph_osd_op_type_pg(int op)
274{ 292{
275 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; 293 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
276} 294}
295static inline int ceph_osd_op_type_multi(int op)
296{
297 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI;
298}
277 299
278static inline int ceph_osd_op_mode_subop(int op) 300static inline int ceph_osd_op_mode_subop(int op)
279{ 301{
@@ -281,11 +303,12 @@ static inline int ceph_osd_op_mode_subop(int op)
281} 303}
282static inline int ceph_osd_op_mode_read(int op) 304static inline int ceph_osd_op_mode_read(int op)
283{ 305{
284 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; 306 return (op & CEPH_OSD_OP_MODE_RD) &&
307 op != CEPH_OSD_OP_CALL;
285} 308}
286static inline int ceph_osd_op_mode_modify(int op) 309static inline int ceph_osd_op_mode_modify(int op)
287{ 310{
288 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 311 return op & CEPH_OSD_OP_MODE_WR;
289} 312}
290 313
291/* 314/*
@@ -294,34 +317,38 @@ static inline int ceph_osd_op_mode_modify(int op)
294 */ 317 */
295#define CEPH_OSD_TMAP_HDR 'h' 318#define CEPH_OSD_TMAP_HDR 'h'
296#define CEPH_OSD_TMAP_SET 's' 319#define CEPH_OSD_TMAP_SET 's'
320#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
297#define CEPH_OSD_TMAP_RM 'r' 321#define CEPH_OSD_TMAP_RM 'r'
322#define CEPH_OSD_TMAP_RMSLOPPY 'R'
298 323
299extern const char *ceph_osd_op_name(int op); 324extern const char *ceph_osd_op_name(int op);
300 325
301
302/* 326/*
303 * osd op flags 327 * osd op flags
304 * 328 *
305 * An op may be READ, WRITE, or READ|WRITE. 329 * An op may be READ, WRITE, or READ|WRITE.
306 */ 330 */
307enum { 331enum {
308 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ 332 CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
309 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ 333 CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
310 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ 334 CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
311 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ 335 CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
312 CEPH_OSD_FLAG_READ = 16, /* op may read */ 336 CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
313 CEPH_OSD_FLAG_WRITE = 32, /* op may write */ 337 CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
314 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ 338 CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
315 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ 339 CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
316 CEPH_OSD_FLAG_BALANCE_READS = 256, 340 CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
317 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 341 CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
318 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 342 CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
319 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 343 CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
320 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ 344 CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
345 CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
346 CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
321}; 347};
322 348
323enum { 349enum {
324 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ 350 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
351 CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
325}; 352};
326 353
327#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 354#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
@@ -381,48 +408,13 @@ struct ceph_osd_op {
381 __le64 ver; 408 __le64 ver;
382 __u8 flag; /* 0 = unwatch, 1 = watch */ 409 __u8 flag; /* 0 = unwatch, 1 = watch */
383 } __attribute__ ((packed)) watch; 410 } __attribute__ ((packed)) watch;
384}; 411 struct {
412 __le64 offset, length;
413 __le64 src_offset;
414 } __attribute__ ((packed)) clonerange;
415 };
385 __le32 payload_len; 416 __le32 payload_len;
386} __attribute__ ((packed)); 417} __attribute__ ((packed));
387 418
388/*
389 * osd request message header. each request may include multiple
390 * ceph_osd_op object operations.
391 */
392struct ceph_osd_request_head {
393 __le32 client_inc; /* client incarnation */
394 struct ceph_object_layout layout; /* pgid */
395 __le32 osdmap_epoch; /* client's osdmap epoch */
396
397 __le32 flags;
398
399 struct ceph_timespec mtime; /* for mutations only */
400 struct ceph_eversion reassert_version; /* if we are replaying op */
401
402 __le32 object_len; /* length of object name */
403
404 __le64 snapid; /* snapid to read */
405 __le64 snap_seq; /* writer's snap context */
406 __le32 num_snaps;
407
408 __le16 num_ops;
409 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
410} __attribute__ ((packed));
411
412struct ceph_osd_reply_head {
413 __le32 client_inc; /* client incarnation */
414 __le32 flags;
415 struct ceph_object_layout layout;
416 __le32 osdmap_epoch;
417 struct ceph_eversion reassert_version; /* for replaying uncommitted */
418
419 __le32 result; /* result code */
420
421 __le32 object_len; /* length of object name */
422 __le32 num_ops;
423 struct ceph_osd_op ops[0]; /* ops[], object */
424} __attribute__ ((packed));
425
426
427 419
428#endif 420#endif
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 25baa287cff7..6a1101f24cfb 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -162,6 +162,8 @@ struct crush_map {
162 __u32 choose_local_fallback_tries; 162 __u32 choose_local_fallback_tries;
163 /* choose attempts before giving up */ 163 /* choose attempts before giving up */
164 __u32 choose_total_tries; 164 __u32 choose_total_tries;
165 /* attempt chooseleaf inner descent once; on failure retry outer descent */
166 __u32 chooseleaf_descend_once;
165}; 167};
166 168
167 169
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 1deb29af82fd..e65e6e4be38b 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -28,6 +28,22 @@
28#include "crypto.h" 28#include "crypto.h"
29 29
30 30
31/*
32 * Module compatibility interface. For now it doesn't do anything,
33 * but its existence signals a certain level of functionality.
34 *
35 * The data buffer is used to pass information both to and from
36 * libceph. The return value indicates whether libceph determines
37 * it is compatible with the caller (from another kernel module),
38 * given the provided data.
39 *
40 * The data pointer can be null.
41 */
42bool libceph_compatible(void *data)
43{
44 return true;
45}
46EXPORT_SYMBOL(libceph_compatible);
31 47
32/* 48/*
33 * find filename portion of a path (/foo/bar/baz -> baz) 49 * find filename portion of a path (/foo/bar/baz -> baz)
@@ -590,10 +606,8 @@ static int __init init_ceph_lib(void)
590 if (ret < 0) 606 if (ret < 0)
591 goto out_crypto; 607 goto out_crypto;
592 608
593 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", 609 pr_info("loaded (mon/osd proto %d/%d)\n",
594 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, 610 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
595 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
596 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
597 611
598 return 0; 612 return 0;
599 613
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 3fbda04de29c..1348df96fe15 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op)
21 switch (op) { 21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read"; 22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat"; 23 case CEPH_OSD_OP_STAT: return "stat";
24 case CEPH_OSD_OP_MAPEXT: return "mapext";
25 case CEPH_OSD_OP_SPARSE_READ: return "sparse-read";
26 case CEPH_OSD_OP_NOTIFY: return "notify";
27 case CEPH_OSD_OP_NOTIFY_ACK: return "notify-ack";
28 case CEPH_OSD_OP_ASSERT_VER: return "assert-version";
24 29
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; 30 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26 31
32 case CEPH_OSD_OP_CREATE: return "create";
27 case CEPH_OSD_OP_WRITE: return "write"; 33 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete"; 34 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate"; 35 case CEPH_OSD_OP_TRUNCATE: return "truncate";
@@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op)
39 case CEPH_OSD_OP_TMAPUP: return "tmapup"; 45 case CEPH_OSD_OP_TMAPUP: return "tmapup";
40 case CEPH_OSD_OP_TMAPGET: return "tmapget"; 46 case CEPH_OSD_OP_TMAPGET: return "tmapget";
41 case CEPH_OSD_OP_TMAPPUT: return "tmapput"; 47 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
48 case CEPH_OSD_OP_WATCH: return "watch";
49
50 case CEPH_OSD_OP_CLONERANGE: return "clonerange";
51 case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version";
52 case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr";
42 53
43 case CEPH_OSD_OP_GETXATTR: return "getxattr"; 54 case CEPH_OSD_OP_GETXATTR: return "getxattr";
44 case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; 55 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
@@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op)
53 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; 64 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
54 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; 65 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
55 case CEPH_OSD_OP_SCRUB: return "scrub"; 66 case CEPH_OSD_OP_SCRUB: return "scrub";
67 case CEPH_OSD_OP_SCRUB_RESERVE: return "scrub-reserve";
68 case CEPH_OSD_OP_SCRUB_UNRESERVE: return "scrub-unreserve";
69 case CEPH_OSD_OP_SCRUB_STOP: return "scrub-stop";
70 case CEPH_OSD_OP_SCRUB_MAP: return "scrub-map";
56 71
57 case CEPH_OSD_OP_WRLOCK: return "wrlock"; 72 case CEPH_OSD_OP_WRLOCK: return "wrlock";
58 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; 73 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
@@ -64,10 +79,34 @@ const char *ceph_osd_op_name(int op)
64 case CEPH_OSD_OP_CALL: return "call"; 79 case CEPH_OSD_OP_CALL: return "call";
65 80
66 case CEPH_OSD_OP_PGLS: return "pgls"; 81 case CEPH_OSD_OP_PGLS: return "pgls";
82 case CEPH_OSD_OP_PGLS_FILTER: return "pgls-filter";
83 case CEPH_OSD_OP_OMAPGETKEYS: return "omap-get-keys";
84 case CEPH_OSD_OP_OMAPGETVALS: return "omap-get-vals";
85 case CEPH_OSD_OP_OMAPGETHEADER: return "omap-get-header";
86 case CEPH_OSD_OP_OMAPGETVALSBYKEYS: return "omap-get-vals-by-keys";
87 case CEPH_OSD_OP_OMAPSETVALS: return "omap-set-vals";
88 case CEPH_OSD_OP_OMAPSETHEADER: return "omap-set-header";
89 case CEPH_OSD_OP_OMAPCLEAR: return "omap-clear";
90 case CEPH_OSD_OP_OMAPRMKEYS: return "omap-rm-keys";
67 } 91 }
68 return "???"; 92 return "???";
69} 93}
70 94
95const char *ceph_osd_state_name(int s)
96{
97 switch (s) {
98 case CEPH_OSD_EXISTS:
99 return "exists";
100 case CEPH_OSD_UP:
101 return "up";
102 case CEPH_OSD_AUTOOUT:
103 return "autoout";
104 case CEPH_OSD_NEW:
105 return "new";
106 default:
107 return "???";
108 }
109}
71 110
72const char *ceph_pool_op_name(int op) 111const char *ceph_pool_op_name(int op)
73{ 112{
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 35fce755ce10..cbd06a91941c 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
287 * @outpos: our position in that vector 287 * @outpos: our position in that vector
288 * @firstn: true if choosing "first n" items, false if choosing "indep" 288 * @firstn: true if choosing "first n" items, false if choosing "indep"
289 * @recurse_to_leaf: true if we want one device under each item of given type 289 * @recurse_to_leaf: true if we want one device under each item of given type
290 * @descend_once: true if we should only try one descent before giving up
290 * @out2: second output vector for leaf items (if @recurse_to_leaf) 291 * @out2: second output vector for leaf items (if @recurse_to_leaf)
291 */ 292 */
292static int crush_choose(const struct crush_map *map, 293static int crush_choose(const struct crush_map *map,
@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map,
295 int x, int numrep, int type, 296 int x, int numrep, int type,
296 int *out, int outpos, 297 int *out, int outpos,
297 int firstn, int recurse_to_leaf, 298 int firstn, int recurse_to_leaf,
298 int *out2) 299 int descend_once, int *out2)
299{ 300{
300 int rep; 301 int rep;
301 unsigned int ftotal, flocal; 302 unsigned int ftotal, flocal;
@@ -391,7 +392,7 @@ static int crush_choose(const struct crush_map *map,
391 } 392 }
392 393
393 reject = 0; 394 reject = 0;
394 if (recurse_to_leaf) { 395 if (!collide && recurse_to_leaf) {
395 if (item < 0) { 396 if (item < 0) {
396 if (crush_choose(map, 397 if (crush_choose(map,
397 map->buckets[-1-item], 398 map->buckets[-1-item],
@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map,
399 x, outpos+1, 0, 400 x, outpos+1, 0,
400 out2, outpos, 401 out2, outpos,
401 firstn, 0, 402 firstn, 0,
403 map->chooseleaf_descend_once,
402 NULL) <= outpos) 404 NULL) <= outpos)
403 /* didn't get leaf */ 405 /* didn't get leaf */
404 reject = 1; 406 reject = 1;
@@ -422,7 +424,10 @@ reject:
422 ftotal++; 424 ftotal++;
423 flocal++; 425 flocal++;
424 426
425 if (collide && flocal <= map->choose_local_tries) 427 if (reject && descend_once)
428 /* let outer call try again */
429 skip_rep = 1;
430 else if (collide && flocal <= map->choose_local_tries)
426 /* retry locally a few times */ 431 /* retry locally a few times */
427 retry_bucket = 1; 432 retry_bucket = 1;
428 else if (map->choose_local_fallback_tries > 0 && 433 else if (map->choose_local_fallback_tries > 0 &&
@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map,
485 int i, j; 490 int i, j;
486 int numrep; 491 int numrep;
487 int firstn; 492 int firstn;
493 const int descend_once = 0;
488 494
489 if ((__u32)ruleno >= map->max_rules) { 495 if ((__u32)ruleno >= map->max_rules) {
490 dprintk(" bad ruleno %d\n", ruleno); 496 dprintk(" bad ruleno %d\n", ruleno);
@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map,
544 curstep->arg2, 550 curstep->arg2,
545 o+osize, j, 551 o+osize, j,
546 firstn, 552 firstn,
547 recurse_to_leaf, c+osize); 553 recurse_to_leaf,
554 descend_once, c+osize);
548 } 555 }
549 556
550 if (recurse_to_leaf) 557 if (recurse_to_leaf)
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index af14cb425164..6e7a236525b6 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -423,7 +423,8 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
423 } 423 }
424} 424}
425 425
426int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) 426static int ceph_key_instantiate(struct key *key,
427 struct key_preparsed_payload *prep)
427{ 428{
428 struct ceph_crypto_key *ckey; 429 struct ceph_crypto_key *ckey;
429 size_t datalen = prep->datalen; 430 size_t datalen = prep->datalen;
@@ -458,12 +459,12 @@ err:
458 return ret; 459 return ret;
459} 460}
460 461
461int ceph_key_match(const struct key *key, const void *description) 462static int ceph_key_match(const struct key *key, const void *description)
462{ 463{
463 return strcmp(key->description, description) == 0; 464 return strcmp(key->description, description) == 0;
464} 465}
465 466
466void ceph_key_destroy(struct key *key) { 467static void ceph_key_destroy(struct key *key) {
467 struct ceph_crypto_key *ckey = key->payload.data; 468 struct ceph_crypto_key *ckey = key->payload.data;
468 469
469 ceph_crypto_key_destroy(ckey); 470 ceph_crypto_key_destroy(ckey);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 38b5dc1823d4..00d051f4894e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p)
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { 66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool = 67 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node); 68 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", 69 seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
70 pool->id, pool->v.pg_num, pool->pg_num_mask, 70 (unsigned long long)pool->id, pool->pg_num,
71 pool->v.lpg_num, pool->lpg_num_mask); 71 pool->pg_num_mask);
72 } 72 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) { 73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
74 struct ceph_entity_addr *addr = 74 struct ceph_entity_addr *addr =
@@ -123,26 +123,16 @@ static int osdc_show(struct seq_file *s, void *pp)
123 mutex_lock(&osdc->request_mutex); 123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req; 125 struct ceph_osd_request *req;
126 struct ceph_osd_request_head *head; 126 int opcode;
127 struct ceph_osd_op *op;
128 int num_ops;
129 int opcode, olen;
130 int i; 127 int i;
131 128
132 req = rb_entry(p, struct ceph_osd_request, r_node); 129 req = rb_entry(p, struct ceph_osd_request, r_node);
133 130
134 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, 131 seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
135 req->r_osd ? req->r_osd->o_osd : -1, 132 req->r_osd ? req->r_osd->o_osd : -1,
136 le32_to_cpu(req->r_pgid.pool), 133 req->r_pgid.pool, req->r_pgid.seed);
137 le16_to_cpu(req->r_pgid.ps));
138 134
139 head = req->r_request->front.iov_base; 135 seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
140 op = (void *)(head + 1);
141
142 num_ops = le16_to_cpu(head->num_ops);
143 olen = le32_to_cpu(head->object_len);
144 seq_printf(s, "%.*s", olen,
145 (const char *)(head->ops + num_ops));
146 136
147 if (req->r_reassert_version.epoch) 137 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu", 138 seq_printf(s, "\t%u'%llu",
@@ -151,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp)
151 else 141 else
152 seq_printf(s, "\t"); 142 seq_printf(s, "\t");
153 143
154 for (i = 0; i < num_ops; i++) { 144 for (i = 0; i < req->r_num_ops; i++) {
155 opcode = le16_to_cpu(op->op); 145 opcode = le16_to_cpu(req->r_request_ops[i].op);
156 seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); 146 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
157 op++;
158 } 147 }
159 148
160 seq_printf(s, "\n"); 149 seq_printf(s, "\n");
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5ccf87ed8d68..2c0669fb54e3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -9,8 +9,9 @@
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/socket.h> 10#include <linux/socket.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#ifdef CONFIG_BLOCK
12#include <linux/bio.h> 13#include <linux/bio.h>
13#include <linux/blkdev.h> 14#endif /* CONFIG_BLOCK */
14#include <linux/dns_resolver.h> 15#include <linux/dns_resolver.h>
15#include <net/tcp.h> 16#include <net/tcp.h>
16 17
@@ -97,6 +98,57 @@
97#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ 98#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
98#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ 99#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
99 100
101static bool con_flag_valid(unsigned long con_flag)
102{
103 switch (con_flag) {
104 case CON_FLAG_LOSSYTX:
105 case CON_FLAG_KEEPALIVE_PENDING:
106 case CON_FLAG_WRITE_PENDING:
107 case CON_FLAG_SOCK_CLOSED:
108 case CON_FLAG_BACKOFF:
109 return true;
110 default:
111 return false;
112 }
113}
114
115static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
116{
117 BUG_ON(!con_flag_valid(con_flag));
118
119 clear_bit(con_flag, &con->flags);
120}
121
122static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
123{
124 BUG_ON(!con_flag_valid(con_flag));
125
126 set_bit(con_flag, &con->flags);
127}
128
129static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
130{
131 BUG_ON(!con_flag_valid(con_flag));
132
133 return test_bit(con_flag, &con->flags);
134}
135
136static bool con_flag_test_and_clear(struct ceph_connection *con,
137 unsigned long con_flag)
138{
139 BUG_ON(!con_flag_valid(con_flag));
140
141 return test_and_clear_bit(con_flag, &con->flags);
142}
143
144static bool con_flag_test_and_set(struct ceph_connection *con,
145 unsigned long con_flag)
146{
147 BUG_ON(!con_flag_valid(con_flag));
148
149 return test_and_set_bit(con_flag, &con->flags);
150}
151
100/* static tag bytes (protocol control messages) */ 152/* static tag bytes (protocol control messages) */
101static char tag_msg = CEPH_MSGR_TAG_MSG; 153static char tag_msg = CEPH_MSGR_TAG_MSG;
102static char tag_ack = CEPH_MSGR_TAG_ACK; 154static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -114,7 +166,7 @@ static struct lock_class_key socket_class;
114 166
115static void queue_con(struct ceph_connection *con); 167static void queue_con(struct ceph_connection *con);
116static void con_work(struct work_struct *); 168static void con_work(struct work_struct *);
117static void ceph_fault(struct ceph_connection *con); 169static void con_fault(struct ceph_connection *con);
118 170
119/* 171/*
120 * Nicely render a sockaddr as a string. An array of formatted 172 * Nicely render a sockaddr as a string. An array of formatted
@@ -171,7 +223,7 @@ static void encode_my_addr(struct ceph_messenger *msgr)
171 */ 223 */
172static struct workqueue_struct *ceph_msgr_wq; 224static struct workqueue_struct *ceph_msgr_wq;
173 225
174void _ceph_msgr_exit(void) 226static void _ceph_msgr_exit(void)
175{ 227{
176 if (ceph_msgr_wq) { 228 if (ceph_msgr_wq) {
177 destroy_workqueue(ceph_msgr_wq); 229 destroy_workqueue(ceph_msgr_wq);
@@ -308,7 +360,7 @@ static void ceph_sock_write_space(struct sock *sk)
308 * buffer. See net/ipv4/tcp_input.c:tcp_check_space() 360 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
309 * and net/core/stream.c:sk_stream_write_space(). 361 * and net/core/stream.c:sk_stream_write_space().
310 */ 362 */
311 if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { 363 if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
312 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { 364 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
313 dout("%s %p queueing write work\n", __func__, con); 365 dout("%s %p queueing write work\n", __func__, con);
314 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 366 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -333,7 +385,7 @@ static void ceph_sock_state_change(struct sock *sk)
333 case TCP_CLOSE_WAIT: 385 case TCP_CLOSE_WAIT:
334 dout("%s TCP_CLOSE_WAIT\n", __func__); 386 dout("%s TCP_CLOSE_WAIT\n", __func__);
335 con_sock_state_closing(con); 387 con_sock_state_closing(con);
336 set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); 388 con_flag_set(con, CON_FLAG_SOCK_CLOSED);
337 queue_con(con); 389 queue_con(con);
338 break; 390 break;
339 case TCP_ESTABLISHED: 391 case TCP_ESTABLISHED:
@@ -474,7 +526,7 @@ static int con_close_socket(struct ceph_connection *con)
474 * received a socket close event before we had the chance to 526 * received a socket close event before we had the chance to
475 * shut the socket down. 527 * shut the socket down.
476 */ 528 */
477 clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); 529 con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
478 530
479 con_sock_state_closed(con); 531 con_sock_state_closed(con);
480 return rc; 532 return rc;
@@ -538,11 +590,10 @@ void ceph_con_close(struct ceph_connection *con)
538 ceph_pr_addr(&con->peer_addr.in_addr)); 590 ceph_pr_addr(&con->peer_addr.in_addr));
539 con->state = CON_STATE_CLOSED; 591 con->state = CON_STATE_CLOSED;
540 592
541 clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ 593 con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
542 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); 594 con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
543 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 595 con_flag_clear(con, CON_FLAG_WRITE_PENDING);
544 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); 596 con_flag_clear(con, CON_FLAG_BACKOFF);
545 clear_bit(CON_FLAG_BACKOFF, &con->flags);
546 597
547 reset_connection(con); 598 reset_connection(con);
548 con->peer_global_seq = 0; 599 con->peer_global_seq = 0;
@@ -798,7 +849,7 @@ static void prepare_write_message(struct ceph_connection *con)
798 /* no, queue up footer too and be done */ 849 /* no, queue up footer too and be done */
799 prepare_write_message_footer(con); 850 prepare_write_message_footer(con);
800 851
801 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 852 con_flag_set(con, CON_FLAG_WRITE_PENDING);
802} 853}
803 854
804/* 855/*
@@ -819,7 +870,7 @@ static void prepare_write_ack(struct ceph_connection *con)
819 &con->out_temp_ack); 870 &con->out_temp_ack);
820 871
821 con->out_more = 1; /* more will follow.. eventually.. */ 872 con->out_more = 1; /* more will follow.. eventually.. */
822 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 873 con_flag_set(con, CON_FLAG_WRITE_PENDING);
823} 874}
824 875
825/* 876/*
@@ -830,7 +881,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
830 dout("prepare_write_keepalive %p\n", con); 881 dout("prepare_write_keepalive %p\n", con);
831 con_out_kvec_reset(con); 882 con_out_kvec_reset(con);
832 con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); 883 con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
833 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 884 con_flag_set(con, CON_FLAG_WRITE_PENDING);
834} 885}
835 886
836/* 887/*
@@ -873,7 +924,7 @@ static void prepare_write_banner(struct ceph_connection *con)
873 &con->msgr->my_enc_addr); 924 &con->msgr->my_enc_addr);
874 925
875 con->out_more = 0; 926 con->out_more = 0;
876 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 927 con_flag_set(con, CON_FLAG_WRITE_PENDING);
877} 928}
878 929
879static int prepare_write_connect(struct ceph_connection *con) 930static int prepare_write_connect(struct ceph_connection *con)
@@ -923,7 +974,7 @@ static int prepare_write_connect(struct ceph_connection *con)
923 auth->authorizer_buf); 974 auth->authorizer_buf);
924 975
925 con->out_more = 0; 976 con->out_more = 0;
926 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 977 con_flag_set(con, CON_FLAG_WRITE_PENDING);
927 978
928 return 0; 979 return 0;
929} 980}
@@ -1643,7 +1694,7 @@ static int process_connect(struct ceph_connection *con)
1643 le32_to_cpu(con->in_reply.connect_seq)); 1694 le32_to_cpu(con->in_reply.connect_seq));
1644 1695
1645 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) 1696 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1646 set_bit(CON_FLAG_LOSSYTX, &con->flags); 1697 con_flag_set(con, CON_FLAG_LOSSYTX);
1647 1698
1648 con->delay = 0; /* reset backoff memory */ 1699 con->delay = 0; /* reset backoff memory */
1649 1700
@@ -2080,15 +2131,14 @@ do_next:
2080 prepare_write_ack(con); 2131 prepare_write_ack(con);
2081 goto more; 2132 goto more;
2082 } 2133 }
2083 if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, 2134 if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
2084 &con->flags)) {
2085 prepare_write_keepalive(con); 2135 prepare_write_keepalive(con);
2086 goto more; 2136 goto more;
2087 } 2137 }
2088 } 2138 }
2089 2139
2090 /* Nothing to do! */ 2140 /* Nothing to do! */
2091 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 2141 con_flag_clear(con, CON_FLAG_WRITE_PENDING);
2092 dout("try_write nothing else to write.\n"); 2142 dout("try_write nothing else to write.\n");
2093 ret = 0; 2143 ret = 0;
2094out: 2144out:
@@ -2268,7 +2318,7 @@ static void queue_con(struct ceph_connection *con)
2268 2318
2269static bool con_sock_closed(struct ceph_connection *con) 2319static bool con_sock_closed(struct ceph_connection *con)
2270{ 2320{
2271 if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) 2321 if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
2272 return false; 2322 return false;
2273 2323
2274#define CASE(x) \ 2324#define CASE(x) \
@@ -2295,6 +2345,41 @@ static bool con_sock_closed(struct ceph_connection *con)
2295 return true; 2345 return true;
2296} 2346}
2297 2347
2348static bool con_backoff(struct ceph_connection *con)
2349{
2350 int ret;
2351
2352 if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
2353 return false;
2354
2355 ret = queue_con_delay(con, round_jiffies_relative(con->delay));
2356 if (ret) {
2357 dout("%s: con %p FAILED to back off %lu\n", __func__,
2358 con, con->delay);
2359 BUG_ON(ret == -ENOENT);
2360 con_flag_set(con, CON_FLAG_BACKOFF);
2361 }
2362
2363 return true;
2364}
2365
2366/* Finish fault handling; con->mutex must *not* be held here */
2367
2368static void con_fault_finish(struct ceph_connection *con)
2369{
2370 /*
2371 * in case we faulted due to authentication, invalidate our
2372 * current tickets so that we can get new ones.
2373 */
2374 if (con->auth_retry && con->ops->invalidate_authorizer) {
2375 dout("calling invalidate_authorizer()\n");
2376 con->ops->invalidate_authorizer(con);
2377 }
2378
2379 if (con->ops->fault)
2380 con->ops->fault(con);
2381}
2382
2298/* 2383/*
2299 * Do some work on a connection. Drop a connection ref when we're done. 2384 * Do some work on a connection. Drop a connection ref when we're done.
2300 */ 2385 */
@@ -2302,73 +2387,68 @@ static void con_work(struct work_struct *work)
2302{ 2387{
2303 struct ceph_connection *con = container_of(work, struct ceph_connection, 2388 struct ceph_connection *con = container_of(work, struct ceph_connection,
2304 work.work); 2389 work.work);
2305 int ret; 2390 bool fault;
2306 2391
2307 mutex_lock(&con->mutex); 2392 mutex_lock(&con->mutex);
2308restart: 2393 while (true) {
2309 if (con_sock_closed(con)) 2394 int ret;
2310 goto fault;
2311 2395
2312 if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { 2396 if ((fault = con_sock_closed(con))) {
2313 dout("con_work %p backing off\n", con); 2397 dout("%s: con %p SOCK_CLOSED\n", __func__, con);
2314 ret = queue_con_delay(con, round_jiffies_relative(con->delay)); 2398 break;
2315 if (ret) { 2399 }
2316 dout("con_work %p FAILED to back off %lu\n", con, 2400 if (con_backoff(con)) {
2317 con->delay); 2401 dout("%s: con %p BACKOFF\n", __func__, con);
2318 BUG_ON(ret == -ENOENT); 2402 break;
2319 set_bit(CON_FLAG_BACKOFF, &con->flags); 2403 }
2404 if (con->state == CON_STATE_STANDBY) {
2405 dout("%s: con %p STANDBY\n", __func__, con);
2406 break;
2407 }
2408 if (con->state == CON_STATE_CLOSED) {
2409 dout("%s: con %p CLOSED\n", __func__, con);
2410 BUG_ON(con->sock);
2411 break;
2412 }
2413 if (con->state == CON_STATE_PREOPEN) {
2414 dout("%s: con %p PREOPEN\n", __func__, con);
2415 BUG_ON(con->sock);
2320 } 2416 }
2321 goto done;
2322 }
2323 2417
2324 if (con->state == CON_STATE_STANDBY) { 2418 ret = try_read(con);
2325 dout("con_work %p STANDBY\n", con); 2419 if (ret < 0) {
2326 goto done; 2420 if (ret == -EAGAIN)
2327 } 2421 continue;
2328 if (con->state == CON_STATE_CLOSED) { 2422 con->error_msg = "socket error on read";
2329 dout("con_work %p CLOSED\n", con); 2423 fault = true;
2330 BUG_ON(con->sock); 2424 break;
2331 goto done; 2425 }
2332 }
2333 if (con->state == CON_STATE_PREOPEN) {
2334 dout("con_work OPENING\n");
2335 BUG_ON(con->sock);
2336 }
2337 2426
2338 ret = try_read(con); 2427 ret = try_write(con);
2339 if (ret == -EAGAIN) 2428 if (ret < 0) {
2340 goto restart; 2429 if (ret == -EAGAIN)
2341 if (ret < 0) { 2430 continue;
2342 con->error_msg = "socket error on read"; 2431 con->error_msg = "socket error on write";
2343 goto fault; 2432 fault = true;
2344 } 2433 }
2345 2434
2346 ret = try_write(con); 2435 break; /* If we make it to here, we're done */
2347 if (ret == -EAGAIN)
2348 goto restart;
2349 if (ret < 0) {
2350 con->error_msg = "socket error on write";
2351 goto fault;
2352 } 2436 }
2353 2437 if (fault)
2354done: 2438 con_fault(con);
2355 mutex_unlock(&con->mutex); 2439 mutex_unlock(&con->mutex);
2356done_unlocked:
2357 con->ops->put(con);
2358 return;
2359 2440
2360fault: 2441 if (fault)
2361 ceph_fault(con); /* error/fault path */ 2442 con_fault_finish(con);
2362 goto done_unlocked;
2363}
2364 2443
2444 con->ops->put(con);
2445}
2365 2446
2366/* 2447/*
2367 * Generic error/fault handler. A retry mechanism is used with 2448 * Generic error/fault handler. A retry mechanism is used with
2368 * exponential backoff 2449 * exponential backoff
2369 */ 2450 */
2370static void ceph_fault(struct ceph_connection *con) 2451static void con_fault(struct ceph_connection *con)
2371 __releases(con->mutex)
2372{ 2452{
2373 pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), 2453 pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2374 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); 2454 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
@@ -2381,10 +2461,10 @@ static void ceph_fault(struct ceph_connection *con)
2381 2461
2382 con_close_socket(con); 2462 con_close_socket(con);
2383 2463
2384 if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { 2464 if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
2385 dout("fault on LOSSYTX channel, marking CLOSED\n"); 2465 dout("fault on LOSSYTX channel, marking CLOSED\n");
2386 con->state = CON_STATE_CLOSED; 2466 con->state = CON_STATE_CLOSED;
2387 goto out_unlock; 2467 return;
2388 } 2468 }
2389 2469
2390 if (con->in_msg) { 2470 if (con->in_msg) {
@@ -2401,9 +2481,9 @@ static void ceph_fault(struct ceph_connection *con)
2401 /* If there are no messages queued or keepalive pending, place 2481 /* If there are no messages queued or keepalive pending, place
2402 * the connection in a STANDBY state */ 2482 * the connection in a STANDBY state */
2403 if (list_empty(&con->out_queue) && 2483 if (list_empty(&con->out_queue) &&
2404 !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { 2484 !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
2405 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); 2485 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
2406 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 2486 con_flag_clear(con, CON_FLAG_WRITE_PENDING);
2407 con->state = CON_STATE_STANDBY; 2487 con->state = CON_STATE_STANDBY;
2408 } else { 2488 } else {
2409 /* retry after a delay. */ 2489 /* retry after a delay. */
@@ -2412,23 +2492,9 @@ static void ceph_fault(struct ceph_connection *con)
2412 con->delay = BASE_DELAY_INTERVAL; 2492 con->delay = BASE_DELAY_INTERVAL;
2413 else if (con->delay < MAX_DELAY_INTERVAL) 2493 else if (con->delay < MAX_DELAY_INTERVAL)
2414 con->delay *= 2; 2494 con->delay *= 2;
2415 set_bit(CON_FLAG_BACKOFF, &con->flags); 2495 con_flag_set(con, CON_FLAG_BACKOFF);
2416 queue_con(con); 2496 queue_con(con);
2417 } 2497 }
2418
2419out_unlock:
2420 mutex_unlock(&con->mutex);
2421 /*
2422 * in case we faulted due to authentication, invalidate our
2423 * current tickets so that we can get new ones.
2424 */
2425 if (con->auth_retry && con->ops->invalidate_authorizer) {
2426 dout("calling invalidate_authorizer()\n");
2427 con->ops->invalidate_authorizer(con);
2428 }
2429
2430 if (con->ops->fault)
2431 con->ops->fault(con);
2432} 2498}
2433 2499
2434 2500
@@ -2469,8 +2535,8 @@ static void clear_standby(struct ceph_connection *con)
2469 dout("clear_standby %p and ++connect_seq\n", con); 2535 dout("clear_standby %p and ++connect_seq\n", con);
2470 con->state = CON_STATE_PREOPEN; 2536 con->state = CON_STATE_PREOPEN;
2471 con->connect_seq++; 2537 con->connect_seq++;
2472 WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); 2538 WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
2473 WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); 2539 WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
2474 } 2540 }
2475} 2541}
2476 2542
@@ -2511,7 +2577,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2511 2577
2512 /* if there wasn't anything waiting to send before, queue 2578 /* if there wasn't anything waiting to send before, queue
2513 * new work */ 2579 * new work */
2514 if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) 2580 if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
2515 queue_con(con); 2581 queue_con(con);
2516} 2582}
2517EXPORT_SYMBOL(ceph_con_send); 2583EXPORT_SYMBOL(ceph_con_send);
@@ -2600,8 +2666,8 @@ void ceph_con_keepalive(struct ceph_connection *con)
2600 mutex_lock(&con->mutex); 2666 mutex_lock(&con->mutex);
2601 clear_standby(con); 2667 clear_standby(con);
2602 mutex_unlock(&con->mutex); 2668 mutex_unlock(&con->mutex);
2603 if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && 2669 if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
2604 test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) 2670 con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
2605 queue_con(con); 2671 queue_con(con);
2606} 2672}
2607EXPORT_SYMBOL(ceph_con_keepalive); 2673EXPORT_SYMBOL(ceph_con_keepalive);
@@ -2651,9 +2717,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2651 m->page_alignment = 0; 2717 m->page_alignment = 0;
2652 m->pages = NULL; 2718 m->pages = NULL;
2653 m->pagelist = NULL; 2719 m->pagelist = NULL;
2720#ifdef CONFIG_BLOCK
2654 m->bio = NULL; 2721 m->bio = NULL;
2655 m->bio_iter = NULL; 2722 m->bio_iter = NULL;
2656 m->bio_seg = 0; 2723 m->bio_seg = 0;
2724#endif /* CONFIG_BLOCK */
2657 m->trail = NULL; 2725 m->trail = NULL;
2658 2726
2659 /* front */ 2727 /* front */
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 812eb3b46c1f..aef5b1062bee 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -697,7 +697,7 @@ int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
697 u32 pool, u64 snapid) 697 u32 pool, u64 snapid)
698{ 698{
699 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 699 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
700 pool, snapid, 0, 0); 700 pool, snapid, NULL, 0);
701 701
702} 702}
703 703
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eb9a44478764..d730dd4d8eb2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -23,7 +23,7 @@
23 23
24static const struct ceph_connection_operations osd_con_ops; 24static const struct ceph_connection_operations osd_con_ops;
25 25
26static void send_queued(struct ceph_osd_client *osdc); 26static void __send_queued(struct ceph_osd_client *osdc);
27static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); 27static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
28static void __register_request(struct ceph_osd_client *osdc, 28static void __register_request(struct ceph_osd_client *osdc,
29 struct ceph_osd_request *req); 29 struct ceph_osd_request *req);
@@ -32,64 +32,12 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
32static void __send_request(struct ceph_osd_client *osdc, 32static void __send_request(struct ceph_osd_client *osdc,
33 struct ceph_osd_request *req); 33 struct ceph_osd_request *req);
34 34
35static int op_needs_trail(int op)
36{
37 switch (op) {
38 case CEPH_OSD_OP_GETXATTR:
39 case CEPH_OSD_OP_SETXATTR:
40 case CEPH_OSD_OP_CMPXATTR:
41 case CEPH_OSD_OP_CALL:
42 case CEPH_OSD_OP_NOTIFY:
43 return 1;
44 default:
45 return 0;
46 }
47}
48
49static int op_has_extent(int op) 35static int op_has_extent(int op)
50{ 36{
51 return (op == CEPH_OSD_OP_READ || 37 return (op == CEPH_OSD_OP_READ ||
52 op == CEPH_OSD_OP_WRITE); 38 op == CEPH_OSD_OP_WRITE);
53} 39}
54 40
55int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
56 struct ceph_file_layout *layout,
57 u64 snapid,
58 u64 off, u64 *plen, u64 *bno,
59 struct ceph_osd_request *req,
60 struct ceph_osd_req_op *op)
61{
62 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
63 u64 orig_len = *plen;
64 u64 objoff, objlen; /* extent in object */
65 int r;
66
67 reqhead->snapid = cpu_to_le64(snapid);
68
69 /* object extent? */
70 r = ceph_calc_file_object_mapping(layout, off, plen, bno,
71 &objoff, &objlen);
72 if (r < 0)
73 return r;
74 if (*plen < orig_len)
75 dout(" skipping last %llu, final file extent %llu~%llu\n",
76 orig_len - *plen, off, *plen);
77
78 if (op_has_extent(op->op)) {
79 op->extent.offset = objoff;
80 op->extent.length = objlen;
81 }
82 req->r_num_pages = calc_pages_for(off, *plen);
83 req->r_page_alignment = off & ~PAGE_MASK;
84 if (op->op == CEPH_OSD_OP_WRITE)
85 op->payload_len = *plen;
86
87 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
88 *bno, objoff, objlen, req->r_num_pages);
89 return 0;
90}
91EXPORT_SYMBOL(ceph_calc_raw_layout);
92
93/* 41/*
94 * Implement client access to distributed object storage cluster. 42 * Implement client access to distributed object storage cluster.
95 * 43 *
@@ -115,20 +63,48 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
115 * 63 *
116 * fill osd op in request message. 64 * fill osd op in request message.
117 */ 65 */
118static int calc_layout(struct ceph_osd_client *osdc, 66static int calc_layout(struct ceph_vino vino,
119 struct ceph_vino vino,
120 struct ceph_file_layout *layout, 67 struct ceph_file_layout *layout,
121 u64 off, u64 *plen, 68 u64 off, u64 *plen,
122 struct ceph_osd_request *req, 69 struct ceph_osd_request *req,
123 struct ceph_osd_req_op *op) 70 struct ceph_osd_req_op *op)
124{ 71{
125 u64 bno; 72 u64 orig_len = *plen;
73 u64 bno = 0;
74 u64 objoff = 0;
75 u64 objlen = 0;
126 int r; 76 int r;
127 77
128 r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, 78 /* object extent? */
129 plen, &bno, req, op); 79 r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno,
80 &objoff, &objlen);
130 if (r < 0) 81 if (r < 0)
131 return r; 82 return r;
83 if (objlen < orig_len) {
84 *plen = objlen;
85 dout(" skipping last %llu, final file extent %llu~%llu\n",
86 orig_len - *plen, off, *plen);
87 }
88
89 if (op_has_extent(op->op)) {
90 u32 osize = le32_to_cpu(layout->fl_object_size);
91 op->extent.offset = objoff;
92 op->extent.length = objlen;
93 if (op->extent.truncate_size <= off - objoff) {
94 op->extent.truncate_size = 0;
95 } else {
96 op->extent.truncate_size -= off - objoff;
97 if (op->extent.truncate_size > osize)
98 op->extent.truncate_size = osize;
99 }
100 }
101 req->r_num_pages = calc_pages_for(off, *plen);
102 req->r_page_alignment = off & ~PAGE_MASK;
103 if (op->op == CEPH_OSD_OP_WRITE)
104 op->payload_len = *plen;
105
106 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
107 bno, objoff, objlen, req->r_num_pages);
132 108
133 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 109 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
134 req->r_oid_len = strlen(req->r_oid); 110 req->r_oid_len = strlen(req->r_oid);
@@ -148,25 +124,19 @@ void ceph_osdc_release_request(struct kref *kref)
148 if (req->r_request) 124 if (req->r_request)
149 ceph_msg_put(req->r_request); 125 ceph_msg_put(req->r_request);
150 if (req->r_con_filling_msg) { 126 if (req->r_con_filling_msg) {
151 dout("%s revoking pages %p from con %p\n", __func__, 127 dout("%s revoking msg %p from con %p\n", __func__,
152 req->r_pages, req->r_con_filling_msg); 128 req->r_reply, req->r_con_filling_msg);
153 ceph_msg_revoke_incoming(req->r_reply); 129 ceph_msg_revoke_incoming(req->r_reply);
154 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 130 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
131 req->r_con_filling_msg = NULL;
155 } 132 }
156 if (req->r_reply) 133 if (req->r_reply)
157 ceph_msg_put(req->r_reply); 134 ceph_msg_put(req->r_reply);
158 if (req->r_own_pages) 135 if (req->r_own_pages)
159 ceph_release_page_vector(req->r_pages, 136 ceph_release_page_vector(req->r_pages,
160 req->r_num_pages); 137 req->r_num_pages);
161#ifdef CONFIG_BLOCK
162 if (req->r_bio)
163 bio_put(req->r_bio);
164#endif
165 ceph_put_snap_context(req->r_snapc); 138 ceph_put_snap_context(req->r_snapc);
166 if (req->r_trail) { 139 ceph_pagelist_release(&req->r_trail);
167 ceph_pagelist_release(req->r_trail);
168 kfree(req->r_trail);
169 }
170 if (req->r_mempool) 140 if (req->r_mempool)
171 mempool_free(req, req->r_osdc->req_mempool); 141 mempool_free(req, req->r_osdc->req_mempool);
172 else 142 else
@@ -174,37 +144,25 @@ void ceph_osdc_release_request(struct kref *kref)
174} 144}
175EXPORT_SYMBOL(ceph_osdc_release_request); 145EXPORT_SYMBOL(ceph_osdc_release_request);
176 146
177static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
178{
179 int i = 0;
180
181 if (needs_trail)
182 *needs_trail = 0;
183 while (ops[i].op) {
184 if (needs_trail && op_needs_trail(ops[i].op))
185 *needs_trail = 1;
186 i++;
187 }
188
189 return i;
190}
191
192struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 147struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
193 int flags,
194 struct ceph_snap_context *snapc, 148 struct ceph_snap_context *snapc,
195 struct ceph_osd_req_op *ops, 149 unsigned int num_ops,
196 bool use_mempool, 150 bool use_mempool,
197 gfp_t gfp_flags, 151 gfp_t gfp_flags)
198 struct page **pages,
199 struct bio *bio)
200{ 152{
201 struct ceph_osd_request *req; 153 struct ceph_osd_request *req;
202 struct ceph_msg *msg; 154 struct ceph_msg *msg;
203 int needs_trail; 155 size_t msg_size;
204 int num_op = get_num_ops(ops, &needs_trail); 156
205 size_t msg_size = sizeof(struct ceph_osd_request_head); 157 msg_size = 4 + 4 + 8 + 8 + 4+8;
206 158 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
207 msg_size += num_op*sizeof(struct ceph_osd_op); 159 msg_size += 1 + 8 + 4 + 4; /* pg_t */
160 msg_size += 4 + MAX_OBJ_NAME_SIZE;
161 msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
162 msg_size += 8; /* snapid */
163 msg_size += 8; /* snap_seq */
164 msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
165 msg_size += 4;
208 166
209 if (use_mempool) { 167 if (use_mempool) {
210 req = mempool_alloc(osdc->req_mempool, gfp_flags); 168 req = mempool_alloc(osdc->req_mempool, gfp_flags);
@@ -228,10 +186,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
228 INIT_LIST_HEAD(&req->r_req_lru_item); 186 INIT_LIST_HEAD(&req->r_req_lru_item);
229 INIT_LIST_HEAD(&req->r_osd_item); 187 INIT_LIST_HEAD(&req->r_osd_item);
230 188
231 req->r_flags = flags;
232
233 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
234
235 /* create reply message */ 189 /* create reply message */
236 if (use_mempool) 190 if (use_mempool)
237 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 191 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
@@ -244,20 +198,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
244 } 198 }
245 req->r_reply = msg; 199 req->r_reply = msg;
246 200
247 /* allocate space for the trailing data */ 201 ceph_pagelist_init(&req->r_trail);
248 if (needs_trail) {
249 req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
250 if (!req->r_trail) {
251 ceph_osdc_put_request(req);
252 return NULL;
253 }
254 ceph_pagelist_init(req->r_trail);
255 }
256 202
257 /* create request message; allow space for oid */ 203 /* create request message; allow space for oid */
258 msg_size += MAX_OBJ_NAME_SIZE;
259 if (snapc)
260 msg_size += sizeof(u64) * snapc->num_snaps;
261 if (use_mempool) 204 if (use_mempool)
262 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 205 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
263 else 206 else
@@ -270,13 +213,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
270 memset(msg->front.iov_base, 0, msg->front.iov_len); 213 memset(msg->front.iov_base, 0, msg->front.iov_len);
271 214
272 req->r_request = msg; 215 req->r_request = msg;
273 req->r_pages = pages;
274#ifdef CONFIG_BLOCK
275 if (bio) {
276 req->r_bio = bio;
277 bio_get(req->r_bio);
278 }
279#endif
280 216
281 return req; 217 return req;
282} 218}
@@ -289,6 +225,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
289 dst->op = cpu_to_le16(src->op); 225 dst->op = cpu_to_le16(src->op);
290 226
291 switch (src->op) { 227 switch (src->op) {
228 case CEPH_OSD_OP_STAT:
229 break;
292 case CEPH_OSD_OP_READ: 230 case CEPH_OSD_OP_READ:
293 case CEPH_OSD_OP_WRITE: 231 case CEPH_OSD_OP_WRITE:
294 dst->extent.offset = 232 dst->extent.offset =
@@ -300,52 +238,20 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
300 dst->extent.truncate_seq = 238 dst->extent.truncate_seq =
301 cpu_to_le32(src->extent.truncate_seq); 239 cpu_to_le32(src->extent.truncate_seq);
302 break; 240 break;
303
304 case CEPH_OSD_OP_GETXATTR:
305 case CEPH_OSD_OP_SETXATTR:
306 case CEPH_OSD_OP_CMPXATTR:
307 BUG_ON(!req->r_trail);
308
309 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
310 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
311 dst->xattr.cmp_op = src->xattr.cmp_op;
312 dst->xattr.cmp_mode = src->xattr.cmp_mode;
313 ceph_pagelist_append(req->r_trail, src->xattr.name,
314 src->xattr.name_len);
315 ceph_pagelist_append(req->r_trail, src->xattr.val,
316 src->xattr.value_len);
317 break;
318 case CEPH_OSD_OP_CALL: 241 case CEPH_OSD_OP_CALL:
319 BUG_ON(!req->r_trail);
320
321 dst->cls.class_len = src->cls.class_len; 242 dst->cls.class_len = src->cls.class_len;
322 dst->cls.method_len = src->cls.method_len; 243 dst->cls.method_len = src->cls.method_len;
323 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); 244 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
324 245
325 ceph_pagelist_append(req->r_trail, src->cls.class_name, 246 ceph_pagelist_append(&req->r_trail, src->cls.class_name,
326 src->cls.class_len); 247 src->cls.class_len);
327 ceph_pagelist_append(req->r_trail, src->cls.method_name, 248 ceph_pagelist_append(&req->r_trail, src->cls.method_name,
328 src->cls.method_len); 249 src->cls.method_len);
329 ceph_pagelist_append(req->r_trail, src->cls.indata, 250 ceph_pagelist_append(&req->r_trail, src->cls.indata,
330 src->cls.indata_len); 251 src->cls.indata_len);
331 break; 252 break;
332 case CEPH_OSD_OP_ROLLBACK:
333 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
334 break;
335 case CEPH_OSD_OP_STARTSYNC: 253 case CEPH_OSD_OP_STARTSYNC:
336 break; 254 break;
337 case CEPH_OSD_OP_NOTIFY:
338 {
339 __le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
340 __le32 timeout = cpu_to_le32(src->watch.timeout);
341
342 BUG_ON(!req->r_trail);
343
344 ceph_pagelist_append(req->r_trail,
345 &prot_ver, sizeof(prot_ver));
346 ceph_pagelist_append(req->r_trail,
347 &timeout, sizeof(timeout));
348 }
349 case CEPH_OSD_OP_NOTIFY_ACK: 255 case CEPH_OSD_OP_NOTIFY_ACK:
350 case CEPH_OSD_OP_WATCH: 256 case CEPH_OSD_OP_WATCH:
351 dst->watch.cookie = cpu_to_le64(src->watch.cookie); 257 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
@@ -356,6 +262,64 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
356 pr_err("unrecognized osd opcode %d\n", dst->op); 262 pr_err("unrecognized osd opcode %d\n", dst->op);
357 WARN_ON(1); 263 WARN_ON(1);
358 break; 264 break;
265 case CEPH_OSD_OP_MAPEXT:
266 case CEPH_OSD_OP_MASKTRUNC:
267 case CEPH_OSD_OP_SPARSE_READ:
268 case CEPH_OSD_OP_NOTIFY:
269 case CEPH_OSD_OP_ASSERT_VER:
270 case CEPH_OSD_OP_WRITEFULL:
271 case CEPH_OSD_OP_TRUNCATE:
272 case CEPH_OSD_OP_ZERO:
273 case CEPH_OSD_OP_DELETE:
274 case CEPH_OSD_OP_APPEND:
275 case CEPH_OSD_OP_SETTRUNC:
276 case CEPH_OSD_OP_TRIMTRUNC:
277 case CEPH_OSD_OP_TMAPUP:
278 case CEPH_OSD_OP_TMAPPUT:
279 case CEPH_OSD_OP_TMAPGET:
280 case CEPH_OSD_OP_CREATE:
281 case CEPH_OSD_OP_ROLLBACK:
282 case CEPH_OSD_OP_OMAPGETKEYS:
283 case CEPH_OSD_OP_OMAPGETVALS:
284 case CEPH_OSD_OP_OMAPGETHEADER:
285 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
286 case CEPH_OSD_OP_MODE_RD:
287 case CEPH_OSD_OP_OMAPSETVALS:
288 case CEPH_OSD_OP_OMAPSETHEADER:
289 case CEPH_OSD_OP_OMAPCLEAR:
290 case CEPH_OSD_OP_OMAPRMKEYS:
291 case CEPH_OSD_OP_OMAP_CMP:
292 case CEPH_OSD_OP_CLONERANGE:
293 case CEPH_OSD_OP_ASSERT_SRC_VERSION:
294 case CEPH_OSD_OP_SRC_CMPXATTR:
295 case CEPH_OSD_OP_GETXATTR:
296 case CEPH_OSD_OP_GETXATTRS:
297 case CEPH_OSD_OP_CMPXATTR:
298 case CEPH_OSD_OP_SETXATTR:
299 case CEPH_OSD_OP_SETXATTRS:
300 case CEPH_OSD_OP_RESETXATTRS:
301 case CEPH_OSD_OP_RMXATTR:
302 case CEPH_OSD_OP_PULL:
303 case CEPH_OSD_OP_PUSH:
304 case CEPH_OSD_OP_BALANCEREADS:
305 case CEPH_OSD_OP_UNBALANCEREADS:
306 case CEPH_OSD_OP_SCRUB:
307 case CEPH_OSD_OP_SCRUB_RESERVE:
308 case CEPH_OSD_OP_SCRUB_UNRESERVE:
309 case CEPH_OSD_OP_SCRUB_STOP:
310 case CEPH_OSD_OP_SCRUB_MAP:
311 case CEPH_OSD_OP_WRLOCK:
312 case CEPH_OSD_OP_WRUNLOCK:
313 case CEPH_OSD_OP_RDLOCK:
314 case CEPH_OSD_OP_RDUNLOCK:
315 case CEPH_OSD_OP_UPLOCK:
316 case CEPH_OSD_OP_DNLOCK:
317 case CEPH_OSD_OP_PGLS:
318 case CEPH_OSD_OP_PGLS_FILTER:
319 pr_err("unsupported osd opcode %s\n",
320 ceph_osd_op_name(dst->op));
321 WARN_ON(1);
322 break;
359 } 323 }
360 dst->payload_len = cpu_to_le32(src->payload_len); 324 dst->payload_len = cpu_to_le32(src->payload_len);
361} 325}
@@ -365,75 +329,95 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
365 * 329 *
366 */ 330 */
367void ceph_osdc_build_request(struct ceph_osd_request *req, 331void ceph_osdc_build_request(struct ceph_osd_request *req,
368 u64 off, u64 *plen, 332 u64 off, u64 len, unsigned int num_ops,
369 struct ceph_osd_req_op *src_ops, 333 struct ceph_osd_req_op *src_ops,
370 struct ceph_snap_context *snapc, 334 struct ceph_snap_context *snapc, u64 snap_id,
371 struct timespec *mtime, 335 struct timespec *mtime)
372 const char *oid,
373 int oid_len)
374{ 336{
375 struct ceph_msg *msg = req->r_request; 337 struct ceph_msg *msg = req->r_request;
376 struct ceph_osd_request_head *head;
377 struct ceph_osd_req_op *src_op; 338 struct ceph_osd_req_op *src_op;
378 struct ceph_osd_op *op;
379 void *p; 339 void *p;
380 int num_op = get_num_ops(src_ops, NULL); 340 size_t msg_size;
381 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
382 int flags = req->r_flags; 341 int flags = req->r_flags;
383 u64 data_len = 0; 342 u64 data_len;
384 int i; 343 int i;
385 344
386 head = msg->front.iov_base; 345 req->r_num_ops = num_ops;
387 op = (void *)(head + 1); 346 req->r_snapid = snap_id;
388 p = (void *)(op + num_op);
389
390 req->r_snapc = ceph_get_snap_context(snapc); 347 req->r_snapc = ceph_get_snap_context(snapc);
391 348
392 head->client_inc = cpu_to_le32(1); /* always, for now. */ 349 /* encode request */
393 head->flags = cpu_to_le32(flags); 350 msg->hdr.version = cpu_to_le16(4);
394 if (flags & CEPH_OSD_FLAG_WRITE)
395 ceph_encode_timespec(&head->mtime, mtime);
396 head->num_ops = cpu_to_le16(num_op);
397
398
399 /* fill in oid */
400 head->object_len = cpu_to_le32(oid_len);
401 memcpy(p, oid, oid_len);
402 p += oid_len;
403 351
352 p = msg->front.iov_base;
353 ceph_encode_32(&p, 1); /* client_inc is always 1 */
354 req->r_request_osdmap_epoch = p;
355 p += 4;
356 req->r_request_flags = p;
357 p += 4;
358 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
359 ceph_encode_timespec(p, mtime);
360 p += sizeof(struct ceph_timespec);
361 req->r_request_reassert_version = p;
362 p += sizeof(struct ceph_eversion); /* will get filled in */
363
364 /* oloc */
365 ceph_encode_8(&p, 4);
366 ceph_encode_8(&p, 4);
367 ceph_encode_32(&p, 8 + 4 + 4);
368 req->r_request_pool = p;
369 p += 8;
370 ceph_encode_32(&p, -1); /* preferred */
371 ceph_encode_32(&p, 0); /* key len */
372
373 ceph_encode_8(&p, 1);
374 req->r_request_pgid = p;
375 p += 8 + 4;
376 ceph_encode_32(&p, -1); /* preferred */
377
378 /* oid */
379 ceph_encode_32(&p, req->r_oid_len);
380 memcpy(p, req->r_oid, req->r_oid_len);
381 dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
382 p += req->r_oid_len;
383
384 /* ops */
385 ceph_encode_16(&p, num_ops);
404 src_op = src_ops; 386 src_op = src_ops;
405 while (src_op->op) { 387 req->r_request_ops = p;
406 osd_req_encode_op(req, op, src_op); 388 for (i = 0; i < num_ops; i++, src_op++) {
407 src_op++; 389 osd_req_encode_op(req, p, src_op);
408 op++; 390 p += sizeof(struct ceph_osd_op);
409 } 391 }
410 392
411 if (req->r_trail) 393 /* snaps */
412 data_len += req->r_trail->length; 394 ceph_encode_64(&p, req->r_snapid);
413 395 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
414 if (snapc) { 396 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
415 head->snap_seq = cpu_to_le64(snapc->seq); 397 if (req->r_snapc) {
416 head->num_snaps = cpu_to_le32(snapc->num_snaps);
417 for (i = 0; i < snapc->num_snaps; i++) { 398 for (i = 0; i < snapc->num_snaps; i++) {
418 put_unaligned_le64(snapc->snaps[i], p); 399 ceph_encode_64(&p, req->r_snapc->snaps[i]);
419 p += sizeof(u64);
420 } 400 }
421 } 401 }
422 402
403 req->r_request_attempts = p;
404 p += 4;
405
406 data_len = req->r_trail.length;
423 if (flags & CEPH_OSD_FLAG_WRITE) { 407 if (flags & CEPH_OSD_FLAG_WRITE) {
424 req->r_request->hdr.data_off = cpu_to_le16(off); 408 req->r_request->hdr.data_off = cpu_to_le16(off);
425 req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); 409 data_len += len;
426 } else if (data_len) {
427 req->r_request->hdr.data_off = 0;
428 req->r_request->hdr.data_len = cpu_to_le32(data_len);
429 } 410 }
430 411 req->r_request->hdr.data_len = cpu_to_le32(data_len);
431 req->r_request->page_alignment = req->r_page_alignment; 412 req->r_request->page_alignment = req->r_page_alignment;
432 413
433 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 414 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
434 msg_size = p - msg->front.iov_base; 415 msg_size = p - msg->front.iov_base;
435 msg->front.iov_len = msg_size; 416 msg->front.iov_len = msg_size;
436 msg->hdr.front_len = cpu_to_le32(msg_size); 417 msg->hdr.front_len = cpu_to_le32(msg_size);
418
419 dout("build_request msg_size was %d num_ops %d\n", (int)msg_size,
420 num_ops);
437 return; 421 return;
438} 422}
439EXPORT_SYMBOL(ceph_osdc_build_request); 423EXPORT_SYMBOL(ceph_osdc_build_request);
@@ -459,34 +443,33 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
459 u32 truncate_seq, 443 u32 truncate_seq,
460 u64 truncate_size, 444 u64 truncate_size,
461 struct timespec *mtime, 445 struct timespec *mtime,
462 bool use_mempool, int num_reply, 446 bool use_mempool,
463 int page_align) 447 int page_align)
464{ 448{
465 struct ceph_osd_req_op ops[3]; 449 struct ceph_osd_req_op ops[2];
466 struct ceph_osd_request *req; 450 struct ceph_osd_request *req;
451 unsigned int num_op = 1;
467 int r; 452 int r;
468 453
454 memset(&ops, 0, sizeof ops);
455
469 ops[0].op = opcode; 456 ops[0].op = opcode;
470 ops[0].extent.truncate_seq = truncate_seq; 457 ops[0].extent.truncate_seq = truncate_seq;
471 ops[0].extent.truncate_size = truncate_size; 458 ops[0].extent.truncate_size = truncate_size;
472 ops[0].payload_len = 0;
473 459
474 if (do_sync) { 460 if (do_sync) {
475 ops[1].op = CEPH_OSD_OP_STARTSYNC; 461 ops[1].op = CEPH_OSD_OP_STARTSYNC;
476 ops[1].payload_len = 0; 462 num_op++;
477 ops[2].op = 0; 463 }
478 } else 464
479 ops[1].op = 0; 465 req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool,
480 466 GFP_NOFS);
481 req = ceph_osdc_alloc_request(osdc, flags,
482 snapc, ops,
483 use_mempool,
484 GFP_NOFS, NULL, NULL);
485 if (!req) 467 if (!req)
486 return ERR_PTR(-ENOMEM); 468 return ERR_PTR(-ENOMEM);
469 req->r_flags = flags;
487 470
488 /* calculate max write size */ 471 /* calculate max write size */
489 r = calc_layout(osdc, vino, layout, off, plen, req, ops); 472 r = calc_layout(vino, layout, off, plen, req, ops);
490 if (r < 0) 473 if (r < 0)
491 return ERR_PTR(r); 474 return ERR_PTR(r);
492 req->r_file_layout = *layout; /* keep a copy */ 475 req->r_file_layout = *layout; /* keep a copy */
@@ -496,10 +479,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
496 req->r_num_pages = calc_pages_for(page_align, *plen); 479 req->r_num_pages = calc_pages_for(page_align, *plen);
497 req->r_page_alignment = page_align; 480 req->r_page_alignment = page_align;
498 481
499 ceph_osdc_build_request(req, off, plen, ops, 482 ceph_osdc_build_request(req, off, *plen, num_op, ops,
500 snapc, 483 snapc, vino.snap, mtime);
501 mtime,
502 req->r_oid, req->r_oid_len);
503 484
504 return req; 485 return req;
505} 486}
@@ -623,8 +604,8 @@ static void osd_reset(struct ceph_connection *con)
623 down_read(&osdc->map_sem); 604 down_read(&osdc->map_sem);
624 mutex_lock(&osdc->request_mutex); 605 mutex_lock(&osdc->request_mutex);
625 __kick_osd_requests(osdc, osd); 606 __kick_osd_requests(osdc, osd);
607 __send_queued(osdc);
626 mutex_unlock(&osdc->request_mutex); 608 mutex_unlock(&osdc->request_mutex);
627 send_queued(osdc);
628 up_read(&osdc->map_sem); 609 up_read(&osdc->map_sem);
629} 610}
630 611
@@ -739,31 +720,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc)
739 */ 720 */
740static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 721static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
741{ 722{
742 struct ceph_osd_request *req; 723 struct ceph_entity_addr *peer_addr;
743 int ret = 0;
744 724
745 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 725 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
746 if (list_empty(&osd->o_requests) && 726 if (list_empty(&osd->o_requests) &&
747 list_empty(&osd->o_linger_requests)) { 727 list_empty(&osd->o_linger_requests)) {
748 __remove_osd(osdc, osd); 728 __remove_osd(osdc, osd);
749 ret = -ENODEV; 729
750 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], 730 return -ENODEV;
751 &osd->o_con.peer_addr, 731 }
752 sizeof(osd->o_con.peer_addr)) == 0 && 732
753 !ceph_con_opened(&osd->o_con)) { 733 peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
734 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
735 !ceph_con_opened(&osd->o_con)) {
736 struct ceph_osd_request *req;
737
754 dout(" osd addr hasn't changed and connection never opened," 738 dout(" osd addr hasn't changed and connection never opened,"
755 " letting msgr retry"); 739 " letting msgr retry");
756 /* touch each r_stamp for handle_timeout()'s benfit */ 740 /* touch each r_stamp for handle_timeout()'s benfit */
757 list_for_each_entry(req, &osd->o_requests, r_osd_item) 741 list_for_each_entry(req, &osd->o_requests, r_osd_item)
758 req->r_stamp = jiffies; 742 req->r_stamp = jiffies;
759 ret = -EAGAIN; 743
760 } else { 744 return -EAGAIN;
761 ceph_con_close(&osd->o_con);
762 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
763 &osdc->osdmap->osd_addr[osd->o_osd]);
764 osd->o_incarnation++;
765 } 745 }
766 return ret; 746
747 ceph_con_close(&osd->o_con);
748 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
749 osd->o_incarnation++;
750
751 return 0;
767} 752}
768 753
769static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) 754static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
@@ -961,20 +946,18 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger);
961static int __map_request(struct ceph_osd_client *osdc, 946static int __map_request(struct ceph_osd_client *osdc,
962 struct ceph_osd_request *req, int force_resend) 947 struct ceph_osd_request *req, int force_resend)
963{ 948{
964 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
965 struct ceph_pg pgid; 949 struct ceph_pg pgid;
966 int acting[CEPH_PG_MAX_SIZE]; 950 int acting[CEPH_PG_MAX_SIZE];
967 int o = -1, num = 0; 951 int o = -1, num = 0;
968 int err; 952 int err;
969 953
970 dout("map_request %p tid %lld\n", req, req->r_tid); 954 dout("map_request %p tid %lld\n", req, req->r_tid);
971 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, 955 err = ceph_calc_object_layout(&pgid, req->r_oid,
972 &req->r_file_layout, osdc->osdmap); 956 &req->r_file_layout, osdc->osdmap);
973 if (err) { 957 if (err) {
974 list_move(&req->r_req_lru_item, &osdc->req_notarget); 958 list_move(&req->r_req_lru_item, &osdc->req_notarget);
975 return err; 959 return err;
976 } 960 }
977 pgid = reqhead->layout.ol_pgid;
978 req->r_pgid = pgid; 961 req->r_pgid = pgid;
979 962
980 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); 963 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
@@ -991,8 +974,8 @@ static int __map_request(struct ceph_osd_client *osdc,
991 (req->r_osd == NULL && o == -1)) 974 (req->r_osd == NULL && o == -1))
992 return 0; /* no change */ 975 return 0; /* no change */
993 976
994 dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", 977 dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
995 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, 978 req->r_tid, pgid.pool, pgid.seed, o,
996 req->r_osd ? req->r_osd->o_osd : -1); 979 req->r_osd ? req->r_osd->o_osd : -1);
997 980
998 /* record full pg acting set */ 981 /* record full pg acting set */
@@ -1041,15 +1024,22 @@ out:
1041static void __send_request(struct ceph_osd_client *osdc, 1024static void __send_request(struct ceph_osd_client *osdc,
1042 struct ceph_osd_request *req) 1025 struct ceph_osd_request *req)
1043{ 1026{
1044 struct ceph_osd_request_head *reqhead; 1027 void *p;
1045
1046 dout("send_request %p tid %llu to osd%d flags %d\n",
1047 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
1048 1028
1049 reqhead = req->r_request->front.iov_base; 1029 dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
1050 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); 1030 req, req->r_tid, req->r_osd->o_osd, req->r_flags,
1051 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ 1031 (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
1052 reqhead->reassert_version = req->r_reassert_version; 1032
1033 /* fill in message content that changes each time we send it */
1034 put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
1035 put_unaligned_le32(req->r_flags, req->r_request_flags);
1036 put_unaligned_le64(req->r_pgid.pool, req->r_request_pool);
1037 p = req->r_request_pgid;
1038 ceph_encode_64(&p, req->r_pgid.pool);
1039 ceph_encode_32(&p, req->r_pgid.seed);
1040 put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
1041 memcpy(req->r_request_reassert_version, &req->r_reassert_version,
1042 sizeof(req->r_reassert_version));
1053 1043
1054 req->r_stamp = jiffies; 1044 req->r_stamp = jiffies;
1055 list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 1045 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
@@ -1062,16 +1052,13 @@ static void __send_request(struct ceph_osd_client *osdc,
1062/* 1052/*
1063 * Send any requests in the queue (req_unsent). 1053 * Send any requests in the queue (req_unsent).
1064 */ 1054 */
1065static void send_queued(struct ceph_osd_client *osdc) 1055static void __send_queued(struct ceph_osd_client *osdc)
1066{ 1056{
1067 struct ceph_osd_request *req, *tmp; 1057 struct ceph_osd_request *req, *tmp;
1068 1058
1069 dout("send_queued\n"); 1059 dout("__send_queued\n");
1070 mutex_lock(&osdc->request_mutex); 1060 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
1071 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
1072 __send_request(osdc, req); 1061 __send_request(osdc, req);
1073 }
1074 mutex_unlock(&osdc->request_mutex);
1075} 1062}
1076 1063
1077/* 1064/*
@@ -1123,8 +1110,8 @@ static void handle_timeout(struct work_struct *work)
1123 } 1110 }
1124 1111
1125 __schedule_osd_timeout(osdc); 1112 __schedule_osd_timeout(osdc);
1113 __send_queued(osdc);
1126 mutex_unlock(&osdc->request_mutex); 1114 mutex_unlock(&osdc->request_mutex);
1127 send_queued(osdc);
1128 up_read(&osdc->map_sem); 1115 up_read(&osdc->map_sem);
1129} 1116}
1130 1117
@@ -1152,6 +1139,26 @@ static void complete_request(struct ceph_osd_request *req)
1152 complete_all(&req->r_safe_completion); /* fsync waiter */ 1139 complete_all(&req->r_safe_completion); /* fsync waiter */
1153} 1140}
1154 1141
1142static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
1143{
1144 __u8 v;
1145
1146 ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
1147 v = ceph_decode_8(p);
1148 if (v > 1) {
1149 pr_warning("do not understand pg encoding %d > 1", v);
1150 return -EINVAL;
1151 }
1152 pgid->pool = ceph_decode_64(p);
1153 pgid->seed = ceph_decode_32(p);
1154 *p += 4;
1155 return 0;
1156
1157bad:
1158 pr_warning("incomplete pg encoding");
1159 return -EINVAL;
1160}
1161
1155/* 1162/*
1156 * handle osd op reply. either call the callback if it is specified, 1163 * handle osd op reply. either call the callback if it is specified,
1157 * or do the completion to wake up the waiting thread. 1164 * or do the completion to wake up the waiting thread.
@@ -1159,22 +1166,42 @@ static void complete_request(struct ceph_osd_request *req)
1159static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, 1166static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1160 struct ceph_connection *con) 1167 struct ceph_connection *con)
1161{ 1168{
1162 struct ceph_osd_reply_head *rhead = msg->front.iov_base; 1169 void *p, *end;
1163 struct ceph_osd_request *req; 1170 struct ceph_osd_request *req;
1164 u64 tid; 1171 u64 tid;
1165 int numops, object_len, flags; 1172 int object_len;
1173 int numops, payload_len, flags;
1166 s32 result; 1174 s32 result;
1175 s32 retry_attempt;
1176 struct ceph_pg pg;
1177 int err;
1178 u32 reassert_epoch;
1179 u64 reassert_version;
1180 u32 osdmap_epoch;
1181 int i;
1167 1182
1168 tid = le64_to_cpu(msg->hdr.tid); 1183 tid = le64_to_cpu(msg->hdr.tid);
1169 if (msg->front.iov_len < sizeof(*rhead)) 1184 dout("handle_reply %p tid %llu\n", msg, tid);
1170 goto bad; 1185
1171 numops = le32_to_cpu(rhead->num_ops); 1186 p = msg->front.iov_base;
1172 object_len = le32_to_cpu(rhead->object_len); 1187 end = p + msg->front.iov_len;
1173 result = le32_to_cpu(rhead->result); 1188
1174 if (msg->front.iov_len != sizeof(*rhead) + object_len + 1189 ceph_decode_need(&p, end, 4, bad);
1175 numops * sizeof(struct ceph_osd_op)) 1190 object_len = ceph_decode_32(&p);
1191 ceph_decode_need(&p, end, object_len, bad);
1192 p += object_len;
1193
1194 err = __decode_pgid(&p, end, &pg);
1195 if (err)
1176 goto bad; 1196 goto bad;
1177 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); 1197
1198 ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
1199 flags = ceph_decode_64(&p);
1200 result = ceph_decode_32(&p);
1201 reassert_epoch = ceph_decode_32(&p);
1202 reassert_version = ceph_decode_64(&p);
1203 osdmap_epoch = ceph_decode_32(&p);
1204
1178 /* lookup */ 1205 /* lookup */
1179 mutex_lock(&osdc->request_mutex); 1206 mutex_lock(&osdc->request_mutex);
1180 req = __lookup_request(osdc, tid); 1207 req = __lookup_request(osdc, tid);
@@ -1184,7 +1211,38 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1184 return; 1211 return;
1185 } 1212 }
1186 ceph_osdc_get_request(req); 1213 ceph_osdc_get_request(req);
1187 flags = le32_to_cpu(rhead->flags); 1214
1215 dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
1216 req, result);
1217
1218 ceph_decode_need(&p, end, 4, bad);
1219 numops = ceph_decode_32(&p);
1220 if (numops > CEPH_OSD_MAX_OP)
1221 goto bad_put;
1222 if (numops != req->r_num_ops)
1223 goto bad_put;
1224 payload_len = 0;
1225 ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad);
1226 for (i = 0; i < numops; i++) {
1227 struct ceph_osd_op *op = p;
1228 int len;
1229
1230 len = le32_to_cpu(op->payload_len);
1231 req->r_reply_op_len[i] = len;
1232 dout(" op %d has %d bytes\n", i, len);
1233 payload_len += len;
1234 p += sizeof(*op);
1235 }
1236 if (payload_len != le32_to_cpu(msg->hdr.data_len)) {
1237 pr_warning("sum of op payload lens %d != data_len %d",
1238 payload_len, le32_to_cpu(msg->hdr.data_len));
1239 goto bad_put;
1240 }
1241
1242 ceph_decode_need(&p, end, 4 + numops * 4, bad);
1243 retry_attempt = ceph_decode_32(&p);
1244 for (i = 0; i < numops; i++)
1245 req->r_reply_op_result[i] = ceph_decode_32(&p);
1188 1246
1189 /* 1247 /*
1190 * if this connection filled our message, drop our reference now, to 1248 * if this connection filled our message, drop our reference now, to
@@ -1199,7 +1257,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1199 if (!req->r_got_reply) { 1257 if (!req->r_got_reply) {
1200 unsigned int bytes; 1258 unsigned int bytes;
1201 1259
1202 req->r_result = le32_to_cpu(rhead->result); 1260 req->r_result = result;
1203 bytes = le32_to_cpu(msg->hdr.data_len); 1261 bytes = le32_to_cpu(msg->hdr.data_len);
1204 dout("handle_reply result %d bytes %d\n", req->r_result, 1262 dout("handle_reply result %d bytes %d\n", req->r_result,
1205 bytes); 1263 bytes);
@@ -1207,7 +1265,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1207 req->r_result = bytes; 1265 req->r_result = bytes;
1208 1266
1209 /* in case this is a write and we need to replay, */ 1267 /* in case this is a write and we need to replay, */
1210 req->r_reassert_version = rhead->reassert_version; 1268 req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
1269 req->r_reassert_version.version = cpu_to_le64(reassert_version);
1211 1270
1212 req->r_got_reply = 1; 1271 req->r_got_reply = 1;
1213 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { 1272 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
@@ -1242,10 +1301,11 @@ done:
1242 ceph_osdc_put_request(req); 1301 ceph_osdc_put_request(req);
1243 return; 1302 return;
1244 1303
1304bad_put:
1305 ceph_osdc_put_request(req);
1245bad: 1306bad:
1246 pr_err("corrupt osd_op_reply got %d %d expected %d\n", 1307 pr_err("corrupt osd_op_reply got %d %d\n",
1247 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), 1308 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
1248 (int)sizeof(*rhead));
1249 ceph_msg_dump(msg); 1309 ceph_msg_dump(msg);
1250} 1310}
1251 1311
@@ -1462,7 +1522,9 @@ done:
1462 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 1522 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
1463 ceph_monc_request_next_osdmap(&osdc->client->monc); 1523 ceph_monc_request_next_osdmap(&osdc->client->monc);
1464 1524
1465 send_queued(osdc); 1525 mutex_lock(&osdc->request_mutex);
1526 __send_queued(osdc);
1527 mutex_unlock(&osdc->request_mutex);
1466 up_read(&osdc->map_sem); 1528 up_read(&osdc->map_sem);
1467 wake_up_all(&osdc->client->auth_wq); 1529 wake_up_all(&osdc->client->auth_wq);
1468 return; 1530 return;
@@ -1556,8 +1618,7 @@ static void __remove_event(struct ceph_osd_event *event)
1556 1618
1557int ceph_osdc_create_event(struct ceph_osd_client *osdc, 1619int ceph_osdc_create_event(struct ceph_osd_client *osdc,
1558 void (*event_cb)(u64, u64, u8, void *), 1620 void (*event_cb)(u64, u64, u8, void *),
1559 int one_shot, void *data, 1621 void *data, struct ceph_osd_event **pevent)
1560 struct ceph_osd_event **pevent)
1561{ 1622{
1562 struct ceph_osd_event *event; 1623 struct ceph_osd_event *event;
1563 1624
@@ -1567,14 +1628,13 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
1567 1628
1568 dout("create_event %p\n", event); 1629 dout("create_event %p\n", event);
1569 event->cb = event_cb; 1630 event->cb = event_cb;
1570 event->one_shot = one_shot; 1631 event->one_shot = 0;
1571 event->data = data; 1632 event->data = data;
1572 event->osdc = osdc; 1633 event->osdc = osdc;
1573 INIT_LIST_HEAD(&event->osd_node); 1634 INIT_LIST_HEAD(&event->osd_node);
1574 RB_CLEAR_NODE(&event->node); 1635 RB_CLEAR_NODE(&event->node);
1575 kref_init(&event->kref); /* one ref for us */ 1636 kref_init(&event->kref); /* one ref for us */
1576 kref_get(&event->kref); /* one ref for the caller */ 1637 kref_get(&event->kref); /* one ref for the caller */
1577 init_completion(&event->completion);
1578 1638
1579 spin_lock(&osdc->event_lock); 1639 spin_lock(&osdc->event_lock);
1580 event->cookie = ++osdc->event_count; 1640 event->cookie = ++osdc->event_count;
@@ -1610,7 +1670,6 @@ static void do_event_work(struct work_struct *work)
1610 1670
1611 dout("do_event_work completing %p\n", event); 1671 dout("do_event_work completing %p\n", event);
1612 event->cb(ver, notify_id, opcode, event->data); 1672 event->cb(ver, notify_id, opcode, event->data);
1613 complete(&event->completion);
1614 dout("do_event_work completed %p\n", event); 1673 dout("do_event_work completed %p\n", event);
1615 ceph_osdc_put_event(event); 1674 ceph_osdc_put_event(event);
1616 kfree(event_work); 1675 kfree(event_work);
@@ -1620,7 +1679,8 @@ static void do_event_work(struct work_struct *work)
1620/* 1679/*
1621 * Process osd watch notifications 1680 * Process osd watch notifications
1622 */ 1681 */
1623void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) 1682static void handle_watch_notify(struct ceph_osd_client *osdc,
1683 struct ceph_msg *msg)
1624{ 1684{
1625 void *p, *end; 1685 void *p, *end;
1626 u8 proto_ver; 1686 u8 proto_ver;
@@ -1641,9 +1701,8 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1641 spin_lock(&osdc->event_lock); 1701 spin_lock(&osdc->event_lock);
1642 event = __find_event(osdc, cookie); 1702 event = __find_event(osdc, cookie);
1643 if (event) { 1703 if (event) {
1704 BUG_ON(event->one_shot);
1644 get_event(event); 1705 get_event(event);
1645 if (event->one_shot)
1646 __remove_event(event);
1647 } 1706 }
1648 spin_unlock(&osdc->event_lock); 1707 spin_unlock(&osdc->event_lock);
1649 dout("handle_watch_notify cookie %lld ver %lld event %p\n", 1708 dout("handle_watch_notify cookie %lld ver %lld event %p\n",
@@ -1668,7 +1727,6 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1668 return; 1727 return;
1669 1728
1670done_err: 1729done_err:
1671 complete(&event->completion);
1672 ceph_osdc_put_event(event); 1730 ceph_osdc_put_event(event);
1673 return; 1731 return;
1674 1732
@@ -1677,21 +1735,6 @@ bad:
1677 return; 1735 return;
1678} 1736}
1679 1737
1680int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout)
1681{
1682 int err;
1683
1684 dout("wait_event %p\n", event);
1685 err = wait_for_completion_interruptible_timeout(&event->completion,
1686 timeout * HZ);
1687 ceph_osdc_put_event(event);
1688 if (err > 0)
1689 err = 0;
1690 dout("wait_event %p returns %d\n", event, err);
1691 return err;
1692}
1693EXPORT_SYMBOL(ceph_osdc_wait_event);
1694
1695/* 1738/*
1696 * Register request, send initial attempt. 1739 * Register request, send initial attempt.
1697 */ 1740 */
@@ -1706,7 +1749,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1706#ifdef CONFIG_BLOCK 1749#ifdef CONFIG_BLOCK
1707 req->r_request->bio = req->r_bio; 1750 req->r_request->bio = req->r_bio;
1708#endif 1751#endif
1709 req->r_request->trail = req->r_trail; 1752 req->r_request->trail = &req->r_trail;
1710 1753
1711 register_request(osdc, req); 1754 register_request(osdc, req);
1712 1755
@@ -1865,7 +1908,6 @@ out_mempool:
1865out: 1908out:
1866 return err; 1909 return err;
1867} 1910}
1868EXPORT_SYMBOL(ceph_osdc_init);
1869 1911
1870void ceph_osdc_stop(struct ceph_osd_client *osdc) 1912void ceph_osdc_stop(struct ceph_osd_client *osdc)
1871{ 1913{
@@ -1882,7 +1924,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
1882 ceph_msgpool_destroy(&osdc->msgpool_op); 1924 ceph_msgpool_destroy(&osdc->msgpool_op);
1883 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 1925 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1884} 1926}
1885EXPORT_SYMBOL(ceph_osdc_stop);
1886 1927
1887/* 1928/*
1888 * Read some contiguous pages. If we cross a stripe boundary, shorten 1929 * Read some contiguous pages. If we cross a stripe boundary, shorten
@@ -1902,7 +1943,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1902 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1943 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1903 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1944 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1904 NULL, 0, truncate_seq, truncate_size, NULL, 1945 NULL, 0, truncate_seq, truncate_size, NULL,
1905 false, 1, page_align); 1946 false, page_align);
1906 if (IS_ERR(req)) 1947 if (IS_ERR(req))
1907 return PTR_ERR(req); 1948 return PTR_ERR(req);
1908 1949
@@ -1931,8 +1972,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1931 u64 off, u64 len, 1972 u64 off, u64 len,
1932 u32 truncate_seq, u64 truncate_size, 1973 u32 truncate_seq, u64 truncate_size,
1933 struct timespec *mtime, 1974 struct timespec *mtime,
1934 struct page **pages, int num_pages, 1975 struct page **pages, int num_pages)
1935 int flags, int do_sync, bool nofail)
1936{ 1976{
1937 struct ceph_osd_request *req; 1977 struct ceph_osd_request *req;
1938 int rc = 0; 1978 int rc = 0;
@@ -1941,11 +1981,10 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1941 BUG_ON(vino.snap != CEPH_NOSNAP); 1981 BUG_ON(vino.snap != CEPH_NOSNAP);
1942 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1982 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1943 CEPH_OSD_OP_WRITE, 1983 CEPH_OSD_OP_WRITE,
1944 flags | CEPH_OSD_FLAG_ONDISK | 1984 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1945 CEPH_OSD_FLAG_WRITE, 1985 snapc, 0,
1946 snapc, do_sync,
1947 truncate_seq, truncate_size, mtime, 1986 truncate_seq, truncate_size, mtime,
1948 nofail, 1, page_align); 1987 true, page_align);
1949 if (IS_ERR(req)) 1988 if (IS_ERR(req))
1950 return PTR_ERR(req); 1989 return PTR_ERR(req);
1951 1990
@@ -1954,7 +1993,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1954 dout("writepages %llu~%llu (%d pages)\n", off, len, 1993 dout("writepages %llu~%llu (%d pages)\n", off, len,
1955 req->r_num_pages); 1994 req->r_num_pages);
1956 1995
1957 rc = ceph_osdc_start_request(osdc, req, nofail); 1996 rc = ceph_osdc_start_request(osdc, req, true);
1958 if (!rc) 1997 if (!rc)
1959 rc = ceph_osdc_wait_request(osdc, req); 1998 rc = ceph_osdc_wait_request(osdc, req);
1960 1999
@@ -2047,7 +2086,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2047 if (data_len > 0) { 2086 if (data_len > 0) {
2048 int want = calc_pages_for(req->r_page_alignment, data_len); 2087 int want = calc_pages_for(req->r_page_alignment, data_len);
2049 2088
2050 if (unlikely(req->r_num_pages < want)) { 2089 if (req->r_pages && unlikely(req->r_num_pages < want)) {
2051 pr_warning("tid %lld reply has %d bytes %d pages, we" 2090 pr_warning("tid %lld reply has %d bytes %d pages, we"
2052 " had only %d pages ready\n", tid, data_len, 2091 " had only %d pages ready\n", tid, data_len,
2053 want, req->r_num_pages); 2092 want, req->r_num_pages);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214b5d26..69bc4bf89e3e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -13,26 +13,18 @@
13 13
14char *ceph_osdmap_state_str(char *str, int len, int state) 14char *ceph_osdmap_state_str(char *str, int len, int state)
15{ 15{
16 int flag = 0;
17
18 if (!len) 16 if (!len)
19 goto done; 17 return str;
20 18
21 *str = '\0'; 19 if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
22 if (state) { 20 snprintf(str, len, "exists, up");
23 if (state & CEPH_OSD_EXISTS) { 21 else if (state & CEPH_OSD_EXISTS)
24 snprintf(str, len, "exists"); 22 snprintf(str, len, "exists");
25 flag = 1; 23 else if (state & CEPH_OSD_UP)
26 } 24 snprintf(str, len, "up");
27 if (state & CEPH_OSD_UP) { 25 else
28 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
29 "up");
30 flag = 1;
31 }
32 } else {
33 snprintf(str, len, "doesn't exist"); 26 snprintf(str, len, "doesn't exist");
34 } 27
35done:
36 return str; 28 return str;
37} 29}
38 30
@@ -53,13 +45,8 @@ static int calc_bits_of(unsigned int t)
53 */ 45 */
54static void calc_pg_masks(struct ceph_pg_pool_info *pi) 46static void calc_pg_masks(struct ceph_pg_pool_info *pi)
55{ 47{
56 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; 48 pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
57 pi->pgp_num_mask = 49 pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
58 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
59 pi->lpg_num_mask =
60 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
61 pi->lpgp_num_mask =
62 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
63} 50}
64 51
65/* 52/*
@@ -170,6 +157,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
170 c->choose_local_tries = 2; 157 c->choose_local_tries = 2;
171 c->choose_local_fallback_tries = 5; 158 c->choose_local_fallback_tries = 5;
172 c->choose_total_tries = 19; 159 c->choose_total_tries = 19;
160 c->chooseleaf_descend_once = 0;
173 161
174 ceph_decode_need(p, end, 4*sizeof(u32), bad); 162 ceph_decode_need(p, end, 4*sizeof(u32), bad);
175 magic = ceph_decode_32(p); 163 magic = ceph_decode_32(p);
@@ -336,6 +324,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
336 dout("crush decode tunable choose_total_tries = %d", 324 dout("crush decode tunable choose_total_tries = %d",
337 c->choose_total_tries); 325 c->choose_total_tries);
338 326
327 ceph_decode_need(p, end, sizeof(u32), done);
328 c->chooseleaf_descend_once = ceph_decode_32(p);
329 dout("crush decode tunable chooseleaf_descend_once = %d",
330 c->chooseleaf_descend_once);
331
339done: 332done:
340 dout("crush_decode success\n"); 333 dout("crush_decode success\n");
341 return c; 334 return c;
@@ -354,12 +347,13 @@ bad:
354 */ 347 */
355static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) 348static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
356{ 349{
357 u64 a = *(u64 *)&l; 350 if (l.pool < r.pool)
358 u64 b = *(u64 *)&r; 351 return -1;
359 352 if (l.pool > r.pool)
360 if (a < b) 353 return 1;
354 if (l.seed < r.seed)
361 return -1; 355 return -1;
362 if (a > b) 356 if (l.seed > r.seed)
363 return 1; 357 return 1;
364 return 0; 358 return 0;
365} 359}
@@ -405,8 +399,8 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
405 } else if (c > 0) { 399 } else if (c > 0) {
406 n = n->rb_right; 400 n = n->rb_right;
407 } else { 401 } else {
408 dout("__lookup_pg_mapping %llx got %p\n", 402 dout("__lookup_pg_mapping %lld.%x got %p\n",
409 *(u64 *)&pgid, pg); 403 pgid.pool, pgid.seed, pg);
410 return pg; 404 return pg;
411 } 405 }
412 } 406 }
@@ -418,12 +412,13 @@ static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
418 struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); 412 struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
419 413
420 if (pg) { 414 if (pg) {
421 dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg); 415 dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
416 pg);
422 rb_erase(&pg->node, root); 417 rb_erase(&pg->node, root);
423 kfree(pg); 418 kfree(pg);
424 return 0; 419 return 0;
425 } 420 }
426 dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid); 421 dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
427 return -ENOENT; 422 return -ENOENT;
428} 423}
429 424
@@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
452 return 0; 447 return 0;
453} 448}
454 449
455static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) 450static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
456{ 451{
457 struct ceph_pg_pool_info *pi; 452 struct ceph_pg_pool_info *pi;
458 struct rb_node *n = root->rb_node; 453 struct rb_node *n = root->rb_node;
@@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
508 503
509static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 504static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
510{ 505{
511 unsigned int n, m; 506 u8 ev, cv;
507 unsigned len, num;
508 void *pool_end;
509
510 ceph_decode_need(p, end, 2 + 4, bad);
511 ev = ceph_decode_8(p); /* encoding version */
512 cv = ceph_decode_8(p); /* compat version */
513 if (ev < 5) {
514 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
515 return -EINVAL;
516 }
517 if (cv > 7) {
518 pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
519 return -EINVAL;
520 }
521 len = ceph_decode_32(p);
522 ceph_decode_need(p, end, len, bad);
523 pool_end = *p + len;
512 524
513 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 525 pi->type = ceph_decode_8(p);
514 calc_pg_masks(pi); 526 pi->size = ceph_decode_8(p);
527 pi->crush_ruleset = ceph_decode_8(p);
528 pi->object_hash = ceph_decode_8(p);
529
530 pi->pg_num = ceph_decode_32(p);
531 pi->pgp_num = ceph_decode_32(p);
532
533 *p += 4 + 4; /* skip lpg* */
534 *p += 4; /* skip last_change */
535 *p += 8 + 4; /* skip snap_seq, snap_epoch */
515 536
516 /* num_snaps * snap_info_t */ 537 /* skip snaps */
517 n = le32_to_cpu(pi->v.num_snaps); 538 num = ceph_decode_32(p);
518 while (n--) { 539 while (num--) {
519 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + 540 *p += 8; /* snapid key */
520 sizeof(struct ceph_timespec), bad); 541 *p += 1 + 1; /* versions */
521 *p += sizeof(u64) + /* key */ 542 len = ceph_decode_32(p);
522 1 + sizeof(u64) + /* u8, snapid */ 543 *p += len;
523 sizeof(struct ceph_timespec);
524 m = ceph_decode_32(p); /* snap name */
525 *p += m;
526 } 544 }
527 545
528 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 546 /* skip removed snaps */
547 num = ceph_decode_32(p);
548 *p += num * (8 + 8);
549
550 *p += 8; /* skip auid */
551 pi->flags = ceph_decode_64(p);
552
553 /* ignore the rest */
554
555 *p = pool_end;
556 calc_pg_masks(pi);
529 return 0; 557 return 0;
530 558
531bad: 559bad:
@@ -535,14 +563,15 @@ bad:
535static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 563static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
536{ 564{
537 struct ceph_pg_pool_info *pi; 565 struct ceph_pg_pool_info *pi;
538 u32 num, len, pool; 566 u32 num, len;
567 u64 pool;
539 568
540 ceph_decode_32_safe(p, end, num, bad); 569 ceph_decode_32_safe(p, end, num, bad);
541 dout(" %d pool names\n", num); 570 dout(" %d pool names\n", num);
542 while (num--) { 571 while (num--) {
543 ceph_decode_32_safe(p, end, pool, bad); 572 ceph_decode_64_safe(p, end, pool, bad);
544 ceph_decode_32_safe(p, end, len, bad); 573 ceph_decode_32_safe(p, end, len, bad);
545 dout(" pool %d len %d\n", pool, len); 574 dout(" pool %llu len %d\n", pool, len);
546 ceph_decode_need(p, end, len, bad); 575 ceph_decode_need(p, end, len, bad);
547 pi = __lookup_pg_pool(&map->pg_pools, pool); 576 pi = __lookup_pg_pool(&map->pg_pools, pool);
548 if (pi) { 577 if (pi) {
@@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
633 struct ceph_osdmap *map; 662 struct ceph_osdmap *map;
634 u16 version; 663 u16 version;
635 u32 len, max, i; 664 u32 len, max, i;
636 u8 ev;
637 int err = -EINVAL; 665 int err = -EINVAL;
638 void *start = *p; 666 void *start = *p;
639 struct ceph_pg_pool_info *pi; 667 struct ceph_pg_pool_info *pi;
@@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
646 map->pg_temp = RB_ROOT; 674 map->pg_temp = RB_ROOT;
647 675
648 ceph_decode_16_safe(p, end, version, bad); 676 ceph_decode_16_safe(p, end, version, bad);
649 if (version > CEPH_OSDMAP_VERSION) { 677 if (version > 6) {
650 pr_warning("got unknown v %d > %d of osdmap\n", version, 678 pr_warning("got unknown v %d > 6 of osdmap\n", version);
651 CEPH_OSDMAP_VERSION); 679 goto bad;
680 }
681 if (version < 6) {
682 pr_warning("got old v %d < 6 of osdmap\n", version);
652 goto bad; 683 goto bad;
653 } 684 }
654 685
@@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
660 691
661 ceph_decode_32_safe(p, end, max, bad); 692 ceph_decode_32_safe(p, end, max, bad);
662 while (max--) { 693 while (max--) {
663 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 694 ceph_decode_need(p, end, 8 + 2, bad);
664 err = -ENOMEM; 695 err = -ENOMEM;
665 pi = kzalloc(sizeof(*pi), GFP_NOFS); 696 pi = kzalloc(sizeof(*pi), GFP_NOFS);
666 if (!pi) 697 if (!pi)
667 goto bad; 698 goto bad;
668 pi->id = ceph_decode_32(p); 699 pi->id = ceph_decode_64(p);
669 err = -EINVAL;
670 ev = ceph_decode_8(p); /* encoding version */
671 if (ev > CEPH_PG_POOL_VERSION) {
672 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
673 ev, CEPH_PG_POOL_VERSION);
674 kfree(pi);
675 goto bad;
676 }
677 err = __decode_pool(p, end, pi); 700 err = __decode_pool(p, end, pi);
678 if (err < 0) { 701 if (err < 0) {
679 kfree(pi); 702 kfree(pi);
@@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
682 __insert_pg_pool(&map->pg_pools, pi); 705 __insert_pg_pool(&map->pg_pools, pi);
683 } 706 }
684 707
685 if (version >= 5) { 708 err = __decode_pool_names(p, end, map);
686 err = __decode_pool_names(p, end, map); 709 if (err < 0) {
687 if (err < 0) { 710 dout("fail to decode pool names");
688 dout("fail to decode pool names"); 711 goto bad;
689 goto bad;
690 }
691 } 712 }
692 713
693 ceph_decode_32_safe(p, end, map->pool_max, bad); 714 ceph_decode_32_safe(p, end, map->pool_max, bad);
@@ -724,10 +745,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
724 for (i = 0; i < len; i++) { 745 for (i = 0; i < len; i++) {
725 int n, j; 746 int n, j;
726 struct ceph_pg pgid; 747 struct ceph_pg pgid;
748 struct ceph_pg_v1 pgid_v1;
727 struct ceph_pg_mapping *pg; 749 struct ceph_pg_mapping *pg;
728 750
729 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); 751 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
730 ceph_decode_copy(p, &pgid, sizeof(pgid)); 752 ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
753 pgid.pool = le32_to_cpu(pgid_v1.pool);
754 pgid.seed = le16_to_cpu(pgid_v1.ps);
731 n = ceph_decode_32(p); 755 n = ceph_decode_32(p);
732 err = -EINVAL; 756 err = -EINVAL;
733 if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 757 if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
@@ -745,7 +769,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
745 err = __insert_pg_mapping(pg, &map->pg_temp); 769 err = __insert_pg_mapping(pg, &map->pg_temp);
746 if (err) 770 if (err)
747 goto bad; 771 goto bad;
748 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); 772 dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
773 len);
749 } 774 }
750 775
751 /* crush */ 776 /* crush */
@@ -784,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
784 struct ceph_fsid fsid; 809 struct ceph_fsid fsid;
785 u32 epoch = 0; 810 u32 epoch = 0;
786 struct ceph_timespec modified; 811 struct ceph_timespec modified;
787 u32 len, pool; 812 s32 len;
788 __s32 new_pool_max, new_flags, max; 813 u64 pool;
814 __s64 new_pool_max;
815 __s32 new_flags, max;
789 void *start = *p; 816 void *start = *p;
790 int err = -EINVAL; 817 int err = -EINVAL;
791 u16 version; 818 u16 version;
792 819
793 ceph_decode_16_safe(p, end, version, bad); 820 ceph_decode_16_safe(p, end, version, bad);
794 if (version > CEPH_OSDMAP_INC_VERSION) { 821 if (version > 6) {
795 pr_warning("got unknown v %d > %d of inc osdmap\n", version, 822 pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6);
796 CEPH_OSDMAP_INC_VERSION);
797 goto bad; 823 goto bad;
798 } 824 }
799 825
@@ -803,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
803 epoch = ceph_decode_32(p); 829 epoch = ceph_decode_32(p);
804 BUG_ON(epoch != map->epoch+1); 830 BUG_ON(epoch != map->epoch+1);
805 ceph_decode_copy(p, &modified, sizeof(modified)); 831 ceph_decode_copy(p, &modified, sizeof(modified));
806 new_pool_max = ceph_decode_32(p); 832 new_pool_max = ceph_decode_64(p);
807 new_flags = ceph_decode_32(p); 833 new_flags = ceph_decode_32(p);
808 834
809 /* full map? */ 835 /* full map? */
@@ -853,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
853 /* new_pool */ 879 /* new_pool */
854 ceph_decode_32_safe(p, end, len, bad); 880 ceph_decode_32_safe(p, end, len, bad);
855 while (len--) { 881 while (len--) {
856 __u8 ev;
857 struct ceph_pg_pool_info *pi; 882 struct ceph_pg_pool_info *pi;
858 883
859 ceph_decode_32_safe(p, end, pool, bad); 884 ceph_decode_64_safe(p, end, pool, bad);
860 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
861 ev = ceph_decode_8(p); /* encoding version */
862 if (ev > CEPH_PG_POOL_VERSION) {
863 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
864 ev, CEPH_PG_POOL_VERSION);
865 err = -EINVAL;
866 goto bad;
867 }
868 pi = __lookup_pg_pool(&map->pg_pools, pool); 885 pi = __lookup_pg_pool(&map->pg_pools, pool);
869 if (!pi) { 886 if (!pi) {
870 pi = kzalloc(sizeof(*pi), GFP_NOFS); 887 pi = kzalloc(sizeof(*pi), GFP_NOFS);
@@ -890,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
890 while (len--) { 907 while (len--) {
891 struct ceph_pg_pool_info *pi; 908 struct ceph_pg_pool_info *pi;
892 909
893 ceph_decode_32_safe(p, end, pool, bad); 910 ceph_decode_64_safe(p, end, pool, bad);
894 pi = __lookup_pg_pool(&map->pg_pools, pool); 911 pi = __lookup_pg_pool(&map->pg_pools, pool);
895 if (pi) 912 if (pi)
896 __remove_pg_pool(&map->pg_pools, pi); 913 __remove_pg_pool(&map->pg_pools, pi);
@@ -946,10 +963,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
946 while (len--) { 963 while (len--) {
947 struct ceph_pg_mapping *pg; 964 struct ceph_pg_mapping *pg;
948 int j; 965 int j;
966 struct ceph_pg_v1 pgid_v1;
949 struct ceph_pg pgid; 967 struct ceph_pg pgid;
950 u32 pglen; 968 u32 pglen;
951 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); 969 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
952 ceph_decode_copy(p, &pgid, sizeof(pgid)); 970 ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1));
971 pgid.pool = le32_to_cpu(pgid_v1.pool);
972 pgid.seed = le16_to_cpu(pgid_v1.ps);
953 pglen = ceph_decode_32(p); 973 pglen = ceph_decode_32(p);
954 974
955 if (pglen) { 975 if (pglen) {
@@ -975,8 +995,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
975 kfree(pg); 995 kfree(pg);
976 goto bad; 996 goto bad;
977 } 997 }
978 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, 998 dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
979 pglen); 999 pgid.seed, pglen);
980 } else { 1000 } else {
981 /* remove */ 1001 /* remove */
982 __remove_pg_mapping(&map->pg_temp, pgid); 1002 __remove_pg_mapping(&map->pg_temp, pgid);
@@ -1010,7 +1030,7 @@ bad:
1010 * pass a stride back to the caller. 1030 * pass a stride back to the caller.
1011 */ 1031 */
1012int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 1032int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1013 u64 off, u64 *plen, 1033 u64 off, u64 len,
1014 u64 *ono, 1034 u64 *ono,
1015 u64 *oxoff, u64 *oxlen) 1035 u64 *oxoff, u64 *oxlen)
1016{ 1036{
@@ -1021,7 +1041,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1021 u32 su_per_object; 1041 u32 su_per_object;
1022 u64 t, su_offset; 1042 u64 t, su_offset;
1023 1043
1024 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, 1044 dout("mapping %llu~%llu osize %u fl_su %u\n", off, len,
1025 osize, su); 1045 osize, su);
1026 if (su == 0 || sc == 0) 1046 if (su == 0 || sc == 0)
1027 goto invalid; 1047 goto invalid;
@@ -1054,11 +1074,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1054 1074
1055 /* 1075 /*
1056 * Calculate the length of the extent being written to the selected 1076 * Calculate the length of the extent being written to the selected
1057 * object. This is the minimum of the full length requested (plen) or 1077 * object. This is the minimum of the full length requested (len) or
1058 * the remainder of the current stripe being written to. 1078 * the remainder of the current stripe being written to.
1059 */ 1079 */
1060 *oxlen = min_t(u64, *plen, su - su_offset); 1080 *oxlen = min_t(u64, len, su - su_offset);
1061 *plen = *oxlen;
1062 1081
1063 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 1082 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
1064 return 0; 1083 return 0;
@@ -1076,33 +1095,24 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
1076 * calculate an object layout (i.e. pgid) from an oid, 1095 * calculate an object layout (i.e. pgid) from an oid,
1077 * file_layout, and osdmap 1096 * file_layout, and osdmap
1078 */ 1097 */
1079int ceph_calc_object_layout(struct ceph_object_layout *ol, 1098int ceph_calc_object_layout(struct ceph_pg *pg,
1080 const char *oid, 1099 const char *oid,
1081 struct ceph_file_layout *fl, 1100 struct ceph_file_layout *fl,
1082 struct ceph_osdmap *osdmap) 1101 struct ceph_osdmap *osdmap)
1083{ 1102{
1084 unsigned int num, num_mask; 1103 unsigned int num, num_mask;
1085 struct ceph_pg pgid;
1086 int poolid = le32_to_cpu(fl->fl_pg_pool);
1087 struct ceph_pg_pool_info *pool; 1104 struct ceph_pg_pool_info *pool;
1088 unsigned int ps;
1089 1105
1090 BUG_ON(!osdmap); 1106 BUG_ON(!osdmap);
1091 1107 pg->pool = le32_to_cpu(fl->fl_pg_pool);
1092 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 1108 pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool);
1093 if (!pool) 1109 if (!pool)
1094 return -EIO; 1110 return -EIO;
1095 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); 1111 pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
1096 num = le32_to_cpu(pool->v.pg_num); 1112 num = pool->pg_num;
1097 num_mask = pool->pg_num_mask; 1113 num_mask = pool->pg_num_mask;
1098 1114
1099 pgid.ps = cpu_to_le16(ps); 1115 dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed);
1100 pgid.preferred = cpu_to_le16(-1);
1101 pgid.pool = fl->fl_pg_pool;
1102 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1103
1104 ol->ol_pgid = pgid;
1105 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1106 return 0; 1116 return 0;
1107} 1117}
1108EXPORT_SYMBOL(ceph_calc_object_layout); 1118EXPORT_SYMBOL(ceph_calc_object_layout);
@@ -1117,19 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1117 struct ceph_pg_mapping *pg; 1127 struct ceph_pg_mapping *pg;
1118 struct ceph_pg_pool_info *pool; 1128 struct ceph_pg_pool_info *pool;
1119 int ruleno; 1129 int ruleno;
1120 unsigned int poolid, ps, pps, t, r; 1130 int r;
1121 1131 u32 pps;
1122 poolid = le32_to_cpu(pgid.pool);
1123 ps = le16_to_cpu(pgid.ps);
1124 1132
1125 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 1133 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
1126 if (!pool) 1134 if (!pool)
1127 return NULL; 1135 return NULL;
1128 1136
1129 /* pg_temp? */ 1137 /* pg_temp? */
1130 t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), 1138 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
1131 pool->pgp_num_mask); 1139 pool->pgp_num_mask);
1132 pgid.ps = cpu_to_le16(t);
1133 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1140 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1134 if (pg) { 1141 if (pg) {
1135 *num = pg->len; 1142 *num = pg->len;
@@ -1137,26 +1144,39 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1137 } 1144 }
1138 1145
1139 /* crush */ 1146 /* crush */
1140 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 1147 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
1141 pool->v.type, pool->v.size); 1148 pool->type, pool->size);
1142 if (ruleno < 0) { 1149 if (ruleno < 0) {
1143 pr_err("no crush rule pool %d ruleset %d type %d size %d\n", 1150 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
1144 poolid, pool->v.crush_ruleset, pool->v.type, 1151 pgid.pool, pool->crush_ruleset, pool->type,
1145 pool->v.size); 1152 pool->size);
1146 return NULL; 1153 return NULL;
1147 } 1154 }
1148 1155
1149 pps = ceph_stable_mod(ps, 1156 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1150 le32_to_cpu(pool->v.pgp_num), 1157 /* hash pool id and seed sothat pool PGs do not overlap */
1151 pool->pgp_num_mask); 1158 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
1152 pps += poolid; 1159 ceph_stable_mod(pgid.seed, pool->pgp_num,
1160 pool->pgp_num_mask),
1161 pgid.pool);
1162 } else {
1163 /*
1164 * legacy ehavior: add ps and pool together. this is
1165 * not a great approach because the PGs from each pool
1166 * will overlap on top of each other: 0.5 == 1.4 ==
1167 * 2.3 == ...
1168 */
1169 pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
1170 pool->pgp_num_mask) +
1171 (unsigned)pgid.pool;
1172 }
1153 r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1173 r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1154 min_t(int, pool->v.size, *num), 1174 min_t(int, pool->size, *num),
1155 osdmap->osd_weight); 1175 osdmap->osd_weight);
1156 if (r < 0) { 1176 if (r < 0) {
1157 pr_err("error %d from crush rule: pool %d ruleset %d type %d" 1177 pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
1158 " size %d\n", r, poolid, pool->v.crush_ruleset, 1178 " size %d\n", r, pgid.pool, pool->crush_ruleset,
1159 pool->v.type, pool->v.size); 1179 pool->type, pool->size);
1160 return NULL; 1180 return NULL;
1161 } 1181 }
1162 *num = r; 1182 *num = r;
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index cd9c21df87d1..815a2249cfa9 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -12,7 +12,7 @@
12/* 12/*
13 * build a vector of user pages 13 * build a vector of user pages
14 */ 14 */
15struct page **ceph_get_direct_page_vector(const char __user *data, 15struct page **ceph_get_direct_page_vector(const void __user *data,
16 int num_pages, bool write_page) 16 int num_pages, bool write_page)
17{ 17{
18 struct page **pages; 18 struct page **pages;
@@ -93,7 +93,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector);
93 * copy user data into a page vector 93 * copy user data into a page vector
94 */ 94 */
95int ceph_copy_user_to_page_vector(struct page **pages, 95int ceph_copy_user_to_page_vector(struct page **pages,
96 const char __user *data, 96 const void __user *data,
97 loff_t off, size_t len) 97 loff_t off, size_t len)
98{ 98{
99 int i = 0; 99 int i = 0;
@@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages,
118} 118}
119EXPORT_SYMBOL(ceph_copy_user_to_page_vector); 119EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
120 120
121int ceph_copy_to_page_vector(struct page **pages, 121void ceph_copy_to_page_vector(struct page **pages,
122 const char *data, 122 const void *data,
123 loff_t off, size_t len) 123 loff_t off, size_t len)
124{ 124{
125 int i = 0; 125 int i = 0;
126 size_t po = off & ~PAGE_CACHE_MASK; 126 size_t po = off & ~PAGE_CACHE_MASK;
127 size_t left = len; 127 size_t left = len;
128 size_t l;
129 128
130 while (left > 0) { 129 while (left > 0) {
131 l = min_t(size_t, PAGE_CACHE_SIZE-po, left); 130 size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
131
132 memcpy(page_address(pages[i]) + po, data, l); 132 memcpy(page_address(pages[i]) + po, data, l);
133 data += l; 133 data += l;
134 left -= l; 134 left -= l;
@@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages,
138 i++; 138 i++;
139 } 139 }
140 } 140 }
141 return len;
142} 141}
143EXPORT_SYMBOL(ceph_copy_to_page_vector); 142EXPORT_SYMBOL(ceph_copy_to_page_vector);
144 143
145int ceph_copy_from_page_vector(struct page **pages, 144void ceph_copy_from_page_vector(struct page **pages,
146 char *data, 145 void *data,
147 loff_t off, size_t len) 146 loff_t off, size_t len)
148{ 147{
149 int i = 0; 148 int i = 0;
150 size_t po = off & ~PAGE_CACHE_MASK; 149 size_t po = off & ~PAGE_CACHE_MASK;
151 size_t left = len; 150 size_t left = len;
152 size_t l;
153 151
154 while (left > 0) { 152 while (left > 0) {
155 l = min_t(size_t, PAGE_CACHE_SIZE-po, left); 153 size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
154
156 memcpy(data, page_address(pages[i]) + po, l); 155 memcpy(data, page_address(pages[i]) + po, l);
157 data += l; 156 data += l;
158 left -= l; 157 left -= l;
@@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages,
162 i++; 161 i++;
163 } 162 }
164 } 163 }
165 return len;
166} 164}
167EXPORT_SYMBOL(ceph_copy_from_page_vector); 165EXPORT_SYMBOL(ceph_copy_from_page_vector);
168 166
@@ -170,7 +168,7 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector);
170 * copy user data from a page vector into a user pointer 168 * copy user data from a page vector into a user pointer
171 */ 169 */
172int ceph_copy_page_vector_to_user(struct page **pages, 170int ceph_copy_page_vector_to_user(struct page **pages,
173 char __user *data, 171 void __user *data,
174 loff_t off, size_t len) 172 loff_t off, size_t len)
175{ 173{
176 int i = 0; 174 int i = 0;