diff options
32 files changed, 2402 insertions, 1528 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 89576a0b3f2e..6c81a4c040b9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -52,9 +52,12 @@ | |||
52 | #define SECTOR_SHIFT 9 | 52 | #define SECTOR_SHIFT 9 |
53 | #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) | 53 | #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) |
54 | 54 | ||
55 | /* It might be useful to have this defined elsewhere too */ | 55 | /* It might be useful to have these defined elsewhere */ |
56 | 56 | ||
57 | #define U64_MAX ((u64) (~0ULL)) | 57 | #define U8_MAX ((u8) (~0U)) |
58 | #define U16_MAX ((u16) (~0U)) | ||
59 | #define U32_MAX ((u32) (~0U)) | ||
60 | #define U64_MAX ((u64) (~0ULL)) | ||
58 | 61 | ||
59 | #define RBD_DRV_NAME "rbd" | 62 | #define RBD_DRV_NAME "rbd" |
60 | #define RBD_DRV_NAME_LONG "rbd (rados block device)" | 63 | #define RBD_DRV_NAME_LONG "rbd (rados block device)" |
@@ -66,7 +69,6 @@ | |||
66 | (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) | 69 | (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) |
67 | 70 | ||
68 | #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ | 71 | #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ |
69 | #define RBD_MAX_OPT_LEN 1024 | ||
70 | 72 | ||
71 | #define RBD_SNAP_HEAD_NAME "-" | 73 | #define RBD_SNAP_HEAD_NAME "-" |
72 | 74 | ||
@@ -93,8 +95,6 @@ | |||
93 | #define DEV_NAME_LEN 32 | 95 | #define DEV_NAME_LEN 32 |
94 | #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) | 96 | #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) |
95 | 97 | ||
96 | #define RBD_READ_ONLY_DEFAULT false | ||
97 | |||
98 | /* | 98 | /* |
99 | * block device image metadata (in-memory version) | 99 | * block device image metadata (in-memory version) |
100 | */ | 100 | */ |
@@ -119,16 +119,33 @@ struct rbd_image_header { | |||
119 | * An rbd image specification. | 119 | * An rbd image specification. |
120 | * | 120 | * |
121 | * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely | 121 | * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely |
122 | * identify an image. | 122 | * identify an image. Each rbd_dev structure includes a pointer to |
123 | * an rbd_spec structure that encapsulates this identity. | ||
124 | * | ||
125 | * Each of the id's in an rbd_spec has an associated name. For a | ||
126 | * user-mapped image, the names are supplied and the id's associated | ||
127 | * with them are looked up. For a layered image, a parent image is | ||
128 | * defined by the tuple, and the names are looked up. | ||
129 | * | ||
130 | * An rbd_dev structure contains a parent_spec pointer which is | ||
131 | * non-null if the image it represents is a child in a layered | ||
132 | * image. This pointer will refer to the rbd_spec structure used | ||
133 | * by the parent rbd_dev for its own identity (i.e., the structure | ||
134 | * is shared between the parent and child). | ||
135 | * | ||
136 | * Since these structures are populated once, during the discovery | ||
137 | * phase of image construction, they are effectively immutable so | ||
138 | * we make no effort to synchronize access to them. | ||
139 | * | ||
140 | * Note that code herein does not assume the image name is known (it | ||
141 | * could be a null pointer). | ||
123 | */ | 142 | */ |
124 | struct rbd_spec { | 143 | struct rbd_spec { |
125 | u64 pool_id; | 144 | u64 pool_id; |
126 | char *pool_name; | 145 | char *pool_name; |
127 | 146 | ||
128 | char *image_id; | 147 | char *image_id; |
129 | size_t image_id_len; | ||
130 | char *image_name; | 148 | char *image_name; |
131 | size_t image_name_len; | ||
132 | 149 | ||
133 | u64 snap_id; | 150 | u64 snap_id; |
134 | char *snap_name; | 151 | char *snap_name; |
@@ -136,10 +153,6 @@ struct rbd_spec { | |||
136 | struct kref kref; | 153 | struct kref kref; |
137 | }; | 154 | }; |
138 | 155 | ||
139 | struct rbd_options { | ||
140 | bool read_only; | ||
141 | }; | ||
142 | |||
143 | /* | 156 | /* |
144 | * an instance of the client. multiple devices may share an rbd client. | 157 | * an instance of the client. multiple devices may share an rbd client. |
145 | */ | 158 | */ |
@@ -149,37 +162,76 @@ struct rbd_client { | |||
149 | struct list_head node; | 162 | struct list_head node; |
150 | }; | 163 | }; |
151 | 164 | ||
152 | /* | 165 | struct rbd_img_request; |
153 | * a request completion status | 166 | typedef void (*rbd_img_callback_t)(struct rbd_img_request *); |
154 | */ | 167 | |
155 | struct rbd_req_status { | 168 | #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ |
156 | int done; | 169 | |
157 | int rc; | 170 | struct rbd_obj_request; |
158 | u64 bytes; | 171 | typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); |
172 | |||
173 | enum obj_request_type { | ||
174 | OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES | ||
159 | }; | 175 | }; |
160 | 176 | ||
161 | /* | 177 | struct rbd_obj_request { |
162 | * a collection of requests | 178 | const char *object_name; |
163 | */ | 179 | u64 offset; /* object start byte */ |
164 | struct rbd_req_coll { | 180 | u64 length; /* bytes from offset */ |
165 | int total; | 181 | |
166 | int num_done; | 182 | struct rbd_img_request *img_request; |
183 | struct list_head links; /* img_request->obj_requests */ | ||
184 | u32 which; /* posn image request list */ | ||
185 | |||
186 | enum obj_request_type type; | ||
187 | union { | ||
188 | struct bio *bio_list; | ||
189 | struct { | ||
190 | struct page **pages; | ||
191 | u32 page_count; | ||
192 | }; | ||
193 | }; | ||
194 | |||
195 | struct ceph_osd_request *osd_req; | ||
196 | |||
197 | u64 xferred; /* bytes transferred */ | ||
198 | u64 version; | ||
199 | int result; | ||
200 | atomic_t done; | ||
201 | |||
202 | rbd_obj_callback_t callback; | ||
203 | struct completion completion; | ||
204 | |||
167 | struct kref kref; | 205 | struct kref kref; |
168 | struct rbd_req_status status[0]; | ||
169 | }; | 206 | }; |
170 | 207 | ||
171 | /* | 208 | struct rbd_img_request { |
172 | * a single io request | 209 | struct request *rq; |
173 | */ | 210 | struct rbd_device *rbd_dev; |
174 | struct rbd_request { | 211 | u64 offset; /* starting image byte offset */ |
175 | struct request *rq; /* blk layer request */ | 212 | u64 length; /* byte count from offset */ |
176 | struct bio *bio; /* cloned bio */ | 213 | bool write_request; /* false for read */ |
177 | struct page **pages; /* list of used pages */ | 214 | union { |
178 | u64 len; | 215 | struct ceph_snap_context *snapc; /* for writes */ |
179 | int coll_index; | 216 | u64 snap_id; /* for reads */ |
180 | struct rbd_req_coll *coll; | 217 | }; |
218 | spinlock_t completion_lock;/* protects next_completion */ | ||
219 | u32 next_completion; | ||
220 | rbd_img_callback_t callback; | ||
221 | |||
222 | u32 obj_request_count; | ||
223 | struct list_head obj_requests; /* rbd_obj_request structs */ | ||
224 | |||
225 | struct kref kref; | ||
181 | }; | 226 | }; |
182 | 227 | ||
228 | #define for_each_obj_request(ireq, oreq) \ | ||
229 | list_for_each_entry(oreq, &(ireq)->obj_requests, links) | ||
230 | #define for_each_obj_request_from(ireq, oreq) \ | ||
231 | list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) | ||
232 | #define for_each_obj_request_safe(ireq, oreq, n) \ | ||
233 | list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) | ||
234 | |||
183 | struct rbd_snap { | 235 | struct rbd_snap { |
184 | struct device dev; | 236 | struct device dev; |
185 | const char *name; | 237 | const char *name; |
@@ -209,16 +261,18 @@ struct rbd_device { | |||
209 | 261 | ||
210 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ | 262 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ |
211 | 263 | ||
212 | spinlock_t lock; /* queue lock */ | 264 | spinlock_t lock; /* queue, flags, open_count */ |
213 | 265 | ||
214 | struct rbd_image_header header; | 266 | struct rbd_image_header header; |
215 | bool exists; | 267 | unsigned long flags; /* possibly lock protected */ |
216 | struct rbd_spec *spec; | 268 | struct rbd_spec *spec; |
217 | 269 | ||
218 | char *header_name; | 270 | char *header_name; |
219 | 271 | ||
272 | struct ceph_file_layout layout; | ||
273 | |||
220 | struct ceph_osd_event *watch_event; | 274 | struct ceph_osd_event *watch_event; |
221 | struct ceph_osd_request *watch_request; | 275 | struct rbd_obj_request *watch_request; |
222 | 276 | ||
223 | struct rbd_spec *parent_spec; | 277 | struct rbd_spec *parent_spec; |
224 | u64 parent_overlap; | 278 | u64 parent_overlap; |
@@ -235,7 +289,19 @@ struct rbd_device { | |||
235 | 289 | ||
236 | /* sysfs related */ | 290 | /* sysfs related */ |
237 | struct device dev; | 291 | struct device dev; |
238 | unsigned long open_count; | 292 | unsigned long open_count; /* protected by lock */ |
293 | }; | ||
294 | |||
295 | /* | ||
296 | * Flag bits for rbd_dev->flags. If atomicity is required, | ||
297 | * rbd_dev->lock is used to protect access. | ||
298 | * | ||
299 | * Currently, only the "removing" flag (which is coupled with the | ||
300 | * "open_count" field) requires atomic access. | ||
301 | */ | ||
302 | enum rbd_dev_flags { | ||
303 | RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ | ||
304 | RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ | ||
239 | }; | 305 | }; |
240 | 306 | ||
241 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ | 307 | static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ |
@@ -277,6 +343,33 @@ static struct device rbd_root_dev = { | |||
277 | .release = rbd_root_dev_release, | 343 | .release = rbd_root_dev_release, |
278 | }; | 344 | }; |
279 | 345 | ||
346 | static __printf(2, 3) | ||
347 | void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) | ||
348 | { | ||
349 | struct va_format vaf; | ||
350 | va_list args; | ||
351 | |||
352 | va_start(args, fmt); | ||
353 | vaf.fmt = fmt; | ||
354 | vaf.va = &args; | ||
355 | |||
356 | if (!rbd_dev) | ||
357 | printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); | ||
358 | else if (rbd_dev->disk) | ||
359 | printk(KERN_WARNING "%s: %s: %pV\n", | ||
360 | RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); | ||
361 | else if (rbd_dev->spec && rbd_dev->spec->image_name) | ||
362 | printk(KERN_WARNING "%s: image %s: %pV\n", | ||
363 | RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); | ||
364 | else if (rbd_dev->spec && rbd_dev->spec->image_id) | ||
365 | printk(KERN_WARNING "%s: id %s: %pV\n", | ||
366 | RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); | ||
367 | else /* punt */ | ||
368 | printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", | ||
369 | RBD_DRV_NAME, rbd_dev, &vaf); | ||
370 | va_end(args); | ||
371 | } | ||
372 | |||
280 | #ifdef RBD_DEBUG | 373 | #ifdef RBD_DEBUG |
281 | #define rbd_assert(expr) \ | 374 | #define rbd_assert(expr) \ |
282 | if (unlikely(!(expr))) { \ | 375 | if (unlikely(!(expr))) { \ |
@@ -296,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); | |||
296 | static int rbd_open(struct block_device *bdev, fmode_t mode) | 389 | static int rbd_open(struct block_device *bdev, fmode_t mode) |
297 | { | 390 | { |
298 | struct rbd_device *rbd_dev = bdev->bd_disk->private_data; | 391 | struct rbd_device *rbd_dev = bdev->bd_disk->private_data; |
392 | bool removing = false; | ||
299 | 393 | ||
300 | if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) | 394 | if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) |
301 | return -EROFS; | 395 | return -EROFS; |
302 | 396 | ||
397 | spin_lock_irq(&rbd_dev->lock); | ||
398 | if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) | ||
399 | removing = true; | ||
400 | else | ||
401 | rbd_dev->open_count++; | ||
402 | spin_unlock_irq(&rbd_dev->lock); | ||
403 | if (removing) | ||
404 | return -ENOENT; | ||
405 | |||
303 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 406 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
304 | (void) get_device(&rbd_dev->dev); | 407 | (void) get_device(&rbd_dev->dev); |
305 | set_device_ro(bdev, rbd_dev->mapping.read_only); | 408 | set_device_ro(bdev, rbd_dev->mapping.read_only); |
306 | rbd_dev->open_count++; | ||
307 | mutex_unlock(&ctl_mutex); | 409 | mutex_unlock(&ctl_mutex); |
308 | 410 | ||
309 | return 0; | 411 | return 0; |
@@ -312,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) | |||
312 | static int rbd_release(struct gendisk *disk, fmode_t mode) | 414 | static int rbd_release(struct gendisk *disk, fmode_t mode) |
313 | { | 415 | { |
314 | struct rbd_device *rbd_dev = disk->private_data; | 416 | struct rbd_device *rbd_dev = disk->private_data; |
417 | unsigned long open_count_before; | ||
418 | |||
419 | spin_lock_irq(&rbd_dev->lock); | ||
420 | open_count_before = rbd_dev->open_count--; | ||
421 | spin_unlock_irq(&rbd_dev->lock); | ||
422 | rbd_assert(open_count_before > 0); | ||
315 | 423 | ||
316 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | 424 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); |
317 | rbd_assert(rbd_dev->open_count > 0); | ||
318 | rbd_dev->open_count--; | ||
319 | put_device(&rbd_dev->dev); | 425 | put_device(&rbd_dev->dev); |
320 | mutex_unlock(&ctl_mutex); | 426 | mutex_unlock(&ctl_mutex); |
321 | 427 | ||
@@ -337,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) | |||
337 | struct rbd_client *rbdc; | 443 | struct rbd_client *rbdc; |
338 | int ret = -ENOMEM; | 444 | int ret = -ENOMEM; |
339 | 445 | ||
340 | dout("rbd_client_create\n"); | 446 | dout("%s:\n", __func__); |
341 | rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); | 447 | rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); |
342 | if (!rbdc) | 448 | if (!rbdc) |
343 | goto out_opt; | 449 | goto out_opt; |
@@ -361,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) | |||
361 | spin_unlock(&rbd_client_list_lock); | 467 | spin_unlock(&rbd_client_list_lock); |
362 | 468 | ||
363 | mutex_unlock(&ctl_mutex); | 469 | mutex_unlock(&ctl_mutex); |
470 | dout("%s: rbdc %p\n", __func__, rbdc); | ||
364 | 471 | ||
365 | dout("rbd_client_create created %p\n", rbdc); | ||
366 | return rbdc; | 472 | return rbdc; |
367 | 473 | ||
368 | out_err: | 474 | out_err: |
@@ -373,6 +479,8 @@ out_mutex: | |||
373 | out_opt: | 479 | out_opt: |
374 | if (ceph_opts) | 480 | if (ceph_opts) |
375 | ceph_destroy_options(ceph_opts); | 481 | ceph_destroy_options(ceph_opts); |
482 | dout("%s: error %d\n", __func__, ret); | ||
483 | |||
376 | return ERR_PTR(ret); | 484 | return ERR_PTR(ret); |
377 | } | 485 | } |
378 | 486 | ||
@@ -426,6 +534,12 @@ static match_table_t rbd_opts_tokens = { | |||
426 | {-1, NULL} | 534 | {-1, NULL} |
427 | }; | 535 | }; |
428 | 536 | ||
537 | struct rbd_options { | ||
538 | bool read_only; | ||
539 | }; | ||
540 | |||
541 | #define RBD_READ_ONLY_DEFAULT false | ||
542 | |||
429 | static int parse_rbd_opts_token(char *c, void *private) | 543 | static int parse_rbd_opts_token(char *c, void *private) |
430 | { | 544 | { |
431 | struct rbd_options *rbd_opts = private; | 545 | struct rbd_options *rbd_opts = private; |
@@ -493,7 +607,7 @@ static void rbd_client_release(struct kref *kref) | |||
493 | { | 607 | { |
494 | struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); | 608 | struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); |
495 | 609 | ||
496 | dout("rbd_release_client %p\n", rbdc); | 610 | dout("%s: rbdc %p\n", __func__, rbdc); |
497 | spin_lock(&rbd_client_list_lock); | 611 | spin_lock(&rbd_client_list_lock); |
498 | list_del(&rbdc->node); | 612 | list_del(&rbdc->node); |
499 | spin_unlock(&rbd_client_list_lock); | 613 | spin_unlock(&rbd_client_list_lock); |
@@ -512,18 +626,6 @@ static void rbd_put_client(struct rbd_client *rbdc) | |||
512 | kref_put(&rbdc->kref, rbd_client_release); | 626 | kref_put(&rbdc->kref, rbd_client_release); |
513 | } | 627 | } |
514 | 628 | ||
515 | /* | ||
516 | * Destroy requests collection | ||
517 | */ | ||
518 | static void rbd_coll_release(struct kref *kref) | ||
519 | { | ||
520 | struct rbd_req_coll *coll = | ||
521 | container_of(kref, struct rbd_req_coll, kref); | ||
522 | |||
523 | dout("rbd_coll_release %p\n", coll); | ||
524 | kfree(coll); | ||
525 | } | ||
526 | |||
527 | static bool rbd_image_format_valid(u32 image_format) | 629 | static bool rbd_image_format_valid(u32 image_format) |
528 | { | 630 | { |
529 | return image_format == 1 || image_format == 2; | 631 | return image_format == 1 || image_format == 2; |
@@ -707,7 +809,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) | |||
707 | goto done; | 809 | goto done; |
708 | rbd_dev->mapping.read_only = true; | 810 | rbd_dev->mapping.read_only = true; |
709 | } | 811 | } |
710 | rbd_dev->exists = true; | 812 | set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
813 | |||
711 | done: | 814 | done: |
712 | return ret; | 815 | return ret; |
713 | } | 816 | } |
@@ -724,7 +827,7 @@ static void rbd_header_free(struct rbd_image_header *header) | |||
724 | header->snapc = NULL; | 827 | header->snapc = NULL; |
725 | } | 828 | } |
726 | 829 | ||
727 | static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) | 830 | static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) |
728 | { | 831 | { |
729 | char *name; | 832 | char *name; |
730 | u64 segment; | 833 | u64 segment; |
@@ -767,23 +870,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, | |||
767 | return length; | 870 | return length; |
768 | } | 871 | } |
769 | 872 | ||
770 | static int rbd_get_num_segments(struct rbd_image_header *header, | ||
771 | u64 ofs, u64 len) | ||
772 | { | ||
773 | u64 start_seg; | ||
774 | u64 end_seg; | ||
775 | |||
776 | if (!len) | ||
777 | return 0; | ||
778 | if (len - 1 > U64_MAX - ofs) | ||
779 | return -ERANGE; | ||
780 | |||
781 | start_seg = ofs >> header->obj_order; | ||
782 | end_seg = (ofs + len - 1) >> header->obj_order; | ||
783 | |||
784 | return end_seg - start_seg + 1; | ||
785 | } | ||
786 | |||
787 | /* | 873 | /* |
788 | * returns the size of an object in the image | 874 | * returns the size of an object in the image |
789 | */ | 875 | */ |
@@ -949,8 +1035,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, | |||
949 | unsigned int bi_size; | 1035 | unsigned int bi_size; |
950 | struct bio *bio; | 1036 | struct bio *bio; |
951 | 1037 | ||
952 | if (!bi) | 1038 | if (!bi) { |
1039 | rbd_warn(NULL, "bio_chain exhausted with %u left", len); | ||
953 | goto out_err; /* EINVAL; ran out of bio's */ | 1040 | goto out_err; /* EINVAL; ran out of bio's */ |
1041 | } | ||
954 | bi_size = min_t(unsigned int, bi->bi_size - off, len); | 1042 | bi_size = min_t(unsigned int, bi->bi_size - off, len); |
955 | bio = bio_clone_range(bi, off, bi_size, gfpmask); | 1043 | bio = bio_clone_range(bi, off, bi_size, gfpmask); |
956 | if (!bio) | 1044 | if (!bio) |
@@ -976,399 +1064,721 @@ out_err: | |||
976 | return NULL; | 1064 | return NULL; |
977 | } | 1065 | } |
978 | 1066 | ||
979 | /* | 1067 | static void rbd_obj_request_get(struct rbd_obj_request *obj_request) |
980 | * helpers for osd request op vectors. | ||
981 | */ | ||
982 | static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, | ||
983 | int opcode, u32 payload_len) | ||
984 | { | 1068 | { |
985 | struct ceph_osd_req_op *ops; | 1069 | dout("%s: obj %p (was %d)\n", __func__, obj_request, |
1070 | atomic_read(&obj_request->kref.refcount)); | ||
1071 | kref_get(&obj_request->kref); | ||
1072 | } | ||
1073 | |||
1074 | static void rbd_obj_request_destroy(struct kref *kref); | ||
1075 | static void rbd_obj_request_put(struct rbd_obj_request *obj_request) | ||
1076 | { | ||
1077 | rbd_assert(obj_request != NULL); | ||
1078 | dout("%s: obj %p (was %d)\n", __func__, obj_request, | ||
1079 | atomic_read(&obj_request->kref.refcount)); | ||
1080 | kref_put(&obj_request->kref, rbd_obj_request_destroy); | ||
1081 | } | ||
1082 | |||
1083 | static void rbd_img_request_get(struct rbd_img_request *img_request) | ||
1084 | { | ||
1085 | dout("%s: img %p (was %d)\n", __func__, img_request, | ||
1086 | atomic_read(&img_request->kref.refcount)); | ||
1087 | kref_get(&img_request->kref); | ||
1088 | } | ||
1089 | |||
1090 | static void rbd_img_request_destroy(struct kref *kref); | ||
1091 | static void rbd_img_request_put(struct rbd_img_request *img_request) | ||
1092 | { | ||
1093 | rbd_assert(img_request != NULL); | ||
1094 | dout("%s: img %p (was %d)\n", __func__, img_request, | ||
1095 | atomic_read(&img_request->kref.refcount)); | ||
1096 | kref_put(&img_request->kref, rbd_img_request_destroy); | ||
1097 | } | ||
1098 | |||
1099 | static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, | ||
1100 | struct rbd_obj_request *obj_request) | ||
1101 | { | ||
1102 | rbd_assert(obj_request->img_request == NULL); | ||
1103 | |||
1104 | rbd_obj_request_get(obj_request); | ||
1105 | obj_request->img_request = img_request; | ||
1106 | obj_request->which = img_request->obj_request_count; | ||
1107 | rbd_assert(obj_request->which != BAD_WHICH); | ||
1108 | img_request->obj_request_count++; | ||
1109 | list_add_tail(&obj_request->links, &img_request->obj_requests); | ||
1110 | dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, | ||
1111 | obj_request->which); | ||
1112 | } | ||
986 | 1113 | ||
987 | ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); | 1114 | static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, |
988 | if (!ops) | 1115 | struct rbd_obj_request *obj_request) |
1116 | { | ||
1117 | rbd_assert(obj_request->which != BAD_WHICH); | ||
1118 | |||
1119 | dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, | ||
1120 | obj_request->which); | ||
1121 | list_del(&obj_request->links); | ||
1122 | rbd_assert(img_request->obj_request_count > 0); | ||
1123 | img_request->obj_request_count--; | ||
1124 | rbd_assert(obj_request->which == img_request->obj_request_count); | ||
1125 | obj_request->which = BAD_WHICH; | ||
1126 | rbd_assert(obj_request->img_request == img_request); | ||
1127 | obj_request->img_request = NULL; | ||
1128 | obj_request->callback = NULL; | ||
1129 | rbd_obj_request_put(obj_request); | ||
1130 | } | ||
1131 | |||
1132 | static bool obj_request_type_valid(enum obj_request_type type) | ||
1133 | { | ||
1134 | switch (type) { | ||
1135 | case OBJ_REQUEST_NODATA: | ||
1136 | case OBJ_REQUEST_BIO: | ||
1137 | case OBJ_REQUEST_PAGES: | ||
1138 | return true; | ||
1139 | default: | ||
1140 | return false; | ||
1141 | } | ||
1142 | } | ||
1143 | |||
1144 | static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) | ||
1145 | { | ||
1146 | struct ceph_osd_req_op *op; | ||
1147 | va_list args; | ||
1148 | size_t size; | ||
1149 | |||
1150 | op = kzalloc(sizeof (*op), GFP_NOIO); | ||
1151 | if (!op) | ||
989 | return NULL; | 1152 | return NULL; |
1153 | op->op = opcode; | ||
1154 | va_start(args, opcode); | ||
1155 | switch (opcode) { | ||
1156 | case CEPH_OSD_OP_READ: | ||
1157 | case CEPH_OSD_OP_WRITE: | ||
1158 | /* rbd_osd_req_op_create(READ, offset, length) */ | ||
1159 | /* rbd_osd_req_op_create(WRITE, offset, length) */ | ||
1160 | op->extent.offset = va_arg(args, u64); | ||
1161 | op->extent.length = va_arg(args, u64); | ||
1162 | if (opcode == CEPH_OSD_OP_WRITE) | ||
1163 | op->payload_len = op->extent.length; | ||
1164 | break; | ||
1165 | case CEPH_OSD_OP_STAT: | ||
1166 | break; | ||
1167 | case CEPH_OSD_OP_CALL: | ||
1168 | /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ | ||
1169 | op->cls.class_name = va_arg(args, char *); | ||
1170 | size = strlen(op->cls.class_name); | ||
1171 | rbd_assert(size <= (size_t) U8_MAX); | ||
1172 | op->cls.class_len = size; | ||
1173 | op->payload_len = size; | ||
1174 | |||
1175 | op->cls.method_name = va_arg(args, char *); | ||
1176 | size = strlen(op->cls.method_name); | ||
1177 | rbd_assert(size <= (size_t) U8_MAX); | ||
1178 | op->cls.method_len = size; | ||
1179 | op->payload_len += size; | ||
1180 | |||
1181 | op->cls.argc = 0; | ||
1182 | op->cls.indata = va_arg(args, void *); | ||
1183 | size = va_arg(args, size_t); | ||
1184 | rbd_assert(size <= (size_t) U32_MAX); | ||
1185 | op->cls.indata_len = (u32) size; | ||
1186 | op->payload_len += size; | ||
1187 | break; | ||
1188 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
1189 | case CEPH_OSD_OP_WATCH: | ||
1190 | /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ | ||
1191 | /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ | ||
1192 | op->watch.cookie = va_arg(args, u64); | ||
1193 | op->watch.ver = va_arg(args, u64); | ||
1194 | op->watch.ver = cpu_to_le64(op->watch.ver); | ||
1195 | if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) | ||
1196 | op->watch.flag = (u8) 1; | ||
1197 | break; | ||
1198 | default: | ||
1199 | rbd_warn(NULL, "unsupported opcode %hu\n", opcode); | ||
1200 | kfree(op); | ||
1201 | op = NULL; | ||
1202 | break; | ||
1203 | } | ||
1204 | va_end(args); | ||
990 | 1205 | ||
991 | ops[0].op = opcode; | 1206 | return op; |
1207 | } | ||
992 | 1208 | ||
993 | /* | 1209 | static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) |
994 | * op extent offset and length will be set later on | 1210 | { |
995 | * in calc_raw_layout() | 1211 | kfree(op); |
996 | */ | 1212 | } |
997 | ops[0].payload_len = payload_len; | 1213 | |
1214 | static int rbd_obj_request_submit(struct ceph_osd_client *osdc, | ||
1215 | struct rbd_obj_request *obj_request) | ||
1216 | { | ||
1217 | dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); | ||
998 | 1218 | ||
999 | return ops; | 1219 | return ceph_osdc_start_request(osdc, obj_request->osd_req, false); |
1000 | } | 1220 | } |
1001 | 1221 | ||
1002 | static void rbd_destroy_ops(struct ceph_osd_req_op *ops) | 1222 | static void rbd_img_request_complete(struct rbd_img_request *img_request) |
1003 | { | 1223 | { |
1004 | kfree(ops); | 1224 | dout("%s: img %p\n", __func__, img_request); |
1225 | if (img_request->callback) | ||
1226 | img_request->callback(img_request); | ||
1227 | else | ||
1228 | rbd_img_request_put(img_request); | ||
1005 | } | 1229 | } |
1006 | 1230 | ||
1007 | static void rbd_coll_end_req_index(struct request *rq, | 1231 | /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ |
1008 | struct rbd_req_coll *coll, | 1232 | |
1009 | int index, | 1233 | static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) |
1010 | int ret, u64 len) | ||
1011 | { | 1234 | { |
1012 | struct request_queue *q; | 1235 | dout("%s: obj %p\n", __func__, obj_request); |
1013 | int min, max, i; | ||
1014 | 1236 | ||
1015 | dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", | 1237 | return wait_for_completion_interruptible(&obj_request->completion); |
1016 | coll, index, ret, (unsigned long long) len); | 1238 | } |
1017 | 1239 | ||
1018 | if (!rq) | 1240 | static void obj_request_done_init(struct rbd_obj_request *obj_request) |
1019 | return; | 1241 | { |
1242 | atomic_set(&obj_request->done, 0); | ||
1243 | smp_wmb(); | ||
1244 | } | ||
1020 | 1245 | ||
1021 | if (!coll) { | 1246 | static void obj_request_done_set(struct rbd_obj_request *obj_request) |
1022 | blk_end_request(rq, ret, len); | 1247 | { |
1023 | return; | 1248 | int done; |
1249 | |||
1250 | done = atomic_inc_return(&obj_request->done); | ||
1251 | if (done > 1) { | ||
1252 | struct rbd_img_request *img_request = obj_request->img_request; | ||
1253 | struct rbd_device *rbd_dev; | ||
1254 | |||
1255 | rbd_dev = img_request ? img_request->rbd_dev : NULL; | ||
1256 | rbd_warn(rbd_dev, "obj_request %p was already done\n", | ||
1257 | obj_request); | ||
1024 | } | 1258 | } |
1259 | } | ||
1025 | 1260 | ||
1026 | q = rq->q; | 1261 | static bool obj_request_done_test(struct rbd_obj_request *obj_request) |
1027 | 1262 | { | |
1028 | spin_lock_irq(q->queue_lock); | 1263 | smp_mb(); |
1029 | coll->status[index].done = 1; | 1264 | return atomic_read(&obj_request->done) != 0; |
1030 | coll->status[index].rc = ret; | 1265 | } |
1031 | coll->status[index].bytes = len; | 1266 | |
1032 | max = min = coll->num_done; | 1267 | static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) |
1033 | while (max < coll->total && coll->status[max].done) | 1268 | { |
1034 | max++; | 1269 | dout("%s: obj %p cb %p\n", __func__, obj_request, |
1035 | 1270 | obj_request->callback); | |
1036 | for (i = min; i<max; i++) { | 1271 | if (obj_request->callback) |
1037 | __blk_end_request(rq, coll->status[i].rc, | 1272 | obj_request->callback(obj_request); |
1038 | coll->status[i].bytes); | 1273 | else |
1039 | coll->num_done++; | 1274 | complete_all(&obj_request->completion); |
1040 | kref_put(&coll->kref, rbd_coll_release); | 1275 | } |
1276 | |||
1277 | static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) | ||
1278 | { | ||
1279 | dout("%s: obj %p\n", __func__, obj_request); | ||
1280 | obj_request_done_set(obj_request); | ||
1281 | } | ||
1282 | |||
1283 | static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) | ||
1284 | { | ||
1285 | dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, | ||
1286 | obj_request->result, obj_request->xferred, obj_request->length); | ||
1287 | /* | ||
1288 | * ENOENT means a hole in the object. We zero-fill the | ||
1289 | * entire length of the request. A short read also implies | ||
1290 | * zero-fill to the end of the request. Either way we | ||
1291 | * update the xferred count to indicate the whole request | ||
1292 | * was satisfied. | ||
1293 | */ | ||
1294 | if (obj_request->result == -ENOENT) { | ||
1295 | zero_bio_chain(obj_request->bio_list, 0); | ||
1296 | obj_request->result = 0; | ||
1297 | obj_request->xferred = obj_request->length; | ||
1298 | } else if (obj_request->xferred < obj_request->length && | ||
1299 | !obj_request->result) { | ||
1300 | zero_bio_chain(obj_request->bio_list, obj_request->xferred); | ||
1301 | obj_request->xferred = obj_request->length; | ||
1041 | } | 1302 | } |
1042 | spin_unlock_irq(q->queue_lock); | 1303 | obj_request_done_set(obj_request); |
1043 | } | 1304 | } |
1044 | 1305 | ||
1045 | static void rbd_coll_end_req(struct rbd_request *req, | 1306 | static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) |
1046 | int ret, u64 len) | ||
1047 | { | 1307 | { |
1048 | rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); | 1308 | dout("%s: obj %p result %d %llu\n", __func__, obj_request, |
1309 | obj_request->result, obj_request->length); | ||
1310 | /* | ||
1311 | * There is no such thing as a successful short write. | ||
1312 | * Our xferred value is the number of bytes transferred | ||
1313 | * back. Set it to our originally-requested length. | ||
1314 | */ | ||
1315 | obj_request->xferred = obj_request->length; | ||
1316 | obj_request_done_set(obj_request); | ||
1049 | } | 1317 | } |
1050 | 1318 | ||
1051 | /* | 1319 | /* |
1052 | * Send ceph osd request | 1320 | * For a simple stat call there's nothing to do. We'll do more if |
1321 | * this is part of a write sequence for a layered image. | ||
1053 | */ | 1322 | */ |
1054 | static int rbd_do_request(struct request *rq, | 1323 | static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) |
1055 | struct rbd_device *rbd_dev, | 1324 | { |
1056 | struct ceph_snap_context *snapc, | 1325 | dout("%s: obj %p\n", __func__, obj_request); |
1057 | u64 snapid, | 1326 | obj_request_done_set(obj_request); |
1058 | const char *object_name, u64 ofs, u64 len, | 1327 | } |
1059 | struct bio *bio, | ||
1060 | struct page **pages, | ||
1061 | int num_pages, | ||
1062 | int flags, | ||
1063 | struct ceph_osd_req_op *ops, | ||
1064 | struct rbd_req_coll *coll, | ||
1065 | int coll_index, | ||
1066 | void (*rbd_cb)(struct ceph_osd_request *req, | ||
1067 | struct ceph_msg *msg), | ||
1068 | struct ceph_osd_request **linger_req, | ||
1069 | u64 *ver) | ||
1070 | { | ||
1071 | struct ceph_osd_request *req; | ||
1072 | struct ceph_file_layout *layout; | ||
1073 | int ret; | ||
1074 | u64 bno; | ||
1075 | struct timespec mtime = CURRENT_TIME; | ||
1076 | struct rbd_request *req_data; | ||
1077 | struct ceph_osd_request_head *reqhead; | ||
1078 | struct ceph_osd_client *osdc; | ||
1079 | 1328 | ||
1080 | req_data = kzalloc(sizeof(*req_data), GFP_NOIO); | 1329 | static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, |
1081 | if (!req_data) { | 1330 | struct ceph_msg *msg) |
1082 | if (coll) | 1331 | { |
1083 | rbd_coll_end_req_index(rq, coll, coll_index, | 1332 | struct rbd_obj_request *obj_request = osd_req->r_priv; |
1084 | -ENOMEM, len); | 1333 | u16 opcode; |
1085 | return -ENOMEM; | 1334 | |
1335 | dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); | ||
1336 | rbd_assert(osd_req == obj_request->osd_req); | ||
1337 | rbd_assert(!!obj_request->img_request ^ | ||
1338 | (obj_request->which == BAD_WHICH)); | ||
1339 | |||
1340 | if (osd_req->r_result < 0) | ||
1341 | obj_request->result = osd_req->r_result; | ||
1342 | obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); | ||
1343 | |||
1344 | WARN_ON(osd_req->r_num_ops != 1); /* For now */ | ||
1345 | |||
1346 | /* | ||
1347 | * We support a 64-bit length, but ultimately it has to be | ||
1348 | * passed to blk_end_request(), which takes an unsigned int. | ||
1349 | */ | ||
1350 | obj_request->xferred = osd_req->r_reply_op_len[0]; | ||
1351 | rbd_assert(obj_request->xferred < (u64) UINT_MAX); | ||
1352 | opcode = osd_req->r_request_ops[0].op; | ||
1353 | switch (opcode) { | ||
1354 | case CEPH_OSD_OP_READ: | ||
1355 | rbd_osd_read_callback(obj_request); | ||
1356 | break; | ||
1357 | case CEPH_OSD_OP_WRITE: | ||
1358 | rbd_osd_write_callback(obj_request); | ||
1359 | break; | ||
1360 | case CEPH_OSD_OP_STAT: | ||
1361 | rbd_osd_stat_callback(obj_request); | ||
1362 | break; | ||
1363 | case CEPH_OSD_OP_CALL: | ||
1364 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
1365 | case CEPH_OSD_OP_WATCH: | ||
1366 | rbd_osd_trivial_callback(obj_request); | ||
1367 | break; | ||
1368 | default: | ||
1369 | rbd_warn(NULL, "%s: unsupported op %hu\n", | ||
1370 | obj_request->object_name, (unsigned short) opcode); | ||
1371 | break; | ||
1086 | } | 1372 | } |
1087 | 1373 | ||
1088 | if (coll) { | 1374 | if (obj_request_done_test(obj_request)) |
1089 | req_data->coll = coll; | 1375 | rbd_obj_request_complete(obj_request); |
1090 | req_data->coll_index = coll_index; | 1376 | } |
1377 | |||
1378 | static struct ceph_osd_request *rbd_osd_req_create( | ||
1379 | struct rbd_device *rbd_dev, | ||
1380 | bool write_request, | ||
1381 | struct rbd_obj_request *obj_request, | ||
1382 | struct ceph_osd_req_op *op) | ||
1383 | { | ||
1384 | struct rbd_img_request *img_request = obj_request->img_request; | ||
1385 | struct ceph_snap_context *snapc = NULL; | ||
1386 | struct ceph_osd_client *osdc; | ||
1387 | struct ceph_osd_request *osd_req; | ||
1388 | struct timespec now; | ||
1389 | struct timespec *mtime; | ||
1390 | u64 snap_id = CEPH_NOSNAP; | ||
1391 | u64 offset = obj_request->offset; | ||
1392 | u64 length = obj_request->length; | ||
1393 | |||
1394 | if (img_request) { | ||
1395 | rbd_assert(img_request->write_request == write_request); | ||
1396 | if (img_request->write_request) | ||
1397 | snapc = img_request->snapc; | ||
1398 | else | ||
1399 | snap_id = img_request->snap_id; | ||
1091 | } | 1400 | } |
1092 | 1401 | ||
1093 | dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", | 1402 | /* Allocate and initialize the request, for the single op */ |
1094 | object_name, (unsigned long long) ofs, | ||
1095 | (unsigned long long) len, coll, coll_index); | ||
1096 | 1403 | ||
1097 | osdc = &rbd_dev->rbd_client->client->osdc; | 1404 | osdc = &rbd_dev->rbd_client->client->osdc; |
1098 | req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, | 1405 | osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); |
1099 | false, GFP_NOIO, pages, bio); | 1406 | if (!osd_req) |
1100 | if (!req) { | 1407 | return NULL; /* ENOMEM */ |
1101 | ret = -ENOMEM; | 1408 | |
1102 | goto done_pages; | 1409 | rbd_assert(obj_request_type_valid(obj_request->type)); |
1410 | switch (obj_request->type) { | ||
1411 | case OBJ_REQUEST_NODATA: | ||
1412 | break; /* Nothing to do */ | ||
1413 | case OBJ_REQUEST_BIO: | ||
1414 | rbd_assert(obj_request->bio_list != NULL); | ||
1415 | osd_req->r_bio = obj_request->bio_list; | ||
1416 | break; | ||
1417 | case OBJ_REQUEST_PAGES: | ||
1418 | osd_req->r_pages = obj_request->pages; | ||
1419 | osd_req->r_num_pages = obj_request->page_count; | ||
1420 | osd_req->r_page_alignment = offset & ~PAGE_MASK; | ||
1421 | break; | ||
1103 | } | 1422 | } |
1104 | 1423 | ||
1105 | req->r_callback = rbd_cb; | 1424 | if (write_request) { |
1425 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | ||
1426 | now = CURRENT_TIME; | ||
1427 | mtime = &now; | ||
1428 | } else { | ||
1429 | osd_req->r_flags = CEPH_OSD_FLAG_READ; | ||
1430 | mtime = NULL; /* not needed for reads */ | ||
1431 | offset = 0; /* These are not used... */ | ||
1432 | length = 0; /* ...for osd read requests */ | ||
1433 | } | ||
1106 | 1434 | ||
1107 | req_data->rq = rq; | 1435 | osd_req->r_callback = rbd_osd_req_callback; |
1108 | req_data->bio = bio; | 1436 | osd_req->r_priv = obj_request; |
1109 | req_data->pages = pages; | ||
1110 | req_data->len = len; | ||
1111 | 1437 | ||
1112 | req->r_priv = req_data; | 1438 | osd_req->r_oid_len = strlen(obj_request->object_name); |
1439 | rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); | ||
1440 | memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); | ||
1113 | 1441 | ||
1114 | reqhead = req->r_request->front.iov_base; | 1442 | osd_req->r_file_layout = rbd_dev->layout; /* struct */ |
1115 | reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); | ||
1116 | 1443 | ||
1117 | strncpy(req->r_oid, object_name, sizeof(req->r_oid)); | 1444 | /* osd_req will get its own reference to snapc (if non-null) */ |
1118 | req->r_oid_len = strlen(req->r_oid); | ||
1119 | 1445 | ||
1120 | layout = &req->r_file_layout; | 1446 | ceph_osdc_build_request(osd_req, offset, length, 1, op, |
1121 | memset(layout, 0, sizeof(*layout)); | 1447 | snapc, snap_id, mtime); |
1122 | layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | ||
1123 | layout->fl_stripe_count = cpu_to_le32(1); | ||
1124 | layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | ||
1125 | layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); | ||
1126 | ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, | ||
1127 | req, ops); | ||
1128 | rbd_assert(ret == 0); | ||
1129 | 1448 | ||
1130 | ceph_osdc_build_request(req, ofs, &len, | 1449 | return osd_req; |
1131 | ops, | 1450 | } |
1132 | snapc, | ||
1133 | &mtime, | ||
1134 | req->r_oid, req->r_oid_len); | ||
1135 | 1451 | ||
1136 | if (linger_req) { | 1452 | static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) |
1137 | ceph_osdc_set_request_linger(osdc, req); | 1453 | { |
1138 | *linger_req = req; | 1454 | ceph_osdc_put_request(osd_req); |
1139 | } | 1455 | } |
1140 | 1456 | ||
1141 | ret = ceph_osdc_start_request(osdc, req, false); | 1457 | /* object_name is assumed to be a non-null pointer and NUL-terminated */ |
1142 | if (ret < 0) | 1458 | |
1143 | goto done_err; | 1459 | static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, |
1144 | 1460 | u64 offset, u64 length, | |
1145 | if (!rbd_cb) { | 1461 | enum obj_request_type type) |
1146 | ret = ceph_osdc_wait_request(osdc, req); | 1462 | { |
1147 | if (ver) | 1463 | struct rbd_obj_request *obj_request; |
1148 | *ver = le64_to_cpu(req->r_reassert_version.version); | 1464 | size_t size; |
1149 | dout("reassert_ver=%llu\n", | 1465 | char *name; |
1150 | (unsigned long long) | 1466 | |
1151 | le64_to_cpu(req->r_reassert_version.version)); | 1467 | rbd_assert(obj_request_type_valid(type)); |
1152 | ceph_osdc_put_request(req); | 1468 | |
1469 | size = strlen(object_name) + 1; | ||
1470 | obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); | ||
1471 | if (!obj_request) | ||
1472 | return NULL; | ||
1473 | |||
1474 | name = (char *)(obj_request + 1); | ||
1475 | obj_request->object_name = memcpy(name, object_name, size); | ||
1476 | obj_request->offset = offset; | ||
1477 | obj_request->length = length; | ||
1478 | obj_request->which = BAD_WHICH; | ||
1479 | obj_request->type = type; | ||
1480 | INIT_LIST_HEAD(&obj_request->links); | ||
1481 | obj_request_done_init(obj_request); | ||
1482 | init_completion(&obj_request->completion); | ||
1483 | kref_init(&obj_request->kref); | ||
1484 | |||
1485 | dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, | ||
1486 | offset, length, (int)type, obj_request); | ||
1487 | |||
1488 | return obj_request; | ||
1489 | } | ||
1490 | |||
1491 | static void rbd_obj_request_destroy(struct kref *kref) | ||
1492 | { | ||
1493 | struct rbd_obj_request *obj_request; | ||
1494 | |||
1495 | obj_request = container_of(kref, struct rbd_obj_request, kref); | ||
1496 | |||
1497 | dout("%s: obj %p\n", __func__, obj_request); | ||
1498 | |||
1499 | rbd_assert(obj_request->img_request == NULL); | ||
1500 | rbd_assert(obj_request->which == BAD_WHICH); | ||
1501 | |||
1502 | if (obj_request->osd_req) | ||
1503 | rbd_osd_req_destroy(obj_request->osd_req); | ||
1504 | |||
1505 | rbd_assert(obj_request_type_valid(obj_request->type)); | ||
1506 | switch (obj_request->type) { | ||
1507 | case OBJ_REQUEST_NODATA: | ||
1508 | break; /* Nothing to do */ | ||
1509 | case OBJ_REQUEST_BIO: | ||
1510 | if (obj_request->bio_list) | ||
1511 | bio_chain_put(obj_request->bio_list); | ||
1512 | break; | ||
1513 | case OBJ_REQUEST_PAGES: | ||
1514 | if (obj_request->pages) | ||
1515 | ceph_release_page_vector(obj_request->pages, | ||
1516 | obj_request->page_count); | ||
1517 | break; | ||
1153 | } | 1518 | } |
1154 | return ret; | ||
1155 | 1519 | ||
1156 | done_err: | 1520 | kfree(obj_request); |
1157 | bio_chain_put(req_data->bio); | ||
1158 | ceph_osdc_put_request(req); | ||
1159 | done_pages: | ||
1160 | rbd_coll_end_req(req_data, ret, len); | ||
1161 | kfree(req_data); | ||
1162 | return ret; | ||
1163 | } | 1521 | } |
1164 | 1522 | ||
1165 | /* | 1523 | /* |
1166 | * Ceph osd op callback | 1524 | * Caller is responsible for filling in the list of object requests |
1525 | * that comprises the image request, and the Linux request pointer | ||
1526 | * (if there is one). | ||
1167 | */ | 1527 | */ |
1168 | static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) | 1528 | static struct rbd_img_request *rbd_img_request_create( |
1169 | { | 1529 | struct rbd_device *rbd_dev, |
1170 | struct rbd_request *req_data = req->r_priv; | 1530 | u64 offset, u64 length, |
1171 | struct ceph_osd_reply_head *replyhead; | 1531 | bool write_request) |
1172 | struct ceph_osd_op *op; | 1532 | { |
1173 | __s32 rc; | 1533 | struct rbd_img_request *img_request; |
1174 | u64 bytes; | 1534 | struct ceph_snap_context *snapc = NULL; |
1175 | int read_op; | ||
1176 | |||
1177 | /* parse reply */ | ||
1178 | replyhead = msg->front.iov_base; | ||
1179 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | ||
1180 | op = (void *)(replyhead + 1); | ||
1181 | rc = le32_to_cpu(replyhead->result); | ||
1182 | bytes = le64_to_cpu(op->extent.length); | ||
1183 | read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); | ||
1184 | |||
1185 | dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", | ||
1186 | (unsigned long long) bytes, read_op, (int) rc); | ||
1187 | |||
1188 | if (rc == -ENOENT && read_op) { | ||
1189 | zero_bio_chain(req_data->bio, 0); | ||
1190 | rc = 0; | ||
1191 | } else if (rc == 0 && read_op && bytes < req_data->len) { | ||
1192 | zero_bio_chain(req_data->bio, bytes); | ||
1193 | bytes = req_data->len; | ||
1194 | } | ||
1195 | 1535 | ||
1196 | rbd_coll_end_req(req_data, rc, bytes); | 1536 | img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); |
1537 | if (!img_request) | ||
1538 | return NULL; | ||
1197 | 1539 | ||
1198 | if (req_data->bio) | 1540 | if (write_request) { |
1199 | bio_chain_put(req_data->bio); | 1541 | down_read(&rbd_dev->header_rwsem); |
1542 | snapc = ceph_get_snap_context(rbd_dev->header.snapc); | ||
1543 | up_read(&rbd_dev->header_rwsem); | ||
1544 | if (WARN_ON(!snapc)) { | ||
1545 | kfree(img_request); | ||
1546 | return NULL; /* Shouldn't happen */ | ||
1547 | } | ||
1548 | } | ||
1200 | 1549 | ||
1201 | ceph_osdc_put_request(req); | 1550 | img_request->rq = NULL; |
1202 | kfree(req_data); | 1551 | img_request->rbd_dev = rbd_dev; |
1552 | img_request->offset = offset; | ||
1553 | img_request->length = length; | ||
1554 | img_request->write_request = write_request; | ||
1555 | if (write_request) | ||
1556 | img_request->snapc = snapc; | ||
1557 | else | ||
1558 | img_request->snap_id = rbd_dev->spec->snap_id; | ||
1559 | spin_lock_init(&img_request->completion_lock); | ||
1560 | img_request->next_completion = 0; | ||
1561 | img_request->callback = NULL; | ||
1562 | img_request->obj_request_count = 0; | ||
1563 | INIT_LIST_HEAD(&img_request->obj_requests); | ||
1564 | kref_init(&img_request->kref); | ||
1565 | |||
1566 | rbd_img_request_get(img_request); /* Avoid a warning */ | ||
1567 | rbd_img_request_put(img_request); /* TEMPORARY */ | ||
1568 | |||
1569 | dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, | ||
1570 | write_request ? "write" : "read", offset, length, | ||
1571 | img_request); | ||
1572 | |||
1573 | return img_request; | ||
1203 | } | 1574 | } |
1204 | 1575 | ||
1205 | static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) | 1576 | static void rbd_img_request_destroy(struct kref *kref) |
1206 | { | 1577 | { |
1207 | ceph_osdc_put_request(req); | 1578 | struct rbd_img_request *img_request; |
1579 | struct rbd_obj_request *obj_request; | ||
1580 | struct rbd_obj_request *next_obj_request; | ||
1581 | |||
1582 | img_request = container_of(kref, struct rbd_img_request, kref); | ||
1583 | |||
1584 | dout("%s: img %p\n", __func__, img_request); | ||
1585 | |||
1586 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) | ||
1587 | rbd_img_obj_request_del(img_request, obj_request); | ||
1588 | rbd_assert(img_request->obj_request_count == 0); | ||
1589 | |||
1590 | if (img_request->write_request) | ||
1591 | ceph_put_snap_context(img_request->snapc); | ||
1592 | |||
1593 | kfree(img_request); | ||
1208 | } | 1594 | } |
1209 | 1595 | ||
1210 | /* | 1596 | static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, |
1211 | * Do a synchronous ceph osd operation | 1597 | struct bio *bio_list) |
1212 | */ | ||
1213 | static int rbd_req_sync_op(struct rbd_device *rbd_dev, | ||
1214 | struct ceph_snap_context *snapc, | ||
1215 | u64 snapid, | ||
1216 | int flags, | ||
1217 | struct ceph_osd_req_op *ops, | ||
1218 | const char *object_name, | ||
1219 | u64 ofs, u64 inbound_size, | ||
1220 | char *inbound, | ||
1221 | struct ceph_osd_request **linger_req, | ||
1222 | u64 *ver) | ||
1223 | { | 1598 | { |
1224 | int ret; | 1599 | struct rbd_device *rbd_dev = img_request->rbd_dev; |
1225 | struct page **pages; | 1600 | struct rbd_obj_request *obj_request = NULL; |
1226 | int num_pages; | 1601 | struct rbd_obj_request *next_obj_request; |
1227 | 1602 | unsigned int bio_offset; | |
1228 | rbd_assert(ops != NULL); | 1603 | u64 image_offset; |
1604 | u64 resid; | ||
1605 | u16 opcode; | ||
1606 | |||
1607 | dout("%s: img %p bio %p\n", __func__, img_request, bio_list); | ||
1608 | |||
1609 | opcode = img_request->write_request ? CEPH_OSD_OP_WRITE | ||
1610 | : CEPH_OSD_OP_READ; | ||
1611 | bio_offset = 0; | ||
1612 | image_offset = img_request->offset; | ||
1613 | rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); | ||
1614 | resid = img_request->length; | ||
1615 | rbd_assert(resid > 0); | ||
1616 | while (resid) { | ||
1617 | const char *object_name; | ||
1618 | unsigned int clone_size; | ||
1619 | struct ceph_osd_req_op *op; | ||
1620 | u64 offset; | ||
1621 | u64 length; | ||
1622 | |||
1623 | object_name = rbd_segment_name(rbd_dev, image_offset); | ||
1624 | if (!object_name) | ||
1625 | goto out_unwind; | ||
1626 | offset = rbd_segment_offset(rbd_dev, image_offset); | ||
1627 | length = rbd_segment_length(rbd_dev, image_offset, resid); | ||
1628 | obj_request = rbd_obj_request_create(object_name, | ||
1629 | offset, length, | ||
1630 | OBJ_REQUEST_BIO); | ||
1631 | kfree(object_name); /* object request has its own copy */ | ||
1632 | if (!obj_request) | ||
1633 | goto out_unwind; | ||
1634 | |||
1635 | rbd_assert(length <= (u64) UINT_MAX); | ||
1636 | clone_size = (unsigned int) length; | ||
1637 | obj_request->bio_list = bio_chain_clone_range(&bio_list, | ||
1638 | &bio_offset, clone_size, | ||
1639 | GFP_ATOMIC); | ||
1640 | if (!obj_request->bio_list) | ||
1641 | goto out_partial; | ||
1229 | 1642 | ||
1230 | num_pages = calc_pages_for(ofs, inbound_size); | 1643 | /* |
1231 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); | 1644 | * Build up the op to use in building the osd |
1232 | if (IS_ERR(pages)) | 1645 | * request. Note that the contents of the op are |
1233 | return PTR_ERR(pages); | 1646 | * copied by rbd_osd_req_create(). |
1647 | */ | ||
1648 | op = rbd_osd_req_op_create(opcode, offset, length); | ||
1649 | if (!op) | ||
1650 | goto out_partial; | ||
1651 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, | ||
1652 | img_request->write_request, | ||
1653 | obj_request, op); | ||
1654 | rbd_osd_req_op_destroy(op); | ||
1655 | if (!obj_request->osd_req) | ||
1656 | goto out_partial; | ||
1657 | /* status and version are initially zero-filled */ | ||
1658 | |||
1659 | rbd_img_obj_request_add(img_request, obj_request); | ||
1660 | |||
1661 | image_offset += length; | ||
1662 | resid -= length; | ||
1663 | } | ||
1234 | 1664 | ||
1235 | ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, | 1665 | return 0; |
1236 | object_name, ofs, inbound_size, NULL, | ||
1237 | pages, num_pages, | ||
1238 | flags, | ||
1239 | ops, | ||
1240 | NULL, 0, | ||
1241 | NULL, | ||
1242 | linger_req, ver); | ||
1243 | if (ret < 0) | ||
1244 | goto done; | ||
1245 | 1666 | ||
1246 | if ((flags & CEPH_OSD_FLAG_READ) && inbound) | 1667 | out_partial: |
1247 | ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); | 1668 | rbd_obj_request_put(obj_request); |
1669 | out_unwind: | ||
1670 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) | ||
1671 | rbd_obj_request_put(obj_request); | ||
1248 | 1672 | ||
1249 | done: | 1673 | return -ENOMEM; |
1250 | ceph_release_page_vector(pages, num_pages); | ||
1251 | return ret; | ||
1252 | } | 1674 | } |
1253 | 1675 | ||
1254 | /* | 1676 | static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) |
1255 | * Do an asynchronous ceph osd operation | 1677 | { |
1256 | */ | 1678 | struct rbd_img_request *img_request; |
1257 | static int rbd_do_op(struct request *rq, | 1679 | u32 which = obj_request->which; |
1258 | struct rbd_device *rbd_dev, | 1680 | bool more = true; |
1259 | struct ceph_snap_context *snapc, | 1681 | |
1260 | u64 ofs, u64 len, | 1682 | img_request = obj_request->img_request; |
1261 | struct bio *bio, | 1683 | |
1262 | struct rbd_req_coll *coll, | 1684 | dout("%s: img %p obj %p\n", __func__, img_request, obj_request); |
1263 | int coll_index) | 1685 | rbd_assert(img_request != NULL); |
1264 | { | 1686 | rbd_assert(img_request->rq != NULL); |
1265 | char *seg_name; | 1687 | rbd_assert(img_request->obj_request_count > 0); |
1266 | u64 seg_ofs; | 1688 | rbd_assert(which != BAD_WHICH); |
1267 | u64 seg_len; | 1689 | rbd_assert(which < img_request->obj_request_count); |
1268 | int ret; | 1690 | rbd_assert(which >= img_request->next_completion); |
1269 | struct ceph_osd_req_op *ops; | 1691 | |
1270 | u32 payload_len; | 1692 | spin_lock_irq(&img_request->completion_lock); |
1271 | int opcode; | 1693 | if (which != img_request->next_completion) |
1272 | int flags; | 1694 | goto out; |
1273 | u64 snapid; | 1695 | |
1274 | 1696 | for_each_obj_request_from(img_request, obj_request) { | |
1275 | seg_name = rbd_segment_name(rbd_dev, ofs); | 1697 | unsigned int xferred; |
1276 | if (!seg_name) | 1698 | int result; |
1277 | return -ENOMEM; | 1699 | |
1278 | seg_len = rbd_segment_length(rbd_dev, ofs, len); | 1700 | rbd_assert(more); |
1279 | seg_ofs = rbd_segment_offset(rbd_dev, ofs); | 1701 | rbd_assert(which < img_request->obj_request_count); |
1280 | 1702 | ||
1281 | if (rq_data_dir(rq) == WRITE) { | 1703 | if (!obj_request_done_test(obj_request)) |
1282 | opcode = CEPH_OSD_OP_WRITE; | 1704 | break; |
1283 | flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; | 1705 | |
1284 | snapid = CEPH_NOSNAP; | 1706 | rbd_assert(obj_request->xferred <= (u64) UINT_MAX); |
1285 | payload_len = seg_len; | 1707 | xferred = (unsigned int) obj_request->xferred; |
1286 | } else { | 1708 | result = (int) obj_request->result; |
1287 | opcode = CEPH_OSD_OP_READ; | 1709 | if (result) |
1288 | flags = CEPH_OSD_FLAG_READ; | 1710 | rbd_warn(NULL, "obj_request %s result %d xferred %u\n", |
1289 | snapc = NULL; | 1711 | img_request->write_request ? "write" : "read", |
1290 | snapid = rbd_dev->spec->snap_id; | 1712 | result, xferred); |
1291 | payload_len = 0; | 1713 | |
1714 | more = blk_end_request(img_request->rq, result, xferred); | ||
1715 | which++; | ||
1292 | } | 1716 | } |
1293 | 1717 | ||
1294 | ret = -ENOMEM; | 1718 | rbd_assert(more ^ (which == img_request->obj_request_count)); |
1295 | ops = rbd_create_rw_ops(1, opcode, payload_len); | 1719 | img_request->next_completion = which; |
1296 | if (!ops) | 1720 | out: |
1297 | goto done; | 1721 | spin_unlock_irq(&img_request->completion_lock); |
1298 | 1722 | ||
1299 | /* we've taken care of segment sizes earlier when we | 1723 | if (!more) |
1300 | cloned the bios. We should never have a segment | 1724 | rbd_img_request_complete(img_request); |
1301 | truncated at this point */ | ||
1302 | rbd_assert(seg_len == len); | ||
1303 | |||
1304 | ret = rbd_do_request(rq, rbd_dev, snapc, snapid, | ||
1305 | seg_name, seg_ofs, seg_len, | ||
1306 | bio, | ||
1307 | NULL, 0, | ||
1308 | flags, | ||
1309 | ops, | ||
1310 | coll, coll_index, | ||
1311 | rbd_req_cb, 0, NULL); | ||
1312 | |||
1313 | rbd_destroy_ops(ops); | ||
1314 | done: | ||
1315 | kfree(seg_name); | ||
1316 | return ret; | ||
1317 | } | 1725 | } |
1318 | 1726 | ||
1319 | /* | 1727 | static int rbd_img_request_submit(struct rbd_img_request *img_request) |
1320 | * Request sync osd read | 1728 | { |
1321 | */ | 1729 | struct rbd_device *rbd_dev = img_request->rbd_dev; |
1322 | static int rbd_req_sync_read(struct rbd_device *rbd_dev, | 1730 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
1323 | u64 snapid, | 1731 | struct rbd_obj_request *obj_request; |
1324 | const char *object_name, | ||
1325 | u64 ofs, u64 len, | ||
1326 | char *buf, | ||
1327 | u64 *ver) | ||
1328 | { | ||
1329 | struct ceph_osd_req_op *ops; | ||
1330 | int ret; | ||
1331 | 1732 | ||
1332 | ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); | 1733 | dout("%s: img %p\n", __func__, img_request); |
1333 | if (!ops) | 1734 | for_each_obj_request(img_request, obj_request) { |
1334 | return -ENOMEM; | 1735 | int ret; |
1335 | 1736 | ||
1336 | ret = rbd_req_sync_op(rbd_dev, NULL, | 1737 | obj_request->callback = rbd_img_obj_callback; |
1337 | snapid, | 1738 | ret = rbd_obj_request_submit(osdc, obj_request); |
1338 | CEPH_OSD_FLAG_READ, | 1739 | if (ret) |
1339 | ops, object_name, ofs, len, buf, NULL, ver); | 1740 | return ret; |
1340 | rbd_destroy_ops(ops); | 1741 | /* |
1742 | * The image request has its own reference to each | ||
1743 | * of its object requests, so we can safely drop the | ||
1744 | * initial one here. | ||
1745 | */ | ||
1746 | rbd_obj_request_put(obj_request); | ||
1747 | } | ||
1341 | 1748 | ||
1342 | return ret; | 1749 | return 0; |
1343 | } | 1750 | } |
1344 | 1751 | ||
1345 | /* | 1752 | static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, |
1346 | * Request sync osd watch | 1753 | u64 ver, u64 notify_id) |
1347 | */ | ||
1348 | static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, | ||
1349 | u64 ver, | ||
1350 | u64 notify_id) | ||
1351 | { | 1754 | { |
1352 | struct ceph_osd_req_op *ops; | 1755 | struct rbd_obj_request *obj_request; |
1756 | struct ceph_osd_req_op *op; | ||
1757 | struct ceph_osd_client *osdc; | ||
1353 | int ret; | 1758 | int ret; |
1354 | 1759 | ||
1355 | ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); | 1760 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, |
1356 | if (!ops) | 1761 | OBJ_REQUEST_NODATA); |
1762 | if (!obj_request) | ||
1357 | return -ENOMEM; | 1763 | return -ENOMEM; |
1358 | 1764 | ||
1359 | ops[0].watch.ver = cpu_to_le64(ver); | 1765 | ret = -ENOMEM; |
1360 | ops[0].watch.cookie = notify_id; | 1766 | op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); |
1361 | ops[0].watch.flag = 0; | 1767 | if (!op) |
1768 | goto out; | ||
1769 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, | ||
1770 | obj_request, op); | ||
1771 | rbd_osd_req_op_destroy(op); | ||
1772 | if (!obj_request->osd_req) | ||
1773 | goto out; | ||
1362 | 1774 | ||
1363 | ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, | 1775 | osdc = &rbd_dev->rbd_client->client->osdc; |
1364 | rbd_dev->header_name, 0, 0, NULL, | 1776 | obj_request->callback = rbd_obj_request_put; |
1365 | NULL, 0, | 1777 | ret = rbd_obj_request_submit(osdc, obj_request); |
1366 | CEPH_OSD_FLAG_READ, | 1778 | out: |
1367 | ops, | 1779 | if (ret) |
1368 | NULL, 0, | 1780 | rbd_obj_request_put(obj_request); |
1369 | rbd_simple_req_cb, 0, NULL); | ||
1370 | 1781 | ||
1371 | rbd_destroy_ops(ops); | ||
1372 | return ret; | 1782 | return ret; |
1373 | } | 1783 | } |
1374 | 1784 | ||
@@ -1381,95 +1791,103 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | |||
1381 | if (!rbd_dev) | 1791 | if (!rbd_dev) |
1382 | return; | 1792 | return; |
1383 | 1793 | ||
1384 | dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", | 1794 | dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, |
1385 | rbd_dev->header_name, (unsigned long long) notify_id, | 1795 | rbd_dev->header_name, (unsigned long long) notify_id, |
1386 | (unsigned int) opcode); | 1796 | (unsigned int) opcode); |
1387 | rc = rbd_dev_refresh(rbd_dev, &hver); | 1797 | rc = rbd_dev_refresh(rbd_dev, &hver); |
1388 | if (rc) | 1798 | if (rc) |
1389 | pr_warning(RBD_DRV_NAME "%d got notification but failed to " | 1799 | rbd_warn(rbd_dev, "got notification but failed to " |
1390 | " update snaps: %d\n", rbd_dev->major, rc); | 1800 | " update snaps: %d\n", rc); |
1391 | 1801 | ||
1392 | rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); | 1802 | rbd_obj_notify_ack(rbd_dev, hver, notify_id); |
1393 | } | 1803 | } |
1394 | 1804 | ||
1395 | /* | 1805 | /* |
1396 | * Request sync osd watch | 1806 | * Request sync osd watch/unwatch. The value of "start" determines |
1807 | * whether a watch request is being initiated or torn down. | ||
1397 | */ | 1808 | */ |
1398 | static int rbd_req_sync_watch(struct rbd_device *rbd_dev) | 1809 | static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) |
1399 | { | 1810 | { |
1400 | struct ceph_osd_req_op *ops; | ||
1401 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 1811 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
1812 | struct rbd_obj_request *obj_request; | ||
1813 | struct ceph_osd_req_op *op; | ||
1402 | int ret; | 1814 | int ret; |
1403 | 1815 | ||
1404 | ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); | 1816 | rbd_assert(start ^ !!rbd_dev->watch_event); |
1405 | if (!ops) | 1817 | rbd_assert(start ^ !!rbd_dev->watch_request); |
1406 | return -ENOMEM; | ||
1407 | 1818 | ||
1408 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, | 1819 | if (start) { |
1409 | (void *)rbd_dev, &rbd_dev->watch_event); | 1820 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, |
1410 | if (ret < 0) | 1821 | &rbd_dev->watch_event); |
1411 | goto fail; | 1822 | if (ret < 0) |
1823 | return ret; | ||
1824 | rbd_assert(rbd_dev->watch_event != NULL); | ||
1825 | } | ||
1412 | 1826 | ||
1413 | ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); | 1827 | ret = -ENOMEM; |
1414 | ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); | 1828 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, |
1415 | ops[0].watch.flag = 1; | 1829 | OBJ_REQUEST_NODATA); |
1830 | if (!obj_request) | ||
1831 | goto out_cancel; | ||
1832 | |||
1833 | op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, | ||
1834 | rbd_dev->watch_event->cookie, | ||
1835 | rbd_dev->header.obj_version, start); | ||
1836 | if (!op) | ||
1837 | goto out_cancel; | ||
1838 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, | ||
1839 | obj_request, op); | ||
1840 | rbd_osd_req_op_destroy(op); | ||
1841 | if (!obj_request->osd_req) | ||
1842 | goto out_cancel; | ||
1843 | |||
1844 | if (start) | ||
1845 | ceph_osdc_set_request_linger(osdc, obj_request->osd_req); | ||
1846 | else | ||
1847 | ceph_osdc_unregister_linger_request(osdc, | ||
1848 | rbd_dev->watch_request->osd_req); | ||
1849 | ret = rbd_obj_request_submit(osdc, obj_request); | ||
1850 | if (ret) | ||
1851 | goto out_cancel; | ||
1852 | ret = rbd_obj_request_wait(obj_request); | ||
1853 | if (ret) | ||
1854 | goto out_cancel; | ||
1855 | ret = obj_request->result; | ||
1856 | if (ret) | ||
1857 | goto out_cancel; | ||
1416 | 1858 | ||
1417 | ret = rbd_req_sync_op(rbd_dev, NULL, | 1859 | /* |
1418 | CEPH_NOSNAP, | 1860 | * A watch request is set to linger, so the underlying osd |
1419 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1861 | * request won't go away until we unregister it. We retain |
1420 | ops, | 1862 | * a pointer to the object request during that time (in |
1421 | rbd_dev->header_name, | 1863 | * rbd_dev->watch_request), so we'll keep a reference to |
1422 | 0, 0, NULL, | 1864 | * it. We'll drop that reference (below) after we've |
1423 | &rbd_dev->watch_request, NULL); | 1865 | * unregistered it. |
1866 | */ | ||
1867 | if (start) { | ||
1868 | rbd_dev->watch_request = obj_request; | ||
1424 | 1869 | ||
1425 | if (ret < 0) | 1870 | return 0; |
1426 | goto fail_event; | 1871 | } |
1427 | 1872 | ||
1428 | rbd_destroy_ops(ops); | 1873 | /* We have successfully torn down the watch request */ |
1429 | return 0; | ||
1430 | 1874 | ||
1431 | fail_event: | 1875 | rbd_obj_request_put(rbd_dev->watch_request); |
1876 | rbd_dev->watch_request = NULL; | ||
1877 | out_cancel: | ||
1878 | /* Cancel the event if we're tearing down, or on error */ | ||
1432 | ceph_osdc_cancel_event(rbd_dev->watch_event); | 1879 | ceph_osdc_cancel_event(rbd_dev->watch_event); |
1433 | rbd_dev->watch_event = NULL; | 1880 | rbd_dev->watch_event = NULL; |
1434 | fail: | 1881 | if (obj_request) |
1435 | rbd_destroy_ops(ops); | 1882 | rbd_obj_request_put(obj_request); |
1436 | return ret; | ||
1437 | } | ||
1438 | 1883 | ||
1439 | /* | ||
1440 | * Request sync osd unwatch | ||
1441 | */ | ||
1442 | static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) | ||
1443 | { | ||
1444 | struct ceph_osd_req_op *ops; | ||
1445 | int ret; | ||
1446 | |||
1447 | ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); | ||
1448 | if (!ops) | ||
1449 | return -ENOMEM; | ||
1450 | |||
1451 | ops[0].watch.ver = 0; | ||
1452 | ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); | ||
1453 | ops[0].watch.flag = 0; | ||
1454 | |||
1455 | ret = rbd_req_sync_op(rbd_dev, NULL, | ||
1456 | CEPH_NOSNAP, | ||
1457 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | ||
1458 | ops, | ||
1459 | rbd_dev->header_name, | ||
1460 | 0, 0, NULL, NULL, NULL); | ||
1461 | |||
1462 | |||
1463 | rbd_destroy_ops(ops); | ||
1464 | ceph_osdc_cancel_event(rbd_dev->watch_event); | ||
1465 | rbd_dev->watch_event = NULL; | ||
1466 | return ret; | 1884 | return ret; |
1467 | } | 1885 | } |
1468 | 1886 | ||
1469 | /* | 1887 | /* |
1470 | * Synchronous osd object method call | 1888 | * Synchronous osd object method call |
1471 | */ | 1889 | */ |
1472 | static int rbd_req_sync_exec(struct rbd_device *rbd_dev, | 1890 | static int rbd_obj_method_sync(struct rbd_device *rbd_dev, |
1473 | const char *object_name, | 1891 | const char *object_name, |
1474 | const char *class_name, | 1892 | const char *class_name, |
1475 | const char *method_name, | 1893 | const char *method_name, |
@@ -1477,169 +1895,154 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, | |||
1477 | size_t outbound_size, | 1895 | size_t outbound_size, |
1478 | char *inbound, | 1896 | char *inbound, |
1479 | size_t inbound_size, | 1897 | size_t inbound_size, |
1480 | int flags, | 1898 | u64 *version) |
1481 | u64 *ver) | ||
1482 | { | 1899 | { |
1483 | struct ceph_osd_req_op *ops; | 1900 | struct rbd_obj_request *obj_request; |
1484 | int class_name_len = strlen(class_name); | 1901 | struct ceph_osd_client *osdc; |
1485 | int method_name_len = strlen(method_name); | 1902 | struct ceph_osd_req_op *op; |
1486 | int payload_size; | 1903 | struct page **pages; |
1904 | u32 page_count; | ||
1487 | int ret; | 1905 | int ret; |
1488 | 1906 | ||
1489 | /* | 1907 | /* |
1490 | * Any input parameters required by the method we're calling | 1908 | * Method calls are ultimately read operations but they |
1491 | * will be sent along with the class and method names as | 1909 | * don't involve object data (so no offset or length). |
1492 | * part of the message payload. That data and its size are | 1910 | * The result should placed into the inbound buffer |
1493 | * supplied via the indata and indata_len fields (named from | 1911 | * provided. They also supply outbound data--parameters for |
1494 | * the perspective of the server side) in the OSD request | 1912 | * the object method. Currently if this is present it will |
1495 | * operation. | 1913 | * be a snapshot id. |
1496 | */ | 1914 | */ |
1497 | payload_size = class_name_len + method_name_len + outbound_size; | 1915 | page_count = (u32) calc_pages_for(0, inbound_size); |
1498 | ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); | 1916 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); |
1499 | if (!ops) | 1917 | if (IS_ERR(pages)) |
1500 | return -ENOMEM; | 1918 | return PTR_ERR(pages); |
1501 | 1919 | ||
1502 | ops[0].cls.class_name = class_name; | 1920 | ret = -ENOMEM; |
1503 | ops[0].cls.class_len = (__u8) class_name_len; | 1921 | obj_request = rbd_obj_request_create(object_name, 0, 0, |
1504 | ops[0].cls.method_name = method_name; | 1922 | OBJ_REQUEST_PAGES); |
1505 | ops[0].cls.method_len = (__u8) method_name_len; | 1923 | if (!obj_request) |
1506 | ops[0].cls.argc = 0; | 1924 | goto out; |
1507 | ops[0].cls.indata = outbound; | ||
1508 | ops[0].cls.indata_len = outbound_size; | ||
1509 | 1925 | ||
1510 | ret = rbd_req_sync_op(rbd_dev, NULL, | 1926 | obj_request->pages = pages; |
1511 | CEPH_NOSNAP, | 1927 | obj_request->page_count = page_count; |
1512 | flags, ops, | ||
1513 | object_name, 0, inbound_size, inbound, | ||
1514 | NULL, ver); | ||
1515 | 1928 | ||
1516 | rbd_destroy_ops(ops); | 1929 | op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, |
1930 | method_name, outbound, outbound_size); | ||
1931 | if (!op) | ||
1932 | goto out; | ||
1933 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, | ||
1934 | obj_request, op); | ||
1935 | rbd_osd_req_op_destroy(op); | ||
1936 | if (!obj_request->osd_req) | ||
1937 | goto out; | ||
1517 | 1938 | ||
1518 | dout("cls_exec returned %d\n", ret); | 1939 | osdc = &rbd_dev->rbd_client->client->osdc; |
1519 | return ret; | 1940 | ret = rbd_obj_request_submit(osdc, obj_request); |
1520 | } | 1941 | if (ret) |
1942 | goto out; | ||
1943 | ret = rbd_obj_request_wait(obj_request); | ||
1944 | if (ret) | ||
1945 | goto out; | ||
1521 | 1946 | ||
1522 | static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) | 1947 | ret = obj_request->result; |
1523 | { | 1948 | if (ret < 0) |
1524 | struct rbd_req_coll *coll = | 1949 | goto out; |
1525 | kzalloc(sizeof(struct rbd_req_coll) + | 1950 | ret = 0; |
1526 | sizeof(struct rbd_req_status) * num_reqs, | 1951 | ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); |
1527 | GFP_ATOMIC); | 1952 | if (version) |
1953 | *version = obj_request->version; | ||
1954 | out: | ||
1955 | if (obj_request) | ||
1956 | rbd_obj_request_put(obj_request); | ||
1957 | else | ||
1958 | ceph_release_page_vector(pages, page_count); | ||
1528 | 1959 | ||
1529 | if (!coll) | 1960 | return ret; |
1530 | return NULL; | ||
1531 | coll->total = num_reqs; | ||
1532 | kref_init(&coll->kref); | ||
1533 | return coll; | ||
1534 | } | 1961 | } |
1535 | 1962 | ||
1536 | /* | 1963 | static void rbd_request_fn(struct request_queue *q) |
1537 | * block device queue callback | 1964 | __releases(q->queue_lock) __acquires(q->queue_lock) |
1538 | */ | ||
1539 | static void rbd_rq_fn(struct request_queue *q) | ||
1540 | { | 1965 | { |
1541 | struct rbd_device *rbd_dev = q->queuedata; | 1966 | struct rbd_device *rbd_dev = q->queuedata; |
1967 | bool read_only = rbd_dev->mapping.read_only; | ||
1542 | struct request *rq; | 1968 | struct request *rq; |
1969 | int result; | ||
1543 | 1970 | ||
1544 | while ((rq = blk_fetch_request(q))) { | 1971 | while ((rq = blk_fetch_request(q))) { |
1545 | struct bio *bio; | 1972 | bool write_request = rq_data_dir(rq) == WRITE; |
1546 | bool do_write; | 1973 | struct rbd_img_request *img_request; |
1547 | unsigned int size; | 1974 | u64 offset; |
1548 | u64 ofs; | 1975 | u64 length; |
1549 | int num_segs, cur_seg = 0; | 1976 | |
1550 | struct rbd_req_coll *coll; | 1977 | /* Ignore any non-FS requests that filter through. */ |
1551 | struct ceph_snap_context *snapc; | ||
1552 | unsigned int bio_offset; | ||
1553 | |||
1554 | dout("fetched request\n"); | ||
1555 | |||
1556 | /* filter out block requests we don't understand */ | ||
1557 | if ((rq->cmd_type != REQ_TYPE_FS)) { | ||
1558 | __blk_end_request_all(rq, 0); | ||
1559 | continue; | ||
1560 | } | ||
1561 | 1978 | ||
1562 | /* deduce our operation (read, write) */ | 1979 | if (rq->cmd_type != REQ_TYPE_FS) { |
1563 | do_write = (rq_data_dir(rq) == WRITE); | 1980 | dout("%s: non-fs request type %d\n", __func__, |
1564 | if (do_write && rbd_dev->mapping.read_only) { | 1981 | (int) rq->cmd_type); |
1565 | __blk_end_request_all(rq, -EROFS); | 1982 | __blk_end_request_all(rq, 0); |
1566 | continue; | 1983 | continue; |
1567 | } | 1984 | } |
1568 | 1985 | ||
1569 | spin_unlock_irq(q->queue_lock); | 1986 | /* Ignore/skip any zero-length requests */ |
1570 | 1987 | ||
1571 | down_read(&rbd_dev->header_rwsem); | 1988 | offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; |
1989 | length = (u64) blk_rq_bytes(rq); | ||
1572 | 1990 | ||
1573 | if (!rbd_dev->exists) { | 1991 | if (!length) { |
1574 | rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); | 1992 | dout("%s: zero-length request\n", __func__); |
1575 | up_read(&rbd_dev->header_rwsem); | 1993 | __blk_end_request_all(rq, 0); |
1576 | dout("request for non-existent snapshot"); | ||
1577 | spin_lock_irq(q->queue_lock); | ||
1578 | __blk_end_request_all(rq, -ENXIO); | ||
1579 | continue; | 1994 | continue; |
1580 | } | 1995 | } |
1581 | 1996 | ||
1582 | snapc = ceph_get_snap_context(rbd_dev->header.snapc); | 1997 | spin_unlock_irq(q->queue_lock); |
1583 | |||
1584 | up_read(&rbd_dev->header_rwsem); | ||
1585 | |||
1586 | size = blk_rq_bytes(rq); | ||
1587 | ofs = blk_rq_pos(rq) * SECTOR_SIZE; | ||
1588 | bio = rq->bio; | ||
1589 | 1998 | ||
1590 | dout("%s 0x%x bytes at 0x%llx\n", | 1999 | /* Disallow writes to a read-only device */ |
1591 | do_write ? "write" : "read", | ||
1592 | size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); | ||
1593 | 2000 | ||
1594 | num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); | 2001 | if (write_request) { |
1595 | if (num_segs <= 0) { | 2002 | result = -EROFS; |
1596 | spin_lock_irq(q->queue_lock); | 2003 | if (read_only) |
1597 | __blk_end_request_all(rq, num_segs); | 2004 | goto end_request; |
1598 | ceph_put_snap_context(snapc); | 2005 | rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); |
1599 | continue; | ||
1600 | } | 2006 | } |
1601 | coll = rbd_alloc_coll(num_segs); | ||
1602 | if (!coll) { | ||
1603 | spin_lock_irq(q->queue_lock); | ||
1604 | __blk_end_request_all(rq, -ENOMEM); | ||
1605 | ceph_put_snap_context(snapc); | ||
1606 | continue; | ||
1607 | } | ||
1608 | |||
1609 | bio_offset = 0; | ||
1610 | do { | ||
1611 | u64 limit = rbd_segment_length(rbd_dev, ofs, size); | ||
1612 | unsigned int chain_size; | ||
1613 | struct bio *bio_chain; | ||
1614 | |||
1615 | BUG_ON(limit > (u64) UINT_MAX); | ||
1616 | chain_size = (unsigned int) limit; | ||
1617 | dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); | ||
1618 | 2007 | ||
1619 | kref_get(&coll->kref); | 2008 | /* |
2009 | * Quit early if the mapped snapshot no longer | ||
2010 | * exists. It's still possible the snapshot will | ||
2011 | * have disappeared by the time our request arrives | ||
2012 | * at the osd, but there's no sense in sending it if | ||
2013 | * we already know. | ||
2014 | */ | ||
2015 | if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { | ||
2016 | dout("request for non-existent snapshot"); | ||
2017 | rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); | ||
2018 | result = -ENXIO; | ||
2019 | goto end_request; | ||
2020 | } | ||
1620 | 2021 | ||
1621 | /* Pass a cloned bio chain via an osd request */ | 2022 | result = -EINVAL; |
2023 | if (WARN_ON(offset && length > U64_MAX - offset + 1)) | ||
2024 | goto end_request; /* Shouldn't happen */ | ||
1622 | 2025 | ||
1623 | bio_chain = bio_chain_clone_range(&bio, | 2026 | result = -ENOMEM; |
1624 | &bio_offset, chain_size, | 2027 | img_request = rbd_img_request_create(rbd_dev, offset, length, |
1625 | GFP_ATOMIC); | 2028 | write_request); |
1626 | if (bio_chain) | 2029 | if (!img_request) |
1627 | (void) rbd_do_op(rq, rbd_dev, snapc, | 2030 | goto end_request; |
1628 | ofs, chain_size, | ||
1629 | bio_chain, coll, cur_seg); | ||
1630 | else | ||
1631 | rbd_coll_end_req_index(rq, coll, cur_seg, | ||
1632 | -ENOMEM, chain_size); | ||
1633 | size -= chain_size; | ||
1634 | ofs += chain_size; | ||
1635 | 2031 | ||
1636 | cur_seg++; | 2032 | img_request->rq = rq; |
1637 | } while (size > 0); | ||
1638 | kref_put(&coll->kref, rbd_coll_release); | ||
1639 | 2033 | ||
2034 | result = rbd_img_request_fill_bio(img_request, rq->bio); | ||
2035 | if (!result) | ||
2036 | result = rbd_img_request_submit(img_request); | ||
2037 | if (result) | ||
2038 | rbd_img_request_put(img_request); | ||
2039 | end_request: | ||
1640 | spin_lock_irq(q->queue_lock); | 2040 | spin_lock_irq(q->queue_lock); |
1641 | 2041 | if (result < 0) { | |
1642 | ceph_put_snap_context(snapc); | 2042 | rbd_warn(rbd_dev, "obj_request %s result %d\n", |
2043 | write_request ? "write" : "read", result); | ||
2044 | __blk_end_request_all(rq, result); | ||
2045 | } | ||
1643 | } | 2046 | } |
1644 | } | 2047 | } |
1645 | 2048 | ||
@@ -1703,6 +2106,71 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) | |||
1703 | put_disk(disk); | 2106 | put_disk(disk); |
1704 | } | 2107 | } |
1705 | 2108 | ||
2109 | static int rbd_obj_read_sync(struct rbd_device *rbd_dev, | ||
2110 | const char *object_name, | ||
2111 | u64 offset, u64 length, | ||
2112 | char *buf, u64 *version) | ||
2113 | |||
2114 | { | ||
2115 | struct ceph_osd_req_op *op; | ||
2116 | struct rbd_obj_request *obj_request; | ||
2117 | struct ceph_osd_client *osdc; | ||
2118 | struct page **pages = NULL; | ||
2119 | u32 page_count; | ||
2120 | size_t size; | ||
2121 | int ret; | ||
2122 | |||
2123 | page_count = (u32) calc_pages_for(offset, length); | ||
2124 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); | ||
2125 | if (IS_ERR(pages)) | ||
2126 | ret = PTR_ERR(pages); | ||
2127 | |||
2128 | ret = -ENOMEM; | ||
2129 | obj_request = rbd_obj_request_create(object_name, offset, length, | ||
2130 | OBJ_REQUEST_PAGES); | ||
2131 | if (!obj_request) | ||
2132 | goto out; | ||
2133 | |||
2134 | obj_request->pages = pages; | ||
2135 | obj_request->page_count = page_count; | ||
2136 | |||
2137 | op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); | ||
2138 | if (!op) | ||
2139 | goto out; | ||
2140 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, | ||
2141 | obj_request, op); | ||
2142 | rbd_osd_req_op_destroy(op); | ||
2143 | if (!obj_request->osd_req) | ||
2144 | goto out; | ||
2145 | |||
2146 | osdc = &rbd_dev->rbd_client->client->osdc; | ||
2147 | ret = rbd_obj_request_submit(osdc, obj_request); | ||
2148 | if (ret) | ||
2149 | goto out; | ||
2150 | ret = rbd_obj_request_wait(obj_request); | ||
2151 | if (ret) | ||
2152 | goto out; | ||
2153 | |||
2154 | ret = obj_request->result; | ||
2155 | if (ret < 0) | ||
2156 | goto out; | ||
2157 | |||
2158 | rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); | ||
2159 | size = (size_t) obj_request->xferred; | ||
2160 | ceph_copy_from_page_vector(pages, buf, 0, size); | ||
2161 | rbd_assert(size <= (size_t) INT_MAX); | ||
2162 | ret = (int) size; | ||
2163 | if (version) | ||
2164 | *version = obj_request->version; | ||
2165 | out: | ||
2166 | if (obj_request) | ||
2167 | rbd_obj_request_put(obj_request); | ||
2168 | else | ||
2169 | ceph_release_page_vector(pages, page_count); | ||
2170 | |||
2171 | return ret; | ||
2172 | } | ||
2173 | |||
1706 | /* | 2174 | /* |
1707 | * Read the complete header for the given rbd device. | 2175 | * Read the complete header for the given rbd device. |
1708 | * | 2176 | * |
@@ -1741,24 +2209,20 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) | |||
1741 | if (!ondisk) | 2209 | if (!ondisk) |
1742 | return ERR_PTR(-ENOMEM); | 2210 | return ERR_PTR(-ENOMEM); |
1743 | 2211 | ||
1744 | ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, | 2212 | ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, |
1745 | rbd_dev->header_name, | ||
1746 | 0, size, | 2213 | 0, size, |
1747 | (char *) ondisk, version); | 2214 | (char *) ondisk, version); |
1748 | |||
1749 | if (ret < 0) | 2215 | if (ret < 0) |
1750 | goto out_err; | 2216 | goto out_err; |
1751 | if (WARN_ON((size_t) ret < size)) { | 2217 | if (WARN_ON((size_t) ret < size)) { |
1752 | ret = -ENXIO; | 2218 | ret = -ENXIO; |
1753 | pr_warning("short header read for image %s" | 2219 | rbd_warn(rbd_dev, "short header read (want %zd got %d)", |
1754 | " (want %zd got %d)\n", | 2220 | size, ret); |
1755 | rbd_dev->spec->image_name, size, ret); | ||
1756 | goto out_err; | 2221 | goto out_err; |
1757 | } | 2222 | } |
1758 | if (!rbd_dev_ondisk_valid(ondisk)) { | 2223 | if (!rbd_dev_ondisk_valid(ondisk)) { |
1759 | ret = -ENXIO; | 2224 | ret = -ENXIO; |
1760 | pr_warning("invalid header for image %s\n", | 2225 | rbd_warn(rbd_dev, "invalid header"); |
1761 | rbd_dev->spec->image_name); | ||
1762 | goto out_err; | 2226 | goto out_err; |
1763 | } | 2227 | } |
1764 | 2228 | ||
@@ -1895,8 +2359,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) | |||
1895 | disk->fops = &rbd_bd_ops; | 2359 | disk->fops = &rbd_bd_ops; |
1896 | disk->private_data = rbd_dev; | 2360 | disk->private_data = rbd_dev; |
1897 | 2361 | ||
1898 | /* init rq */ | 2362 | q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); |
1899 | q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); | ||
1900 | if (!q) | 2363 | if (!q) |
1901 | goto out_disk; | 2364 | goto out_disk; |
1902 | 2365 | ||
@@ -2233,7 +2696,7 @@ static void rbd_spec_free(struct kref *kref) | |||
2233 | kfree(spec); | 2696 | kfree(spec); |
2234 | } | 2697 | } |
2235 | 2698 | ||
2236 | struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, | 2699 | static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, |
2237 | struct rbd_spec *spec) | 2700 | struct rbd_spec *spec) |
2238 | { | 2701 | { |
2239 | struct rbd_device *rbd_dev; | 2702 | struct rbd_device *rbd_dev; |
@@ -2243,6 +2706,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, | |||
2243 | return NULL; | 2706 | return NULL; |
2244 | 2707 | ||
2245 | spin_lock_init(&rbd_dev->lock); | 2708 | spin_lock_init(&rbd_dev->lock); |
2709 | rbd_dev->flags = 0; | ||
2246 | INIT_LIST_HEAD(&rbd_dev->node); | 2710 | INIT_LIST_HEAD(&rbd_dev->node); |
2247 | INIT_LIST_HEAD(&rbd_dev->snaps); | 2711 | INIT_LIST_HEAD(&rbd_dev->snaps); |
2248 | init_rwsem(&rbd_dev->header_rwsem); | 2712 | init_rwsem(&rbd_dev->header_rwsem); |
@@ -2250,6 +2714,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, | |||
2250 | rbd_dev->spec = spec; | 2714 | rbd_dev->spec = spec; |
2251 | rbd_dev->rbd_client = rbdc; | 2715 | rbd_dev->rbd_client = rbdc; |
2252 | 2716 | ||
2717 | /* Initialize the layout used for all rbd requests */ | ||
2718 | |||
2719 | rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | ||
2720 | rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); | ||
2721 | rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); | ||
2722 | rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); | ||
2723 | |||
2253 | return rbd_dev; | 2724 | return rbd_dev; |
2254 | } | 2725 | } |
2255 | 2726 | ||
@@ -2360,12 +2831,11 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, | |||
2360 | __le64 size; | 2831 | __le64 size; |
2361 | } __attribute__ ((packed)) size_buf = { 0 }; | 2832 | } __attribute__ ((packed)) size_buf = { 0 }; |
2362 | 2833 | ||
2363 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | 2834 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
2364 | "rbd", "get_size", | 2835 | "rbd", "get_size", |
2365 | (char *) &snapid, sizeof (snapid), | 2836 | (char *) &snapid, sizeof (snapid), |
2366 | (char *) &size_buf, sizeof (size_buf), | 2837 | (char *) &size_buf, sizeof (size_buf), NULL); |
2367 | CEPH_OSD_FLAG_READ, NULL); | 2838 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
2368 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | ||
2369 | if (ret < 0) | 2839 | if (ret < 0) |
2370 | return ret; | 2840 | return ret; |
2371 | 2841 | ||
@@ -2396,15 +2866,13 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) | |||
2396 | if (!reply_buf) | 2866 | if (!reply_buf) |
2397 | return -ENOMEM; | 2867 | return -ENOMEM; |
2398 | 2868 | ||
2399 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | 2869 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
2400 | "rbd", "get_object_prefix", | 2870 | "rbd", "get_object_prefix", |
2401 | NULL, 0, | 2871 | NULL, 0, |
2402 | reply_buf, RBD_OBJ_PREFIX_LEN_MAX, | 2872 | reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); |
2403 | CEPH_OSD_FLAG_READ, NULL); | 2873 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
2404 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | ||
2405 | if (ret < 0) | 2874 | if (ret < 0) |
2406 | goto out; | 2875 | goto out; |
2407 | ret = 0; /* rbd_req_sync_exec() can return positive */ | ||
2408 | 2876 | ||
2409 | p = reply_buf; | 2877 | p = reply_buf; |
2410 | rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, | 2878 | rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, |
@@ -2435,12 +2903,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, | |||
2435 | u64 incompat; | 2903 | u64 incompat; |
2436 | int ret; | 2904 | int ret; |
2437 | 2905 | ||
2438 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | 2906 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
2439 | "rbd", "get_features", | 2907 | "rbd", "get_features", |
2440 | (char *) &snapid, sizeof (snapid), | 2908 | (char *) &snapid, sizeof (snapid), |
2441 | (char *) &features_buf, sizeof (features_buf), | 2909 | (char *) &features_buf, sizeof (features_buf), |
2442 | CEPH_OSD_FLAG_READ, NULL); | 2910 | NULL); |
2443 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | 2911 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
2444 | if (ret < 0) | 2912 | if (ret < 0) |
2445 | return ret; | 2913 | return ret; |
2446 | 2914 | ||
@@ -2474,7 +2942,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) | |||
2474 | void *end; | 2942 | void *end; |
2475 | char *image_id; | 2943 | char *image_id; |
2476 | u64 overlap; | 2944 | u64 overlap; |
2477 | size_t len = 0; | ||
2478 | int ret; | 2945 | int ret; |
2479 | 2946 | ||
2480 | parent_spec = rbd_spec_alloc(); | 2947 | parent_spec = rbd_spec_alloc(); |
@@ -2492,12 +2959,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) | |||
2492 | } | 2959 | } |
2493 | 2960 | ||
2494 | snapid = cpu_to_le64(CEPH_NOSNAP); | 2961 | snapid = cpu_to_le64(CEPH_NOSNAP); |
2495 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | 2962 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
2496 | "rbd", "get_parent", | 2963 | "rbd", "get_parent", |
2497 | (char *) &snapid, sizeof (snapid), | 2964 | (char *) &snapid, sizeof (snapid), |
2498 | (char *) reply_buf, size, | 2965 | (char *) reply_buf, size, NULL); |
2499 | CEPH_OSD_FLAG_READ, NULL); | 2966 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
2500 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | ||
2501 | if (ret < 0) | 2967 | if (ret < 0) |
2502 | goto out_err; | 2968 | goto out_err; |
2503 | 2969 | ||
@@ -2508,13 +2974,18 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) | |||
2508 | if (parent_spec->pool_id == CEPH_NOPOOL) | 2974 | if (parent_spec->pool_id == CEPH_NOPOOL) |
2509 | goto out; /* No parent? No problem. */ | 2975 | goto out; /* No parent? No problem. */ |
2510 | 2976 | ||
2511 | image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); | 2977 | /* The ceph file layout needs to fit pool id in 32 bits */ |
2978 | |||
2979 | ret = -EIO; | ||
2980 | if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) | ||
2981 | goto out; | ||
2982 | |||
2983 | image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); | ||
2512 | if (IS_ERR(image_id)) { | 2984 | if (IS_ERR(image_id)) { |
2513 | ret = PTR_ERR(image_id); | 2985 | ret = PTR_ERR(image_id); |
2514 | goto out_err; | 2986 | goto out_err; |
2515 | } | 2987 | } |
2516 | parent_spec->image_id = image_id; | 2988 | parent_spec->image_id = image_id; |
2517 | parent_spec->image_id_len = len; | ||
2518 | ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); | 2989 | ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); |
2519 | ceph_decode_64_safe(&p, end, overlap, out_err); | 2990 | ceph_decode_64_safe(&p, end, overlap, out_err); |
2520 | 2991 | ||
@@ -2544,26 +3015,25 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) | |||
2544 | 3015 | ||
2545 | rbd_assert(!rbd_dev->spec->image_name); | 3016 | rbd_assert(!rbd_dev->spec->image_name); |
2546 | 3017 | ||
2547 | image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; | 3018 | len = strlen(rbd_dev->spec->image_id); |
3019 | image_id_size = sizeof (__le32) + len; | ||
2548 | image_id = kmalloc(image_id_size, GFP_KERNEL); | 3020 | image_id = kmalloc(image_id_size, GFP_KERNEL); |
2549 | if (!image_id) | 3021 | if (!image_id) |
2550 | return NULL; | 3022 | return NULL; |
2551 | 3023 | ||
2552 | p = image_id; | 3024 | p = image_id; |
2553 | end = (char *) image_id + image_id_size; | 3025 | end = (char *) image_id + image_id_size; |
2554 | ceph_encode_string(&p, end, rbd_dev->spec->image_id, | 3026 | ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); |
2555 | (u32) rbd_dev->spec->image_id_len); | ||
2556 | 3027 | ||
2557 | size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; | 3028 | size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; |
2558 | reply_buf = kmalloc(size, GFP_KERNEL); | 3029 | reply_buf = kmalloc(size, GFP_KERNEL); |
2559 | if (!reply_buf) | 3030 | if (!reply_buf) |
2560 | goto out; | 3031 | goto out; |
2561 | 3032 | ||
2562 | ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, | 3033 | ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, |
2563 | "rbd", "dir_get_name", | 3034 | "rbd", "dir_get_name", |
2564 | image_id, image_id_size, | 3035 | image_id, image_id_size, |
2565 | (char *) reply_buf, size, | 3036 | (char *) reply_buf, size, NULL); |
2566 | CEPH_OSD_FLAG_READ, NULL); | ||
2567 | if (ret < 0) | 3037 | if (ret < 0) |
2568 | goto out; | 3038 | goto out; |
2569 | p = reply_buf; | 3039 | p = reply_buf; |
@@ -2602,8 +3072,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) | |||
2602 | 3072 | ||
2603 | osdc = &rbd_dev->rbd_client->client->osdc; | 3073 | osdc = &rbd_dev->rbd_client->client->osdc; |
2604 | name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); | 3074 | name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); |
2605 | if (!name) | 3075 | if (!name) { |
2606 | return -EIO; /* pool id too large (>= 2^31) */ | 3076 | rbd_warn(rbd_dev, "there is no pool with id %llu", |
3077 | rbd_dev->spec->pool_id); /* Really a BUG() */ | ||
3078 | return -EIO; | ||
3079 | } | ||
2607 | 3080 | ||
2608 | rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); | 3081 | rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); |
2609 | if (!rbd_dev->spec->pool_name) | 3082 | if (!rbd_dev->spec->pool_name) |
@@ -2612,19 +3085,17 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) | |||
2612 | /* Fetch the image name; tolerate failure here */ | 3085 | /* Fetch the image name; tolerate failure here */ |
2613 | 3086 | ||
2614 | name = rbd_dev_image_name(rbd_dev); | 3087 | name = rbd_dev_image_name(rbd_dev); |
2615 | if (name) { | 3088 | if (name) |
2616 | rbd_dev->spec->image_name_len = strlen(name); | ||
2617 | rbd_dev->spec->image_name = (char *) name; | 3089 | rbd_dev->spec->image_name = (char *) name; |
2618 | } else { | 3090 | else |
2619 | pr_warning(RBD_DRV_NAME "%d " | 3091 | rbd_warn(rbd_dev, "unable to get image name"); |
2620 | "unable to get image name for image id %s\n", | ||
2621 | rbd_dev->major, rbd_dev->spec->image_id); | ||
2622 | } | ||
2623 | 3092 | ||
2624 | /* Look up the snapshot name. */ | 3093 | /* Look up the snapshot name. */ |
2625 | 3094 | ||
2626 | name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); | 3095 | name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); |
2627 | if (!name) { | 3096 | if (!name) { |
3097 | rbd_warn(rbd_dev, "no snapshot with id %llu", | ||
3098 | rbd_dev->spec->snap_id); /* Really a BUG() */ | ||
2628 | ret = -EIO; | 3099 | ret = -EIO; |
2629 | goto out_err; | 3100 | goto out_err; |
2630 | } | 3101 | } |
@@ -2665,12 +3136,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) | |||
2665 | if (!reply_buf) | 3136 | if (!reply_buf) |
2666 | return -ENOMEM; | 3137 | return -ENOMEM; |
2667 | 3138 | ||
2668 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | 3139 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
2669 | "rbd", "get_snapcontext", | 3140 | "rbd", "get_snapcontext", |
2670 | NULL, 0, | 3141 | NULL, 0, |
2671 | reply_buf, size, | 3142 | reply_buf, size, ver); |
2672 | CEPH_OSD_FLAG_READ, ver); | 3143 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
2673 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | ||
2674 | if (ret < 0) | 3144 | if (ret < 0) |
2675 | goto out; | 3145 | goto out; |
2676 | 3146 | ||
@@ -2735,12 +3205,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) | |||
2735 | return ERR_PTR(-ENOMEM); | 3205 | return ERR_PTR(-ENOMEM); |
2736 | 3206 | ||
2737 | snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); | 3207 | snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); |
2738 | ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, | 3208 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, |
2739 | "rbd", "get_snapshot_name", | 3209 | "rbd", "get_snapshot_name", |
2740 | (char *) &snap_id, sizeof (snap_id), | 3210 | (char *) &snap_id, sizeof (snap_id), |
2741 | reply_buf, size, | 3211 | reply_buf, size, NULL); |
2742 | CEPH_OSD_FLAG_READ, NULL); | 3212 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
2743 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | ||
2744 | if (ret < 0) | 3213 | if (ret < 0) |
2745 | goto out; | 3214 | goto out; |
2746 | 3215 | ||
@@ -2766,7 +3235,7 @@ out: | |||
2766 | static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, | 3235 | static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, |
2767 | u64 *snap_size, u64 *snap_features) | 3236 | u64 *snap_size, u64 *snap_features) |
2768 | { | 3237 | { |
2769 | __le64 snap_id; | 3238 | u64 snap_id; |
2770 | u8 order; | 3239 | u8 order; |
2771 | int ret; | 3240 | int ret; |
2772 | 3241 | ||
@@ -2865,10 +3334,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) | |||
2865 | if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { | 3334 | if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { |
2866 | struct list_head *next = links->next; | 3335 | struct list_head *next = links->next; |
2867 | 3336 | ||
2868 | /* Existing snapshot not in the new snap context */ | 3337 | /* |
2869 | 3338 | * A previously-existing snapshot is not in | |
3339 | * the new snap context. | ||
3340 | * | ||
3341 | * If the now missing snapshot is the one the | ||
3342 | * image is mapped to, clear its exists flag | ||
3343 | * so we can avoid sending any more requests | ||
3344 | * to it. | ||
3345 | */ | ||
2870 | if (rbd_dev->spec->snap_id == snap->id) | 3346 | if (rbd_dev->spec->snap_id == snap->id) |
2871 | rbd_dev->exists = false; | 3347 | clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); |
2872 | rbd_remove_snap_dev(snap); | 3348 | rbd_remove_snap_dev(snap); |
2873 | dout("%ssnap id %llu has been removed\n", | 3349 | dout("%ssnap id %llu has been removed\n", |
2874 | rbd_dev->spec->snap_id == snap->id ? | 3350 | rbd_dev->spec->snap_id == snap->id ? |
@@ -2942,7 +3418,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) | |||
2942 | struct rbd_snap *snap; | 3418 | struct rbd_snap *snap; |
2943 | int ret = 0; | 3419 | int ret = 0; |
2944 | 3420 | ||
2945 | dout("%s called\n", __func__); | 3421 | dout("%s:\n", __func__); |
2946 | if (WARN_ON(!device_is_registered(&rbd_dev->dev))) | 3422 | if (WARN_ON(!device_is_registered(&rbd_dev->dev))) |
2947 | return -EIO; | 3423 | return -EIO; |
2948 | 3424 | ||
@@ -2983,22 +3459,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) | |||
2983 | device_unregister(&rbd_dev->dev); | 3459 | device_unregister(&rbd_dev->dev); |
2984 | } | 3460 | } |
2985 | 3461 | ||
2986 | static int rbd_init_watch_dev(struct rbd_device *rbd_dev) | ||
2987 | { | ||
2988 | int ret, rc; | ||
2989 | |||
2990 | do { | ||
2991 | ret = rbd_req_sync_watch(rbd_dev); | ||
2992 | if (ret == -ERANGE) { | ||
2993 | rc = rbd_dev_refresh(rbd_dev, NULL); | ||
2994 | if (rc < 0) | ||
2995 | return rc; | ||
2996 | } | ||
2997 | } while (ret == -ERANGE); | ||
2998 | |||
2999 | return ret; | ||
3000 | } | ||
3001 | |||
3002 | static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); | 3462 | static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); |
3003 | 3463 | ||
3004 | /* | 3464 | /* |
@@ -3138,11 +3598,9 @@ static inline char *dup_token(const char **buf, size_t *lenp) | |||
3138 | size_t len; | 3598 | size_t len; |
3139 | 3599 | ||
3140 | len = next_token(buf); | 3600 | len = next_token(buf); |
3141 | dup = kmalloc(len + 1, GFP_KERNEL); | 3601 | dup = kmemdup(*buf, len + 1, GFP_KERNEL); |
3142 | if (!dup) | 3602 | if (!dup) |
3143 | return NULL; | 3603 | return NULL; |
3144 | |||
3145 | memcpy(dup, *buf, len); | ||
3146 | *(dup + len) = '\0'; | 3604 | *(dup + len) = '\0'; |
3147 | *buf += len; | 3605 | *buf += len; |
3148 | 3606 | ||
@@ -3210,8 +3668,10 @@ static int rbd_add_parse_args(const char *buf, | |||
3210 | /* The first four tokens are required */ | 3668 | /* The first four tokens are required */ |
3211 | 3669 | ||
3212 | len = next_token(&buf); | 3670 | len = next_token(&buf); |
3213 | if (!len) | 3671 | if (!len) { |
3214 | return -EINVAL; /* Missing monitor address(es) */ | 3672 | rbd_warn(NULL, "no monitor address(es) provided"); |
3673 | return -EINVAL; | ||
3674 | } | ||
3215 | mon_addrs = buf; | 3675 | mon_addrs = buf; |
3216 | mon_addrs_size = len + 1; | 3676 | mon_addrs_size = len + 1; |
3217 | buf += len; | 3677 | buf += len; |
@@ -3220,8 +3680,10 @@ static int rbd_add_parse_args(const char *buf, | |||
3220 | options = dup_token(&buf, NULL); | 3680 | options = dup_token(&buf, NULL); |
3221 | if (!options) | 3681 | if (!options) |
3222 | return -ENOMEM; | 3682 | return -ENOMEM; |
3223 | if (!*options) | 3683 | if (!*options) { |
3224 | goto out_err; /* Missing options */ | 3684 | rbd_warn(NULL, "no options provided"); |
3685 | goto out_err; | ||
3686 | } | ||
3225 | 3687 | ||
3226 | spec = rbd_spec_alloc(); | 3688 | spec = rbd_spec_alloc(); |
3227 | if (!spec) | 3689 | if (!spec) |
@@ -3230,14 +3692,18 @@ static int rbd_add_parse_args(const char *buf, | |||
3230 | spec->pool_name = dup_token(&buf, NULL); | 3692 | spec->pool_name = dup_token(&buf, NULL); |
3231 | if (!spec->pool_name) | 3693 | if (!spec->pool_name) |
3232 | goto out_mem; | 3694 | goto out_mem; |
3233 | if (!*spec->pool_name) | 3695 | if (!*spec->pool_name) { |
3234 | goto out_err; /* Missing pool name */ | 3696 | rbd_warn(NULL, "no pool name provided"); |
3697 | goto out_err; | ||
3698 | } | ||
3235 | 3699 | ||
3236 | spec->image_name = dup_token(&buf, &spec->image_name_len); | 3700 | spec->image_name = dup_token(&buf, NULL); |
3237 | if (!spec->image_name) | 3701 | if (!spec->image_name) |
3238 | goto out_mem; | 3702 | goto out_mem; |
3239 | if (!*spec->image_name) | 3703 | if (!*spec->image_name) { |
3240 | goto out_err; /* Missing image name */ | 3704 | rbd_warn(NULL, "no image name provided"); |
3705 | goto out_err; | ||
3706 | } | ||
3241 | 3707 | ||
3242 | /* | 3708 | /* |
3243 | * Snapshot name is optional; default is to use "-" | 3709 | * Snapshot name is optional; default is to use "-" |
@@ -3251,10 +3717,9 @@ static int rbd_add_parse_args(const char *buf, | |||
3251 | ret = -ENAMETOOLONG; | 3717 | ret = -ENAMETOOLONG; |
3252 | goto out_err; | 3718 | goto out_err; |
3253 | } | 3719 | } |
3254 | spec->snap_name = kmalloc(len + 1, GFP_KERNEL); | 3720 | spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); |
3255 | if (!spec->snap_name) | 3721 | if (!spec->snap_name) |
3256 | goto out_mem; | 3722 | goto out_mem; |
3257 | memcpy(spec->snap_name, buf, len); | ||
3258 | *(spec->snap_name + len) = '\0'; | 3723 | *(spec->snap_name + len) = '\0'; |
3259 | 3724 | ||
3260 | /* Initialize all rbd options to the defaults */ | 3725 | /* Initialize all rbd options to the defaults */ |
@@ -3323,7 +3788,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
3323 | * First, see if the format 2 image id file exists, and if | 3788 | * First, see if the format 2 image id file exists, and if |
3324 | * so, get the image's persistent id from it. | 3789 | * so, get the image's persistent id from it. |
3325 | */ | 3790 | */ |
3326 | size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; | 3791 | size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); |
3327 | object_name = kmalloc(size, GFP_NOIO); | 3792 | object_name = kmalloc(size, GFP_NOIO); |
3328 | if (!object_name) | 3793 | if (!object_name) |
3329 | return -ENOMEM; | 3794 | return -ENOMEM; |
@@ -3339,21 +3804,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
3339 | goto out; | 3804 | goto out; |
3340 | } | 3805 | } |
3341 | 3806 | ||
3342 | ret = rbd_req_sync_exec(rbd_dev, object_name, | 3807 | ret = rbd_obj_method_sync(rbd_dev, object_name, |
3343 | "rbd", "get_id", | 3808 | "rbd", "get_id", |
3344 | NULL, 0, | 3809 | NULL, 0, |
3345 | response, RBD_IMAGE_ID_LEN_MAX, | 3810 | response, RBD_IMAGE_ID_LEN_MAX, NULL); |
3346 | CEPH_OSD_FLAG_READ, NULL); | 3811 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
3347 | dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); | ||
3348 | if (ret < 0) | 3812 | if (ret < 0) |
3349 | goto out; | 3813 | goto out; |
3350 | ret = 0; /* rbd_req_sync_exec() can return positive */ | ||
3351 | 3814 | ||
3352 | p = response; | 3815 | p = response; |
3353 | rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, | 3816 | rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, |
3354 | p + RBD_IMAGE_ID_LEN_MAX, | 3817 | p + RBD_IMAGE_ID_LEN_MAX, |
3355 | &rbd_dev->spec->image_id_len, | 3818 | NULL, GFP_NOIO); |
3356 | GFP_NOIO); | ||
3357 | if (IS_ERR(rbd_dev->spec->image_id)) { | 3819 | if (IS_ERR(rbd_dev->spec->image_id)) { |
3358 | ret = PTR_ERR(rbd_dev->spec->image_id); | 3820 | ret = PTR_ERR(rbd_dev->spec->image_id); |
3359 | rbd_dev->spec->image_id = NULL; | 3821 | rbd_dev->spec->image_id = NULL; |
@@ -3377,11 +3839,10 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) | |||
3377 | rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); | 3839 | rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); |
3378 | if (!rbd_dev->spec->image_id) | 3840 | if (!rbd_dev->spec->image_id) |
3379 | return -ENOMEM; | 3841 | return -ENOMEM; |
3380 | rbd_dev->spec->image_id_len = 0; | ||
3381 | 3842 | ||
3382 | /* Record the header object name for this rbd image. */ | 3843 | /* Record the header object name for this rbd image. */ |
3383 | 3844 | ||
3384 | size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); | 3845 | size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); |
3385 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); | 3846 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); |
3386 | if (!rbd_dev->header_name) { | 3847 | if (!rbd_dev->header_name) { |
3387 | ret = -ENOMEM; | 3848 | ret = -ENOMEM; |
@@ -3427,7 +3888,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) | |||
3427 | * Image id was filled in by the caller. Record the header | 3888 | * Image id was filled in by the caller. Record the header |
3428 | * object name for this rbd image. | 3889 | * object name for this rbd image. |
3429 | */ | 3890 | */ |
3430 | size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; | 3891 | size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); |
3431 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); | 3892 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); |
3432 | if (!rbd_dev->header_name) | 3893 | if (!rbd_dev->header_name) |
3433 | return -ENOMEM; | 3894 | return -ENOMEM; |
@@ -3542,7 +4003,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) | |||
3542 | if (ret) | 4003 | if (ret) |
3543 | goto err_out_bus; | 4004 | goto err_out_bus; |
3544 | 4005 | ||
3545 | ret = rbd_init_watch_dev(rbd_dev); | 4006 | ret = rbd_dev_header_watch_sync(rbd_dev, 1); |
3546 | if (ret) | 4007 | if (ret) |
3547 | goto err_out_bus; | 4008 | goto err_out_bus; |
3548 | 4009 | ||
@@ -3638,6 +4099,13 @@ static ssize_t rbd_add(struct bus_type *bus, | |||
3638 | goto err_out_client; | 4099 | goto err_out_client; |
3639 | spec->pool_id = (u64) rc; | 4100 | spec->pool_id = (u64) rc; |
3640 | 4101 | ||
4102 | /* The ceph file layout needs to fit pool id in 32 bits */ | ||
4103 | |||
4104 | if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { | ||
4105 | rc = -EIO; | ||
4106 | goto err_out_client; | ||
4107 | } | ||
4108 | |||
3641 | rbd_dev = rbd_dev_create(rbdc, spec); | 4109 | rbd_dev = rbd_dev_create(rbdc, spec); |
3642 | if (!rbd_dev) | 4110 | if (!rbd_dev) |
3643 | goto err_out_client; | 4111 | goto err_out_client; |
@@ -3691,15 +4159,8 @@ static void rbd_dev_release(struct device *dev) | |||
3691 | { | 4159 | { |
3692 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 4160 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
3693 | 4161 | ||
3694 | if (rbd_dev->watch_request) { | ||
3695 | struct ceph_client *client = rbd_dev->rbd_client->client; | ||
3696 | |||
3697 | ceph_osdc_unregister_linger_request(&client->osdc, | ||
3698 | rbd_dev->watch_request); | ||
3699 | } | ||
3700 | if (rbd_dev->watch_event) | 4162 | if (rbd_dev->watch_event) |
3701 | rbd_req_sync_unwatch(rbd_dev); | 4163 | rbd_dev_header_watch_sync(rbd_dev, 0); |
3702 | |||
3703 | 4164 | ||
3704 | /* clean up and free blkdev */ | 4165 | /* clean up and free blkdev */ |
3705 | rbd_free_disk(rbd_dev); | 4166 | rbd_free_disk(rbd_dev); |
@@ -3743,10 +4204,14 @@ static ssize_t rbd_remove(struct bus_type *bus, | |||
3743 | goto done; | 4204 | goto done; |
3744 | } | 4205 | } |
3745 | 4206 | ||
3746 | if (rbd_dev->open_count) { | 4207 | spin_lock_irq(&rbd_dev->lock); |
4208 | if (rbd_dev->open_count) | ||
3747 | ret = -EBUSY; | 4209 | ret = -EBUSY; |
4210 | else | ||
4211 | set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); | ||
4212 | spin_unlock_irq(&rbd_dev->lock); | ||
4213 | if (ret < 0) | ||
3748 | goto done; | 4214 | goto done; |
3749 | } | ||
3750 | 4215 | ||
3751 | rbd_remove_all_snaps(rbd_dev); | 4216 | rbd_remove_all_snaps(rbd_dev); |
3752 | rbd_bus_del_dev(rbd_dev); | 4217 | rbd_bus_del_dev(rbd_dev); |
@@ -3782,10 +4247,15 @@ static void rbd_sysfs_cleanup(void) | |||
3782 | device_unregister(&rbd_root_dev); | 4247 | device_unregister(&rbd_root_dev); |
3783 | } | 4248 | } |
3784 | 4249 | ||
3785 | int __init rbd_init(void) | 4250 | static int __init rbd_init(void) |
3786 | { | 4251 | { |
3787 | int rc; | 4252 | int rc; |
3788 | 4253 | ||
4254 | if (!libceph_compatible(NULL)) { | ||
4255 | rbd_warn(NULL, "libceph incompatibility (quitting)"); | ||
4256 | |||
4257 | return -EINVAL; | ||
4258 | } | ||
3789 | rc = rbd_sysfs_init(); | 4259 | rc = rbd_sysfs_init(); |
3790 | if (rc) | 4260 | if (rc) |
3791 | return rc; | 4261 | return rc; |
@@ -3793,7 +4263,7 @@ int __init rbd_init(void) | |||
3793 | return 0; | 4263 | return 0; |
3794 | } | 4264 | } |
3795 | 4265 | ||
3796 | void __exit rbd_exit(void) | 4266 | static void __exit rbd_exit(void) |
3797 | { | 4267 | { |
3798 | rbd_sysfs_cleanup(); | 4268 | rbd_sysfs_cleanup(); |
3799 | } | 4269 | } |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d4f81edd9a5d..a60ea977af6f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page) | |||
236 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) | 236 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) |
237 | { | 237 | { |
238 | struct inode *inode = req->r_inode; | 238 | struct inode *inode = req->r_inode; |
239 | struct ceph_osd_reply_head *replyhead; | 239 | int rc = req->r_result; |
240 | int rc, bytes; | 240 | int bytes = le32_to_cpu(msg->hdr.data_len); |
241 | int i; | 241 | int i; |
242 | 242 | ||
243 | /* parse reply */ | ||
244 | replyhead = msg->front.iov_base; | ||
245 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | ||
246 | rc = le32_to_cpu(replyhead->result); | ||
247 | bytes = le32_to_cpu(msg->hdr.data_len); | ||
248 | |||
249 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); | 243 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); |
250 | 244 | ||
251 | /* unlock all pages, zeroing any data we didn't read */ | 245 | /* unlock all pages, zeroing any data we didn't read */ |
@@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) | |||
315 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | 309 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
316 | NULL, 0, | 310 | NULL, 0, |
317 | ci->i_truncate_seq, ci->i_truncate_size, | 311 | ci->i_truncate_seq, ci->i_truncate_size, |
318 | NULL, false, 1, 0); | 312 | NULL, false, 0); |
319 | if (IS_ERR(req)) | 313 | if (IS_ERR(req)) |
320 | return PTR_ERR(req); | 314 | return PTR_ERR(req); |
321 | 315 | ||
@@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
492 | &ci->i_layout, snapc, | 486 | &ci->i_layout, snapc, |
493 | page_off, len, | 487 | page_off, len, |
494 | ci->i_truncate_seq, ci->i_truncate_size, | 488 | ci->i_truncate_seq, ci->i_truncate_size, |
495 | &inode->i_mtime, | 489 | &inode->i_mtime, &page, 1); |
496 | &page, 1, 0, 0, true); | ||
497 | if (err < 0) { | 490 | if (err < 0) { |
498 | dout("writepage setting page/mapping error %d %p\n", err, page); | 491 | dout("writepage setting page/mapping error %d %p\n", err, page); |
499 | SetPageError(page); | 492 | SetPageError(page); |
@@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
554 | struct ceph_msg *msg) | 547 | struct ceph_msg *msg) |
555 | { | 548 | { |
556 | struct inode *inode = req->r_inode; | 549 | struct inode *inode = req->r_inode; |
557 | struct ceph_osd_reply_head *replyhead; | ||
558 | struct ceph_osd_op *op; | ||
559 | struct ceph_inode_info *ci = ceph_inode(inode); | 550 | struct ceph_inode_info *ci = ceph_inode(inode); |
560 | unsigned wrote; | 551 | unsigned wrote; |
561 | struct page *page; | 552 | struct page *page; |
562 | int i; | 553 | int i; |
563 | struct ceph_snap_context *snapc = req->r_snapc; | 554 | struct ceph_snap_context *snapc = req->r_snapc; |
564 | struct address_space *mapping = inode->i_mapping; | 555 | struct address_space *mapping = inode->i_mapping; |
565 | __s32 rc = -EIO; | 556 | int rc = req->r_result; |
566 | u64 bytes = 0; | 557 | u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); |
567 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 558 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
568 | long writeback_stat; | 559 | long writeback_stat; |
569 | unsigned issued = ceph_caps_issued(ci); | 560 | unsigned issued = ceph_caps_issued(ci); |
570 | 561 | ||
571 | /* parse reply */ | ||
572 | replyhead = msg->front.iov_base; | ||
573 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | ||
574 | op = (void *)(replyhead + 1); | ||
575 | rc = le32_to_cpu(replyhead->result); | ||
576 | bytes = le64_to_cpu(op->extent.length); | ||
577 | |||
578 | if (rc >= 0) { | 562 | if (rc >= 0) { |
579 | /* | 563 | /* |
580 | * Assume we wrote the pages we originally sent. The | 564 | * Assume we wrote the pages we originally sent. The |
@@ -741,8 +725,6 @@ retry: | |||
741 | struct page *page; | 725 | struct page *page; |
742 | int want; | 726 | int want; |
743 | u64 offset, len; | 727 | u64 offset, len; |
744 | struct ceph_osd_request_head *reqhead; | ||
745 | struct ceph_osd_op *op; | ||
746 | long writeback_stat; | 728 | long writeback_stat; |
747 | 729 | ||
748 | next = 0; | 730 | next = 0; |
@@ -838,7 +820,7 @@ get_more_pages: | |||
838 | snapc, do_sync, | 820 | snapc, do_sync, |
839 | ci->i_truncate_seq, | 821 | ci->i_truncate_seq, |
840 | ci->i_truncate_size, | 822 | ci->i_truncate_size, |
841 | &inode->i_mtime, true, 1, 0); | 823 | &inode->i_mtime, true, 0); |
842 | 824 | ||
843 | if (IS_ERR(req)) { | 825 | if (IS_ERR(req)) { |
844 | rc = PTR_ERR(req); | 826 | rc = PTR_ERR(req); |
@@ -906,10 +888,8 @@ get_more_pages: | |||
906 | 888 | ||
907 | /* revise final length, page count */ | 889 | /* revise final length, page count */ |
908 | req->r_num_pages = locked_pages; | 890 | req->r_num_pages = locked_pages; |
909 | reqhead = req->r_request->front.iov_base; | 891 | req->r_request_ops[0].extent.length = cpu_to_le64(len); |
910 | op = (void *)(reqhead + 1); | 892 | req->r_request_ops[0].payload_len = cpu_to_le32(len); |
911 | op->extent.length = cpu_to_le64(len); | ||
912 | op->payload_len = cpu_to_le32(len); | ||
913 | req->r_request->hdr.data_len = cpu_to_le32(len); | 893 | req->r_request->hdr.data_len = cpu_to_le32(len); |
914 | 894 | ||
915 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); | 895 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); |
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ae2be696eb5b..78e2f575247d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -611,8 +611,16 @@ retry: | |||
611 | 611 | ||
612 | if (flags & CEPH_CAP_FLAG_AUTH) | 612 | if (flags & CEPH_CAP_FLAG_AUTH) |
613 | ci->i_auth_cap = cap; | 613 | ci->i_auth_cap = cap; |
614 | else if (ci->i_auth_cap == cap) | 614 | else if (ci->i_auth_cap == cap) { |
615 | ci->i_auth_cap = NULL; | 615 | ci->i_auth_cap = NULL; |
616 | spin_lock(&mdsc->cap_dirty_lock); | ||
617 | if (!list_empty(&ci->i_dirty_item)) { | ||
618 | dout(" moving %p to cap_dirty_migrating\n", inode); | ||
619 | list_move(&ci->i_dirty_item, | ||
620 | &mdsc->cap_dirty_migrating); | ||
621 | } | ||
622 | spin_unlock(&mdsc->cap_dirty_lock); | ||
623 | } | ||
616 | 624 | ||
617 | dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", | 625 | dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", |
618 | inode, ceph_vinop(inode), cap, ceph_cap_string(issued), | 626 | inode, ceph_vinop(inode), cap, ceph_cap_string(issued), |
@@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, | |||
1460 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1468 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1461 | struct inode *inode = &ci->vfs_inode; | 1469 | struct inode *inode = &ci->vfs_inode; |
1462 | struct ceph_cap *cap; | 1470 | struct ceph_cap *cap; |
1463 | int file_wanted, used; | 1471 | int file_wanted, used, cap_used; |
1464 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ | 1472 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ |
1465 | int issued, implemented, want, retain, revoking, flushing = 0; | 1473 | int issued, implemented, want, retain, revoking, flushing = 0; |
1466 | int mds = -1; /* keep track of how far we've gone through i_caps list | 1474 | int mds = -1; /* keep track of how far we've gone through i_caps list |
@@ -1563,9 +1571,14 @@ retry_locked: | |||
1563 | 1571 | ||
1564 | /* NOTE: no side-effects allowed, until we take s_mutex */ | 1572 | /* NOTE: no side-effects allowed, until we take s_mutex */ |
1565 | 1573 | ||
1574 | cap_used = used; | ||
1575 | if (ci->i_auth_cap && cap != ci->i_auth_cap) | ||
1576 | cap_used &= ~ci->i_auth_cap->issued; | ||
1577 | |||
1566 | revoking = cap->implemented & ~cap->issued; | 1578 | revoking = cap->implemented & ~cap->issued; |
1567 | dout(" mds%d cap %p issued %s implemented %s revoking %s\n", | 1579 | dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", |
1568 | cap->mds, cap, ceph_cap_string(cap->issued), | 1580 | cap->mds, cap, ceph_cap_string(cap->issued), |
1581 | ceph_cap_string(cap_used), | ||
1569 | ceph_cap_string(cap->implemented), | 1582 | ceph_cap_string(cap->implemented), |
1570 | ceph_cap_string(revoking)); | 1583 | ceph_cap_string(revoking)); |
1571 | 1584 | ||
@@ -1593,7 +1606,7 @@ retry_locked: | |||
1593 | } | 1606 | } |
1594 | 1607 | ||
1595 | /* completed revocation? going down and there are no caps? */ | 1608 | /* completed revocation? going down and there are no caps? */ |
1596 | if (revoking && (revoking & used) == 0) { | 1609 | if (revoking && (revoking & cap_used) == 0) { |
1597 | dout("completed revocation of %s\n", | 1610 | dout("completed revocation of %s\n", |
1598 | ceph_cap_string(cap->implemented & ~cap->issued)); | 1611 | ceph_cap_string(cap->implemented & ~cap->issued)); |
1599 | goto ack; | 1612 | goto ack; |
@@ -1670,8 +1683,8 @@ ack: | |||
1670 | sent++; | 1683 | sent++; |
1671 | 1684 | ||
1672 | /* __send_cap drops i_ceph_lock */ | 1685 | /* __send_cap drops i_ceph_lock */ |
1673 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, | 1686 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, |
1674 | retain, flushing, NULL); | 1687 | want, retain, flushing, NULL); |
1675 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ | 1688 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ |
1676 | } | 1689 | } |
1677 | 1690 | ||
@@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
2417 | dout("mds wanted %s -> %s\n", | 2430 | dout("mds wanted %s -> %s\n", |
2418 | ceph_cap_string(le32_to_cpu(grant->wanted)), | 2431 | ceph_cap_string(le32_to_cpu(grant->wanted)), |
2419 | ceph_cap_string(wanted)); | 2432 | ceph_cap_string(wanted)); |
2420 | grant->wanted = cpu_to_le32(wanted); | 2433 | /* imported cap may not have correct mds_wanted */ |
2434 | if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) | ||
2435 | check_caps = 1; | ||
2421 | } | 2436 | } |
2422 | 2437 | ||
2423 | cap->seq = seq; | 2438 | cap->seq = seq; |
@@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2821 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, | 2836 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, |
2822 | (unsigned)seq); | 2837 | (unsigned)seq); |
2823 | 2838 | ||
2839 | if (op == CEPH_CAP_OP_IMPORT) | ||
2840 | ceph_add_cap_releases(mdsc, session); | ||
2841 | |||
2824 | /* lookup ino */ | 2842 | /* lookup ino */ |
2825 | inode = ceph_find_inode(sb, vino); | 2843 | inode = ceph_find_inode(sb, vino); |
2826 | ci = ceph_inode(inode); | 2844 | ci = ceph_inode(inode); |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 11b57c2c8f15..bf338d9b67e3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, | |||
243 | err = ceph_mdsc_do_request(mdsc, | 243 | err = ceph_mdsc_do_request(mdsc, |
244 | (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, | 244 | (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, |
245 | req); | 245 | req); |
246 | if (err) | ||
247 | goto out_err; | ||
248 | |||
246 | err = ceph_handle_snapdir(req, dentry, err); | 249 | err = ceph_handle_snapdir(req, dentry, err); |
247 | if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) | 250 | if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) |
248 | err = ceph_handle_notrace_create(dir, dentry); | 251 | err = ceph_handle_notrace_create(dir, dentry); |
@@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, | |||
263 | err = finish_no_open(file, dn); | 266 | err = finish_no_open(file, dn); |
264 | } else { | 267 | } else { |
265 | dout("atomic_open finish_open on dn %p\n", dn); | 268 | dout("atomic_open finish_open on dn %p\n", dn); |
269 | if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { | ||
270 | *opened |= FILE_CREATED; | ||
271 | } | ||
266 | err = finish_open(file, dentry, ceph_open, opened); | 272 | err = finish_open(file, dentry, ceph_open, opened); |
267 | } | 273 | } |
268 | 274 | ||
@@ -535,7 +541,7 @@ more: | |||
535 | ci->i_snap_realm->cached_context, | 541 | ci->i_snap_realm->cached_context, |
536 | do_sync, | 542 | do_sync, |
537 | ci->i_truncate_seq, ci->i_truncate_size, | 543 | ci->i_truncate_seq, ci->i_truncate_size, |
538 | &mtime, false, 2, page_align); | 544 | &mtime, false, page_align); |
539 | if (IS_ERR(req)) | 545 | if (IS_ERR(req)) |
540 | return PTR_ERR(req); | 546 | return PTR_ERR(req); |
541 | 547 | ||
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f5ed767806df..4a989345b37b 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c | |||
@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
185 | &ceph_sb_to_client(inode->i_sb)->client->osdc; | 185 | &ceph_sb_to_client(inode->i_sb)->client->osdc; |
186 | u64 len = 1, olen; | 186 | u64 len = 1, olen; |
187 | u64 tmp; | 187 | u64 tmp; |
188 | struct ceph_object_layout ol; | ||
189 | struct ceph_pg pgid; | 188 | struct ceph_pg pgid; |
190 | int r; | 189 | int r; |
191 | 190 | ||
@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
194 | return -EFAULT; | 193 | return -EFAULT; |
195 | 194 | ||
196 | down_read(&osdc->map_sem); | 195 | down_read(&osdc->map_sem); |
197 | r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, | 196 | r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, |
198 | &dl.object_no, &dl.object_offset, | 197 | &dl.object_no, &dl.object_offset, |
199 | &olen); | 198 | &olen); |
200 | if (r < 0) | 199 | if (r < 0) |
@@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
209 | 208 | ||
210 | snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", | 209 | snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", |
211 | ceph_ino(inode), dl.object_no); | 210 | ceph_ino(inode), dl.object_no); |
212 | ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, | 211 | ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, |
213 | osdc->osdmap); | 212 | osdc->osdmap); |
214 | 213 | ||
215 | pgid = ol.ol_pgid; | ||
216 | dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); | 214 | dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); |
217 | if (dl.osd >= 0) { | 215 | if (dl.osd >= 0) { |
218 | struct ceph_entity_addr *a = | 216 | struct ceph_entity_addr *a = |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7a3dfe0a9a80..442880d099c9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -233,6 +233,30 @@ bad: | |||
233 | } | 233 | } |
234 | 234 | ||
235 | /* | 235 | /* |
236 | * parse create results | ||
237 | */ | ||
238 | static int parse_reply_info_create(void **p, void *end, | ||
239 | struct ceph_mds_reply_info_parsed *info, | ||
240 | int features) | ||
241 | { | ||
242 | if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { | ||
243 | if (*p == end) { | ||
244 | info->has_create_ino = false; | ||
245 | } else { | ||
246 | info->has_create_ino = true; | ||
247 | info->ino = ceph_decode_64(p); | ||
248 | } | ||
249 | } | ||
250 | |||
251 | if (unlikely(*p != end)) | ||
252 | goto bad; | ||
253 | return 0; | ||
254 | |||
255 | bad: | ||
256 | return -EIO; | ||
257 | } | ||
258 | |||
259 | /* | ||
236 | * parse extra results | 260 | * parse extra results |
237 | */ | 261 | */ |
238 | static int parse_reply_info_extra(void **p, void *end, | 262 | static int parse_reply_info_extra(void **p, void *end, |
@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end, | |||
241 | { | 265 | { |
242 | if (info->head->op == CEPH_MDS_OP_GETFILELOCK) | 266 | if (info->head->op == CEPH_MDS_OP_GETFILELOCK) |
243 | return parse_reply_info_filelock(p, end, info, features); | 267 | return parse_reply_info_filelock(p, end, info, features); |
244 | else | 268 | else if (info->head->op == CEPH_MDS_OP_READDIR) |
245 | return parse_reply_info_dir(p, end, info, features); | 269 | return parse_reply_info_dir(p, end, info, features); |
270 | else if (info->head->op == CEPH_MDS_OP_CREATE) | ||
271 | return parse_reply_info_create(p, end, info, features); | ||
272 | else | ||
273 | return -EIO; | ||
246 | } | 274 | } |
247 | 275 | ||
248 | /* | 276 | /* |
@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
2170 | mutex_lock(&req->r_fill_mutex); | 2198 | mutex_lock(&req->r_fill_mutex); |
2171 | err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); | 2199 | err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); |
2172 | if (err == 0) { | 2200 | if (err == 0) { |
2173 | if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && | 2201 | if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || |
2202 | req->r_op == CEPH_MDS_OP_LSSNAP) && | ||
2174 | rinfo->dir_nr) | 2203 | rinfo->dir_nr) |
2175 | ceph_readdir_prepopulate(req, req->r_session); | 2204 | ceph_readdir_prepopulate(req, req->r_session); |
2176 | ceph_unreserve_caps(mdsc, &req->r_caps_reservation); | 2205 | ceph_unreserve_caps(mdsc, &req->r_caps_reservation); |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ff4188bf6199..c2a19fbbe517 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed { | |||
74 | struct ceph_mds_reply_info_in *dir_in; | 74 | struct ceph_mds_reply_info_in *dir_in; |
75 | u8 dir_complete, dir_end; | 75 | u8 dir_complete, dir_end; |
76 | }; | 76 | }; |
77 | |||
78 | /* for create results */ | ||
79 | struct { | ||
80 | bool has_create_ino; | ||
81 | u64 ino; | ||
82 | }; | ||
77 | }; | 83 | }; |
78 | 84 | ||
79 | /* encoded blob describing snapshot contexts for certain | 85 | /* encoded blob describing snapshot contexts for certain |
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 73b7d44e8a35..0d3c9240c61b 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c | |||
@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
59 | return ERR_PTR(-ENOMEM); | 59 | return ERR_PTR(-ENOMEM); |
60 | 60 | ||
61 | ceph_decode_16_safe(p, end, version, bad); | 61 | ceph_decode_16_safe(p, end, version, bad); |
62 | if (version > 3) { | ||
63 | pr_warning("got mdsmap version %d > 3, failing", version); | ||
64 | goto bad; | ||
65 | } | ||
62 | 66 | ||
63 | ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); | 67 | ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); |
64 | m->m_epoch = ceph_decode_32(p); | 68 | m->m_epoch = ceph_decode_32(p); |
@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
144 | /* pg_pools */ | 148 | /* pg_pools */ |
145 | ceph_decode_32_safe(p, end, n, bad); | 149 | ceph_decode_32_safe(p, end, n, bad); |
146 | m->m_num_data_pg_pools = n; | 150 | m->m_num_data_pg_pools = n; |
147 | m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); | 151 | m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); |
148 | if (!m->m_data_pg_pools) | 152 | if (!m->m_data_pg_pools) |
149 | goto badmem; | 153 | goto badmem; |
150 | ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); | 154 | ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); |
151 | for (i = 0; i < n; i++) | 155 | for (i = 0; i < n; i++) |
152 | m->m_data_pg_pools[i] = ceph_decode_32(p); | 156 | m->m_data_pg_pools[i] = ceph_decode_64(p); |
153 | m->m_cas_pg_pool = ceph_decode_32(p); | 157 | m->m_cas_pg_pool = ceph_decode_64(p); |
154 | 158 | ||
155 | /* ok, we don't care about the rest. */ | 159 | /* ok, we don't care about the rest. */ |
156 | dout("mdsmap_decode success epoch %u\n", m->m_epoch); | 160 | dout("mdsmap_decode success epoch %u\n", m->m_epoch); |
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index cd5097d7c804..89fa4a940a0f 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c | |||
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s) | |||
15 | case CEPH_MDS_STATE_BOOT: return "up:boot"; | 15 | case CEPH_MDS_STATE_BOOT: return "up:boot"; |
16 | case CEPH_MDS_STATE_STANDBY: return "up:standby"; | 16 | case CEPH_MDS_STATE_STANDBY: return "up:standby"; |
17 | case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; | 17 | case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; |
18 | case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay"; | ||
18 | case CEPH_MDS_STATE_CREATING: return "up:creating"; | 19 | case CEPH_MDS_STATE_CREATING: return "up:creating"; |
19 | case CEPH_MDS_STATE_STARTING: return "up:starting"; | 20 | case CEPH_MDS_STATE_STARTING: return "up:starting"; |
20 | /* up and in */ | 21 | /* up and in */ |
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op) | |||
50 | case CEPH_MDS_OP_LOOKUP: return "lookup"; | 51 | case CEPH_MDS_OP_LOOKUP: return "lookup"; |
51 | case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; | 52 | case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; |
52 | case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; | 53 | case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; |
54 | case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; | ||
53 | case CEPH_MDS_OP_GETATTR: return "getattr"; | 55 | case CEPH_MDS_OP_GETATTR: return "getattr"; |
54 | case CEPH_MDS_OP_SETXATTR: return "setxattr"; | 56 | case CEPH_MDS_OP_SETXATTR: return "setxattr"; |
55 | case CEPH_MDS_OP_SETATTR: return "setattr"; | 57 | case CEPH_MDS_OP_SETATTR: return "setattr"; |
56 | case CEPH_MDS_OP_RMXATTR: return "rmxattr"; | 58 | case CEPH_MDS_OP_RMXATTR: return "rmxattr"; |
59 | case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; | ||
60 | case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout"; | ||
57 | case CEPH_MDS_OP_READDIR: return "readdir"; | 61 | case CEPH_MDS_OP_READDIR: return "readdir"; |
58 | case CEPH_MDS_OP_MKNOD: return "mknod"; | 62 | case CEPH_MDS_OP_MKNOD: return "mknod"; |
59 | case CEPH_MDS_OP_LINK: return "link"; | 63 | case CEPH_MDS_OP_LINK: return "link"; |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e86aa9948124..9fe17c6c2876 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
71 | /* | 71 | /* |
72 | * express utilization in terms of large blocks to avoid | 72 | * express utilization in terms of large blocks to avoid |
73 | * overflow on 32-bit machines. | 73 | * overflow on 32-bit machines. |
74 | * | ||
75 | * NOTE: for the time being, we make bsize == frsize to humor | ||
76 | * not-yet-ancient versions of glibc that are broken. | ||
77 | * Someday, we will probably want to report a real block | ||
78 | * size... whatever that may mean for a network file system! | ||
74 | */ | 79 | */ |
75 | buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; | 80 | buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; |
81 | buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; | ||
76 | buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); | 82 | buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); |
77 | buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); | 83 | buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); |
78 | buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); | 84 | buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); |
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
80 | buf->f_files = le64_to_cpu(st.num_objects); | 86 | buf->f_files = le64_to_cpu(st.num_objects); |
81 | buf->f_ffree = -1; | 87 | buf->f_ffree = -1; |
82 | buf->f_namelen = NAME_MAX; | 88 | buf->f_namelen = NAME_MAX; |
83 | buf->f_frsize = PAGE_CACHE_SIZE; | ||
84 | 89 | ||
85 | /* leave fsid little-endian, regardless of host endianness */ | 90 | /* leave fsid little-endian, regardless of host endianness */ |
86 | fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); | 91 | fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f053bbd1886f..c7b309723dcc 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -21,7 +21,7 @@ | |||
21 | 21 | ||
22 | /* large granularity for statfs utilization stats to facilitate | 22 | /* large granularity for statfs utilization stats to facilitate |
23 | * large volume sizes on 32-bit machines. */ | 23 | * large volume sizes on 32-bit machines. */ |
24 | #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ | 24 | #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ |
25 | #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) | 25 | #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) |
26 | 26 | ||
27 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ | 27 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ |
@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); | |||
798 | /* file.c */ | 798 | /* file.c */ |
799 | extern const struct file_operations ceph_file_fops; | 799 | extern const struct file_operations ceph_file_fops; |
800 | extern const struct address_space_operations ceph_aops; | 800 | extern const struct address_space_operations ceph_aops; |
801 | extern int ceph_copy_to_page_vector(struct page **pages, | 801 | |
802 | const char *data, | ||
803 | loff_t off, size_t len); | ||
804 | extern int ceph_copy_from_page_vector(struct page **pages, | ||
805 | char *data, | ||
806 | loff_t off, size_t len); | ||
807 | extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); | ||
808 | extern int ceph_open(struct inode *inode, struct file *file); | 802 | extern int ceph_open(struct inode *inode, struct file *file); |
809 | extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, | 803 | extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, |
810 | struct file *file, unsigned flags, umode_t mode, | 804 | struct file *file, unsigned flags, umode_t mode, |
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2c2ae5be9902..9b6b2b6dd164 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
@@ -29,9 +29,94 @@ struct ceph_vxattr { | |||
29 | size_t name_size; /* strlen(name) + 1 (for '\0') */ | 29 | size_t name_size; /* strlen(name) + 1 (for '\0') */ |
30 | size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, | 30 | size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, |
31 | size_t size); | 31 | size_t size); |
32 | bool readonly; | 32 | bool readonly, hidden; |
33 | bool (*exists_cb)(struct ceph_inode_info *ci); | ||
33 | }; | 34 | }; |
34 | 35 | ||
36 | /* layouts */ | ||
37 | |||
38 | static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) | ||
39 | { | ||
40 | size_t s; | ||
41 | char *p = (char *)&ci->i_layout; | ||
42 | |||
43 | for (s = 0; s < sizeof(ci->i_layout); s++, p++) | ||
44 | if (*p) | ||
45 | return true; | ||
46 | return false; | ||
47 | } | ||
48 | |||
49 | static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, | ||
50 | size_t size) | ||
51 | { | ||
52 | int ret; | ||
53 | struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); | ||
54 | struct ceph_osd_client *osdc = &fsc->client->osdc; | ||
55 | s64 pool = ceph_file_layout_pg_pool(ci->i_layout); | ||
56 | const char *pool_name; | ||
57 | |||
58 | dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); | ||
59 | down_read(&osdc->map_sem); | ||
60 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); | ||
61 | if (pool_name) | ||
62 | ret = snprintf(val, size, | ||
63 | "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", | ||
64 | (unsigned long long)ceph_file_layout_su(ci->i_layout), | ||
65 | (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), | ||
66 | (unsigned long long)ceph_file_layout_object_size(ci->i_layout), | ||
67 | pool_name); | ||
68 | else | ||
69 | ret = snprintf(val, size, | ||
70 | "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", | ||
71 | (unsigned long long)ceph_file_layout_su(ci->i_layout), | ||
72 | (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), | ||
73 | (unsigned long long)ceph_file_layout_object_size(ci->i_layout), | ||
74 | (unsigned long long)pool); | ||
75 | |||
76 | up_read(&osdc->map_sem); | ||
77 | return ret; | ||
78 | } | ||
79 | |||
80 | static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, | ||
81 | char *val, size_t size) | ||
82 | { | ||
83 | return snprintf(val, size, "%lld", | ||
84 | (unsigned long long)ceph_file_layout_su(ci->i_layout)); | ||
85 | } | ||
86 | |||
87 | static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, | ||
88 | char *val, size_t size) | ||
89 | { | ||
90 | return snprintf(val, size, "%lld", | ||
91 | (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); | ||
92 | } | ||
93 | |||
94 | static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, | ||
95 | char *val, size_t size) | ||
96 | { | ||
97 | return snprintf(val, size, "%lld", | ||
98 | (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); | ||
99 | } | ||
100 | |||
101 | static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, | ||
102 | char *val, size_t size) | ||
103 | { | ||
104 | int ret; | ||
105 | struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); | ||
106 | struct ceph_osd_client *osdc = &fsc->client->osdc; | ||
107 | s64 pool = ceph_file_layout_pg_pool(ci->i_layout); | ||
108 | const char *pool_name; | ||
109 | |||
110 | down_read(&osdc->map_sem); | ||
111 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); | ||
112 | if (pool_name) | ||
113 | ret = snprintf(val, size, "%s", pool_name); | ||
114 | else | ||
115 | ret = snprintf(val, size, "%lld", (unsigned long long)pool); | ||
116 | up_read(&osdc->map_sem); | ||
117 | return ret; | ||
118 | } | ||
119 | |||
35 | /* directories */ | 120 | /* directories */ |
36 | 121 | ||
37 | static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, | 122 | static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, |
@@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, | |||
83 | (long)ci->i_rctime.tv_nsec); | 168 | (long)ci->i_rctime.tv_nsec); |
84 | } | 169 | } |
85 | 170 | ||
86 | #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name | ||
87 | 171 | ||
88 | #define XATTR_NAME_CEPH(_type, _name) \ | 172 | #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name |
89 | { \ | 173 | #define CEPH_XATTR_NAME2(_type, _name, _name2) \ |
90 | .name = CEPH_XATTR_NAME(_type, _name), \ | 174 | XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 |
91 | .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ | 175 | |
92 | .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ | 176 | #define XATTR_NAME_CEPH(_type, _name) \ |
93 | .readonly = true, \ | 177 | { \ |
94 | } | 178 | .name = CEPH_XATTR_NAME(_type, _name), \ |
179 | .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ | ||
180 | .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ | ||
181 | .readonly = true, \ | ||
182 | .hidden = false, \ | ||
183 | .exists_cb = NULL, \ | ||
184 | } | ||
185 | #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ | ||
186 | { \ | ||
187 | .name = CEPH_XATTR_NAME2(_type, _name, _field), \ | ||
188 | .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ | ||
189 | .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ | ||
190 | .readonly = false, \ | ||
191 | .hidden = true, \ | ||
192 | .exists_cb = ceph_vxattrcb_layout_exists, \ | ||
193 | } | ||
95 | 194 | ||
96 | static struct ceph_vxattr ceph_dir_vxattrs[] = { | 195 | static struct ceph_vxattr ceph_dir_vxattrs[] = { |
196 | { | ||
197 | .name = "ceph.dir.layout", | ||
198 | .name_size = sizeof("ceph.dir.layout"), | ||
199 | .getxattr_cb = ceph_vxattrcb_layout, | ||
200 | .readonly = false, | ||
201 | .hidden = false, | ||
202 | .exists_cb = ceph_vxattrcb_layout_exists, | ||
203 | }, | ||
204 | XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), | ||
205 | XATTR_LAYOUT_FIELD(dir, layout, stripe_count), | ||
206 | XATTR_LAYOUT_FIELD(dir, layout, object_size), | ||
207 | XATTR_LAYOUT_FIELD(dir, layout, pool), | ||
97 | XATTR_NAME_CEPH(dir, entries), | 208 | XATTR_NAME_CEPH(dir, entries), |
98 | XATTR_NAME_CEPH(dir, files), | 209 | XATTR_NAME_CEPH(dir, files), |
99 | XATTR_NAME_CEPH(dir, subdirs), | 210 | XATTR_NAME_CEPH(dir, subdirs), |
@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { | |||
102 | XATTR_NAME_CEPH(dir, rsubdirs), | 213 | XATTR_NAME_CEPH(dir, rsubdirs), |
103 | XATTR_NAME_CEPH(dir, rbytes), | 214 | XATTR_NAME_CEPH(dir, rbytes), |
104 | XATTR_NAME_CEPH(dir, rctime), | 215 | XATTR_NAME_CEPH(dir, rctime), |
105 | { 0 } /* Required table terminator */ | 216 | { .name = NULL, 0 } /* Required table terminator */ |
106 | }; | 217 | }; |
107 | static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ | 218 | static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ |
108 | 219 | ||
109 | /* files */ | 220 | /* files */ |
110 | 221 | ||
111 | static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val, | ||
112 | size_t size) | ||
113 | { | ||
114 | int ret; | ||
115 | |||
116 | ret = snprintf(val, size, | ||
117 | "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", | ||
118 | (unsigned long long)ceph_file_layout_su(ci->i_layout), | ||
119 | (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), | ||
120 | (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); | ||
121 | return ret; | ||
122 | } | ||
123 | |||
124 | static struct ceph_vxattr ceph_file_vxattrs[] = { | 222 | static struct ceph_vxattr ceph_file_vxattrs[] = { |
125 | XATTR_NAME_CEPH(file, layout), | ||
126 | /* The following extended attribute name is deprecated */ | ||
127 | { | 223 | { |
128 | .name = XATTR_CEPH_PREFIX "layout", | 224 | .name = "ceph.file.layout", |
129 | .name_size = sizeof (XATTR_CEPH_PREFIX "layout"), | 225 | .name_size = sizeof("ceph.file.layout"), |
130 | .getxattr_cb = ceph_vxattrcb_file_layout, | 226 | .getxattr_cb = ceph_vxattrcb_layout, |
131 | .readonly = true, | 227 | .readonly = false, |
228 | .hidden = false, | ||
229 | .exists_cb = ceph_vxattrcb_layout_exists, | ||
132 | }, | 230 | }, |
133 | { 0 } /* Required table terminator */ | 231 | XATTR_LAYOUT_FIELD(file, layout, stripe_unit), |
232 | XATTR_LAYOUT_FIELD(file, layout, stripe_count), | ||
233 | XATTR_LAYOUT_FIELD(file, layout, object_size), | ||
234 | XATTR_LAYOUT_FIELD(file, layout, pool), | ||
235 | { .name = NULL, 0 } /* Required table terminator */ | ||
134 | }; | 236 | }; |
135 | static size_t ceph_file_vxattrs_name_size; /* total size of all names */ | 237 | static size_t ceph_file_vxattrs_name_size; /* total size of all names */ |
136 | 238 | ||
@@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) | |||
164 | size_t size = 0; | 266 | size_t size = 0; |
165 | 267 | ||
166 | for (vxattr = vxattrs; vxattr->name; vxattr++) | 268 | for (vxattr = vxattrs; vxattr->name; vxattr++) |
167 | size += vxattr->name_size; | 269 | if (!vxattr->hidden) |
270 | size += vxattr->name_size; | ||
168 | 271 | ||
169 | return size; | 272 | return size; |
170 | } | 273 | } |
@@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | |||
572 | if (!ceph_is_valid_xattr(name)) | 675 | if (!ceph_is_valid_xattr(name)) |
573 | return -ENODATA; | 676 | return -ENODATA; |
574 | 677 | ||
575 | /* let's see if a virtual xattr was requested */ | ||
576 | vxattr = ceph_match_vxattr(inode, name); | ||
577 | |||
578 | spin_lock(&ci->i_ceph_lock); | 678 | spin_lock(&ci->i_ceph_lock); |
579 | dout("getxattr %p ver=%lld index_ver=%lld\n", inode, | 679 | dout("getxattr %p ver=%lld index_ver=%lld\n", inode, |
580 | ci->i_xattrs.version, ci->i_xattrs.index_version); | 680 | ci->i_xattrs.version, ci->i_xattrs.index_version); |
581 | 681 | ||
682 | /* let's see if a virtual xattr was requested */ | ||
683 | vxattr = ceph_match_vxattr(inode, name); | ||
684 | if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { | ||
685 | err = vxattr->getxattr_cb(ci, value, size); | ||
686 | goto out; | ||
687 | } | ||
688 | |||
582 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && | 689 | if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && |
583 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { | 690 | (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { |
584 | goto get_xattr; | 691 | goto get_xattr; |
@@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | |||
592 | 699 | ||
593 | spin_lock(&ci->i_ceph_lock); | 700 | spin_lock(&ci->i_ceph_lock); |
594 | 701 | ||
595 | if (vxattr && vxattr->readonly) { | ||
596 | err = vxattr->getxattr_cb(ci, value, size); | ||
597 | goto out; | ||
598 | } | ||
599 | |||
600 | err = __build_xattrs(inode); | 702 | err = __build_xattrs(inode); |
601 | if (err < 0) | 703 | if (err < 0) |
602 | goto out; | 704 | goto out; |
@@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, | |||
604 | get_xattr: | 706 | get_xattr: |
605 | err = -ENODATA; /* == ENOATTR */ | 707 | err = -ENODATA; /* == ENOATTR */ |
606 | xattr = __get_xattr(ci, name); | 708 | xattr = __get_xattr(ci, name); |
607 | if (!xattr) { | 709 | if (!xattr) |
608 | if (vxattr) | ||
609 | err = vxattr->getxattr_cb(ci, value, size); | ||
610 | goto out; | 710 | goto out; |
611 | } | ||
612 | 711 | ||
613 | err = -ERANGE; | 712 | err = -ERANGE; |
614 | if (size && size < xattr->val_len) | 713 | if (size && size < xattr->val_len) |
@@ -664,23 +763,30 @@ list_xattr: | |||
664 | vir_namelen = ceph_vxattrs_name_size(vxattrs); | 763 | vir_namelen = ceph_vxattrs_name_size(vxattrs); |
665 | 764 | ||
666 | /* adding 1 byte per each variable due to the null termination */ | 765 | /* adding 1 byte per each variable due to the null termination */ |
667 | namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; | 766 | namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; |
668 | err = -ERANGE; | 767 | err = -ERANGE; |
669 | if (size && namelen > size) | 768 | if (size && vir_namelen + namelen > size) |
670 | goto out; | 769 | goto out; |
671 | 770 | ||
672 | err = namelen; | 771 | err = namelen + vir_namelen; |
673 | if (size == 0) | 772 | if (size == 0) |
674 | goto out; | 773 | goto out; |
675 | 774 | ||
676 | names = __copy_xattr_names(ci, names); | 775 | names = __copy_xattr_names(ci, names); |
677 | 776 | ||
678 | /* virtual xattr names, too */ | 777 | /* virtual xattr names, too */ |
679 | if (vxattrs) | 778 | err = namelen; |
779 | if (vxattrs) { | ||
680 | for (i = 0; vxattrs[i].name; i++) { | 780 | for (i = 0; vxattrs[i].name; i++) { |
681 | len = sprintf(names, "%s", vxattrs[i].name); | 781 | if (!vxattrs[i].hidden && |
682 | names += len + 1; | 782 | !(vxattrs[i].exists_cb && |
783 | !vxattrs[i].exists_cb(ci))) { | ||
784 | len = sprintf(names, "%s", vxattrs[i].name); | ||
785 | names += len + 1; | ||
786 | err += len + 1; | ||
787 | } | ||
683 | } | 788 | } |
789 | } | ||
684 | 790 | ||
685 | out: | 791 | out: |
686 | spin_unlock(&ci->i_ceph_lock); | 792 | spin_unlock(&ci->i_ceph_lock); |
@@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name, | |||
782 | if (vxattr && vxattr->readonly) | 888 | if (vxattr && vxattr->readonly) |
783 | return -EOPNOTSUPP; | 889 | return -EOPNOTSUPP; |
784 | 890 | ||
891 | /* pass any unhandled ceph.* xattrs through to the MDS */ | ||
892 | if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) | ||
893 | goto do_sync_unlocked; | ||
894 | |||
785 | /* preallocate memory for xattr name, value, index node */ | 895 | /* preallocate memory for xattr name, value, index node */ |
786 | err = -ENOMEM; | 896 | err = -ENOMEM; |
787 | newname = kmemdup(name, name_len + 1, GFP_NOFS); | 897 | newname = kmemdup(name, name_len + 1, GFP_NOFS); |
@@ -838,6 +948,7 @@ retry: | |||
838 | 948 | ||
839 | do_sync: | 949 | do_sync: |
840 | spin_unlock(&ci->i_ceph_lock); | 950 | spin_unlock(&ci->i_ceph_lock); |
951 | do_sync_unlocked: | ||
841 | err = ceph_sync_setxattr(dentry, name, value, size, flags); | 952 | err = ceph_sync_setxattr(dentry, name, value, size, flags); |
842 | out: | 953 | out: |
843 | kfree(newname); | 954 | kfree(newname); |
@@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name) | |||
892 | if (vxattr && vxattr->readonly) | 1003 | if (vxattr && vxattr->readonly) |
893 | return -EOPNOTSUPP; | 1004 | return -EOPNOTSUPP; |
894 | 1005 | ||
1006 | /* pass any unhandled ceph.* xattrs through to the MDS */ | ||
1007 | if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) | ||
1008 | goto do_sync_unlocked; | ||
1009 | |||
895 | err = -ENOMEM; | 1010 | err = -ENOMEM; |
896 | spin_lock(&ci->i_ceph_lock); | 1011 | spin_lock(&ci->i_ceph_lock); |
897 | retry: | 1012 | retry: |
@@ -931,6 +1046,7 @@ retry: | |||
931 | return err; | 1046 | return err; |
932 | do_sync: | 1047 | do_sync: |
933 | spin_unlock(&ci->i_ceph_lock); | 1048 | spin_unlock(&ci->i_ceph_lock); |
1049 | do_sync_unlocked: | ||
934 | err = ceph_send_removexattr(dentry, name); | 1050 | err = ceph_send_removexattr(dentry, name); |
935 | out: | 1051 | out: |
936 | return err; | 1052 | return err; |
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index dad579b0c0e6..76554cecaab2 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h | |||
@@ -12,16 +12,46 @@ | |||
12 | #define CEPH_FEATURE_MONNAMES (1<<5) | 12 | #define CEPH_FEATURE_MONNAMES (1<<5) |
13 | #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) | 13 | #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) |
14 | #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) | 14 | #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) |
15 | /* bits 8-17 defined by user-space; not supported yet here */ | 15 | #define CEPH_FEATURE_OBJECTLOCATOR (1<<8) |
16 | #define CEPH_FEATURE_PGID64 (1<<9) | ||
17 | #define CEPH_FEATURE_INCSUBOSDMAP (1<<10) | ||
18 | #define CEPH_FEATURE_PGPOOL3 (1<<11) | ||
19 | #define CEPH_FEATURE_OSDREPLYMUX (1<<12) | ||
20 | #define CEPH_FEATURE_OSDENC (1<<13) | ||
21 | #define CEPH_FEATURE_OMAP (1<<14) | ||
22 | #define CEPH_FEATURE_MONENC (1<<15) | ||
23 | #define CEPH_FEATURE_QUERY_T (1<<16) | ||
24 | #define CEPH_FEATURE_INDEP_PG_MAP (1<<17) | ||
16 | #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) | 25 | #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) |
26 | #define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) | ||
27 | #define CEPH_FEATURE_MON_NULLROUTE (1<<20) | ||
28 | #define CEPH_FEATURE_MON_GV (1<<21) | ||
29 | #define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) | ||
30 | #define CEPH_FEATURE_MSG_AUTH (1<<23) | ||
31 | #define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) | ||
32 | #define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) | ||
33 | #define CEPH_FEATURE_CREATEPOOLID (1<<26) | ||
34 | #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) | ||
35 | #define CEPH_FEATURE_OSD_HBMSGS (1<<28) | ||
36 | #define CEPH_FEATURE_MDSENC (1<<29) | ||
37 | #define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) | ||
17 | 38 | ||
18 | /* | 39 | /* |
19 | * Features supported. | 40 | * Features supported. |
20 | */ | 41 | */ |
21 | #define CEPH_FEATURES_SUPPORTED_DEFAULT \ | 42 | #define CEPH_FEATURES_SUPPORTED_DEFAULT \ |
22 | (CEPH_FEATURE_NOSRCADDR | \ | 43 | (CEPH_FEATURE_NOSRCADDR | \ |
23 | CEPH_FEATURE_CRUSH_TUNABLES) | 44 | CEPH_FEATURE_PGID64 | \ |
45 | CEPH_FEATURE_PGPOOL3 | \ | ||
46 | CEPH_FEATURE_OSDENC | \ | ||
47 | CEPH_FEATURE_CRUSH_TUNABLES | \ | ||
48 | CEPH_FEATURE_CRUSH_TUNABLES2 | \ | ||
49 | CEPH_FEATURE_REPLY_CREATE_INODE | \ | ||
50 | CEPH_FEATURE_OSDHASHPSPOOL) | ||
24 | 51 | ||
25 | #define CEPH_FEATURES_REQUIRED_DEFAULT \ | 52 | #define CEPH_FEATURES_REQUIRED_DEFAULT \ |
26 | (CEPH_FEATURE_NOSRCADDR) | 53 | (CEPH_FEATURE_NOSRCADDR | \ |
54 | CEPH_FEATURE_PGID64 | \ | ||
55 | CEPH_FEATURE_PGPOOL3 | \ | ||
56 | CEPH_FEATURE_OSDENC) | ||
27 | #endif | 57 | #endif |
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index cf6f4d998a76..2ad7b860f062 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h | |||
@@ -21,16 +21,14 @@ | |||
21 | * internal cluster protocols separately from the public, | 21 | * internal cluster protocols separately from the public, |
22 | * client-facing protocol. | 22 | * client-facing protocol. |
23 | */ | 23 | */ |
24 | #define CEPH_OSD_PROTOCOL 8 /* cluster internal */ | ||
25 | #define CEPH_MDS_PROTOCOL 12 /* cluster internal */ | ||
26 | #define CEPH_MON_PROTOCOL 5 /* cluster internal */ | ||
27 | #define CEPH_OSDC_PROTOCOL 24 /* server/client */ | 24 | #define CEPH_OSDC_PROTOCOL 24 /* server/client */ |
28 | #define CEPH_MDSC_PROTOCOL 32 /* server/client */ | 25 | #define CEPH_MDSC_PROTOCOL 32 /* server/client */ |
29 | #define CEPH_MONC_PROTOCOL 15 /* server/client */ | 26 | #define CEPH_MONC_PROTOCOL 15 /* server/client */ |
30 | 27 | ||
31 | 28 | ||
32 | #define CEPH_INO_ROOT 1 | 29 | #define CEPH_INO_ROOT 1 |
33 | #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ | 30 | #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ |
31 | #define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */ | ||
34 | 32 | ||
35 | /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ | 33 | /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ |
36 | #define CEPH_MAX_MON 31 | 34 | #define CEPH_MAX_MON 31 |
@@ -51,7 +49,7 @@ struct ceph_file_layout { | |||
51 | __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ | 49 | __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ |
52 | 50 | ||
53 | /* object -> pg layout */ | 51 | /* object -> pg layout */ |
54 | __le32 fl_unused; /* unused; used to be preferred primary (-1) */ | 52 | __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */ |
55 | __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ | 53 | __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ |
56 | } __attribute__ ((packed)); | 54 | } __attribute__ ((packed)); |
57 | 55 | ||
@@ -101,6 +99,8 @@ struct ceph_dir_layout { | |||
101 | #define CEPH_MSG_MON_SUBSCRIBE_ACK 16 | 99 | #define CEPH_MSG_MON_SUBSCRIBE_ACK 16 |
102 | #define CEPH_MSG_AUTH 17 | 100 | #define CEPH_MSG_AUTH 17 |
103 | #define CEPH_MSG_AUTH_REPLY 18 | 101 | #define CEPH_MSG_AUTH_REPLY 18 |
102 | #define CEPH_MSG_MON_GET_VERSION 19 | ||
103 | #define CEPH_MSG_MON_GET_VERSION_REPLY 20 | ||
104 | 104 | ||
105 | /* client <-> mds */ | 105 | /* client <-> mds */ |
106 | #define CEPH_MSG_MDS_MAP 21 | 106 | #define CEPH_MSG_MDS_MAP 21 |
@@ -221,6 +221,11 @@ struct ceph_mon_subscribe_ack { | |||
221 | } __attribute__ ((packed)); | 221 | } __attribute__ ((packed)); |
222 | 222 | ||
223 | /* | 223 | /* |
224 | * mdsmap flags | ||
225 | */ | ||
226 | #define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */ | ||
227 | |||
228 | /* | ||
224 | * mds states | 229 | * mds states |
225 | * > 0 -> in | 230 | * > 0 -> in |
226 | * <= 0 -> out | 231 | * <= 0 -> out |
@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack { | |||
233 | #define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ | 238 | #define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ |
234 | #define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ | 239 | #define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ |
235 | #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ | 240 | #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ |
241 | #define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */ | ||
236 | 242 | ||
237 | #define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ | 243 | #define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ |
238 | #define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed | 244 | #define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed |
@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s); | |||
264 | #define CEPH_LOCK_IXATTR 2048 | 270 | #define CEPH_LOCK_IXATTR 2048 |
265 | #define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ | 271 | #define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ |
266 | #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ | 272 | #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ |
273 | #define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */ | ||
267 | 274 | ||
268 | /* client_session ops */ | 275 | /* client_session ops */ |
269 | enum { | 276 | enum { |
@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op); | |||
338 | #define CEPH_SETATTR_SIZE 32 | 345 | #define CEPH_SETATTR_SIZE 32 |
339 | #define CEPH_SETATTR_CTIME 64 | 346 | #define CEPH_SETATTR_CTIME 64 |
340 | 347 | ||
348 | /* | ||
349 | * Ceph setxattr request flags. | ||
350 | */ | ||
351 | #define CEPH_XATTR_CREATE 1 | ||
352 | #define CEPH_XATTR_REPLACE 2 | ||
353 | |||
341 | union ceph_mds_request_args { | 354 | union ceph_mds_request_args { |
342 | struct { | 355 | struct { |
343 | __le32 mask; /* CEPH_CAP_* */ | 356 | __le32 mask; /* CEPH_CAP_* */ |
@@ -522,14 +535,17 @@ int ceph_flags_to_mode(int flags); | |||
522 | #define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ | 535 | #define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ |
523 | #define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ | 536 | #define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ |
524 | 537 | ||
538 | #define CEPH_CAP_SIMPLE_BITS 2 | ||
539 | #define CEPH_CAP_FILE_BITS 8 | ||
540 | |||
525 | /* per-lock shift */ | 541 | /* per-lock shift */ |
526 | #define CEPH_CAP_SAUTH 2 | 542 | #define CEPH_CAP_SAUTH 2 |
527 | #define CEPH_CAP_SLINK 4 | 543 | #define CEPH_CAP_SLINK 4 |
528 | #define CEPH_CAP_SXATTR 6 | 544 | #define CEPH_CAP_SXATTR 6 |
529 | #define CEPH_CAP_SFILE 8 | 545 | #define CEPH_CAP_SFILE 8 |
530 | #define CEPH_CAP_SFLOCK 20 | 546 | #define CEPH_CAP_SFLOCK 20 |
531 | 547 | ||
532 | #define CEPH_CAP_BITS 22 | 548 | #define CEPH_CAP_BITS 22 |
533 | 549 | ||
534 | /* composed values */ | 550 | /* composed values */ |
535 | #define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) | 551 | #define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) |
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 63d092822bad..360d9d08ca9e 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h | |||
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n) | |||
52 | return end >= *p && n <= end - *p; | 52 | return end >= *p && n <= end - *p; |
53 | } | 53 | } |
54 | 54 | ||
55 | #define ceph_decode_need(p, end, n, bad) \ | 55 | #define ceph_decode_need(p, end, n, bad) \ |
56 | do { \ | 56 | do { \ |
57 | if (!likely(ceph_has_room(p, end, n))) \ | 57 | if (!likely(ceph_has_room(p, end, n))) \ |
58 | goto bad; \ | 58 | goto bad; \ |
59 | } while (0) | 59 | } while (0) |
60 | 60 | ||
61 | #define ceph_decode_64_safe(p, end, v, bad) \ | 61 | #define ceph_decode_64_safe(p, end, v, bad) \ |
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n) | |||
99 | * | 99 | * |
100 | * There are two possible failures: | 100 | * There are two possible failures: |
101 | * - converting the string would require accessing memory at or | 101 | * - converting the string would require accessing memory at or |
102 | * beyond the "end" pointer provided (-E | 102 | * beyond the "end" pointer provided (-ERANGE) |
103 | * - memory could not be allocated for the result | 103 | * - memory could not be allocated for the result (-ENOMEM) |
104 | */ | 104 | */ |
105 | static inline char *ceph_extract_encoded_string(void **p, void *end, | 105 | static inline char *ceph_extract_encoded_string(void **p, void *end, |
106 | size_t *lenp, gfp_t gfp) | 106 | size_t *lenp, gfp_t gfp) |
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end, | |||
217 | *p += len; | 217 | *p += len; |
218 | } | 218 | } |
219 | 219 | ||
220 | #define ceph_encode_need(p, end, n, bad) \ | 220 | #define ceph_encode_need(p, end, n, bad) \ |
221 | do { \ | 221 | do { \ |
222 | if (!likely(ceph_has_room(p, end, n))) \ | 222 | if (!likely(ceph_has_room(p, end, n))) \ |
223 | goto bad; \ | 223 | goto bad; \ |
224 | } while (0) | 224 | } while (0) |
225 | 225 | ||
226 | #define ceph_encode_64_safe(p, end, v, bad) \ | 226 | #define ceph_encode_64_safe(p, end, v, bad) \ |
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end, | |||
231 | #define ceph_encode_32_safe(p, end, v, bad) \ | 231 | #define ceph_encode_32_safe(p, end, v, bad) \ |
232 | do { \ | 232 | do { \ |
233 | ceph_encode_need(p, end, sizeof(u32), bad); \ | 233 | ceph_encode_need(p, end, sizeof(u32), bad); \ |
234 | ceph_encode_32(p, v); \ | 234 | ceph_encode_32(p, v); \ |
235 | } while (0) | 235 | } while (0) |
236 | #define ceph_encode_16_safe(p, end, v, bad) \ | 236 | #define ceph_encode_16_safe(p, end, v, bad) \ |
237 | do { \ | 237 | do { \ |
238 | ceph_encode_need(p, end, sizeof(u16), bad); \ | 238 | ceph_encode_need(p, end, sizeof(u16), bad); \ |
239 | ceph_encode_16(p, v); \ | 239 | ceph_encode_16(p, v); \ |
240 | } while (0) | ||
241 | #define ceph_encode_8_safe(p, end, v, bad) \ | ||
242 | do { \ | ||
243 | ceph_encode_need(p, end, sizeof(u8), bad); \ | ||
244 | ceph_encode_8(p, v); \ | ||
240 | } while (0) | 245 | } while (0) |
241 | 246 | ||
242 | #define ceph_encode_copy_safe(p, end, pv, n, bad) \ | 247 | #define ceph_encode_copy_safe(p, end, pv, n, bad) \ |
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 084d3c622b12..29818fc3fa49 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h | |||
@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len) | |||
193 | } | 193 | } |
194 | 194 | ||
195 | /* ceph_common.c */ | 195 | /* ceph_common.c */ |
196 | extern bool libceph_compatible(void *data); | ||
197 | |||
196 | extern const char *ceph_msg_type_name(int type); | 198 | extern const char *ceph_msg_type_name(int type); |
197 | extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); | 199 | extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); |
198 | extern struct kmem_cache *ceph_inode_cachep; | 200 | extern struct kmem_cache *ceph_inode_cachep; |
@@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client); | |||
220 | /* pagevec.c */ | 222 | /* pagevec.c */ |
221 | extern void ceph_release_page_vector(struct page **pages, int num_pages); | 223 | extern void ceph_release_page_vector(struct page **pages, int num_pages); |
222 | 224 | ||
223 | extern struct page **ceph_get_direct_page_vector(const char __user *data, | 225 | extern struct page **ceph_get_direct_page_vector(const void __user *data, |
224 | int num_pages, | 226 | int num_pages, |
225 | bool write_page); | 227 | bool write_page); |
226 | extern void ceph_put_page_vector(struct page **pages, int num_pages, | 228 | extern void ceph_put_page_vector(struct page **pages, int num_pages, |
@@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages, | |||
228 | extern void ceph_release_page_vector(struct page **pages, int num_pages); | 230 | extern void ceph_release_page_vector(struct page **pages, int num_pages); |
229 | extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); | 231 | extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); |
230 | extern int ceph_copy_user_to_page_vector(struct page **pages, | 232 | extern int ceph_copy_user_to_page_vector(struct page **pages, |
231 | const char __user *data, | 233 | const void __user *data, |
232 | loff_t off, size_t len); | 234 | loff_t off, size_t len); |
233 | extern int ceph_copy_to_page_vector(struct page **pages, | 235 | extern void ceph_copy_to_page_vector(struct page **pages, |
234 | const char *data, | 236 | const void *data, |
235 | loff_t off, size_t len); | 237 | loff_t off, size_t len); |
236 | extern int ceph_copy_from_page_vector(struct page **pages, | 238 | extern void ceph_copy_from_page_vector(struct page **pages, |
237 | char *data, | 239 | void *data, |
238 | loff_t off, size_t len); | 240 | loff_t off, size_t len); |
239 | extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, | 241 | extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data, |
240 | loff_t off, size_t len); | 242 | loff_t off, size_t len); |
241 | extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); | 243 | extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); |
242 | 244 | ||
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index cb15b5d867c7..87ed09f54800 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h | |||
@@ -29,8 +29,8 @@ struct ceph_mdsmap { | |||
29 | 29 | ||
30 | /* which object pools file data can be stored in */ | 30 | /* which object pools file data can be stored in */ |
31 | int m_num_data_pg_pools; | 31 | int m_num_data_pg_pools; |
32 | u32 *m_data_pg_pools; | 32 | u64 *m_data_pg_pools; |
33 | u32 m_cas_pg_pool; | 33 | u64 m_cas_pg_pool; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | static inline struct ceph_entity_addr * | 36 | static inline struct ceph_entity_addr * |
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 14ba5ee738a9..60903e0f665c 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h | |||
@@ -83,9 +83,11 @@ struct ceph_msg { | |||
83 | struct list_head list_head; | 83 | struct list_head list_head; |
84 | 84 | ||
85 | struct kref kref; | 85 | struct kref kref; |
86 | #ifdef CONFIG_BLOCK | ||
86 | struct bio *bio; /* instead of pages/pagelist */ | 87 | struct bio *bio; /* instead of pages/pagelist */ |
87 | struct bio *bio_iter; /* bio iterator */ | 88 | struct bio *bio_iter; /* bio iterator */ |
88 | int bio_seg; /* current bio segment */ | 89 | int bio_seg; /* current bio segment */ |
90 | #endif /* CONFIG_BLOCK */ | ||
89 | struct ceph_pagelist *trail; /* the trailing part of the data */ | 91 | struct ceph_pagelist *trail; /* the trailing part of the data */ |
90 | bool front_is_vmalloc; | 92 | bool front_is_vmalloc; |
91 | bool more_to_follow; | 93 | bool more_to_follow; |
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d9b880e977e6..1dd5d466b6f9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/ceph/osdmap.h> | 10 | #include <linux/ceph/osdmap.h> |
11 | #include <linux/ceph/messenger.h> | 11 | #include <linux/ceph/messenger.h> |
12 | #include <linux/ceph/auth.h> | 12 | #include <linux/ceph/auth.h> |
13 | #include <linux/ceph/pagelist.h> | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * Maximum object name size | 16 | * Maximum object name size |
@@ -22,7 +23,6 @@ struct ceph_snap_context; | |||
22 | struct ceph_osd_request; | 23 | struct ceph_osd_request; |
23 | struct ceph_osd_client; | 24 | struct ceph_osd_client; |
24 | struct ceph_authorizer; | 25 | struct ceph_authorizer; |
25 | struct ceph_pagelist; | ||
26 | 26 | ||
27 | /* | 27 | /* |
28 | * completion callback for async writepages | 28 | * completion callback for async writepages |
@@ -47,6 +47,9 @@ struct ceph_osd { | |||
47 | struct list_head o_keepalive_item; | 47 | struct list_head o_keepalive_item; |
48 | }; | 48 | }; |
49 | 49 | ||
50 | |||
51 | #define CEPH_OSD_MAX_OP 10 | ||
52 | |||
50 | /* an in-flight request */ | 53 | /* an in-flight request */ |
51 | struct ceph_osd_request { | 54 | struct ceph_osd_request { |
52 | u64 r_tid; /* unique for this client */ | 55 | u64 r_tid; /* unique for this client */ |
@@ -63,9 +66,23 @@ struct ceph_osd_request { | |||
63 | struct ceph_connection *r_con_filling_msg; | 66 | struct ceph_connection *r_con_filling_msg; |
64 | 67 | ||
65 | struct ceph_msg *r_request, *r_reply; | 68 | struct ceph_msg *r_request, *r_reply; |
66 | int r_result; | ||
67 | int r_flags; /* any additional flags for the osd */ | 69 | int r_flags; /* any additional flags for the osd */ |
68 | u32 r_sent; /* >0 if r_request is sending/sent */ | 70 | u32 r_sent; /* >0 if r_request is sending/sent */ |
71 | int r_num_ops; | ||
72 | |||
73 | /* encoded message content */ | ||
74 | struct ceph_osd_op *r_request_ops; | ||
75 | /* these are updated on each send */ | ||
76 | __le32 *r_request_osdmap_epoch; | ||
77 | __le32 *r_request_flags; | ||
78 | __le64 *r_request_pool; | ||
79 | void *r_request_pgid; | ||
80 | __le32 *r_request_attempts; | ||
81 | struct ceph_eversion *r_request_reassert_version; | ||
82 | |||
83 | int r_result; | ||
84 | int r_reply_op_len[CEPH_OSD_MAX_OP]; | ||
85 | s32 r_reply_op_result[CEPH_OSD_MAX_OP]; | ||
69 | int r_got_reply; | 86 | int r_got_reply; |
70 | int r_linger; | 87 | int r_linger; |
71 | 88 | ||
@@ -82,6 +99,7 @@ struct ceph_osd_request { | |||
82 | 99 | ||
83 | char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ | 100 | char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ |
84 | int r_oid_len; | 101 | int r_oid_len; |
102 | u64 r_snapid; | ||
85 | unsigned long r_stamp; /* send OR check time */ | 103 | unsigned long r_stamp; /* send OR check time */ |
86 | 104 | ||
87 | struct ceph_file_layout r_file_layout; | 105 | struct ceph_file_layout r_file_layout; |
@@ -95,7 +113,7 @@ struct ceph_osd_request { | |||
95 | struct bio *r_bio; /* instead of pages */ | 113 | struct bio *r_bio; /* instead of pages */ |
96 | #endif | 114 | #endif |
97 | 115 | ||
98 | struct ceph_pagelist *r_trail; /* trailing part of the data */ | 116 | struct ceph_pagelist r_trail; /* trailing part of the data */ |
99 | }; | 117 | }; |
100 | 118 | ||
101 | struct ceph_osd_event { | 119 | struct ceph_osd_event { |
@@ -107,7 +125,6 @@ struct ceph_osd_event { | |||
107 | struct rb_node node; | 125 | struct rb_node node; |
108 | struct list_head osd_node; | 126 | struct list_head osd_node; |
109 | struct kref kref; | 127 | struct kref kref; |
110 | struct completion completion; | ||
111 | }; | 128 | }; |
112 | 129 | ||
113 | struct ceph_osd_event_work { | 130 | struct ceph_osd_event_work { |
@@ -157,7 +174,7 @@ struct ceph_osd_client { | |||
157 | 174 | ||
158 | struct ceph_osd_req_op { | 175 | struct ceph_osd_req_op { |
159 | u16 op; /* CEPH_OSD_OP_* */ | 176 | u16 op; /* CEPH_OSD_OP_* */ |
160 | u32 flags; /* CEPH_OSD_FLAG_* */ | 177 | u32 payload_len; |
161 | union { | 178 | union { |
162 | struct { | 179 | struct { |
163 | u64 offset, length; | 180 | u64 offset, length; |
@@ -166,23 +183,24 @@ struct ceph_osd_req_op { | |||
166 | } extent; | 183 | } extent; |
167 | struct { | 184 | struct { |
168 | const char *name; | 185 | const char *name; |
169 | u32 name_len; | ||
170 | const char *val; | 186 | const char *val; |
187 | u32 name_len; | ||
171 | u32 value_len; | 188 | u32 value_len; |
172 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ | 189 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ |
173 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ | 190 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ |
174 | } xattr; | 191 | } xattr; |
175 | struct { | 192 | struct { |
176 | const char *class_name; | 193 | const char *class_name; |
177 | __u8 class_len; | ||
178 | const char *method_name; | 194 | const char *method_name; |
179 | __u8 method_len; | ||
180 | __u8 argc; | ||
181 | const char *indata; | 195 | const char *indata; |
182 | u32 indata_len; | 196 | u32 indata_len; |
197 | __u8 class_len; | ||
198 | __u8 method_len; | ||
199 | __u8 argc; | ||
183 | } cls; | 200 | } cls; |
184 | struct { | 201 | struct { |
185 | u64 cookie, count; | 202 | u64 cookie; |
203 | u64 count; | ||
186 | } pgls; | 204 | } pgls; |
187 | struct { | 205 | struct { |
188 | u64 snapid; | 206 | u64 snapid; |
@@ -190,12 +208,11 @@ struct ceph_osd_req_op { | |||
190 | struct { | 208 | struct { |
191 | u64 cookie; | 209 | u64 cookie; |
192 | u64 ver; | 210 | u64 ver; |
193 | __u8 flag; | ||
194 | u32 prot_ver; | 211 | u32 prot_ver; |
195 | u32 timeout; | 212 | u32 timeout; |
213 | __u8 flag; | ||
196 | } watch; | 214 | } watch; |
197 | }; | 215 | }; |
198 | u32 payload_len; | ||
199 | }; | 216 | }; |
200 | 217 | ||
201 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, | 218 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, |
@@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, | |||
207 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, | 224 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, |
208 | struct ceph_msg *msg); | 225 | struct ceph_msg *msg); |
209 | 226 | ||
210 | extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, | ||
211 | struct ceph_file_layout *layout, | ||
212 | u64 snapid, | ||
213 | u64 off, u64 *plen, u64 *bno, | ||
214 | struct ceph_osd_request *req, | ||
215 | struct ceph_osd_req_op *op); | ||
216 | |||
217 | extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | 227 | extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, |
218 | int flags, | ||
219 | struct ceph_snap_context *snapc, | 228 | struct ceph_snap_context *snapc, |
220 | struct ceph_osd_req_op *ops, | 229 | unsigned int num_op, |
221 | bool use_mempool, | 230 | bool use_mempool, |
222 | gfp_t gfp_flags, | 231 | gfp_t gfp_flags); |
223 | struct page **pages, | ||
224 | struct bio *bio); | ||
225 | 232 | ||
226 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, | 233 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, |
227 | u64 off, u64 *plen, | 234 | u64 off, u64 len, |
235 | unsigned int num_op, | ||
228 | struct ceph_osd_req_op *src_ops, | 236 | struct ceph_osd_req_op *src_ops, |
229 | struct ceph_snap_context *snapc, | 237 | struct ceph_snap_context *snapc, |
230 | struct timespec *mtime, | 238 | u64 snap_id, |
231 | const char *oid, | 239 | struct timespec *mtime); |
232 | int oid_len); | ||
233 | 240 | ||
234 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | 241 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, |
235 | struct ceph_file_layout *layout, | 242 | struct ceph_file_layout *layout, |
@@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | |||
239 | int do_sync, u32 truncate_seq, | 246 | int do_sync, u32 truncate_seq, |
240 | u64 truncate_size, | 247 | u64 truncate_size, |
241 | struct timespec *mtime, | 248 | struct timespec *mtime, |
242 | bool use_mempool, int num_reply, | 249 | bool use_mempool, int page_align); |
243 | int page_align); | ||
244 | 250 | ||
245 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | 251 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, |
246 | struct ceph_osd_request *req); | 252 | struct ceph_osd_request *req); |
@@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | |||
279 | u64 off, u64 len, | 285 | u64 off, u64 len, |
280 | u32 truncate_seq, u64 truncate_size, | 286 | u32 truncate_seq, u64 truncate_size, |
281 | struct timespec *mtime, | 287 | struct timespec *mtime, |
282 | struct page **pages, int nr_pages, | 288 | struct page **pages, int nr_pages); |
283 | int flags, int do_sync, bool nofail); | ||
284 | 289 | ||
285 | /* watch/notify events */ | 290 | /* watch/notify events */ |
286 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 291 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, |
287 | void (*event_cb)(u64, u64, u8, void *), | 292 | void (*event_cb)(u64, u64, u8, void *), |
288 | int one_shot, void *data, | 293 | void *data, struct ceph_osd_event **pevent); |
289 | struct ceph_osd_event **pevent); | ||
290 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); | 294 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); |
291 | extern int ceph_osdc_wait_event(struct ceph_osd_event *event, | ||
292 | unsigned long timeout); | ||
293 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); | 295 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); |
294 | #endif | 296 | #endif |
295 | 297 | ||
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 10a417f9f76f..c819190d1642 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h | |||
@@ -18,14 +18,31 @@ | |||
18 | * The map can be updated either via an incremental map (diff) describing | 18 | * The map can be updated either via an incremental map (diff) describing |
19 | * the change between two successive epochs, or as a fully encoded map. | 19 | * the change between two successive epochs, or as a fully encoded map. |
20 | */ | 20 | */ |
21 | struct ceph_pg { | ||
22 | uint64_t pool; | ||
23 | uint32_t seed; | ||
24 | }; | ||
25 | |||
26 | #define CEPH_POOL_FLAG_HASHPSPOOL 1 | ||
27 | |||
21 | struct ceph_pg_pool_info { | 28 | struct ceph_pg_pool_info { |
22 | struct rb_node node; | 29 | struct rb_node node; |
23 | int id; | 30 | s64 id; |
24 | struct ceph_pg_pool v; | 31 | u8 type; |
25 | int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; | 32 | u8 size; |
33 | u8 crush_ruleset; | ||
34 | u8 object_hash; | ||
35 | u32 pg_num, pgp_num; | ||
36 | int pg_num_mask, pgp_num_mask; | ||
37 | u64 flags; | ||
26 | char *name; | 38 | char *name; |
27 | }; | 39 | }; |
28 | 40 | ||
41 | struct ceph_object_locator { | ||
42 | uint64_t pool; | ||
43 | char *key; | ||
44 | }; | ||
45 | |||
29 | struct ceph_pg_mapping { | 46 | struct ceph_pg_mapping { |
30 | struct rb_node node; | 47 | struct rb_node node; |
31 | struct ceph_pg pgid; | 48 | struct ceph_pg pgid; |
@@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map); | |||
110 | 127 | ||
111 | /* calculate mapping of a file extent to an object */ | 128 | /* calculate mapping of a file extent to an object */ |
112 | extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | 129 | extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, |
113 | u64 off, u64 *plen, | 130 | u64 off, u64 len, |
114 | u64 *bno, u64 *oxoff, u64 *oxlen); | 131 | u64 *bno, u64 *oxoff, u64 *oxlen); |
115 | 132 | ||
116 | /* calculate mapping of object to a placement group */ | 133 | /* calculate mapping of object to a placement group */ |
117 | extern int ceph_calc_object_layout(struct ceph_object_layout *ol, | 134 | extern int ceph_calc_object_layout(struct ceph_pg *pg, |
118 | const char *oid, | 135 | const char *oid, |
119 | struct ceph_file_layout *fl, | 136 | struct ceph_file_layout *fl, |
120 | struct ceph_osdmap *osdmap); | 137 | struct ceph_osdmap *osdmap); |
121 | extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | 138 | extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, |
139 | struct ceph_pg pgid, | ||
122 | int *acting); | 140 | int *acting); |
123 | extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, | 141 | extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, |
124 | struct ceph_pg pgid); | 142 | struct ceph_pg pgid); |
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 2c04afeead1c..68c96a508ac2 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h | |||
@@ -9,14 +9,6 @@ | |||
9 | #include <linux/ceph/msgr.h> | 9 | #include <linux/ceph/msgr.h> |
10 | 10 | ||
11 | /* | 11 | /* |
12 | * osdmap encoding versions | ||
13 | */ | ||
14 | #define CEPH_OSDMAP_INC_VERSION 5 | ||
15 | #define CEPH_OSDMAP_INC_VERSION_EXT 6 | ||
16 | #define CEPH_OSDMAP_VERSION 5 | ||
17 | #define CEPH_OSDMAP_VERSION_EXT 6 | ||
18 | |||
19 | /* | ||
20 | * fs id | 12 | * fs id |
21 | */ | 13 | */ |
22 | struct ceph_fsid { | 14 | struct ceph_fsid { |
@@ -64,7 +56,7 @@ struct ceph_timespec { | |||
64 | * placement group. | 56 | * placement group. |
65 | * we encode this into one __le64. | 57 | * we encode this into one __le64. |
66 | */ | 58 | */ |
67 | struct ceph_pg { | 59 | struct ceph_pg_v1 { |
68 | __le16 preferred; /* preferred primary osd */ | 60 | __le16 preferred; /* preferred primary osd */ |
69 | __le16 ps; /* placement seed */ | 61 | __le16 ps; /* placement seed */ |
70 | __le32 pool; /* object pool */ | 62 | __le32 pool; /* object pool */ |
@@ -91,21 +83,6 @@ struct ceph_pg { | |||
91 | 83 | ||
92 | #define CEPH_PG_TYPE_REP 1 | 84 | #define CEPH_PG_TYPE_REP 1 |
93 | #define CEPH_PG_TYPE_RAID4 2 | 85 | #define CEPH_PG_TYPE_RAID4 2 |
94 | #define CEPH_PG_POOL_VERSION 2 | ||
95 | struct ceph_pg_pool { | ||
96 | __u8 type; /* CEPH_PG_TYPE_* */ | ||
97 | __u8 size; /* number of osds in each pg */ | ||
98 | __u8 crush_ruleset; /* crush placement rule */ | ||
99 | __u8 object_hash; /* hash mapping object name to ps */ | ||
100 | __le32 pg_num, pgp_num; /* number of pg's */ | ||
101 | __le32 lpg_num, lpgp_num; /* number of localized pg's */ | ||
102 | __le32 last_change; /* most recent epoch changed */ | ||
103 | __le64 snap_seq; /* seq for per-pool snapshot */ | ||
104 | __le32 snap_epoch; /* epoch of last snap */ | ||
105 | __le32 num_snaps; | ||
106 | __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ | ||
107 | __le64 auid; /* who owns the pg */ | ||
108 | } __attribute__ ((packed)); | ||
109 | 86 | ||
110 | /* | 87 | /* |
111 | * stable_mod func is used to control number of placement groups. | 88 | * stable_mod func is used to control number of placement groups. |
@@ -128,7 +105,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask) | |||
128 | * object layout - how a given object should be stored. | 105 | * object layout - how a given object should be stored. |
129 | */ | 106 | */ |
130 | struct ceph_object_layout { | 107 | struct ceph_object_layout { |
131 | struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ | 108 | struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */ |
132 | __le32 ol_stripe_unit; /* for per-object parity, if any */ | 109 | __le32 ol_stripe_unit; /* for per-object parity, if any */ |
133 | } __attribute__ ((packed)); | 110 | } __attribute__ ((packed)); |
134 | 111 | ||
@@ -145,8 +122,12 @@ struct ceph_eversion { | |||
145 | */ | 122 | */ |
146 | 123 | ||
147 | /* status bits */ | 124 | /* status bits */ |
148 | #define CEPH_OSD_EXISTS 1 | 125 | #define CEPH_OSD_EXISTS (1<<0) |
149 | #define CEPH_OSD_UP 2 | 126 | #define CEPH_OSD_UP (1<<1) |
127 | #define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */ | ||
128 | #define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ | ||
129 | |||
130 | extern const char *ceph_osd_state_name(int s); | ||
150 | 131 | ||
151 | /* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ | 132 | /* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ |
152 | #define CEPH_OSD_IN 0x10000 | 133 | #define CEPH_OSD_IN 0x10000 |
@@ -161,9 +142,25 @@ struct ceph_eversion { | |||
161 | #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ | 142 | #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ |
162 | #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ | 143 | #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ |
163 | #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ | 144 | #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ |
145 | #define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */ | ||
146 | #define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */ | ||
147 | #define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */ | ||
148 | #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ | ||
149 | #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ | ||
150 | #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ | ||
151 | |||
152 | /* | ||
153 | * The error code to return when an OSD can't handle a write | ||
154 | * because it is too large. | ||
155 | */ | ||
156 | #define OSD_WRITETOOBIG EMSGSIZE | ||
164 | 157 | ||
165 | /* | 158 | /* |
166 | * osd ops | 159 | * osd ops |
160 | * | ||
161 | * WARNING: do not use these op codes directly. Use the helpers | ||
162 | * defined below instead. In certain cases, op code behavior was | ||
163 | * redefined, resulting in special-cases in the helpers. | ||
167 | */ | 164 | */ |
168 | #define CEPH_OSD_OP_MODE 0xf000 | 165 | #define CEPH_OSD_OP_MODE 0xf000 |
169 | #define CEPH_OSD_OP_MODE_RD 0x1000 | 166 | #define CEPH_OSD_OP_MODE_RD 0x1000 |
@@ -177,6 +174,7 @@ struct ceph_eversion { | |||
177 | #define CEPH_OSD_OP_TYPE_ATTR 0x0300 | 174 | #define CEPH_OSD_OP_TYPE_ATTR 0x0300 |
178 | #define CEPH_OSD_OP_TYPE_EXEC 0x0400 | 175 | #define CEPH_OSD_OP_TYPE_EXEC 0x0400 |
179 | #define CEPH_OSD_OP_TYPE_PG 0x0500 | 176 | #define CEPH_OSD_OP_TYPE_PG 0x0500 |
177 | #define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */ | ||
180 | 178 | ||
181 | enum { | 179 | enum { |
182 | /** data **/ | 180 | /** data **/ |
@@ -217,6 +215,23 @@ enum { | |||
217 | 215 | ||
218 | CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, | 216 | CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, |
219 | 217 | ||
218 | /* omap */ | ||
219 | CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17, | ||
220 | CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18, | ||
221 | CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19, | ||
222 | CEPH_OSD_OP_OMAPGETVALSBYKEYS = | ||
223 | CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20, | ||
224 | CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21, | ||
225 | CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22, | ||
226 | CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23, | ||
227 | CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, | ||
228 | CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, | ||
229 | |||
230 | /** multi **/ | ||
231 | CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, | ||
232 | CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, | ||
233 | CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3, | ||
234 | |||
220 | /** attrs **/ | 235 | /** attrs **/ |
221 | /* read */ | 236 | /* read */ |
222 | CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, | 237 | CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, |
@@ -238,6 +253,7 @@ enum { | |||
238 | CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, | 253 | CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, |
239 | CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, | 254 | CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, |
240 | CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, | 255 | CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, |
256 | CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9, | ||
241 | 257 | ||
242 | /** lock **/ | 258 | /** lock **/ |
243 | CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, | 259 | CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, |
@@ -248,10 +264,12 @@ enum { | |||
248 | CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, | 264 | CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, |
249 | 265 | ||
250 | /** exec **/ | 266 | /** exec **/ |
267 | /* note: the RD bit here is wrong; see special-case below in helper */ | ||
251 | CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, | 268 | CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, |
252 | 269 | ||
253 | /** pg **/ | 270 | /** pg **/ |
254 | CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, | 271 | CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, |
272 | CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2, | ||
255 | }; | 273 | }; |
256 | 274 | ||
257 | static inline int ceph_osd_op_type_lock(int op) | 275 | static inline int ceph_osd_op_type_lock(int op) |
@@ -274,6 +292,10 @@ static inline int ceph_osd_op_type_pg(int op) | |||
274 | { | 292 | { |
275 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; | 293 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; |
276 | } | 294 | } |
295 | static inline int ceph_osd_op_type_multi(int op) | ||
296 | { | ||
297 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI; | ||
298 | } | ||
277 | 299 | ||
278 | static inline int ceph_osd_op_mode_subop(int op) | 300 | static inline int ceph_osd_op_mode_subop(int op) |
279 | { | 301 | { |
@@ -281,11 +303,12 @@ static inline int ceph_osd_op_mode_subop(int op) | |||
281 | } | 303 | } |
282 | static inline int ceph_osd_op_mode_read(int op) | 304 | static inline int ceph_osd_op_mode_read(int op) |
283 | { | 305 | { |
284 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; | 306 | return (op & CEPH_OSD_OP_MODE_RD) && |
307 | op != CEPH_OSD_OP_CALL; | ||
285 | } | 308 | } |
286 | static inline int ceph_osd_op_mode_modify(int op) | 309 | static inline int ceph_osd_op_mode_modify(int op) |
287 | { | 310 | { |
288 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; | 311 | return op & CEPH_OSD_OP_MODE_WR; |
289 | } | 312 | } |
290 | 313 | ||
291 | /* | 314 | /* |
@@ -294,34 +317,38 @@ static inline int ceph_osd_op_mode_modify(int op) | |||
294 | */ | 317 | */ |
295 | #define CEPH_OSD_TMAP_HDR 'h' | 318 | #define CEPH_OSD_TMAP_HDR 'h' |
296 | #define CEPH_OSD_TMAP_SET 's' | 319 | #define CEPH_OSD_TMAP_SET 's' |
320 | #define CEPH_OSD_TMAP_CREATE 'c' /* create key */ | ||
297 | #define CEPH_OSD_TMAP_RM 'r' | 321 | #define CEPH_OSD_TMAP_RM 'r' |
322 | #define CEPH_OSD_TMAP_RMSLOPPY 'R' | ||
298 | 323 | ||
299 | extern const char *ceph_osd_op_name(int op); | 324 | extern const char *ceph_osd_op_name(int op); |
300 | 325 | ||
301 | |||
302 | /* | 326 | /* |
303 | * osd op flags | 327 | * osd op flags |
304 | * | 328 | * |
305 | * An op may be READ, WRITE, or READ|WRITE. | 329 | * An op may be READ, WRITE, or READ|WRITE. |
306 | */ | 330 | */ |
307 | enum { | 331 | enum { |
308 | CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ | 332 | CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */ |
309 | CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ | 333 | CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */ |
310 | CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ | 334 | CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */ |
311 | CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ | 335 | CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */ |
312 | CEPH_OSD_FLAG_READ = 16, /* op may read */ | 336 | CEPH_OSD_FLAG_READ = 0x0010, /* op may read */ |
313 | CEPH_OSD_FLAG_WRITE = 32, /* op may write */ | 337 | CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */ |
314 | CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ | 338 | CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */ |
315 | CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ | 339 | CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */ |
316 | CEPH_OSD_FLAG_BALANCE_READS = 256, | 340 | CEPH_OSD_FLAG_BALANCE_READS = 0x0100, |
317 | CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ | 341 | CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */ |
318 | CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ | 342 | CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */ |
319 | CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ | 343 | CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */ |
320 | CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ | 344 | CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ |
345 | CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ | ||
346 | CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ | ||
321 | }; | 347 | }; |
322 | 348 | ||
323 | enum { | 349 | enum { |
324 | CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ | 350 | CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ |
351 | CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ | ||
325 | }; | 352 | }; |
326 | 353 | ||
327 | #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ | 354 | #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ |
@@ -381,48 +408,13 @@ struct ceph_osd_op { | |||
381 | __le64 ver; | 408 | __le64 ver; |
382 | __u8 flag; /* 0 = unwatch, 1 = watch */ | 409 | __u8 flag; /* 0 = unwatch, 1 = watch */ |
383 | } __attribute__ ((packed)) watch; | 410 | } __attribute__ ((packed)) watch; |
384 | }; | 411 | struct { |
412 | __le64 offset, length; | ||
413 | __le64 src_offset; | ||
414 | } __attribute__ ((packed)) clonerange; | ||
415 | }; | ||
385 | __le32 payload_len; | 416 | __le32 payload_len; |
386 | } __attribute__ ((packed)); | 417 | } __attribute__ ((packed)); |
387 | 418 | ||
388 | /* | ||
389 | * osd request message header. each request may include multiple | ||
390 | * ceph_osd_op object operations. | ||
391 | */ | ||
392 | struct ceph_osd_request_head { | ||
393 | __le32 client_inc; /* client incarnation */ | ||
394 | struct ceph_object_layout layout; /* pgid */ | ||
395 | __le32 osdmap_epoch; /* client's osdmap epoch */ | ||
396 | |||
397 | __le32 flags; | ||
398 | |||
399 | struct ceph_timespec mtime; /* for mutations only */ | ||
400 | struct ceph_eversion reassert_version; /* if we are replaying op */ | ||
401 | |||
402 | __le32 object_len; /* length of object name */ | ||
403 | |||
404 | __le64 snapid; /* snapid to read */ | ||
405 | __le64 snap_seq; /* writer's snap context */ | ||
406 | __le32 num_snaps; | ||
407 | |||
408 | __le16 num_ops; | ||
409 | struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ | ||
410 | } __attribute__ ((packed)); | ||
411 | |||
412 | struct ceph_osd_reply_head { | ||
413 | __le32 client_inc; /* client incarnation */ | ||
414 | __le32 flags; | ||
415 | struct ceph_object_layout layout; | ||
416 | __le32 osdmap_epoch; | ||
417 | struct ceph_eversion reassert_version; /* for replaying uncommitted */ | ||
418 | |||
419 | __le32 result; /* result code */ | ||
420 | |||
421 | __le32 object_len; /* length of object name */ | ||
422 | __le32 num_ops; | ||
423 | struct ceph_osd_op ops[0]; /* ops[], object */ | ||
424 | } __attribute__ ((packed)); | ||
425 | |||
426 | |||
427 | 419 | ||
428 | #endif | 420 | #endif |
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 25baa287cff7..6a1101f24cfb 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h | |||
@@ -162,6 +162,8 @@ struct crush_map { | |||
162 | __u32 choose_local_fallback_tries; | 162 | __u32 choose_local_fallback_tries; |
163 | /* choose attempts before giving up */ | 163 | /* choose attempts before giving up */ |
164 | __u32 choose_total_tries; | 164 | __u32 choose_total_tries; |
165 | /* attempt chooseleaf inner descent once; on failure retry outer descent */ | ||
166 | __u32 chooseleaf_descend_once; | ||
165 | }; | 167 | }; |
166 | 168 | ||
167 | 169 | ||
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 1deb29af82fd..e65e6e4be38b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c | |||
@@ -28,6 +28,22 @@ | |||
28 | #include "crypto.h" | 28 | #include "crypto.h" |
29 | 29 | ||
30 | 30 | ||
31 | /* | ||
32 | * Module compatibility interface. For now it doesn't do anything, | ||
33 | * but its existence signals a certain level of functionality. | ||
34 | * | ||
35 | * The data buffer is used to pass information both to and from | ||
36 | * libceph. The return value indicates whether libceph determines | ||
37 | * it is compatible with the caller (from another kernel module), | ||
38 | * given the provided data. | ||
39 | * | ||
40 | * The data pointer can be null. | ||
41 | */ | ||
42 | bool libceph_compatible(void *data) | ||
43 | { | ||
44 | return true; | ||
45 | } | ||
46 | EXPORT_SYMBOL(libceph_compatible); | ||
31 | 47 | ||
32 | /* | 48 | /* |
33 | * find filename portion of a path (/foo/bar/baz -> baz) | 49 | * find filename portion of a path (/foo/bar/baz -> baz) |
@@ -590,10 +606,8 @@ static int __init init_ceph_lib(void) | |||
590 | if (ret < 0) | 606 | if (ret < 0) |
591 | goto out_crypto; | 607 | goto out_crypto; |
592 | 608 | ||
593 | pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", | 609 | pr_info("loaded (mon/osd proto %d/%d)\n", |
594 | CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, | 610 | CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); |
595 | CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, | ||
596 | CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); | ||
597 | 611 | ||
598 | return 0; | 612 | return 0; |
599 | 613 | ||
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 3fbda04de29c..1348df96fe15 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c | |||
@@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op) | |||
21 | switch (op) { | 21 | switch (op) { |
22 | case CEPH_OSD_OP_READ: return "read"; | 22 | case CEPH_OSD_OP_READ: return "read"; |
23 | case CEPH_OSD_OP_STAT: return "stat"; | 23 | case CEPH_OSD_OP_STAT: return "stat"; |
24 | case CEPH_OSD_OP_MAPEXT: return "mapext"; | ||
25 | case CEPH_OSD_OP_SPARSE_READ: return "sparse-read"; | ||
26 | case CEPH_OSD_OP_NOTIFY: return "notify"; | ||
27 | case CEPH_OSD_OP_NOTIFY_ACK: return "notify-ack"; | ||
28 | case CEPH_OSD_OP_ASSERT_VER: return "assert-version"; | ||
24 | 29 | ||
25 | case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; | 30 | case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; |
26 | 31 | ||
32 | case CEPH_OSD_OP_CREATE: return "create"; | ||
27 | case CEPH_OSD_OP_WRITE: return "write"; | 33 | case CEPH_OSD_OP_WRITE: return "write"; |
28 | case CEPH_OSD_OP_DELETE: return "delete"; | 34 | case CEPH_OSD_OP_DELETE: return "delete"; |
29 | case CEPH_OSD_OP_TRUNCATE: return "truncate"; | 35 | case CEPH_OSD_OP_TRUNCATE: return "truncate"; |
@@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op) | |||
39 | case CEPH_OSD_OP_TMAPUP: return "tmapup"; | 45 | case CEPH_OSD_OP_TMAPUP: return "tmapup"; |
40 | case CEPH_OSD_OP_TMAPGET: return "tmapget"; | 46 | case CEPH_OSD_OP_TMAPGET: return "tmapget"; |
41 | case CEPH_OSD_OP_TMAPPUT: return "tmapput"; | 47 | case CEPH_OSD_OP_TMAPPUT: return "tmapput"; |
48 | case CEPH_OSD_OP_WATCH: return "watch"; | ||
49 | |||
50 | case CEPH_OSD_OP_CLONERANGE: return "clonerange"; | ||
51 | case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version"; | ||
52 | case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr"; | ||
42 | 53 | ||
43 | case CEPH_OSD_OP_GETXATTR: return "getxattr"; | 54 | case CEPH_OSD_OP_GETXATTR: return "getxattr"; |
44 | case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; | 55 | case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; |
@@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op) | |||
53 | case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; | 64 | case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; |
54 | case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; | 65 | case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; |
55 | case CEPH_OSD_OP_SCRUB: return "scrub"; | 66 | case CEPH_OSD_OP_SCRUB: return "scrub"; |
67 | case CEPH_OSD_OP_SCRUB_RESERVE: return "scrub-reserve"; | ||
68 | case CEPH_OSD_OP_SCRUB_UNRESERVE: return "scrub-unreserve"; | ||
69 | case CEPH_OSD_OP_SCRUB_STOP: return "scrub-stop"; | ||
70 | case CEPH_OSD_OP_SCRUB_MAP: return "scrub-map"; | ||
56 | 71 | ||
57 | case CEPH_OSD_OP_WRLOCK: return "wrlock"; | 72 | case CEPH_OSD_OP_WRLOCK: return "wrlock"; |
58 | case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; | 73 | case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; |
@@ -64,10 +79,34 @@ const char *ceph_osd_op_name(int op) | |||
64 | case CEPH_OSD_OP_CALL: return "call"; | 79 | case CEPH_OSD_OP_CALL: return "call"; |
65 | 80 | ||
66 | case CEPH_OSD_OP_PGLS: return "pgls"; | 81 | case CEPH_OSD_OP_PGLS: return "pgls"; |
82 | case CEPH_OSD_OP_PGLS_FILTER: return "pgls-filter"; | ||
83 | case CEPH_OSD_OP_OMAPGETKEYS: return "omap-get-keys"; | ||
84 | case CEPH_OSD_OP_OMAPGETVALS: return "omap-get-vals"; | ||
85 | case CEPH_OSD_OP_OMAPGETHEADER: return "omap-get-header"; | ||
86 | case CEPH_OSD_OP_OMAPGETVALSBYKEYS: return "omap-get-vals-by-keys"; | ||
87 | case CEPH_OSD_OP_OMAPSETVALS: return "omap-set-vals"; | ||
88 | case CEPH_OSD_OP_OMAPSETHEADER: return "omap-set-header"; | ||
89 | case CEPH_OSD_OP_OMAPCLEAR: return "omap-clear"; | ||
90 | case CEPH_OSD_OP_OMAPRMKEYS: return "omap-rm-keys"; | ||
67 | } | 91 | } |
68 | return "???"; | 92 | return "???"; |
69 | } | 93 | } |
70 | 94 | ||
95 | const char *ceph_osd_state_name(int s) | ||
96 | { | ||
97 | switch (s) { | ||
98 | case CEPH_OSD_EXISTS: | ||
99 | return "exists"; | ||
100 | case CEPH_OSD_UP: | ||
101 | return "up"; | ||
102 | case CEPH_OSD_AUTOOUT: | ||
103 | return "autoout"; | ||
104 | case CEPH_OSD_NEW: | ||
105 | return "new"; | ||
106 | default: | ||
107 | return "???"; | ||
108 | } | ||
109 | } | ||
71 | 110 | ||
72 | const char *ceph_pool_op_name(int op) | 111 | const char *ceph_pool_op_name(int op) |
73 | { | 112 | { |
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 35fce755ce10..cbd06a91941c 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in | |||
287 | * @outpos: our position in that vector | 287 | * @outpos: our position in that vector |
288 | * @firstn: true if choosing "first n" items, false if choosing "indep" | 288 | * @firstn: true if choosing "first n" items, false if choosing "indep" |
289 | * @recurse_to_leaf: true if we want one device under each item of given type | 289 | * @recurse_to_leaf: true if we want one device under each item of given type |
290 | * @descend_once: true if we should only try one descent before giving up | ||
290 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | 291 | * @out2: second output vector for leaf items (if @recurse_to_leaf) |
291 | */ | 292 | */ |
292 | static int crush_choose(const struct crush_map *map, | 293 | static int crush_choose(const struct crush_map *map, |
@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map, | |||
295 | int x, int numrep, int type, | 296 | int x, int numrep, int type, |
296 | int *out, int outpos, | 297 | int *out, int outpos, |
297 | int firstn, int recurse_to_leaf, | 298 | int firstn, int recurse_to_leaf, |
298 | int *out2) | 299 | int descend_once, int *out2) |
299 | { | 300 | { |
300 | int rep; | 301 | int rep; |
301 | unsigned int ftotal, flocal; | 302 | unsigned int ftotal, flocal; |
@@ -391,7 +392,7 @@ static int crush_choose(const struct crush_map *map, | |||
391 | } | 392 | } |
392 | 393 | ||
393 | reject = 0; | 394 | reject = 0; |
394 | if (recurse_to_leaf) { | 395 | if (!collide && recurse_to_leaf) { |
395 | if (item < 0) { | 396 | if (item < 0) { |
396 | if (crush_choose(map, | 397 | if (crush_choose(map, |
397 | map->buckets[-1-item], | 398 | map->buckets[-1-item], |
@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map, | |||
399 | x, outpos+1, 0, | 400 | x, outpos+1, 0, |
400 | out2, outpos, | 401 | out2, outpos, |
401 | firstn, 0, | 402 | firstn, 0, |
403 | map->chooseleaf_descend_once, | ||
402 | NULL) <= outpos) | 404 | NULL) <= outpos) |
403 | /* didn't get leaf */ | 405 | /* didn't get leaf */ |
404 | reject = 1; | 406 | reject = 1; |
@@ -422,7 +424,10 @@ reject: | |||
422 | ftotal++; | 424 | ftotal++; |
423 | flocal++; | 425 | flocal++; |
424 | 426 | ||
425 | if (collide && flocal <= map->choose_local_tries) | 427 | if (reject && descend_once) |
428 | /* let outer call try again */ | ||
429 | skip_rep = 1; | ||
430 | else if (collide && flocal <= map->choose_local_tries) | ||
426 | /* retry locally a few times */ | 431 | /* retry locally a few times */ |
427 | retry_bucket = 1; | 432 | retry_bucket = 1; |
428 | else if (map->choose_local_fallback_tries > 0 && | 433 | else if (map->choose_local_fallback_tries > 0 && |
@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map, | |||
485 | int i, j; | 490 | int i, j; |
486 | int numrep; | 491 | int numrep; |
487 | int firstn; | 492 | int firstn; |
493 | const int descend_once = 0; | ||
488 | 494 | ||
489 | if ((__u32)ruleno >= map->max_rules) { | 495 | if ((__u32)ruleno >= map->max_rules) { |
490 | dprintk(" bad ruleno %d\n", ruleno); | 496 | dprintk(" bad ruleno %d\n", ruleno); |
@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map, | |||
544 | curstep->arg2, | 550 | curstep->arg2, |
545 | o+osize, j, | 551 | o+osize, j, |
546 | firstn, | 552 | firstn, |
547 | recurse_to_leaf, c+osize); | 553 | recurse_to_leaf, |
554 | descend_once, c+osize); | ||
548 | } | 555 | } |
549 | 556 | ||
550 | if (recurse_to_leaf) | 557 | if (recurse_to_leaf) |
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index af14cb425164..6e7a236525b6 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c | |||
@@ -423,7 +423,8 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, | |||
423 | } | 423 | } |
424 | } | 424 | } |
425 | 425 | ||
426 | int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) | 426 | static int ceph_key_instantiate(struct key *key, |
427 | struct key_preparsed_payload *prep) | ||
427 | { | 428 | { |
428 | struct ceph_crypto_key *ckey; | 429 | struct ceph_crypto_key *ckey; |
429 | size_t datalen = prep->datalen; | 430 | size_t datalen = prep->datalen; |
@@ -458,12 +459,12 @@ err: | |||
458 | return ret; | 459 | return ret; |
459 | } | 460 | } |
460 | 461 | ||
461 | int ceph_key_match(const struct key *key, const void *description) | 462 | static int ceph_key_match(const struct key *key, const void *description) |
462 | { | 463 | { |
463 | return strcmp(key->description, description) == 0; | 464 | return strcmp(key->description, description) == 0; |
464 | } | 465 | } |
465 | 466 | ||
466 | void ceph_key_destroy(struct key *key) { | 467 | static void ceph_key_destroy(struct key *key) { |
467 | struct ceph_crypto_key *ckey = key->payload.data; | 468 | struct ceph_crypto_key *ckey = key->payload.data; |
468 | 469 | ||
469 | ceph_crypto_key_destroy(ckey); | 470 | ceph_crypto_key_destroy(ckey); |
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 38b5dc1823d4..00d051f4894e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c | |||
@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p) | |||
66 | for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { | 66 | for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { |
67 | struct ceph_pg_pool_info *pool = | 67 | struct ceph_pg_pool_info *pool = |
68 | rb_entry(n, struct ceph_pg_pool_info, node); | 68 | rb_entry(n, struct ceph_pg_pool_info, node); |
69 | seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", | 69 | seq_printf(s, "pg_pool %llu pg_num %d / %d\n", |
70 | pool->id, pool->v.pg_num, pool->pg_num_mask, | 70 | (unsigned long long)pool->id, pool->pg_num, |
71 | pool->v.lpg_num, pool->lpg_num_mask); | 71 | pool->pg_num_mask); |
72 | } | 72 | } |
73 | for (i = 0; i < client->osdc.osdmap->max_osd; i++) { | 73 | for (i = 0; i < client->osdc.osdmap->max_osd; i++) { |
74 | struct ceph_entity_addr *addr = | 74 | struct ceph_entity_addr *addr = |
@@ -123,26 +123,16 @@ static int osdc_show(struct seq_file *s, void *pp) | |||
123 | mutex_lock(&osdc->request_mutex); | 123 | mutex_lock(&osdc->request_mutex); |
124 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | 124 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { |
125 | struct ceph_osd_request *req; | 125 | struct ceph_osd_request *req; |
126 | struct ceph_osd_request_head *head; | 126 | int opcode; |
127 | struct ceph_osd_op *op; | ||
128 | int num_ops; | ||
129 | int opcode, olen; | ||
130 | int i; | 127 | int i; |
131 | 128 | ||
132 | req = rb_entry(p, struct ceph_osd_request, r_node); | 129 | req = rb_entry(p, struct ceph_osd_request, r_node); |
133 | 130 | ||
134 | seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, | 131 | seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, |
135 | req->r_osd ? req->r_osd->o_osd : -1, | 132 | req->r_osd ? req->r_osd->o_osd : -1, |
136 | le32_to_cpu(req->r_pgid.pool), | 133 | req->r_pgid.pool, req->r_pgid.seed); |
137 | le16_to_cpu(req->r_pgid.ps)); | ||
138 | 134 | ||
139 | head = req->r_request->front.iov_base; | 135 | seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); |
140 | op = (void *)(head + 1); | ||
141 | |||
142 | num_ops = le16_to_cpu(head->num_ops); | ||
143 | olen = le32_to_cpu(head->object_len); | ||
144 | seq_printf(s, "%.*s", olen, | ||
145 | (const char *)(head->ops + num_ops)); | ||
146 | 136 | ||
147 | if (req->r_reassert_version.epoch) | 137 | if (req->r_reassert_version.epoch) |
148 | seq_printf(s, "\t%u'%llu", | 138 | seq_printf(s, "\t%u'%llu", |
@@ -151,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp) | |||
151 | else | 141 | else |
152 | seq_printf(s, "\t"); | 142 | seq_printf(s, "\t"); |
153 | 143 | ||
154 | for (i = 0; i < num_ops; i++) { | 144 | for (i = 0; i < req->r_num_ops; i++) { |
155 | opcode = le16_to_cpu(op->op); | 145 | opcode = le16_to_cpu(req->r_request_ops[i].op); |
156 | seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); | 146 | seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); |
157 | op++; | ||
158 | } | 147 | } |
159 | 148 | ||
160 | seq_printf(s, "\n"); | 149 | seq_printf(s, "\n"); |
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5ccf87ed8d68..2c0669fb54e3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c | |||
@@ -9,8 +9,9 @@ | |||
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/socket.h> | 10 | #include <linux/socket.h> |
11 | #include <linux/string.h> | 11 | #include <linux/string.h> |
12 | #ifdef CONFIG_BLOCK | ||
12 | #include <linux/bio.h> | 13 | #include <linux/bio.h> |
13 | #include <linux/blkdev.h> | 14 | #endif /* CONFIG_BLOCK */ |
14 | #include <linux/dns_resolver.h> | 15 | #include <linux/dns_resolver.h> |
15 | #include <net/tcp.h> | 16 | #include <net/tcp.h> |
16 | 17 | ||
@@ -97,6 +98,57 @@ | |||
97 | #define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ | 98 | #define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ |
98 | #define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ | 99 | #define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ |
99 | 100 | ||
101 | static bool con_flag_valid(unsigned long con_flag) | ||
102 | { | ||
103 | switch (con_flag) { | ||
104 | case CON_FLAG_LOSSYTX: | ||
105 | case CON_FLAG_KEEPALIVE_PENDING: | ||
106 | case CON_FLAG_WRITE_PENDING: | ||
107 | case CON_FLAG_SOCK_CLOSED: | ||
108 | case CON_FLAG_BACKOFF: | ||
109 | return true; | ||
110 | default: | ||
111 | return false; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) | ||
116 | { | ||
117 | BUG_ON(!con_flag_valid(con_flag)); | ||
118 | |||
119 | clear_bit(con_flag, &con->flags); | ||
120 | } | ||
121 | |||
122 | static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) | ||
123 | { | ||
124 | BUG_ON(!con_flag_valid(con_flag)); | ||
125 | |||
126 | set_bit(con_flag, &con->flags); | ||
127 | } | ||
128 | |||
129 | static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) | ||
130 | { | ||
131 | BUG_ON(!con_flag_valid(con_flag)); | ||
132 | |||
133 | return test_bit(con_flag, &con->flags); | ||
134 | } | ||
135 | |||
136 | static bool con_flag_test_and_clear(struct ceph_connection *con, | ||
137 | unsigned long con_flag) | ||
138 | { | ||
139 | BUG_ON(!con_flag_valid(con_flag)); | ||
140 | |||
141 | return test_and_clear_bit(con_flag, &con->flags); | ||
142 | } | ||
143 | |||
144 | static bool con_flag_test_and_set(struct ceph_connection *con, | ||
145 | unsigned long con_flag) | ||
146 | { | ||
147 | BUG_ON(!con_flag_valid(con_flag)); | ||
148 | |||
149 | return test_and_set_bit(con_flag, &con->flags); | ||
150 | } | ||
151 | |||
100 | /* static tag bytes (protocol control messages) */ | 152 | /* static tag bytes (protocol control messages) */ |
101 | static char tag_msg = CEPH_MSGR_TAG_MSG; | 153 | static char tag_msg = CEPH_MSGR_TAG_MSG; |
102 | static char tag_ack = CEPH_MSGR_TAG_ACK; | 154 | static char tag_ack = CEPH_MSGR_TAG_ACK; |
@@ -114,7 +166,7 @@ static struct lock_class_key socket_class; | |||
114 | 166 | ||
115 | static void queue_con(struct ceph_connection *con); | 167 | static void queue_con(struct ceph_connection *con); |
116 | static void con_work(struct work_struct *); | 168 | static void con_work(struct work_struct *); |
117 | static void ceph_fault(struct ceph_connection *con); | 169 | static void con_fault(struct ceph_connection *con); |
118 | 170 | ||
119 | /* | 171 | /* |
120 | * Nicely render a sockaddr as a string. An array of formatted | 172 | * Nicely render a sockaddr as a string. An array of formatted |
@@ -171,7 +223,7 @@ static void encode_my_addr(struct ceph_messenger *msgr) | |||
171 | */ | 223 | */ |
172 | static struct workqueue_struct *ceph_msgr_wq; | 224 | static struct workqueue_struct *ceph_msgr_wq; |
173 | 225 | ||
174 | void _ceph_msgr_exit(void) | 226 | static void _ceph_msgr_exit(void) |
175 | { | 227 | { |
176 | if (ceph_msgr_wq) { | 228 | if (ceph_msgr_wq) { |
177 | destroy_workqueue(ceph_msgr_wq); | 229 | destroy_workqueue(ceph_msgr_wq); |
@@ -308,7 +360,7 @@ static void ceph_sock_write_space(struct sock *sk) | |||
308 | * buffer. See net/ipv4/tcp_input.c:tcp_check_space() | 360 | * buffer. See net/ipv4/tcp_input.c:tcp_check_space() |
309 | * and net/core/stream.c:sk_stream_write_space(). | 361 | * and net/core/stream.c:sk_stream_write_space(). |
310 | */ | 362 | */ |
311 | if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) { | 363 | if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { |
312 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { | 364 | if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { |
313 | dout("%s %p queueing write work\n", __func__, con); | 365 | dout("%s %p queueing write work\n", __func__, con); |
314 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 366 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
@@ -333,7 +385,7 @@ static void ceph_sock_state_change(struct sock *sk) | |||
333 | case TCP_CLOSE_WAIT: | 385 | case TCP_CLOSE_WAIT: |
334 | dout("%s TCP_CLOSE_WAIT\n", __func__); | 386 | dout("%s TCP_CLOSE_WAIT\n", __func__); |
335 | con_sock_state_closing(con); | 387 | con_sock_state_closing(con); |
336 | set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); | 388 | con_flag_set(con, CON_FLAG_SOCK_CLOSED); |
337 | queue_con(con); | 389 | queue_con(con); |
338 | break; | 390 | break; |
339 | case TCP_ESTABLISHED: | 391 | case TCP_ESTABLISHED: |
@@ -474,7 +526,7 @@ static int con_close_socket(struct ceph_connection *con) | |||
474 | * received a socket close event before we had the chance to | 526 | * received a socket close event before we had the chance to |
475 | * shut the socket down. | 527 | * shut the socket down. |
476 | */ | 528 | */ |
477 | clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags); | 529 | con_flag_clear(con, CON_FLAG_SOCK_CLOSED); |
478 | 530 | ||
479 | con_sock_state_closed(con); | 531 | con_sock_state_closed(con); |
480 | return rc; | 532 | return rc; |
@@ -538,11 +590,10 @@ void ceph_con_close(struct ceph_connection *con) | |||
538 | ceph_pr_addr(&con->peer_addr.in_addr)); | 590 | ceph_pr_addr(&con->peer_addr.in_addr)); |
539 | con->state = CON_STATE_CLOSED; | 591 | con->state = CON_STATE_CLOSED; |
540 | 592 | ||
541 | clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ | 593 | con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ |
542 | clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); | 594 | con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); |
543 | clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 595 | con_flag_clear(con, CON_FLAG_WRITE_PENDING); |
544 | clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); | 596 | con_flag_clear(con, CON_FLAG_BACKOFF); |
545 | clear_bit(CON_FLAG_BACKOFF, &con->flags); | ||
546 | 597 | ||
547 | reset_connection(con); | 598 | reset_connection(con); |
548 | con->peer_global_seq = 0; | 599 | con->peer_global_seq = 0; |
@@ -798,7 +849,7 @@ static void prepare_write_message(struct ceph_connection *con) | |||
798 | /* no, queue up footer too and be done */ | 849 | /* no, queue up footer too and be done */ |
799 | prepare_write_message_footer(con); | 850 | prepare_write_message_footer(con); |
800 | 851 | ||
801 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 852 | con_flag_set(con, CON_FLAG_WRITE_PENDING); |
802 | } | 853 | } |
803 | 854 | ||
804 | /* | 855 | /* |
@@ -819,7 +870,7 @@ static void prepare_write_ack(struct ceph_connection *con) | |||
819 | &con->out_temp_ack); | 870 | &con->out_temp_ack); |
820 | 871 | ||
821 | con->out_more = 1; /* more will follow.. eventually.. */ | 872 | con->out_more = 1; /* more will follow.. eventually.. */ |
822 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 873 | con_flag_set(con, CON_FLAG_WRITE_PENDING); |
823 | } | 874 | } |
824 | 875 | ||
825 | /* | 876 | /* |
@@ -830,7 +881,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) | |||
830 | dout("prepare_write_keepalive %p\n", con); | 881 | dout("prepare_write_keepalive %p\n", con); |
831 | con_out_kvec_reset(con); | 882 | con_out_kvec_reset(con); |
832 | con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); | 883 | con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); |
833 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 884 | con_flag_set(con, CON_FLAG_WRITE_PENDING); |
834 | } | 885 | } |
835 | 886 | ||
836 | /* | 887 | /* |
@@ -873,7 +924,7 @@ static void prepare_write_banner(struct ceph_connection *con) | |||
873 | &con->msgr->my_enc_addr); | 924 | &con->msgr->my_enc_addr); |
874 | 925 | ||
875 | con->out_more = 0; | 926 | con->out_more = 0; |
876 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 927 | con_flag_set(con, CON_FLAG_WRITE_PENDING); |
877 | } | 928 | } |
878 | 929 | ||
879 | static int prepare_write_connect(struct ceph_connection *con) | 930 | static int prepare_write_connect(struct ceph_connection *con) |
@@ -923,7 +974,7 @@ static int prepare_write_connect(struct ceph_connection *con) | |||
923 | auth->authorizer_buf); | 974 | auth->authorizer_buf); |
924 | 975 | ||
925 | con->out_more = 0; | 976 | con->out_more = 0; |
926 | set_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 977 | con_flag_set(con, CON_FLAG_WRITE_PENDING); |
927 | 978 | ||
928 | return 0; | 979 | return 0; |
929 | } | 980 | } |
@@ -1643,7 +1694,7 @@ static int process_connect(struct ceph_connection *con) | |||
1643 | le32_to_cpu(con->in_reply.connect_seq)); | 1694 | le32_to_cpu(con->in_reply.connect_seq)); |
1644 | 1695 | ||
1645 | if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) | 1696 | if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) |
1646 | set_bit(CON_FLAG_LOSSYTX, &con->flags); | 1697 | con_flag_set(con, CON_FLAG_LOSSYTX); |
1647 | 1698 | ||
1648 | con->delay = 0; /* reset backoff memory */ | 1699 | con->delay = 0; /* reset backoff memory */ |
1649 | 1700 | ||
@@ -2080,15 +2131,14 @@ do_next: | |||
2080 | prepare_write_ack(con); | 2131 | prepare_write_ack(con); |
2081 | goto more; | 2132 | goto more; |
2082 | } | 2133 | } |
2083 | if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, | 2134 | if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { |
2084 | &con->flags)) { | ||
2085 | prepare_write_keepalive(con); | 2135 | prepare_write_keepalive(con); |
2086 | goto more; | 2136 | goto more; |
2087 | } | 2137 | } |
2088 | } | 2138 | } |
2089 | 2139 | ||
2090 | /* Nothing to do! */ | 2140 | /* Nothing to do! */ |
2091 | clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 2141 | con_flag_clear(con, CON_FLAG_WRITE_PENDING); |
2092 | dout("try_write nothing else to write.\n"); | 2142 | dout("try_write nothing else to write.\n"); |
2093 | ret = 0; | 2143 | ret = 0; |
2094 | out: | 2144 | out: |
@@ -2268,7 +2318,7 @@ static void queue_con(struct ceph_connection *con) | |||
2268 | 2318 | ||
2269 | static bool con_sock_closed(struct ceph_connection *con) | 2319 | static bool con_sock_closed(struct ceph_connection *con) |
2270 | { | 2320 | { |
2271 | if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) | 2321 | if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) |
2272 | return false; | 2322 | return false; |
2273 | 2323 | ||
2274 | #define CASE(x) \ | 2324 | #define CASE(x) \ |
@@ -2295,6 +2345,41 @@ static bool con_sock_closed(struct ceph_connection *con) | |||
2295 | return true; | 2345 | return true; |
2296 | } | 2346 | } |
2297 | 2347 | ||
2348 | static bool con_backoff(struct ceph_connection *con) | ||
2349 | { | ||
2350 | int ret; | ||
2351 | |||
2352 | if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) | ||
2353 | return false; | ||
2354 | |||
2355 | ret = queue_con_delay(con, round_jiffies_relative(con->delay)); | ||
2356 | if (ret) { | ||
2357 | dout("%s: con %p FAILED to back off %lu\n", __func__, | ||
2358 | con, con->delay); | ||
2359 | BUG_ON(ret == -ENOENT); | ||
2360 | con_flag_set(con, CON_FLAG_BACKOFF); | ||
2361 | } | ||
2362 | |||
2363 | return true; | ||
2364 | } | ||
2365 | |||
2366 | /* Finish fault handling; con->mutex must *not* be held here */ | ||
2367 | |||
2368 | static void con_fault_finish(struct ceph_connection *con) | ||
2369 | { | ||
2370 | /* | ||
2371 | * in case we faulted due to authentication, invalidate our | ||
2372 | * current tickets so that we can get new ones. | ||
2373 | */ | ||
2374 | if (con->auth_retry && con->ops->invalidate_authorizer) { | ||
2375 | dout("calling invalidate_authorizer()\n"); | ||
2376 | con->ops->invalidate_authorizer(con); | ||
2377 | } | ||
2378 | |||
2379 | if (con->ops->fault) | ||
2380 | con->ops->fault(con); | ||
2381 | } | ||
2382 | |||
2298 | /* | 2383 | /* |
2299 | * Do some work on a connection. Drop a connection ref when we're done. | 2384 | * Do some work on a connection. Drop a connection ref when we're done. |
2300 | */ | 2385 | */ |
@@ -2302,73 +2387,68 @@ static void con_work(struct work_struct *work) | |||
2302 | { | 2387 | { |
2303 | struct ceph_connection *con = container_of(work, struct ceph_connection, | 2388 | struct ceph_connection *con = container_of(work, struct ceph_connection, |
2304 | work.work); | 2389 | work.work); |
2305 | int ret; | 2390 | bool fault; |
2306 | 2391 | ||
2307 | mutex_lock(&con->mutex); | 2392 | mutex_lock(&con->mutex); |
2308 | restart: | 2393 | while (true) { |
2309 | if (con_sock_closed(con)) | 2394 | int ret; |
2310 | goto fault; | ||
2311 | 2395 | ||
2312 | if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { | 2396 | if ((fault = con_sock_closed(con))) { |
2313 | dout("con_work %p backing off\n", con); | 2397 | dout("%s: con %p SOCK_CLOSED\n", __func__, con); |
2314 | ret = queue_con_delay(con, round_jiffies_relative(con->delay)); | 2398 | break; |
2315 | if (ret) { | 2399 | } |
2316 | dout("con_work %p FAILED to back off %lu\n", con, | 2400 | if (con_backoff(con)) { |
2317 | con->delay); | 2401 | dout("%s: con %p BACKOFF\n", __func__, con); |
2318 | BUG_ON(ret == -ENOENT); | 2402 | break; |
2319 | set_bit(CON_FLAG_BACKOFF, &con->flags); | 2403 | } |
2404 | if (con->state == CON_STATE_STANDBY) { | ||
2405 | dout("%s: con %p STANDBY\n", __func__, con); | ||
2406 | break; | ||
2407 | } | ||
2408 | if (con->state == CON_STATE_CLOSED) { | ||
2409 | dout("%s: con %p CLOSED\n", __func__, con); | ||
2410 | BUG_ON(con->sock); | ||
2411 | break; | ||
2412 | } | ||
2413 | if (con->state == CON_STATE_PREOPEN) { | ||
2414 | dout("%s: con %p PREOPEN\n", __func__, con); | ||
2415 | BUG_ON(con->sock); | ||
2320 | } | 2416 | } |
2321 | goto done; | ||
2322 | } | ||
2323 | 2417 | ||
2324 | if (con->state == CON_STATE_STANDBY) { | 2418 | ret = try_read(con); |
2325 | dout("con_work %p STANDBY\n", con); | 2419 | if (ret < 0) { |
2326 | goto done; | 2420 | if (ret == -EAGAIN) |
2327 | } | 2421 | continue; |
2328 | if (con->state == CON_STATE_CLOSED) { | 2422 | con->error_msg = "socket error on read"; |
2329 | dout("con_work %p CLOSED\n", con); | 2423 | fault = true; |
2330 | BUG_ON(con->sock); | 2424 | break; |
2331 | goto done; | 2425 | } |
2332 | } | ||
2333 | if (con->state == CON_STATE_PREOPEN) { | ||
2334 | dout("con_work OPENING\n"); | ||
2335 | BUG_ON(con->sock); | ||
2336 | } | ||
2337 | 2426 | ||
2338 | ret = try_read(con); | 2427 | ret = try_write(con); |
2339 | if (ret == -EAGAIN) | 2428 | if (ret < 0) { |
2340 | goto restart; | 2429 | if (ret == -EAGAIN) |
2341 | if (ret < 0) { | 2430 | continue; |
2342 | con->error_msg = "socket error on read"; | 2431 | con->error_msg = "socket error on write"; |
2343 | goto fault; | 2432 | fault = true; |
2344 | } | 2433 | } |
2345 | 2434 | ||
2346 | ret = try_write(con); | 2435 | break; /* If we make it to here, we're done */ |
2347 | if (ret == -EAGAIN) | ||
2348 | goto restart; | ||
2349 | if (ret < 0) { | ||
2350 | con->error_msg = "socket error on write"; | ||
2351 | goto fault; | ||
2352 | } | 2436 | } |
2353 | 2437 | if (fault) | |
2354 | done: | 2438 | con_fault(con); |
2355 | mutex_unlock(&con->mutex); | 2439 | mutex_unlock(&con->mutex); |
2356 | done_unlocked: | ||
2357 | con->ops->put(con); | ||
2358 | return; | ||
2359 | 2440 | ||
2360 | fault: | 2441 | if (fault) |
2361 | ceph_fault(con); /* error/fault path */ | 2442 | con_fault_finish(con); |
2362 | goto done_unlocked; | ||
2363 | } | ||
2364 | 2443 | ||
2444 | con->ops->put(con); | ||
2445 | } | ||
2365 | 2446 | ||
2366 | /* | 2447 | /* |
2367 | * Generic error/fault handler. A retry mechanism is used with | 2448 | * Generic error/fault handler. A retry mechanism is used with |
2368 | * exponential backoff | 2449 | * exponential backoff |
2369 | */ | 2450 | */ |
2370 | static void ceph_fault(struct ceph_connection *con) | 2451 | static void con_fault(struct ceph_connection *con) |
2371 | __releases(con->mutex) | ||
2372 | { | 2452 | { |
2373 | pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), | 2453 | pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), |
2374 | ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); | 2454 | ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); |
@@ -2381,10 +2461,10 @@ static void ceph_fault(struct ceph_connection *con) | |||
2381 | 2461 | ||
2382 | con_close_socket(con); | 2462 | con_close_socket(con); |
2383 | 2463 | ||
2384 | if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { | 2464 | if (con_flag_test(con, CON_FLAG_LOSSYTX)) { |
2385 | dout("fault on LOSSYTX channel, marking CLOSED\n"); | 2465 | dout("fault on LOSSYTX channel, marking CLOSED\n"); |
2386 | con->state = CON_STATE_CLOSED; | 2466 | con->state = CON_STATE_CLOSED; |
2387 | goto out_unlock; | 2467 | return; |
2388 | } | 2468 | } |
2389 | 2469 | ||
2390 | if (con->in_msg) { | 2470 | if (con->in_msg) { |
@@ -2401,9 +2481,9 @@ static void ceph_fault(struct ceph_connection *con) | |||
2401 | /* If there are no messages queued or keepalive pending, place | 2481 | /* If there are no messages queued or keepalive pending, place |
2402 | * the connection in a STANDBY state */ | 2482 | * the connection in a STANDBY state */ |
2403 | if (list_empty(&con->out_queue) && | 2483 | if (list_empty(&con->out_queue) && |
2404 | !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { | 2484 | !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { |
2405 | dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); | 2485 | dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); |
2406 | clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); | 2486 | con_flag_clear(con, CON_FLAG_WRITE_PENDING); |
2407 | con->state = CON_STATE_STANDBY; | 2487 | con->state = CON_STATE_STANDBY; |
2408 | } else { | 2488 | } else { |
2409 | /* retry after a delay. */ | 2489 | /* retry after a delay. */ |
@@ -2412,23 +2492,9 @@ static void ceph_fault(struct ceph_connection *con) | |||
2412 | con->delay = BASE_DELAY_INTERVAL; | 2492 | con->delay = BASE_DELAY_INTERVAL; |
2413 | else if (con->delay < MAX_DELAY_INTERVAL) | 2493 | else if (con->delay < MAX_DELAY_INTERVAL) |
2414 | con->delay *= 2; | 2494 | con->delay *= 2; |
2415 | set_bit(CON_FLAG_BACKOFF, &con->flags); | 2495 | con_flag_set(con, CON_FLAG_BACKOFF); |
2416 | queue_con(con); | 2496 | queue_con(con); |
2417 | } | 2497 | } |
2418 | |||
2419 | out_unlock: | ||
2420 | mutex_unlock(&con->mutex); | ||
2421 | /* | ||
2422 | * in case we faulted due to authentication, invalidate our | ||
2423 | * current tickets so that we can get new ones. | ||
2424 | */ | ||
2425 | if (con->auth_retry && con->ops->invalidate_authorizer) { | ||
2426 | dout("calling invalidate_authorizer()\n"); | ||
2427 | con->ops->invalidate_authorizer(con); | ||
2428 | } | ||
2429 | |||
2430 | if (con->ops->fault) | ||
2431 | con->ops->fault(con); | ||
2432 | } | 2498 | } |
2433 | 2499 | ||
2434 | 2500 | ||
@@ -2469,8 +2535,8 @@ static void clear_standby(struct ceph_connection *con) | |||
2469 | dout("clear_standby %p and ++connect_seq\n", con); | 2535 | dout("clear_standby %p and ++connect_seq\n", con); |
2470 | con->state = CON_STATE_PREOPEN; | 2536 | con->state = CON_STATE_PREOPEN; |
2471 | con->connect_seq++; | 2537 | con->connect_seq++; |
2472 | WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); | 2538 | WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); |
2473 | WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); | 2539 | WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); |
2474 | } | 2540 | } |
2475 | } | 2541 | } |
2476 | 2542 | ||
@@ -2511,7 +2577,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) | |||
2511 | 2577 | ||
2512 | /* if there wasn't anything waiting to send before, queue | 2578 | /* if there wasn't anything waiting to send before, queue |
2513 | * new work */ | 2579 | * new work */ |
2514 | if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) | 2580 | if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) |
2515 | queue_con(con); | 2581 | queue_con(con); |
2516 | } | 2582 | } |
2517 | EXPORT_SYMBOL(ceph_con_send); | 2583 | EXPORT_SYMBOL(ceph_con_send); |
@@ -2600,8 +2666,8 @@ void ceph_con_keepalive(struct ceph_connection *con) | |||
2600 | mutex_lock(&con->mutex); | 2666 | mutex_lock(&con->mutex); |
2601 | clear_standby(con); | 2667 | clear_standby(con); |
2602 | mutex_unlock(&con->mutex); | 2668 | mutex_unlock(&con->mutex); |
2603 | if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && | 2669 | if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 && |
2604 | test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) | 2670 | con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) |
2605 | queue_con(con); | 2671 | queue_con(con); |
2606 | } | 2672 | } |
2607 | EXPORT_SYMBOL(ceph_con_keepalive); | 2673 | EXPORT_SYMBOL(ceph_con_keepalive); |
@@ -2651,9 +2717,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |||
2651 | m->page_alignment = 0; | 2717 | m->page_alignment = 0; |
2652 | m->pages = NULL; | 2718 | m->pages = NULL; |
2653 | m->pagelist = NULL; | 2719 | m->pagelist = NULL; |
2720 | #ifdef CONFIG_BLOCK | ||
2654 | m->bio = NULL; | 2721 | m->bio = NULL; |
2655 | m->bio_iter = NULL; | 2722 | m->bio_iter = NULL; |
2656 | m->bio_seg = 0; | 2723 | m->bio_seg = 0; |
2724 | #endif /* CONFIG_BLOCK */ | ||
2657 | m->trail = NULL; | 2725 | m->trail = NULL; |
2658 | 2726 | ||
2659 | /* front */ | 2727 | /* front */ |
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 812eb3b46c1f..aef5b1062bee 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c | |||
@@ -697,7 +697,7 @@ int ceph_monc_delete_snapid(struct ceph_mon_client *monc, | |||
697 | u32 pool, u64 snapid) | 697 | u32 pool, u64 snapid) |
698 | { | 698 | { |
699 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | 699 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, |
700 | pool, snapid, 0, 0); | 700 | pool, snapid, NULL, 0); |
701 | 701 | ||
702 | } | 702 | } |
703 | 703 | ||
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index eb9a44478764..d730dd4d8eb2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -23,7 +23,7 @@ | |||
23 | 23 | ||
24 | static const struct ceph_connection_operations osd_con_ops; | 24 | static const struct ceph_connection_operations osd_con_ops; |
25 | 25 | ||
26 | static void send_queued(struct ceph_osd_client *osdc); | 26 | static void __send_queued(struct ceph_osd_client *osdc); |
27 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); | 27 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); |
28 | static void __register_request(struct ceph_osd_client *osdc, | 28 | static void __register_request(struct ceph_osd_client *osdc, |
29 | struct ceph_osd_request *req); | 29 | struct ceph_osd_request *req); |
@@ -32,64 +32,12 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, | |||
32 | static void __send_request(struct ceph_osd_client *osdc, | 32 | static void __send_request(struct ceph_osd_client *osdc, |
33 | struct ceph_osd_request *req); | 33 | struct ceph_osd_request *req); |
34 | 34 | ||
35 | static int op_needs_trail(int op) | ||
36 | { | ||
37 | switch (op) { | ||
38 | case CEPH_OSD_OP_GETXATTR: | ||
39 | case CEPH_OSD_OP_SETXATTR: | ||
40 | case CEPH_OSD_OP_CMPXATTR: | ||
41 | case CEPH_OSD_OP_CALL: | ||
42 | case CEPH_OSD_OP_NOTIFY: | ||
43 | return 1; | ||
44 | default: | ||
45 | return 0; | ||
46 | } | ||
47 | } | ||
48 | |||
49 | static int op_has_extent(int op) | 35 | static int op_has_extent(int op) |
50 | { | 36 | { |
51 | return (op == CEPH_OSD_OP_READ || | 37 | return (op == CEPH_OSD_OP_READ || |
52 | op == CEPH_OSD_OP_WRITE); | 38 | op == CEPH_OSD_OP_WRITE); |
53 | } | 39 | } |
54 | 40 | ||
55 | int ceph_calc_raw_layout(struct ceph_osd_client *osdc, | ||
56 | struct ceph_file_layout *layout, | ||
57 | u64 snapid, | ||
58 | u64 off, u64 *plen, u64 *bno, | ||
59 | struct ceph_osd_request *req, | ||
60 | struct ceph_osd_req_op *op) | ||
61 | { | ||
62 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | ||
63 | u64 orig_len = *plen; | ||
64 | u64 objoff, objlen; /* extent in object */ | ||
65 | int r; | ||
66 | |||
67 | reqhead->snapid = cpu_to_le64(snapid); | ||
68 | |||
69 | /* object extent? */ | ||
70 | r = ceph_calc_file_object_mapping(layout, off, plen, bno, | ||
71 | &objoff, &objlen); | ||
72 | if (r < 0) | ||
73 | return r; | ||
74 | if (*plen < orig_len) | ||
75 | dout(" skipping last %llu, final file extent %llu~%llu\n", | ||
76 | orig_len - *plen, off, *plen); | ||
77 | |||
78 | if (op_has_extent(op->op)) { | ||
79 | op->extent.offset = objoff; | ||
80 | op->extent.length = objlen; | ||
81 | } | ||
82 | req->r_num_pages = calc_pages_for(off, *plen); | ||
83 | req->r_page_alignment = off & ~PAGE_MASK; | ||
84 | if (op->op == CEPH_OSD_OP_WRITE) | ||
85 | op->payload_len = *plen; | ||
86 | |||
87 | dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", | ||
88 | *bno, objoff, objlen, req->r_num_pages); | ||
89 | return 0; | ||
90 | } | ||
91 | EXPORT_SYMBOL(ceph_calc_raw_layout); | ||
92 | |||
93 | /* | 41 | /* |
94 | * Implement client access to distributed object storage cluster. | 42 | * Implement client access to distributed object storage cluster. |
95 | * | 43 | * |
@@ -115,20 +63,48 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); | |||
115 | * | 63 | * |
116 | * fill osd op in request message. | 64 | * fill osd op in request message. |
117 | */ | 65 | */ |
118 | static int calc_layout(struct ceph_osd_client *osdc, | 66 | static int calc_layout(struct ceph_vino vino, |
119 | struct ceph_vino vino, | ||
120 | struct ceph_file_layout *layout, | 67 | struct ceph_file_layout *layout, |
121 | u64 off, u64 *plen, | 68 | u64 off, u64 *plen, |
122 | struct ceph_osd_request *req, | 69 | struct ceph_osd_request *req, |
123 | struct ceph_osd_req_op *op) | 70 | struct ceph_osd_req_op *op) |
124 | { | 71 | { |
125 | u64 bno; | 72 | u64 orig_len = *plen; |
73 | u64 bno = 0; | ||
74 | u64 objoff = 0; | ||
75 | u64 objlen = 0; | ||
126 | int r; | 76 | int r; |
127 | 77 | ||
128 | r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, | 78 | /* object extent? */ |
129 | plen, &bno, req, op); | 79 | r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno, |
80 | &objoff, &objlen); | ||
130 | if (r < 0) | 81 | if (r < 0) |
131 | return r; | 82 | return r; |
83 | if (objlen < orig_len) { | ||
84 | *plen = objlen; | ||
85 | dout(" skipping last %llu, final file extent %llu~%llu\n", | ||
86 | orig_len - *plen, off, *plen); | ||
87 | } | ||
88 | |||
89 | if (op_has_extent(op->op)) { | ||
90 | u32 osize = le32_to_cpu(layout->fl_object_size); | ||
91 | op->extent.offset = objoff; | ||
92 | op->extent.length = objlen; | ||
93 | if (op->extent.truncate_size <= off - objoff) { | ||
94 | op->extent.truncate_size = 0; | ||
95 | } else { | ||
96 | op->extent.truncate_size -= off - objoff; | ||
97 | if (op->extent.truncate_size > osize) | ||
98 | op->extent.truncate_size = osize; | ||
99 | } | ||
100 | } | ||
101 | req->r_num_pages = calc_pages_for(off, *plen); | ||
102 | req->r_page_alignment = off & ~PAGE_MASK; | ||
103 | if (op->op == CEPH_OSD_OP_WRITE) | ||
104 | op->payload_len = *plen; | ||
105 | |||
106 | dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", | ||
107 | bno, objoff, objlen, req->r_num_pages); | ||
132 | 108 | ||
133 | snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); | 109 | snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); |
134 | req->r_oid_len = strlen(req->r_oid); | 110 | req->r_oid_len = strlen(req->r_oid); |
@@ -148,25 +124,19 @@ void ceph_osdc_release_request(struct kref *kref) | |||
148 | if (req->r_request) | 124 | if (req->r_request) |
149 | ceph_msg_put(req->r_request); | 125 | ceph_msg_put(req->r_request); |
150 | if (req->r_con_filling_msg) { | 126 | if (req->r_con_filling_msg) { |
151 | dout("%s revoking pages %p from con %p\n", __func__, | 127 | dout("%s revoking msg %p from con %p\n", __func__, |
152 | req->r_pages, req->r_con_filling_msg); | 128 | req->r_reply, req->r_con_filling_msg); |
153 | ceph_msg_revoke_incoming(req->r_reply); | 129 | ceph_msg_revoke_incoming(req->r_reply); |
154 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); | 130 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); |
131 | req->r_con_filling_msg = NULL; | ||
155 | } | 132 | } |
156 | if (req->r_reply) | 133 | if (req->r_reply) |
157 | ceph_msg_put(req->r_reply); | 134 | ceph_msg_put(req->r_reply); |
158 | if (req->r_own_pages) | 135 | if (req->r_own_pages) |
159 | ceph_release_page_vector(req->r_pages, | 136 | ceph_release_page_vector(req->r_pages, |
160 | req->r_num_pages); | 137 | req->r_num_pages); |
161 | #ifdef CONFIG_BLOCK | ||
162 | if (req->r_bio) | ||
163 | bio_put(req->r_bio); | ||
164 | #endif | ||
165 | ceph_put_snap_context(req->r_snapc); | 138 | ceph_put_snap_context(req->r_snapc); |
166 | if (req->r_trail) { | 139 | ceph_pagelist_release(&req->r_trail); |
167 | ceph_pagelist_release(req->r_trail); | ||
168 | kfree(req->r_trail); | ||
169 | } | ||
170 | if (req->r_mempool) | 140 | if (req->r_mempool) |
171 | mempool_free(req, req->r_osdc->req_mempool); | 141 | mempool_free(req, req->r_osdc->req_mempool); |
172 | else | 142 | else |
@@ -174,37 +144,25 @@ void ceph_osdc_release_request(struct kref *kref) | |||
174 | } | 144 | } |
175 | EXPORT_SYMBOL(ceph_osdc_release_request); | 145 | EXPORT_SYMBOL(ceph_osdc_release_request); |
176 | 146 | ||
177 | static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) | ||
178 | { | ||
179 | int i = 0; | ||
180 | |||
181 | if (needs_trail) | ||
182 | *needs_trail = 0; | ||
183 | while (ops[i].op) { | ||
184 | if (needs_trail && op_needs_trail(ops[i].op)) | ||
185 | *needs_trail = 1; | ||
186 | i++; | ||
187 | } | ||
188 | |||
189 | return i; | ||
190 | } | ||
191 | |||
192 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | 147 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, |
193 | int flags, | ||
194 | struct ceph_snap_context *snapc, | 148 | struct ceph_snap_context *snapc, |
195 | struct ceph_osd_req_op *ops, | 149 | unsigned int num_ops, |
196 | bool use_mempool, | 150 | bool use_mempool, |
197 | gfp_t gfp_flags, | 151 | gfp_t gfp_flags) |
198 | struct page **pages, | ||
199 | struct bio *bio) | ||
200 | { | 152 | { |
201 | struct ceph_osd_request *req; | 153 | struct ceph_osd_request *req; |
202 | struct ceph_msg *msg; | 154 | struct ceph_msg *msg; |
203 | int needs_trail; | 155 | size_t msg_size; |
204 | int num_op = get_num_ops(ops, &needs_trail); | 156 | |
205 | size_t msg_size = sizeof(struct ceph_osd_request_head); | 157 | msg_size = 4 + 4 + 8 + 8 + 4+8; |
206 | 158 | msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ | |
207 | msg_size += num_op*sizeof(struct ceph_osd_op); | 159 | msg_size += 1 + 8 + 4 + 4; /* pg_t */ |
160 | msg_size += 4 + MAX_OBJ_NAME_SIZE; | ||
161 | msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); | ||
162 | msg_size += 8; /* snapid */ | ||
163 | msg_size += 8; /* snap_seq */ | ||
164 | msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ | ||
165 | msg_size += 4; | ||
208 | 166 | ||
209 | if (use_mempool) { | 167 | if (use_mempool) { |
210 | req = mempool_alloc(osdc->req_mempool, gfp_flags); | 168 | req = mempool_alloc(osdc->req_mempool, gfp_flags); |
@@ -228,10 +186,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
228 | INIT_LIST_HEAD(&req->r_req_lru_item); | 186 | INIT_LIST_HEAD(&req->r_req_lru_item); |
229 | INIT_LIST_HEAD(&req->r_osd_item); | 187 | INIT_LIST_HEAD(&req->r_osd_item); |
230 | 188 | ||
231 | req->r_flags = flags; | ||
232 | |||
233 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); | ||
234 | |||
235 | /* create reply message */ | 189 | /* create reply message */ |
236 | if (use_mempool) | 190 | if (use_mempool) |
237 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); | 191 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); |
@@ -244,20 +198,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
244 | } | 198 | } |
245 | req->r_reply = msg; | 199 | req->r_reply = msg; |
246 | 200 | ||
247 | /* allocate space for the trailing data */ | 201 | ceph_pagelist_init(&req->r_trail); |
248 | if (needs_trail) { | ||
249 | req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); | ||
250 | if (!req->r_trail) { | ||
251 | ceph_osdc_put_request(req); | ||
252 | return NULL; | ||
253 | } | ||
254 | ceph_pagelist_init(req->r_trail); | ||
255 | } | ||
256 | 202 | ||
257 | /* create request message; allow space for oid */ | 203 | /* create request message; allow space for oid */ |
258 | msg_size += MAX_OBJ_NAME_SIZE; | ||
259 | if (snapc) | ||
260 | msg_size += sizeof(u64) * snapc->num_snaps; | ||
261 | if (use_mempool) | 204 | if (use_mempool) |
262 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); | 205 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); |
263 | else | 206 | else |
@@ -270,13 +213,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
270 | memset(msg->front.iov_base, 0, msg->front.iov_len); | 213 | memset(msg->front.iov_base, 0, msg->front.iov_len); |
271 | 214 | ||
272 | req->r_request = msg; | 215 | req->r_request = msg; |
273 | req->r_pages = pages; | ||
274 | #ifdef CONFIG_BLOCK | ||
275 | if (bio) { | ||
276 | req->r_bio = bio; | ||
277 | bio_get(req->r_bio); | ||
278 | } | ||
279 | #endif | ||
280 | 216 | ||
281 | return req; | 217 | return req; |
282 | } | 218 | } |
@@ -289,6 +225,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req, | |||
289 | dst->op = cpu_to_le16(src->op); | 225 | dst->op = cpu_to_le16(src->op); |
290 | 226 | ||
291 | switch (src->op) { | 227 | switch (src->op) { |
228 | case CEPH_OSD_OP_STAT: | ||
229 | break; | ||
292 | case CEPH_OSD_OP_READ: | 230 | case CEPH_OSD_OP_READ: |
293 | case CEPH_OSD_OP_WRITE: | 231 | case CEPH_OSD_OP_WRITE: |
294 | dst->extent.offset = | 232 | dst->extent.offset = |
@@ -300,52 +238,20 @@ static void osd_req_encode_op(struct ceph_osd_request *req, | |||
300 | dst->extent.truncate_seq = | 238 | dst->extent.truncate_seq = |
301 | cpu_to_le32(src->extent.truncate_seq); | 239 | cpu_to_le32(src->extent.truncate_seq); |
302 | break; | 240 | break; |
303 | |||
304 | case CEPH_OSD_OP_GETXATTR: | ||
305 | case CEPH_OSD_OP_SETXATTR: | ||
306 | case CEPH_OSD_OP_CMPXATTR: | ||
307 | BUG_ON(!req->r_trail); | ||
308 | |||
309 | dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); | ||
310 | dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); | ||
311 | dst->xattr.cmp_op = src->xattr.cmp_op; | ||
312 | dst->xattr.cmp_mode = src->xattr.cmp_mode; | ||
313 | ceph_pagelist_append(req->r_trail, src->xattr.name, | ||
314 | src->xattr.name_len); | ||
315 | ceph_pagelist_append(req->r_trail, src->xattr.val, | ||
316 | src->xattr.value_len); | ||
317 | break; | ||
318 | case CEPH_OSD_OP_CALL: | 241 | case CEPH_OSD_OP_CALL: |
319 | BUG_ON(!req->r_trail); | ||
320 | |||
321 | dst->cls.class_len = src->cls.class_len; | 242 | dst->cls.class_len = src->cls.class_len; |
322 | dst->cls.method_len = src->cls.method_len; | 243 | dst->cls.method_len = src->cls.method_len; |
323 | dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); | 244 | dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); |
324 | 245 | ||
325 | ceph_pagelist_append(req->r_trail, src->cls.class_name, | 246 | ceph_pagelist_append(&req->r_trail, src->cls.class_name, |
326 | src->cls.class_len); | 247 | src->cls.class_len); |
327 | ceph_pagelist_append(req->r_trail, src->cls.method_name, | 248 | ceph_pagelist_append(&req->r_trail, src->cls.method_name, |
328 | src->cls.method_len); | 249 | src->cls.method_len); |
329 | ceph_pagelist_append(req->r_trail, src->cls.indata, | 250 | ceph_pagelist_append(&req->r_trail, src->cls.indata, |
330 | src->cls.indata_len); | 251 | src->cls.indata_len); |
331 | break; | 252 | break; |
332 | case CEPH_OSD_OP_ROLLBACK: | ||
333 | dst->snap.snapid = cpu_to_le64(src->snap.snapid); | ||
334 | break; | ||
335 | case CEPH_OSD_OP_STARTSYNC: | 253 | case CEPH_OSD_OP_STARTSYNC: |
336 | break; | 254 | break; |
337 | case CEPH_OSD_OP_NOTIFY: | ||
338 | { | ||
339 | __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); | ||
340 | __le32 timeout = cpu_to_le32(src->watch.timeout); | ||
341 | |||
342 | BUG_ON(!req->r_trail); | ||
343 | |||
344 | ceph_pagelist_append(req->r_trail, | ||
345 | &prot_ver, sizeof(prot_ver)); | ||
346 | ceph_pagelist_append(req->r_trail, | ||
347 | &timeout, sizeof(timeout)); | ||
348 | } | ||
349 | case CEPH_OSD_OP_NOTIFY_ACK: | 255 | case CEPH_OSD_OP_NOTIFY_ACK: |
350 | case CEPH_OSD_OP_WATCH: | 256 | case CEPH_OSD_OP_WATCH: |
351 | dst->watch.cookie = cpu_to_le64(src->watch.cookie); | 257 | dst->watch.cookie = cpu_to_le64(src->watch.cookie); |
@@ -356,6 +262,64 @@ static void osd_req_encode_op(struct ceph_osd_request *req, | |||
356 | pr_err("unrecognized osd opcode %d\n", dst->op); | 262 | pr_err("unrecognized osd opcode %d\n", dst->op); |
357 | WARN_ON(1); | 263 | WARN_ON(1); |
358 | break; | 264 | break; |
265 | case CEPH_OSD_OP_MAPEXT: | ||
266 | case CEPH_OSD_OP_MASKTRUNC: | ||
267 | case CEPH_OSD_OP_SPARSE_READ: | ||
268 | case CEPH_OSD_OP_NOTIFY: | ||
269 | case CEPH_OSD_OP_ASSERT_VER: | ||
270 | case CEPH_OSD_OP_WRITEFULL: | ||
271 | case CEPH_OSD_OP_TRUNCATE: | ||
272 | case CEPH_OSD_OP_ZERO: | ||
273 | case CEPH_OSD_OP_DELETE: | ||
274 | case CEPH_OSD_OP_APPEND: | ||
275 | case CEPH_OSD_OP_SETTRUNC: | ||
276 | case CEPH_OSD_OP_TRIMTRUNC: | ||
277 | case CEPH_OSD_OP_TMAPUP: | ||
278 | case CEPH_OSD_OP_TMAPPUT: | ||
279 | case CEPH_OSD_OP_TMAPGET: | ||
280 | case CEPH_OSD_OP_CREATE: | ||
281 | case CEPH_OSD_OP_ROLLBACK: | ||
282 | case CEPH_OSD_OP_OMAPGETKEYS: | ||
283 | case CEPH_OSD_OP_OMAPGETVALS: | ||
284 | case CEPH_OSD_OP_OMAPGETHEADER: | ||
285 | case CEPH_OSD_OP_OMAPGETVALSBYKEYS: | ||
286 | case CEPH_OSD_OP_MODE_RD: | ||
287 | case CEPH_OSD_OP_OMAPSETVALS: | ||
288 | case CEPH_OSD_OP_OMAPSETHEADER: | ||
289 | case CEPH_OSD_OP_OMAPCLEAR: | ||
290 | case CEPH_OSD_OP_OMAPRMKEYS: | ||
291 | case CEPH_OSD_OP_OMAP_CMP: | ||
292 | case CEPH_OSD_OP_CLONERANGE: | ||
293 | case CEPH_OSD_OP_ASSERT_SRC_VERSION: | ||
294 | case CEPH_OSD_OP_SRC_CMPXATTR: | ||
295 | case CEPH_OSD_OP_GETXATTR: | ||
296 | case CEPH_OSD_OP_GETXATTRS: | ||
297 | case CEPH_OSD_OP_CMPXATTR: | ||
298 | case CEPH_OSD_OP_SETXATTR: | ||
299 | case CEPH_OSD_OP_SETXATTRS: | ||
300 | case CEPH_OSD_OP_RESETXATTRS: | ||
301 | case CEPH_OSD_OP_RMXATTR: | ||
302 | case CEPH_OSD_OP_PULL: | ||
303 | case CEPH_OSD_OP_PUSH: | ||
304 | case CEPH_OSD_OP_BALANCEREADS: | ||
305 | case CEPH_OSD_OP_UNBALANCEREADS: | ||
306 | case CEPH_OSD_OP_SCRUB: | ||
307 | case CEPH_OSD_OP_SCRUB_RESERVE: | ||
308 | case CEPH_OSD_OP_SCRUB_UNRESERVE: | ||
309 | case CEPH_OSD_OP_SCRUB_STOP: | ||
310 | case CEPH_OSD_OP_SCRUB_MAP: | ||
311 | case CEPH_OSD_OP_WRLOCK: | ||
312 | case CEPH_OSD_OP_WRUNLOCK: | ||
313 | case CEPH_OSD_OP_RDLOCK: | ||
314 | case CEPH_OSD_OP_RDUNLOCK: | ||
315 | case CEPH_OSD_OP_UPLOCK: | ||
316 | case CEPH_OSD_OP_DNLOCK: | ||
317 | case CEPH_OSD_OP_PGLS: | ||
318 | case CEPH_OSD_OP_PGLS_FILTER: | ||
319 | pr_err("unsupported osd opcode %s\n", | ||
320 | ceph_osd_op_name(dst->op)); | ||
321 | WARN_ON(1); | ||
322 | break; | ||
359 | } | 323 | } |
360 | dst->payload_len = cpu_to_le32(src->payload_len); | 324 | dst->payload_len = cpu_to_le32(src->payload_len); |
361 | } | 325 | } |
@@ -365,75 +329,95 @@ static void osd_req_encode_op(struct ceph_osd_request *req, | |||
365 | * | 329 | * |
366 | */ | 330 | */ |
367 | void ceph_osdc_build_request(struct ceph_osd_request *req, | 331 | void ceph_osdc_build_request(struct ceph_osd_request *req, |
368 | u64 off, u64 *plen, | 332 | u64 off, u64 len, unsigned int num_ops, |
369 | struct ceph_osd_req_op *src_ops, | 333 | struct ceph_osd_req_op *src_ops, |
370 | struct ceph_snap_context *snapc, | 334 | struct ceph_snap_context *snapc, u64 snap_id, |
371 | struct timespec *mtime, | 335 | struct timespec *mtime) |
372 | const char *oid, | ||
373 | int oid_len) | ||
374 | { | 336 | { |
375 | struct ceph_msg *msg = req->r_request; | 337 | struct ceph_msg *msg = req->r_request; |
376 | struct ceph_osd_request_head *head; | ||
377 | struct ceph_osd_req_op *src_op; | 338 | struct ceph_osd_req_op *src_op; |
378 | struct ceph_osd_op *op; | ||
379 | void *p; | 339 | void *p; |
380 | int num_op = get_num_ops(src_ops, NULL); | 340 | size_t msg_size; |
381 | size_t msg_size = sizeof(*head) + num_op*sizeof(*op); | ||
382 | int flags = req->r_flags; | 341 | int flags = req->r_flags; |
383 | u64 data_len = 0; | 342 | u64 data_len; |
384 | int i; | 343 | int i; |
385 | 344 | ||
386 | head = msg->front.iov_base; | 345 | req->r_num_ops = num_ops; |
387 | op = (void *)(head + 1); | 346 | req->r_snapid = snap_id; |
388 | p = (void *)(op + num_op); | ||
389 | |||
390 | req->r_snapc = ceph_get_snap_context(snapc); | 347 | req->r_snapc = ceph_get_snap_context(snapc); |
391 | 348 | ||
392 | head->client_inc = cpu_to_le32(1); /* always, for now. */ | 349 | /* encode request */ |
393 | head->flags = cpu_to_le32(flags); | 350 | msg->hdr.version = cpu_to_le16(4); |
394 | if (flags & CEPH_OSD_FLAG_WRITE) | ||
395 | ceph_encode_timespec(&head->mtime, mtime); | ||
396 | head->num_ops = cpu_to_le16(num_op); | ||
397 | |||
398 | |||
399 | /* fill in oid */ | ||
400 | head->object_len = cpu_to_le32(oid_len); | ||
401 | memcpy(p, oid, oid_len); | ||
402 | p += oid_len; | ||
403 | 351 | ||
352 | p = msg->front.iov_base; | ||
353 | ceph_encode_32(&p, 1); /* client_inc is always 1 */ | ||
354 | req->r_request_osdmap_epoch = p; | ||
355 | p += 4; | ||
356 | req->r_request_flags = p; | ||
357 | p += 4; | ||
358 | if (req->r_flags & CEPH_OSD_FLAG_WRITE) | ||
359 | ceph_encode_timespec(p, mtime); | ||
360 | p += sizeof(struct ceph_timespec); | ||
361 | req->r_request_reassert_version = p; | ||
362 | p += sizeof(struct ceph_eversion); /* will get filled in */ | ||
363 | |||
364 | /* oloc */ | ||
365 | ceph_encode_8(&p, 4); | ||
366 | ceph_encode_8(&p, 4); | ||
367 | ceph_encode_32(&p, 8 + 4 + 4); | ||
368 | req->r_request_pool = p; | ||
369 | p += 8; | ||
370 | ceph_encode_32(&p, -1); /* preferred */ | ||
371 | ceph_encode_32(&p, 0); /* key len */ | ||
372 | |||
373 | ceph_encode_8(&p, 1); | ||
374 | req->r_request_pgid = p; | ||
375 | p += 8 + 4; | ||
376 | ceph_encode_32(&p, -1); /* preferred */ | ||
377 | |||
378 | /* oid */ | ||
379 | ceph_encode_32(&p, req->r_oid_len); | ||
380 | memcpy(p, req->r_oid, req->r_oid_len); | ||
381 | dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); | ||
382 | p += req->r_oid_len; | ||
383 | |||
384 | /* ops */ | ||
385 | ceph_encode_16(&p, num_ops); | ||
404 | src_op = src_ops; | 386 | src_op = src_ops; |
405 | while (src_op->op) { | 387 | req->r_request_ops = p; |
406 | osd_req_encode_op(req, op, src_op); | 388 | for (i = 0; i < num_ops; i++, src_op++) { |
407 | src_op++; | 389 | osd_req_encode_op(req, p, src_op); |
408 | op++; | 390 | p += sizeof(struct ceph_osd_op); |
409 | } | 391 | } |
410 | 392 | ||
411 | if (req->r_trail) | 393 | /* snaps */ |
412 | data_len += req->r_trail->length; | 394 | ceph_encode_64(&p, req->r_snapid); |
413 | 395 | ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); | |
414 | if (snapc) { | 396 | ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); |
415 | head->snap_seq = cpu_to_le64(snapc->seq); | 397 | if (req->r_snapc) { |
416 | head->num_snaps = cpu_to_le32(snapc->num_snaps); | ||
417 | for (i = 0; i < snapc->num_snaps; i++) { | 398 | for (i = 0; i < snapc->num_snaps; i++) { |
418 | put_unaligned_le64(snapc->snaps[i], p); | 399 | ceph_encode_64(&p, req->r_snapc->snaps[i]); |
419 | p += sizeof(u64); | ||
420 | } | 400 | } |
421 | } | 401 | } |
422 | 402 | ||
403 | req->r_request_attempts = p; | ||
404 | p += 4; | ||
405 | |||
406 | data_len = req->r_trail.length; | ||
423 | if (flags & CEPH_OSD_FLAG_WRITE) { | 407 | if (flags & CEPH_OSD_FLAG_WRITE) { |
424 | req->r_request->hdr.data_off = cpu_to_le16(off); | 408 | req->r_request->hdr.data_off = cpu_to_le16(off); |
425 | req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); | 409 | data_len += len; |
426 | } else if (data_len) { | ||
427 | req->r_request->hdr.data_off = 0; | ||
428 | req->r_request->hdr.data_len = cpu_to_le32(data_len); | ||
429 | } | 410 | } |
430 | 411 | req->r_request->hdr.data_len = cpu_to_le32(data_len); | |
431 | req->r_request->page_alignment = req->r_page_alignment; | 412 | req->r_request->page_alignment = req->r_page_alignment; |
432 | 413 | ||
433 | BUG_ON(p > msg->front.iov_base + msg->front.iov_len); | 414 | BUG_ON(p > msg->front.iov_base + msg->front.iov_len); |
434 | msg_size = p - msg->front.iov_base; | 415 | msg_size = p - msg->front.iov_base; |
435 | msg->front.iov_len = msg_size; | 416 | msg->front.iov_len = msg_size; |
436 | msg->hdr.front_len = cpu_to_le32(msg_size); | 417 | msg->hdr.front_len = cpu_to_le32(msg_size); |
418 | |||
419 | dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, | ||
420 | num_ops); | ||
437 | return; | 421 | return; |
438 | } | 422 | } |
439 | EXPORT_SYMBOL(ceph_osdc_build_request); | 423 | EXPORT_SYMBOL(ceph_osdc_build_request); |
@@ -459,34 +443,33 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
459 | u32 truncate_seq, | 443 | u32 truncate_seq, |
460 | u64 truncate_size, | 444 | u64 truncate_size, |
461 | struct timespec *mtime, | 445 | struct timespec *mtime, |
462 | bool use_mempool, int num_reply, | 446 | bool use_mempool, |
463 | int page_align) | 447 | int page_align) |
464 | { | 448 | { |
465 | struct ceph_osd_req_op ops[3]; | 449 | struct ceph_osd_req_op ops[2]; |
466 | struct ceph_osd_request *req; | 450 | struct ceph_osd_request *req; |
451 | unsigned int num_op = 1; | ||
467 | int r; | 452 | int r; |
468 | 453 | ||
454 | memset(&ops, 0, sizeof ops); | ||
455 | |||
469 | ops[0].op = opcode; | 456 | ops[0].op = opcode; |
470 | ops[0].extent.truncate_seq = truncate_seq; | 457 | ops[0].extent.truncate_seq = truncate_seq; |
471 | ops[0].extent.truncate_size = truncate_size; | 458 | ops[0].extent.truncate_size = truncate_size; |
472 | ops[0].payload_len = 0; | ||
473 | 459 | ||
474 | if (do_sync) { | 460 | if (do_sync) { |
475 | ops[1].op = CEPH_OSD_OP_STARTSYNC; | 461 | ops[1].op = CEPH_OSD_OP_STARTSYNC; |
476 | ops[1].payload_len = 0; | 462 | num_op++; |
477 | ops[2].op = 0; | 463 | } |
478 | } else | 464 | |
479 | ops[1].op = 0; | 465 | req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, |
480 | 466 | GFP_NOFS); | |
481 | req = ceph_osdc_alloc_request(osdc, flags, | ||
482 | snapc, ops, | ||
483 | use_mempool, | ||
484 | GFP_NOFS, NULL, NULL); | ||
485 | if (!req) | 467 | if (!req) |
486 | return ERR_PTR(-ENOMEM); | 468 | return ERR_PTR(-ENOMEM); |
469 | req->r_flags = flags; | ||
487 | 470 | ||
488 | /* calculate max write size */ | 471 | /* calculate max write size */ |
489 | r = calc_layout(osdc, vino, layout, off, plen, req, ops); | 472 | r = calc_layout(vino, layout, off, plen, req, ops); |
490 | if (r < 0) | 473 | if (r < 0) |
491 | return ERR_PTR(r); | 474 | return ERR_PTR(r); |
492 | req->r_file_layout = *layout; /* keep a copy */ | 475 | req->r_file_layout = *layout; /* keep a copy */ |
@@ -496,10 +479,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
496 | req->r_num_pages = calc_pages_for(page_align, *plen); | 479 | req->r_num_pages = calc_pages_for(page_align, *plen); |
497 | req->r_page_alignment = page_align; | 480 | req->r_page_alignment = page_align; |
498 | 481 | ||
499 | ceph_osdc_build_request(req, off, plen, ops, | 482 | ceph_osdc_build_request(req, off, *plen, num_op, ops, |
500 | snapc, | 483 | snapc, vino.snap, mtime); |
501 | mtime, | ||
502 | req->r_oid, req->r_oid_len); | ||
503 | 484 | ||
504 | return req; | 485 | return req; |
505 | } | 486 | } |
@@ -623,8 +604,8 @@ static void osd_reset(struct ceph_connection *con) | |||
623 | down_read(&osdc->map_sem); | 604 | down_read(&osdc->map_sem); |
624 | mutex_lock(&osdc->request_mutex); | 605 | mutex_lock(&osdc->request_mutex); |
625 | __kick_osd_requests(osdc, osd); | 606 | __kick_osd_requests(osdc, osd); |
607 | __send_queued(osdc); | ||
626 | mutex_unlock(&osdc->request_mutex); | 608 | mutex_unlock(&osdc->request_mutex); |
627 | send_queued(osdc); | ||
628 | up_read(&osdc->map_sem); | 609 | up_read(&osdc->map_sem); |
629 | } | 610 | } |
630 | 611 | ||
@@ -739,31 +720,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc) | |||
739 | */ | 720 | */ |
740 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | 721 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) |
741 | { | 722 | { |
742 | struct ceph_osd_request *req; | 723 | struct ceph_entity_addr *peer_addr; |
743 | int ret = 0; | ||
744 | 724 | ||
745 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); | 725 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); |
746 | if (list_empty(&osd->o_requests) && | 726 | if (list_empty(&osd->o_requests) && |
747 | list_empty(&osd->o_linger_requests)) { | 727 | list_empty(&osd->o_linger_requests)) { |
748 | __remove_osd(osdc, osd); | 728 | __remove_osd(osdc, osd); |
749 | ret = -ENODEV; | 729 | |
750 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], | 730 | return -ENODEV; |
751 | &osd->o_con.peer_addr, | 731 | } |
752 | sizeof(osd->o_con.peer_addr)) == 0 && | 732 | |
753 | !ceph_con_opened(&osd->o_con)) { | 733 | peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; |
734 | if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && | ||
735 | !ceph_con_opened(&osd->o_con)) { | ||
736 | struct ceph_osd_request *req; | ||
737 | |||
754 | dout(" osd addr hasn't changed and connection never opened," | 738 | dout(" osd addr hasn't changed and connection never opened," |
755 | " letting msgr retry"); | 739 | " letting msgr retry"); |
756 | /* touch each r_stamp for handle_timeout()'s benfit */ | 740 | /* touch each r_stamp for handle_timeout()'s benfit */ |
757 | list_for_each_entry(req, &osd->o_requests, r_osd_item) | 741 | list_for_each_entry(req, &osd->o_requests, r_osd_item) |
758 | req->r_stamp = jiffies; | 742 | req->r_stamp = jiffies; |
759 | ret = -EAGAIN; | 743 | |
760 | } else { | 744 | return -EAGAIN; |
761 | ceph_con_close(&osd->o_con); | ||
762 | ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, | ||
763 | &osdc->osdmap->osd_addr[osd->o_osd]); | ||
764 | osd->o_incarnation++; | ||
765 | } | 745 | } |
766 | return ret; | 746 | |
747 | ceph_con_close(&osd->o_con); | ||
748 | ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); | ||
749 | osd->o_incarnation++; | ||
750 | |||
751 | return 0; | ||
767 | } | 752 | } |
768 | 753 | ||
769 | static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) | 754 | static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) |
@@ -961,20 +946,18 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger); | |||
961 | static int __map_request(struct ceph_osd_client *osdc, | 946 | static int __map_request(struct ceph_osd_client *osdc, |
962 | struct ceph_osd_request *req, int force_resend) | 947 | struct ceph_osd_request *req, int force_resend) |
963 | { | 948 | { |
964 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | ||
965 | struct ceph_pg pgid; | 949 | struct ceph_pg pgid; |
966 | int acting[CEPH_PG_MAX_SIZE]; | 950 | int acting[CEPH_PG_MAX_SIZE]; |
967 | int o = -1, num = 0; | 951 | int o = -1, num = 0; |
968 | int err; | 952 | int err; |
969 | 953 | ||
970 | dout("map_request %p tid %lld\n", req, req->r_tid); | 954 | dout("map_request %p tid %lld\n", req, req->r_tid); |
971 | err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, | 955 | err = ceph_calc_object_layout(&pgid, req->r_oid, |
972 | &req->r_file_layout, osdc->osdmap); | 956 | &req->r_file_layout, osdc->osdmap); |
973 | if (err) { | 957 | if (err) { |
974 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | 958 | list_move(&req->r_req_lru_item, &osdc->req_notarget); |
975 | return err; | 959 | return err; |
976 | } | 960 | } |
977 | pgid = reqhead->layout.ol_pgid; | ||
978 | req->r_pgid = pgid; | 961 | req->r_pgid = pgid; |
979 | 962 | ||
980 | err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); | 963 | err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); |
@@ -991,8 +974,8 @@ static int __map_request(struct ceph_osd_client *osdc, | |||
991 | (req->r_osd == NULL && o == -1)) | 974 | (req->r_osd == NULL && o == -1)) |
992 | return 0; /* no change */ | 975 | return 0; /* no change */ |
993 | 976 | ||
994 | dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", | 977 | dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", |
995 | req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, | 978 | req->r_tid, pgid.pool, pgid.seed, o, |
996 | req->r_osd ? req->r_osd->o_osd : -1); | 979 | req->r_osd ? req->r_osd->o_osd : -1); |
997 | 980 | ||
998 | /* record full pg acting set */ | 981 | /* record full pg acting set */ |
@@ -1041,15 +1024,22 @@ out: | |||
1041 | static void __send_request(struct ceph_osd_client *osdc, | 1024 | static void __send_request(struct ceph_osd_client *osdc, |
1042 | struct ceph_osd_request *req) | 1025 | struct ceph_osd_request *req) |
1043 | { | 1026 | { |
1044 | struct ceph_osd_request_head *reqhead; | 1027 | void *p; |
1045 | |||
1046 | dout("send_request %p tid %llu to osd%d flags %d\n", | ||
1047 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); | ||
1048 | 1028 | ||
1049 | reqhead = req->r_request->front.iov_base; | 1029 | dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", |
1050 | reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); | 1030 | req, req->r_tid, req->r_osd->o_osd, req->r_flags, |
1051 | reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ | 1031 | (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); |
1052 | reqhead->reassert_version = req->r_reassert_version; | 1032 | |
1033 | /* fill in message content that changes each time we send it */ | ||
1034 | put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); | ||
1035 | put_unaligned_le32(req->r_flags, req->r_request_flags); | ||
1036 | put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); | ||
1037 | p = req->r_request_pgid; | ||
1038 | ceph_encode_64(&p, req->r_pgid.pool); | ||
1039 | ceph_encode_32(&p, req->r_pgid.seed); | ||
1040 | put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ | ||
1041 | memcpy(req->r_request_reassert_version, &req->r_reassert_version, | ||
1042 | sizeof(req->r_reassert_version)); | ||
1053 | 1043 | ||
1054 | req->r_stamp = jiffies; | 1044 | req->r_stamp = jiffies; |
1055 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); | 1045 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); |
@@ -1062,16 +1052,13 @@ static void __send_request(struct ceph_osd_client *osdc, | |||
1062 | /* | 1052 | /* |
1063 | * Send any requests in the queue (req_unsent). | 1053 | * Send any requests in the queue (req_unsent). |
1064 | */ | 1054 | */ |
1065 | static void send_queued(struct ceph_osd_client *osdc) | 1055 | static void __send_queued(struct ceph_osd_client *osdc) |
1066 | { | 1056 | { |
1067 | struct ceph_osd_request *req, *tmp; | 1057 | struct ceph_osd_request *req, *tmp; |
1068 | 1058 | ||
1069 | dout("send_queued\n"); | 1059 | dout("__send_queued\n"); |
1070 | mutex_lock(&osdc->request_mutex); | 1060 | list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) |
1071 | list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { | ||
1072 | __send_request(osdc, req); | 1061 | __send_request(osdc, req); |
1073 | } | ||
1074 | mutex_unlock(&osdc->request_mutex); | ||
1075 | } | 1062 | } |
1076 | 1063 | ||
1077 | /* | 1064 | /* |
@@ -1123,8 +1110,8 @@ static void handle_timeout(struct work_struct *work) | |||
1123 | } | 1110 | } |
1124 | 1111 | ||
1125 | __schedule_osd_timeout(osdc); | 1112 | __schedule_osd_timeout(osdc); |
1113 | __send_queued(osdc); | ||
1126 | mutex_unlock(&osdc->request_mutex); | 1114 | mutex_unlock(&osdc->request_mutex); |
1127 | send_queued(osdc); | ||
1128 | up_read(&osdc->map_sem); | 1115 | up_read(&osdc->map_sem); |
1129 | } | 1116 | } |
1130 | 1117 | ||
@@ -1152,6 +1139,26 @@ static void complete_request(struct ceph_osd_request *req) | |||
1152 | complete_all(&req->r_safe_completion); /* fsync waiter */ | 1139 | complete_all(&req->r_safe_completion); /* fsync waiter */ |
1153 | } | 1140 | } |
1154 | 1141 | ||
1142 | static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid) | ||
1143 | { | ||
1144 | __u8 v; | ||
1145 | |||
1146 | ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad); | ||
1147 | v = ceph_decode_8(p); | ||
1148 | if (v > 1) { | ||
1149 | pr_warning("do not understand pg encoding %d > 1", v); | ||
1150 | return -EINVAL; | ||
1151 | } | ||
1152 | pgid->pool = ceph_decode_64(p); | ||
1153 | pgid->seed = ceph_decode_32(p); | ||
1154 | *p += 4; | ||
1155 | return 0; | ||
1156 | |||
1157 | bad: | ||
1158 | pr_warning("incomplete pg encoding"); | ||
1159 | return -EINVAL; | ||
1160 | } | ||
1161 | |||
1155 | /* | 1162 | /* |
1156 | * handle osd op reply. either call the callback if it is specified, | 1163 | * handle osd op reply. either call the callback if it is specified, |
1157 | * or do the completion to wake up the waiting thread. | 1164 | * or do the completion to wake up the waiting thread. |
@@ -1159,22 +1166,42 @@ static void complete_request(struct ceph_osd_request *req) | |||
1159 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | 1166 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, |
1160 | struct ceph_connection *con) | 1167 | struct ceph_connection *con) |
1161 | { | 1168 | { |
1162 | struct ceph_osd_reply_head *rhead = msg->front.iov_base; | 1169 | void *p, *end; |
1163 | struct ceph_osd_request *req; | 1170 | struct ceph_osd_request *req; |
1164 | u64 tid; | 1171 | u64 tid; |
1165 | int numops, object_len, flags; | 1172 | int object_len; |
1173 | int numops, payload_len, flags; | ||
1166 | s32 result; | 1174 | s32 result; |
1175 | s32 retry_attempt; | ||
1176 | struct ceph_pg pg; | ||
1177 | int err; | ||
1178 | u32 reassert_epoch; | ||
1179 | u64 reassert_version; | ||
1180 | u32 osdmap_epoch; | ||
1181 | int i; | ||
1167 | 1182 | ||
1168 | tid = le64_to_cpu(msg->hdr.tid); | 1183 | tid = le64_to_cpu(msg->hdr.tid); |
1169 | if (msg->front.iov_len < sizeof(*rhead)) | 1184 | dout("handle_reply %p tid %llu\n", msg, tid); |
1170 | goto bad; | 1185 | |
1171 | numops = le32_to_cpu(rhead->num_ops); | 1186 | p = msg->front.iov_base; |
1172 | object_len = le32_to_cpu(rhead->object_len); | 1187 | end = p + msg->front.iov_len; |
1173 | result = le32_to_cpu(rhead->result); | 1188 | |
1174 | if (msg->front.iov_len != sizeof(*rhead) + object_len + | 1189 | ceph_decode_need(&p, end, 4, bad); |
1175 | numops * sizeof(struct ceph_osd_op)) | 1190 | object_len = ceph_decode_32(&p); |
1191 | ceph_decode_need(&p, end, object_len, bad); | ||
1192 | p += object_len; | ||
1193 | |||
1194 | err = __decode_pgid(&p, end, &pg); | ||
1195 | if (err) | ||
1176 | goto bad; | 1196 | goto bad; |
1177 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); | 1197 | |
1198 | ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); | ||
1199 | flags = ceph_decode_64(&p); | ||
1200 | result = ceph_decode_32(&p); | ||
1201 | reassert_epoch = ceph_decode_32(&p); | ||
1202 | reassert_version = ceph_decode_64(&p); | ||
1203 | osdmap_epoch = ceph_decode_32(&p); | ||
1204 | |||
1178 | /* lookup */ | 1205 | /* lookup */ |
1179 | mutex_lock(&osdc->request_mutex); | 1206 | mutex_lock(&osdc->request_mutex); |
1180 | req = __lookup_request(osdc, tid); | 1207 | req = __lookup_request(osdc, tid); |
@@ -1184,7 +1211,38 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1184 | return; | 1211 | return; |
1185 | } | 1212 | } |
1186 | ceph_osdc_get_request(req); | 1213 | ceph_osdc_get_request(req); |
1187 | flags = le32_to_cpu(rhead->flags); | 1214 | |
1215 | dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, | ||
1216 | req, result); | ||
1217 | |||
1218 | ceph_decode_need(&p, end, 4, bad); | ||
1219 | numops = ceph_decode_32(&p); | ||
1220 | if (numops > CEPH_OSD_MAX_OP) | ||
1221 | goto bad_put; | ||
1222 | if (numops != req->r_num_ops) | ||
1223 | goto bad_put; | ||
1224 | payload_len = 0; | ||
1225 | ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad); | ||
1226 | for (i = 0; i < numops; i++) { | ||
1227 | struct ceph_osd_op *op = p; | ||
1228 | int len; | ||
1229 | |||
1230 | len = le32_to_cpu(op->payload_len); | ||
1231 | req->r_reply_op_len[i] = len; | ||
1232 | dout(" op %d has %d bytes\n", i, len); | ||
1233 | payload_len += len; | ||
1234 | p += sizeof(*op); | ||
1235 | } | ||
1236 | if (payload_len != le32_to_cpu(msg->hdr.data_len)) { | ||
1237 | pr_warning("sum of op payload lens %d != data_len %d", | ||
1238 | payload_len, le32_to_cpu(msg->hdr.data_len)); | ||
1239 | goto bad_put; | ||
1240 | } | ||
1241 | |||
1242 | ceph_decode_need(&p, end, 4 + numops * 4, bad); | ||
1243 | retry_attempt = ceph_decode_32(&p); | ||
1244 | for (i = 0; i < numops; i++) | ||
1245 | req->r_reply_op_result[i] = ceph_decode_32(&p); | ||
1188 | 1246 | ||
1189 | /* | 1247 | /* |
1190 | * if this connection filled our message, drop our reference now, to | 1248 | * if this connection filled our message, drop our reference now, to |
@@ -1199,7 +1257,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1199 | if (!req->r_got_reply) { | 1257 | if (!req->r_got_reply) { |
1200 | unsigned int bytes; | 1258 | unsigned int bytes; |
1201 | 1259 | ||
1202 | req->r_result = le32_to_cpu(rhead->result); | 1260 | req->r_result = result; |
1203 | bytes = le32_to_cpu(msg->hdr.data_len); | 1261 | bytes = le32_to_cpu(msg->hdr.data_len); |
1204 | dout("handle_reply result %d bytes %d\n", req->r_result, | 1262 | dout("handle_reply result %d bytes %d\n", req->r_result, |
1205 | bytes); | 1263 | bytes); |
@@ -1207,7 +1265,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1207 | req->r_result = bytes; | 1265 | req->r_result = bytes; |
1208 | 1266 | ||
1209 | /* in case this is a write and we need to replay, */ | 1267 | /* in case this is a write and we need to replay, */ |
1210 | req->r_reassert_version = rhead->reassert_version; | 1268 | req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); |
1269 | req->r_reassert_version.version = cpu_to_le64(reassert_version); | ||
1211 | 1270 | ||
1212 | req->r_got_reply = 1; | 1271 | req->r_got_reply = 1; |
1213 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { | 1272 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { |
@@ -1242,10 +1301,11 @@ done: | |||
1242 | ceph_osdc_put_request(req); | 1301 | ceph_osdc_put_request(req); |
1243 | return; | 1302 | return; |
1244 | 1303 | ||
1304 | bad_put: | ||
1305 | ceph_osdc_put_request(req); | ||
1245 | bad: | 1306 | bad: |
1246 | pr_err("corrupt osd_op_reply got %d %d expected %d\n", | 1307 | pr_err("corrupt osd_op_reply got %d %d\n", |
1247 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), | 1308 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); |
1248 | (int)sizeof(*rhead)); | ||
1249 | ceph_msg_dump(msg); | 1309 | ceph_msg_dump(msg); |
1250 | } | 1310 | } |
1251 | 1311 | ||
@@ -1462,7 +1522,9 @@ done: | |||
1462 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) | 1522 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) |
1463 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1523 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1464 | 1524 | ||
1465 | send_queued(osdc); | 1525 | mutex_lock(&osdc->request_mutex); |
1526 | __send_queued(osdc); | ||
1527 | mutex_unlock(&osdc->request_mutex); | ||
1466 | up_read(&osdc->map_sem); | 1528 | up_read(&osdc->map_sem); |
1467 | wake_up_all(&osdc->client->auth_wq); | 1529 | wake_up_all(&osdc->client->auth_wq); |
1468 | return; | 1530 | return; |
@@ -1556,8 +1618,7 @@ static void __remove_event(struct ceph_osd_event *event) | |||
1556 | 1618 | ||
1557 | int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 1619 | int ceph_osdc_create_event(struct ceph_osd_client *osdc, |
1558 | void (*event_cb)(u64, u64, u8, void *), | 1620 | void (*event_cb)(u64, u64, u8, void *), |
1559 | int one_shot, void *data, | 1621 | void *data, struct ceph_osd_event **pevent) |
1560 | struct ceph_osd_event **pevent) | ||
1561 | { | 1622 | { |
1562 | struct ceph_osd_event *event; | 1623 | struct ceph_osd_event *event; |
1563 | 1624 | ||
@@ -1567,14 +1628,13 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc, | |||
1567 | 1628 | ||
1568 | dout("create_event %p\n", event); | 1629 | dout("create_event %p\n", event); |
1569 | event->cb = event_cb; | 1630 | event->cb = event_cb; |
1570 | event->one_shot = one_shot; | 1631 | event->one_shot = 0; |
1571 | event->data = data; | 1632 | event->data = data; |
1572 | event->osdc = osdc; | 1633 | event->osdc = osdc; |
1573 | INIT_LIST_HEAD(&event->osd_node); | 1634 | INIT_LIST_HEAD(&event->osd_node); |
1574 | RB_CLEAR_NODE(&event->node); | 1635 | RB_CLEAR_NODE(&event->node); |
1575 | kref_init(&event->kref); /* one ref for us */ | 1636 | kref_init(&event->kref); /* one ref for us */ |
1576 | kref_get(&event->kref); /* one ref for the caller */ | 1637 | kref_get(&event->kref); /* one ref for the caller */ |
1577 | init_completion(&event->completion); | ||
1578 | 1638 | ||
1579 | spin_lock(&osdc->event_lock); | 1639 | spin_lock(&osdc->event_lock); |
1580 | event->cookie = ++osdc->event_count; | 1640 | event->cookie = ++osdc->event_count; |
@@ -1610,7 +1670,6 @@ static void do_event_work(struct work_struct *work) | |||
1610 | 1670 | ||
1611 | dout("do_event_work completing %p\n", event); | 1671 | dout("do_event_work completing %p\n", event); |
1612 | event->cb(ver, notify_id, opcode, event->data); | 1672 | event->cb(ver, notify_id, opcode, event->data); |
1613 | complete(&event->completion); | ||
1614 | dout("do_event_work completed %p\n", event); | 1673 | dout("do_event_work completed %p\n", event); |
1615 | ceph_osdc_put_event(event); | 1674 | ceph_osdc_put_event(event); |
1616 | kfree(event_work); | 1675 | kfree(event_work); |
@@ -1620,7 +1679,8 @@ static void do_event_work(struct work_struct *work) | |||
1620 | /* | 1679 | /* |
1621 | * Process osd watch notifications | 1680 | * Process osd watch notifications |
1622 | */ | 1681 | */ |
1623 | void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) | 1682 | static void handle_watch_notify(struct ceph_osd_client *osdc, |
1683 | struct ceph_msg *msg) | ||
1624 | { | 1684 | { |
1625 | void *p, *end; | 1685 | void *p, *end; |
1626 | u8 proto_ver; | 1686 | u8 proto_ver; |
@@ -1641,9 +1701,8 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1641 | spin_lock(&osdc->event_lock); | 1701 | spin_lock(&osdc->event_lock); |
1642 | event = __find_event(osdc, cookie); | 1702 | event = __find_event(osdc, cookie); |
1643 | if (event) { | 1703 | if (event) { |
1704 | BUG_ON(event->one_shot); | ||
1644 | get_event(event); | 1705 | get_event(event); |
1645 | if (event->one_shot) | ||
1646 | __remove_event(event); | ||
1647 | } | 1706 | } |
1648 | spin_unlock(&osdc->event_lock); | 1707 | spin_unlock(&osdc->event_lock); |
1649 | dout("handle_watch_notify cookie %lld ver %lld event %p\n", | 1708 | dout("handle_watch_notify cookie %lld ver %lld event %p\n", |
@@ -1668,7 +1727,6 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1668 | return; | 1727 | return; |
1669 | 1728 | ||
1670 | done_err: | 1729 | done_err: |
1671 | complete(&event->completion); | ||
1672 | ceph_osdc_put_event(event); | 1730 | ceph_osdc_put_event(event); |
1673 | return; | 1731 | return; |
1674 | 1732 | ||
@@ -1677,21 +1735,6 @@ bad: | |||
1677 | return; | 1735 | return; |
1678 | } | 1736 | } |
1679 | 1737 | ||
1680 | int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) | ||
1681 | { | ||
1682 | int err; | ||
1683 | |||
1684 | dout("wait_event %p\n", event); | ||
1685 | err = wait_for_completion_interruptible_timeout(&event->completion, | ||
1686 | timeout * HZ); | ||
1687 | ceph_osdc_put_event(event); | ||
1688 | if (err > 0) | ||
1689 | err = 0; | ||
1690 | dout("wait_event %p returns %d\n", event, err); | ||
1691 | return err; | ||
1692 | } | ||
1693 | EXPORT_SYMBOL(ceph_osdc_wait_event); | ||
1694 | |||
1695 | /* | 1738 | /* |
1696 | * Register request, send initial attempt. | 1739 | * Register request, send initial attempt. |
1697 | */ | 1740 | */ |
@@ -1706,7 +1749,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, | |||
1706 | #ifdef CONFIG_BLOCK | 1749 | #ifdef CONFIG_BLOCK |
1707 | req->r_request->bio = req->r_bio; | 1750 | req->r_request->bio = req->r_bio; |
1708 | #endif | 1751 | #endif |
1709 | req->r_request->trail = req->r_trail; | 1752 | req->r_request->trail = &req->r_trail; |
1710 | 1753 | ||
1711 | register_request(osdc, req); | 1754 | register_request(osdc, req); |
1712 | 1755 | ||
@@ -1865,7 +1908,6 @@ out_mempool: | |||
1865 | out: | 1908 | out: |
1866 | return err; | 1909 | return err; |
1867 | } | 1910 | } |
1868 | EXPORT_SYMBOL(ceph_osdc_init); | ||
1869 | 1911 | ||
1870 | void ceph_osdc_stop(struct ceph_osd_client *osdc) | 1912 | void ceph_osdc_stop(struct ceph_osd_client *osdc) |
1871 | { | 1913 | { |
@@ -1882,7 +1924,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) | |||
1882 | ceph_msgpool_destroy(&osdc->msgpool_op); | 1924 | ceph_msgpool_destroy(&osdc->msgpool_op); |
1883 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); | 1925 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); |
1884 | } | 1926 | } |
1885 | EXPORT_SYMBOL(ceph_osdc_stop); | ||
1886 | 1927 | ||
1887 | /* | 1928 | /* |
1888 | * Read some contiguous pages. If we cross a stripe boundary, shorten | 1929 | * Read some contiguous pages. If we cross a stripe boundary, shorten |
@@ -1902,7 +1943,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, | |||
1902 | req = ceph_osdc_new_request(osdc, layout, vino, off, plen, | 1943 | req = ceph_osdc_new_request(osdc, layout, vino, off, plen, |
1903 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | 1944 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
1904 | NULL, 0, truncate_seq, truncate_size, NULL, | 1945 | NULL, 0, truncate_seq, truncate_size, NULL, |
1905 | false, 1, page_align); | 1946 | false, page_align); |
1906 | if (IS_ERR(req)) | 1947 | if (IS_ERR(req)) |
1907 | return PTR_ERR(req); | 1948 | return PTR_ERR(req); |
1908 | 1949 | ||
@@ -1931,8 +1972,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
1931 | u64 off, u64 len, | 1972 | u64 off, u64 len, |
1932 | u32 truncate_seq, u64 truncate_size, | 1973 | u32 truncate_seq, u64 truncate_size, |
1933 | struct timespec *mtime, | 1974 | struct timespec *mtime, |
1934 | struct page **pages, int num_pages, | 1975 | struct page **pages, int num_pages) |
1935 | int flags, int do_sync, bool nofail) | ||
1936 | { | 1976 | { |
1937 | struct ceph_osd_request *req; | 1977 | struct ceph_osd_request *req; |
1938 | int rc = 0; | 1978 | int rc = 0; |
@@ -1941,11 +1981,10 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
1941 | BUG_ON(vino.snap != CEPH_NOSNAP); | 1981 | BUG_ON(vino.snap != CEPH_NOSNAP); |
1942 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, | 1982 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, |
1943 | CEPH_OSD_OP_WRITE, | 1983 | CEPH_OSD_OP_WRITE, |
1944 | flags | CEPH_OSD_FLAG_ONDISK | | 1984 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, |
1945 | CEPH_OSD_FLAG_WRITE, | 1985 | snapc, 0, |
1946 | snapc, do_sync, | ||
1947 | truncate_seq, truncate_size, mtime, | 1986 | truncate_seq, truncate_size, mtime, |
1948 | nofail, 1, page_align); | 1987 | true, page_align); |
1949 | if (IS_ERR(req)) | 1988 | if (IS_ERR(req)) |
1950 | return PTR_ERR(req); | 1989 | return PTR_ERR(req); |
1951 | 1990 | ||
@@ -1954,7 +1993,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
1954 | dout("writepages %llu~%llu (%d pages)\n", off, len, | 1993 | dout("writepages %llu~%llu (%d pages)\n", off, len, |
1955 | req->r_num_pages); | 1994 | req->r_num_pages); |
1956 | 1995 | ||
1957 | rc = ceph_osdc_start_request(osdc, req, nofail); | 1996 | rc = ceph_osdc_start_request(osdc, req, true); |
1958 | if (!rc) | 1997 | if (!rc) |
1959 | rc = ceph_osdc_wait_request(osdc, req); | 1998 | rc = ceph_osdc_wait_request(osdc, req); |
1960 | 1999 | ||
@@ -2047,7 +2086,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, | |||
2047 | if (data_len > 0) { | 2086 | if (data_len > 0) { |
2048 | int want = calc_pages_for(req->r_page_alignment, data_len); | 2087 | int want = calc_pages_for(req->r_page_alignment, data_len); |
2049 | 2088 | ||
2050 | if (unlikely(req->r_num_pages < want)) { | 2089 | if (req->r_pages && unlikely(req->r_num_pages < want)) { |
2051 | pr_warning("tid %lld reply has %d bytes %d pages, we" | 2090 | pr_warning("tid %lld reply has %d bytes %d pages, we" |
2052 | " had only %d pages ready\n", tid, data_len, | 2091 | " had only %d pages ready\n", tid, data_len, |
2053 | want, req->r_num_pages); | 2092 | want, req->r_num_pages); |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index de73214b5d26..69bc4bf89e3e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -13,26 +13,18 @@ | |||
13 | 13 | ||
14 | char *ceph_osdmap_state_str(char *str, int len, int state) | 14 | char *ceph_osdmap_state_str(char *str, int len, int state) |
15 | { | 15 | { |
16 | int flag = 0; | ||
17 | |||
18 | if (!len) | 16 | if (!len) |
19 | goto done; | 17 | return str; |
20 | 18 | ||
21 | *str = '\0'; | 19 | if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) |
22 | if (state) { | 20 | snprintf(str, len, "exists, up"); |
23 | if (state & CEPH_OSD_EXISTS) { | 21 | else if (state & CEPH_OSD_EXISTS) |
24 | snprintf(str, len, "exists"); | 22 | snprintf(str, len, "exists"); |
25 | flag = 1; | 23 | else if (state & CEPH_OSD_UP) |
26 | } | 24 | snprintf(str, len, "up"); |
27 | if (state & CEPH_OSD_UP) { | 25 | else |
28 | snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), | ||
29 | "up"); | ||
30 | flag = 1; | ||
31 | } | ||
32 | } else { | ||
33 | snprintf(str, len, "doesn't exist"); | 26 | snprintf(str, len, "doesn't exist"); |
34 | } | 27 | |
35 | done: | ||
36 | return str; | 28 | return str; |
37 | } | 29 | } |
38 | 30 | ||
@@ -53,13 +45,8 @@ static int calc_bits_of(unsigned int t) | |||
53 | */ | 45 | */ |
54 | static void calc_pg_masks(struct ceph_pg_pool_info *pi) | 46 | static void calc_pg_masks(struct ceph_pg_pool_info *pi) |
55 | { | 47 | { |
56 | pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; | 48 | pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; |
57 | pi->pgp_num_mask = | 49 | pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; |
58 | (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; | ||
59 | pi->lpg_num_mask = | ||
60 | (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; | ||
61 | pi->lpgp_num_mask = | ||
62 | (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; | ||
63 | } | 50 | } |
64 | 51 | ||
65 | /* | 52 | /* |
@@ -170,6 +157,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
170 | c->choose_local_tries = 2; | 157 | c->choose_local_tries = 2; |
171 | c->choose_local_fallback_tries = 5; | 158 | c->choose_local_fallback_tries = 5; |
172 | c->choose_total_tries = 19; | 159 | c->choose_total_tries = 19; |
160 | c->chooseleaf_descend_once = 0; | ||
173 | 161 | ||
174 | ceph_decode_need(p, end, 4*sizeof(u32), bad); | 162 | ceph_decode_need(p, end, 4*sizeof(u32), bad); |
175 | magic = ceph_decode_32(p); | 163 | magic = ceph_decode_32(p); |
@@ -336,6 +324,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
336 | dout("crush decode tunable choose_total_tries = %d", | 324 | dout("crush decode tunable choose_total_tries = %d", |
337 | c->choose_total_tries); | 325 | c->choose_total_tries); |
338 | 326 | ||
327 | ceph_decode_need(p, end, sizeof(u32), done); | ||
328 | c->chooseleaf_descend_once = ceph_decode_32(p); | ||
329 | dout("crush decode tunable chooseleaf_descend_once = %d", | ||
330 | c->chooseleaf_descend_once); | ||
331 | |||
339 | done: | 332 | done: |
340 | dout("crush_decode success\n"); | 333 | dout("crush_decode success\n"); |
341 | return c; | 334 | return c; |
@@ -354,12 +347,13 @@ bad: | |||
354 | */ | 347 | */ |
355 | static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) | 348 | static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) |
356 | { | 349 | { |
357 | u64 a = *(u64 *)&l; | 350 | if (l.pool < r.pool) |
358 | u64 b = *(u64 *)&r; | 351 | return -1; |
359 | 352 | if (l.pool > r.pool) | |
360 | if (a < b) | 353 | return 1; |
354 | if (l.seed < r.seed) | ||
361 | return -1; | 355 | return -1; |
362 | if (a > b) | 356 | if (l.seed > r.seed) |
363 | return 1; | 357 | return 1; |
364 | return 0; | 358 | return 0; |
365 | } | 359 | } |
@@ -405,8 +399,8 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, | |||
405 | } else if (c > 0) { | 399 | } else if (c > 0) { |
406 | n = n->rb_right; | 400 | n = n->rb_right; |
407 | } else { | 401 | } else { |
408 | dout("__lookup_pg_mapping %llx got %p\n", | 402 | dout("__lookup_pg_mapping %lld.%x got %p\n", |
409 | *(u64 *)&pgid, pg); | 403 | pgid.pool, pgid.seed, pg); |
410 | return pg; | 404 | return pg; |
411 | } | 405 | } |
412 | } | 406 | } |
@@ -418,12 +412,13 @@ static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) | |||
418 | struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); | 412 | struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); |
419 | 413 | ||
420 | if (pg) { | 414 | if (pg) { |
421 | dout("__remove_pg_mapping %llx %p\n", *(u64 *)&pgid, pg); | 415 | dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, |
416 | pg); | ||
422 | rb_erase(&pg->node, root); | 417 | rb_erase(&pg->node, root); |
423 | kfree(pg); | 418 | kfree(pg); |
424 | return 0; | 419 | return 0; |
425 | } | 420 | } |
426 | dout("__remove_pg_mapping %llx dne\n", *(u64 *)&pgid); | 421 | dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); |
427 | return -ENOENT; | 422 | return -ENOENT; |
428 | } | 423 | } |
429 | 424 | ||
@@ -452,7 +447,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) | |||
452 | return 0; | 447 | return 0; |
453 | } | 448 | } |
454 | 449 | ||
455 | static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) | 450 | static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) |
456 | { | 451 | { |
457 | struct ceph_pg_pool_info *pi; | 452 | struct ceph_pg_pool_info *pi; |
458 | struct rb_node *n = root->rb_node; | 453 | struct rb_node *n = root->rb_node; |
@@ -508,24 +503,57 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) | |||
508 | 503 | ||
509 | static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) | 504 | static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) |
510 | { | 505 | { |
511 | unsigned int n, m; | 506 | u8 ev, cv; |
507 | unsigned len, num; | ||
508 | void *pool_end; | ||
509 | |||
510 | ceph_decode_need(p, end, 2 + 4, bad); | ||
511 | ev = ceph_decode_8(p); /* encoding version */ | ||
512 | cv = ceph_decode_8(p); /* compat version */ | ||
513 | if (ev < 5) { | ||
514 | pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); | ||
515 | return -EINVAL; | ||
516 | } | ||
517 | if (cv > 7) { | ||
518 | pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); | ||
519 | return -EINVAL; | ||
520 | } | ||
521 | len = ceph_decode_32(p); | ||
522 | ceph_decode_need(p, end, len, bad); | ||
523 | pool_end = *p + len; | ||
512 | 524 | ||
513 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | 525 | pi->type = ceph_decode_8(p); |
514 | calc_pg_masks(pi); | 526 | pi->size = ceph_decode_8(p); |
527 | pi->crush_ruleset = ceph_decode_8(p); | ||
528 | pi->object_hash = ceph_decode_8(p); | ||
529 | |||
530 | pi->pg_num = ceph_decode_32(p); | ||
531 | pi->pgp_num = ceph_decode_32(p); | ||
532 | |||
533 | *p += 4 + 4; /* skip lpg* */ | ||
534 | *p += 4; /* skip last_change */ | ||
535 | *p += 8 + 4; /* skip snap_seq, snap_epoch */ | ||
515 | 536 | ||
516 | /* num_snaps * snap_info_t */ | 537 | /* skip snaps */ |
517 | n = le32_to_cpu(pi->v.num_snaps); | 538 | num = ceph_decode_32(p); |
518 | while (n--) { | 539 | while (num--) { |
519 | ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + | 540 | *p += 8; /* snapid key */ |
520 | sizeof(struct ceph_timespec), bad); | 541 | *p += 1 + 1; /* versions */ |
521 | *p += sizeof(u64) + /* key */ | 542 | len = ceph_decode_32(p); |
522 | 1 + sizeof(u64) + /* u8, snapid */ | 543 | *p += len; |
523 | sizeof(struct ceph_timespec); | ||
524 | m = ceph_decode_32(p); /* snap name */ | ||
525 | *p += m; | ||
526 | } | 544 | } |
527 | 545 | ||
528 | *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; | 546 | /* skip removed snaps */ |
547 | num = ceph_decode_32(p); | ||
548 | *p += num * (8 + 8); | ||
549 | |||
550 | *p += 8; /* skip auid */ | ||
551 | pi->flags = ceph_decode_64(p); | ||
552 | |||
553 | /* ignore the rest */ | ||
554 | |||
555 | *p = pool_end; | ||
556 | calc_pg_masks(pi); | ||
529 | return 0; | 557 | return 0; |
530 | 558 | ||
531 | bad: | 559 | bad: |
@@ -535,14 +563,15 @@ bad: | |||
535 | static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) | 563 | static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) |
536 | { | 564 | { |
537 | struct ceph_pg_pool_info *pi; | 565 | struct ceph_pg_pool_info *pi; |
538 | u32 num, len, pool; | 566 | u32 num, len; |
567 | u64 pool; | ||
539 | 568 | ||
540 | ceph_decode_32_safe(p, end, num, bad); | 569 | ceph_decode_32_safe(p, end, num, bad); |
541 | dout(" %d pool names\n", num); | 570 | dout(" %d pool names\n", num); |
542 | while (num--) { | 571 | while (num--) { |
543 | ceph_decode_32_safe(p, end, pool, bad); | 572 | ceph_decode_64_safe(p, end, pool, bad); |
544 | ceph_decode_32_safe(p, end, len, bad); | 573 | ceph_decode_32_safe(p, end, len, bad); |
545 | dout(" pool %d len %d\n", pool, len); | 574 | dout(" pool %llu len %d\n", pool, len); |
546 | ceph_decode_need(p, end, len, bad); | 575 | ceph_decode_need(p, end, len, bad); |
547 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 576 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
548 | if (pi) { | 577 | if (pi) { |
@@ -633,7 +662,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
633 | struct ceph_osdmap *map; | 662 | struct ceph_osdmap *map; |
634 | u16 version; | 663 | u16 version; |
635 | u32 len, max, i; | 664 | u32 len, max, i; |
636 | u8 ev; | ||
637 | int err = -EINVAL; | 665 | int err = -EINVAL; |
638 | void *start = *p; | 666 | void *start = *p; |
639 | struct ceph_pg_pool_info *pi; | 667 | struct ceph_pg_pool_info *pi; |
@@ -646,9 +674,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
646 | map->pg_temp = RB_ROOT; | 674 | map->pg_temp = RB_ROOT; |
647 | 675 | ||
648 | ceph_decode_16_safe(p, end, version, bad); | 676 | ceph_decode_16_safe(p, end, version, bad); |
649 | if (version > CEPH_OSDMAP_VERSION) { | 677 | if (version > 6) { |
650 | pr_warning("got unknown v %d > %d of osdmap\n", version, | 678 | pr_warning("got unknown v %d > 6 of osdmap\n", version); |
651 | CEPH_OSDMAP_VERSION); | 679 | goto bad; |
680 | } | ||
681 | if (version < 6) { | ||
682 | pr_warning("got old v %d < 6 of osdmap\n", version); | ||
652 | goto bad; | 683 | goto bad; |
653 | } | 684 | } |
654 | 685 | ||
@@ -660,20 +691,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
660 | 691 | ||
661 | ceph_decode_32_safe(p, end, max, bad); | 692 | ceph_decode_32_safe(p, end, max, bad); |
662 | while (max--) { | 693 | while (max--) { |
663 | ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); | 694 | ceph_decode_need(p, end, 8 + 2, bad); |
664 | err = -ENOMEM; | 695 | err = -ENOMEM; |
665 | pi = kzalloc(sizeof(*pi), GFP_NOFS); | 696 | pi = kzalloc(sizeof(*pi), GFP_NOFS); |
666 | if (!pi) | 697 | if (!pi) |
667 | goto bad; | 698 | goto bad; |
668 | pi->id = ceph_decode_32(p); | 699 | pi->id = ceph_decode_64(p); |
669 | err = -EINVAL; | ||
670 | ev = ceph_decode_8(p); /* encoding version */ | ||
671 | if (ev > CEPH_PG_POOL_VERSION) { | ||
672 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | ||
673 | ev, CEPH_PG_POOL_VERSION); | ||
674 | kfree(pi); | ||
675 | goto bad; | ||
676 | } | ||
677 | err = __decode_pool(p, end, pi); | 700 | err = __decode_pool(p, end, pi); |
678 | if (err < 0) { | 701 | if (err < 0) { |
679 | kfree(pi); | 702 | kfree(pi); |
@@ -682,12 +705,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
682 | __insert_pg_pool(&map->pg_pools, pi); | 705 | __insert_pg_pool(&map->pg_pools, pi); |
683 | } | 706 | } |
684 | 707 | ||
685 | if (version >= 5) { | 708 | err = __decode_pool_names(p, end, map); |
686 | err = __decode_pool_names(p, end, map); | 709 | if (err < 0) { |
687 | if (err < 0) { | 710 | dout("fail to decode pool names"); |
688 | dout("fail to decode pool names"); | 711 | goto bad; |
689 | goto bad; | ||
690 | } | ||
691 | } | 712 | } |
692 | 713 | ||
693 | ceph_decode_32_safe(p, end, map->pool_max, bad); | 714 | ceph_decode_32_safe(p, end, map->pool_max, bad); |
@@ -724,10 +745,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
724 | for (i = 0; i < len; i++) { | 745 | for (i = 0; i < len; i++) { |
725 | int n, j; | 746 | int n, j; |
726 | struct ceph_pg pgid; | 747 | struct ceph_pg pgid; |
748 | struct ceph_pg_v1 pgid_v1; | ||
727 | struct ceph_pg_mapping *pg; | 749 | struct ceph_pg_mapping *pg; |
728 | 750 | ||
729 | ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); | 751 | ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); |
730 | ceph_decode_copy(p, &pgid, sizeof(pgid)); | 752 | ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1)); |
753 | pgid.pool = le32_to_cpu(pgid_v1.pool); | ||
754 | pgid.seed = le16_to_cpu(pgid_v1.ps); | ||
731 | n = ceph_decode_32(p); | 755 | n = ceph_decode_32(p); |
732 | err = -EINVAL; | 756 | err = -EINVAL; |
733 | if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) | 757 | if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) |
@@ -745,7 +769,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
745 | err = __insert_pg_mapping(pg, &map->pg_temp); | 769 | err = __insert_pg_mapping(pg, &map->pg_temp); |
746 | if (err) | 770 | if (err) |
747 | goto bad; | 771 | goto bad; |
748 | dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); | 772 | dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, |
773 | len); | ||
749 | } | 774 | } |
750 | 775 | ||
751 | /* crush */ | 776 | /* crush */ |
@@ -784,16 +809,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
784 | struct ceph_fsid fsid; | 809 | struct ceph_fsid fsid; |
785 | u32 epoch = 0; | 810 | u32 epoch = 0; |
786 | struct ceph_timespec modified; | 811 | struct ceph_timespec modified; |
787 | u32 len, pool; | 812 | s32 len; |
788 | __s32 new_pool_max, new_flags, max; | 813 | u64 pool; |
814 | __s64 new_pool_max; | ||
815 | __s32 new_flags, max; | ||
789 | void *start = *p; | 816 | void *start = *p; |
790 | int err = -EINVAL; | 817 | int err = -EINVAL; |
791 | u16 version; | 818 | u16 version; |
792 | 819 | ||
793 | ceph_decode_16_safe(p, end, version, bad); | 820 | ceph_decode_16_safe(p, end, version, bad); |
794 | if (version > CEPH_OSDMAP_INC_VERSION) { | 821 | if (version > 6) { |
795 | pr_warning("got unknown v %d > %d of inc osdmap\n", version, | 822 | pr_warning("got unknown v %d > %d of inc osdmap\n", version, 6); |
796 | CEPH_OSDMAP_INC_VERSION); | ||
797 | goto bad; | 823 | goto bad; |
798 | } | 824 | } |
799 | 825 | ||
@@ -803,7 +829,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
803 | epoch = ceph_decode_32(p); | 829 | epoch = ceph_decode_32(p); |
804 | BUG_ON(epoch != map->epoch+1); | 830 | BUG_ON(epoch != map->epoch+1); |
805 | ceph_decode_copy(p, &modified, sizeof(modified)); | 831 | ceph_decode_copy(p, &modified, sizeof(modified)); |
806 | new_pool_max = ceph_decode_32(p); | 832 | new_pool_max = ceph_decode_64(p); |
807 | new_flags = ceph_decode_32(p); | 833 | new_flags = ceph_decode_32(p); |
808 | 834 | ||
809 | /* full map? */ | 835 | /* full map? */ |
@@ -853,18 +879,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
853 | /* new_pool */ | 879 | /* new_pool */ |
854 | ceph_decode_32_safe(p, end, len, bad); | 880 | ceph_decode_32_safe(p, end, len, bad); |
855 | while (len--) { | 881 | while (len--) { |
856 | __u8 ev; | ||
857 | struct ceph_pg_pool_info *pi; | 882 | struct ceph_pg_pool_info *pi; |
858 | 883 | ||
859 | ceph_decode_32_safe(p, end, pool, bad); | 884 | ceph_decode_64_safe(p, end, pool, bad); |
860 | ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); | ||
861 | ev = ceph_decode_8(p); /* encoding version */ | ||
862 | if (ev > CEPH_PG_POOL_VERSION) { | ||
863 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | ||
864 | ev, CEPH_PG_POOL_VERSION); | ||
865 | err = -EINVAL; | ||
866 | goto bad; | ||
867 | } | ||
868 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 885 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
869 | if (!pi) { | 886 | if (!pi) { |
870 | pi = kzalloc(sizeof(*pi), GFP_NOFS); | 887 | pi = kzalloc(sizeof(*pi), GFP_NOFS); |
@@ -890,7 +907,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
890 | while (len--) { | 907 | while (len--) { |
891 | struct ceph_pg_pool_info *pi; | 908 | struct ceph_pg_pool_info *pi; |
892 | 909 | ||
893 | ceph_decode_32_safe(p, end, pool, bad); | 910 | ceph_decode_64_safe(p, end, pool, bad); |
894 | pi = __lookup_pg_pool(&map->pg_pools, pool); | 911 | pi = __lookup_pg_pool(&map->pg_pools, pool); |
895 | if (pi) | 912 | if (pi) |
896 | __remove_pg_pool(&map->pg_pools, pi); | 913 | __remove_pg_pool(&map->pg_pools, pi); |
@@ -946,10 +963,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
946 | while (len--) { | 963 | while (len--) { |
947 | struct ceph_pg_mapping *pg; | 964 | struct ceph_pg_mapping *pg; |
948 | int j; | 965 | int j; |
966 | struct ceph_pg_v1 pgid_v1; | ||
949 | struct ceph_pg pgid; | 967 | struct ceph_pg pgid; |
950 | u32 pglen; | 968 | u32 pglen; |
951 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); | 969 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); |
952 | ceph_decode_copy(p, &pgid, sizeof(pgid)); | 970 | ceph_decode_copy(p, &pgid_v1, sizeof(pgid_v1)); |
971 | pgid.pool = le32_to_cpu(pgid_v1.pool); | ||
972 | pgid.seed = le16_to_cpu(pgid_v1.ps); | ||
953 | pglen = ceph_decode_32(p); | 973 | pglen = ceph_decode_32(p); |
954 | 974 | ||
955 | if (pglen) { | 975 | if (pglen) { |
@@ -975,8 +995,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
975 | kfree(pg); | 995 | kfree(pg); |
976 | goto bad; | 996 | goto bad; |
977 | } | 997 | } |
978 | dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, | 998 | dout(" added pg_temp %lld.%x len %d\n", pgid.pool, |
979 | pglen); | 999 | pgid.seed, pglen); |
980 | } else { | 1000 | } else { |
981 | /* remove */ | 1001 | /* remove */ |
982 | __remove_pg_mapping(&map->pg_temp, pgid); | 1002 | __remove_pg_mapping(&map->pg_temp, pgid); |
@@ -1010,7 +1030,7 @@ bad: | |||
1010 | * pass a stride back to the caller. | 1030 | * pass a stride back to the caller. |
1011 | */ | 1031 | */ |
1012 | int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | 1032 | int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, |
1013 | u64 off, u64 *plen, | 1033 | u64 off, u64 len, |
1014 | u64 *ono, | 1034 | u64 *ono, |
1015 | u64 *oxoff, u64 *oxlen) | 1035 | u64 *oxoff, u64 *oxlen) |
1016 | { | 1036 | { |
@@ -1021,7 +1041,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | |||
1021 | u32 su_per_object; | 1041 | u32 su_per_object; |
1022 | u64 t, su_offset; | 1042 | u64 t, su_offset; |
1023 | 1043 | ||
1024 | dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, | 1044 | dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, |
1025 | osize, su); | 1045 | osize, su); |
1026 | if (su == 0 || sc == 0) | 1046 | if (su == 0 || sc == 0) |
1027 | goto invalid; | 1047 | goto invalid; |
@@ -1054,11 +1074,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | |||
1054 | 1074 | ||
1055 | /* | 1075 | /* |
1056 | * Calculate the length of the extent being written to the selected | 1076 | * Calculate the length of the extent being written to the selected |
1057 | * object. This is the minimum of the full length requested (plen) or | 1077 | * object. This is the minimum of the full length requested (len) or |
1058 | * the remainder of the current stripe being written to. | 1078 | * the remainder of the current stripe being written to. |
1059 | */ | 1079 | */ |
1060 | *oxlen = min_t(u64, *plen, su - su_offset); | 1080 | *oxlen = min_t(u64, len, su - su_offset); |
1061 | *plen = *oxlen; | ||
1062 | 1081 | ||
1063 | dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); | 1082 | dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); |
1064 | return 0; | 1083 | return 0; |
@@ -1076,33 +1095,24 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); | |||
1076 | * calculate an object layout (i.e. pgid) from an oid, | 1095 | * calculate an object layout (i.e. pgid) from an oid, |
1077 | * file_layout, and osdmap | 1096 | * file_layout, and osdmap |
1078 | */ | 1097 | */ |
1079 | int ceph_calc_object_layout(struct ceph_object_layout *ol, | 1098 | int ceph_calc_object_layout(struct ceph_pg *pg, |
1080 | const char *oid, | 1099 | const char *oid, |
1081 | struct ceph_file_layout *fl, | 1100 | struct ceph_file_layout *fl, |
1082 | struct ceph_osdmap *osdmap) | 1101 | struct ceph_osdmap *osdmap) |
1083 | { | 1102 | { |
1084 | unsigned int num, num_mask; | 1103 | unsigned int num, num_mask; |
1085 | struct ceph_pg pgid; | ||
1086 | int poolid = le32_to_cpu(fl->fl_pg_pool); | ||
1087 | struct ceph_pg_pool_info *pool; | 1104 | struct ceph_pg_pool_info *pool; |
1088 | unsigned int ps; | ||
1089 | 1105 | ||
1090 | BUG_ON(!osdmap); | 1106 | BUG_ON(!osdmap); |
1091 | 1107 | pg->pool = le32_to_cpu(fl->fl_pg_pool); | |
1092 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); | 1108 | pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); |
1093 | if (!pool) | 1109 | if (!pool) |
1094 | return -EIO; | 1110 | return -EIO; |
1095 | ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); | 1111 | pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); |
1096 | num = le32_to_cpu(pool->v.pg_num); | 1112 | num = pool->pg_num; |
1097 | num_mask = pool->pg_num_mask; | 1113 | num_mask = pool->pg_num_mask; |
1098 | 1114 | ||
1099 | pgid.ps = cpu_to_le16(ps); | 1115 | dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); |
1100 | pgid.preferred = cpu_to_le16(-1); | ||
1101 | pgid.pool = fl->fl_pg_pool; | ||
1102 | dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); | ||
1103 | |||
1104 | ol->ol_pgid = pgid; | ||
1105 | ol->ol_stripe_unit = fl->fl_object_stripe_unit; | ||
1106 | return 0; | 1116 | return 0; |
1107 | } | 1117 | } |
1108 | EXPORT_SYMBOL(ceph_calc_object_layout); | 1118 | EXPORT_SYMBOL(ceph_calc_object_layout); |
@@ -1117,19 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
1117 | struct ceph_pg_mapping *pg; | 1127 | struct ceph_pg_mapping *pg; |
1118 | struct ceph_pg_pool_info *pool; | 1128 | struct ceph_pg_pool_info *pool; |
1119 | int ruleno; | 1129 | int ruleno; |
1120 | unsigned int poolid, ps, pps, t, r; | 1130 | int r; |
1121 | 1131 | u32 pps; | |
1122 | poolid = le32_to_cpu(pgid.pool); | ||
1123 | ps = le16_to_cpu(pgid.ps); | ||
1124 | 1132 | ||
1125 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); | 1133 | pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); |
1126 | if (!pool) | 1134 | if (!pool) |
1127 | return NULL; | 1135 | return NULL; |
1128 | 1136 | ||
1129 | /* pg_temp? */ | 1137 | /* pg_temp? */ |
1130 | t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), | 1138 | pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, |
1131 | pool->pgp_num_mask); | 1139 | pool->pgp_num_mask); |
1132 | pgid.ps = cpu_to_le16(t); | ||
1133 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); | 1140 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); |
1134 | if (pg) { | 1141 | if (pg) { |
1135 | *num = pg->len; | 1142 | *num = pg->len; |
@@ -1137,26 +1144,39 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
1137 | } | 1144 | } |
1138 | 1145 | ||
1139 | /* crush */ | 1146 | /* crush */ |
1140 | ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, | 1147 | ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, |
1141 | pool->v.type, pool->v.size); | 1148 | pool->type, pool->size); |
1142 | if (ruleno < 0) { | 1149 | if (ruleno < 0) { |
1143 | pr_err("no crush rule pool %d ruleset %d type %d size %d\n", | 1150 | pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", |
1144 | poolid, pool->v.crush_ruleset, pool->v.type, | 1151 | pgid.pool, pool->crush_ruleset, pool->type, |
1145 | pool->v.size); | 1152 | pool->size); |
1146 | return NULL; | 1153 | return NULL; |
1147 | } | 1154 | } |
1148 | 1155 | ||
1149 | pps = ceph_stable_mod(ps, | 1156 | if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { |
1150 | le32_to_cpu(pool->v.pgp_num), | 1157 | /* hash pool id and seed sothat pool PGs do not overlap */ |
1151 | pool->pgp_num_mask); | 1158 | pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, |
1152 | pps += poolid; | 1159 | ceph_stable_mod(pgid.seed, pool->pgp_num, |
1160 | pool->pgp_num_mask), | ||
1161 | pgid.pool); | ||
1162 | } else { | ||
1163 | /* | ||
1164 | * legacy ehavior: add ps and pool together. this is | ||
1165 | * not a great approach because the PGs from each pool | ||
1166 | * will overlap on top of each other: 0.5 == 1.4 == | ||
1167 | * 2.3 == ... | ||
1168 | */ | ||
1169 | pps = ceph_stable_mod(pgid.seed, pool->pgp_num, | ||
1170 | pool->pgp_num_mask) + | ||
1171 | (unsigned)pgid.pool; | ||
1172 | } | ||
1153 | r = crush_do_rule(osdmap->crush, ruleno, pps, osds, | 1173 | r = crush_do_rule(osdmap->crush, ruleno, pps, osds, |
1154 | min_t(int, pool->v.size, *num), | 1174 | min_t(int, pool->size, *num), |
1155 | osdmap->osd_weight); | 1175 | osdmap->osd_weight); |
1156 | if (r < 0) { | 1176 | if (r < 0) { |
1157 | pr_err("error %d from crush rule: pool %d ruleset %d type %d" | 1177 | pr_err("error %d from crush rule: pool %lld ruleset %d type %d" |
1158 | " size %d\n", r, poolid, pool->v.crush_ruleset, | 1178 | " size %d\n", r, pgid.pool, pool->crush_ruleset, |
1159 | pool->v.type, pool->v.size); | 1179 | pool->type, pool->size); |
1160 | return NULL; | 1180 | return NULL; |
1161 | } | 1181 | } |
1162 | *num = r; | 1182 | *num = r; |
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index cd9c21df87d1..815a2249cfa9 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c | |||
@@ -12,7 +12,7 @@ | |||
12 | /* | 12 | /* |
13 | * build a vector of user pages | 13 | * build a vector of user pages |
14 | */ | 14 | */ |
15 | struct page **ceph_get_direct_page_vector(const char __user *data, | 15 | struct page **ceph_get_direct_page_vector(const void __user *data, |
16 | int num_pages, bool write_page) | 16 | int num_pages, bool write_page) |
17 | { | 17 | { |
18 | struct page **pages; | 18 | struct page **pages; |
@@ -93,7 +93,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector); | |||
93 | * copy user data into a page vector | 93 | * copy user data into a page vector |
94 | */ | 94 | */ |
95 | int ceph_copy_user_to_page_vector(struct page **pages, | 95 | int ceph_copy_user_to_page_vector(struct page **pages, |
96 | const char __user *data, | 96 | const void __user *data, |
97 | loff_t off, size_t len) | 97 | loff_t off, size_t len) |
98 | { | 98 | { |
99 | int i = 0; | 99 | int i = 0; |
@@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages, | |||
118 | } | 118 | } |
119 | EXPORT_SYMBOL(ceph_copy_user_to_page_vector); | 119 | EXPORT_SYMBOL(ceph_copy_user_to_page_vector); |
120 | 120 | ||
121 | int ceph_copy_to_page_vector(struct page **pages, | 121 | void ceph_copy_to_page_vector(struct page **pages, |
122 | const char *data, | 122 | const void *data, |
123 | loff_t off, size_t len) | 123 | loff_t off, size_t len) |
124 | { | 124 | { |
125 | int i = 0; | 125 | int i = 0; |
126 | size_t po = off & ~PAGE_CACHE_MASK; | 126 | size_t po = off & ~PAGE_CACHE_MASK; |
127 | size_t left = len; | 127 | size_t left = len; |
128 | size_t l; | ||
129 | 128 | ||
130 | while (left > 0) { | 129 | while (left > 0) { |
131 | l = min_t(size_t, PAGE_CACHE_SIZE-po, left); | 130 | size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); |
131 | |||
132 | memcpy(page_address(pages[i]) + po, data, l); | 132 | memcpy(page_address(pages[i]) + po, data, l); |
133 | data += l; | 133 | data += l; |
134 | left -= l; | 134 | left -= l; |
@@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages, | |||
138 | i++; | 138 | i++; |
139 | } | 139 | } |
140 | } | 140 | } |
141 | return len; | ||
142 | } | 141 | } |
143 | EXPORT_SYMBOL(ceph_copy_to_page_vector); | 142 | EXPORT_SYMBOL(ceph_copy_to_page_vector); |
144 | 143 | ||
145 | int ceph_copy_from_page_vector(struct page **pages, | 144 | void ceph_copy_from_page_vector(struct page **pages, |
146 | char *data, | 145 | void *data, |
147 | loff_t off, size_t len) | 146 | loff_t off, size_t len) |
148 | { | 147 | { |
149 | int i = 0; | 148 | int i = 0; |
150 | size_t po = off & ~PAGE_CACHE_MASK; | 149 | size_t po = off & ~PAGE_CACHE_MASK; |
151 | size_t left = len; | 150 | size_t left = len; |
152 | size_t l; | ||
153 | 151 | ||
154 | while (left > 0) { | 152 | while (left > 0) { |
155 | l = min_t(size_t, PAGE_CACHE_SIZE-po, left); | 153 | size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); |
154 | |||
156 | memcpy(data, page_address(pages[i]) + po, l); | 155 | memcpy(data, page_address(pages[i]) + po, l); |
157 | data += l; | 156 | data += l; |
158 | left -= l; | 157 | left -= l; |
@@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages, | |||
162 | i++; | 161 | i++; |
163 | } | 162 | } |
164 | } | 163 | } |
165 | return len; | ||
166 | } | 164 | } |
167 | EXPORT_SYMBOL(ceph_copy_from_page_vector); | 165 | EXPORT_SYMBOL(ceph_copy_from_page_vector); |
168 | 166 | ||
@@ -170,7 +168,7 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector); | |||
170 | * copy user data from a page vector into a user pointer | 168 | * copy user data from a page vector into a user pointer |
171 | */ | 169 | */ |
172 | int ceph_copy_page_vector_to_user(struct page **pages, | 170 | int ceph_copy_page_vector_to_user(struct page **pages, |
173 | char __user *data, | 171 | void __user *data, |
174 | loff_t off, size_t len) | 172 | loff_t off, size_t len) |
175 | { | 173 | { |
176 | int i = 0; | 174 | int i = 0; |