aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-bus-rbd20
-rw-r--r--drivers/block/rbd.c2858
-rw-r--r--fs/ceph/addr.c222
-rw-r--r--fs/ceph/caps.c33
-rw-r--r--fs/ceph/dir.c65
-rw-r--r--fs/ceph/file.c241
-rw-r--r--fs/ceph/inode.c59
-rw-r--r--fs/ceph/ioctl.c5
-rw-r--r--fs/ceph/mds_client.c79
-rw-r--r--fs/ceph/mdsmap.c8
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h65
-rw-r--r--include/linux/ceph/auth.h18
-rw-r--r--include/linux/ceph/ceph_features.h2
-rw-r--r--include/linux/ceph/decode.h30
-rw-r--r--include/linux/ceph/libceph.h31
-rw-r--r--include/linux/ceph/messenger.h104
-rw-r--r--include/linux/ceph/msgr.h1
-rw-r--r--include/linux/ceph/osd_client.h204
-rw-r--r--include/linux/ceph/osdmap.h30
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/auth.c117
-rw-r--r--net/ceph/auth_x.c24
-rw-r--r--net/ceph/auth_x.h1
-rw-r--r--net/ceph/ceph_common.c7
-rw-r--r--net/ceph/debugfs.c4
-rw-r--r--net/ceph/messenger.c1019
-rw-r--r--net/ceph/mon_client.c7
-rw-r--r--net/ceph/osd_client.c1087
-rw-r--r--net/ceph/osdmap.c45
-rw-r--r--net/ceph/snapshot.c78
32 files changed, 4196 insertions, 2280 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd
index cd9213ccf3dc..0a306476424e 100644
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@@ -66,27 +66,7 @@ current_snap
66 66
67 The current snapshot for which the device is mapped. 67 The current snapshot for which the device is mapped.
68 68
69snap_*
70
71 A directory per each snapshot
72
73parent 69parent
74 70
75 Information identifying the pool, image, and snapshot id for 71 Information identifying the pool, image, and snapshot id for
76 the parent image in a layered rbd image (format 2 only). 72 the parent image in a layered rbd image (format 2 only).
77
78Entries under /sys/bus/rbd/devices/<dev-id>/snap_<snap-name>
79-------------------------------------------------------------
80
81snap_id
82
83 The rados internal snapshot id assigned for this snapshot
84
85snap_size
86
87 The size of the image when this snapshot was taken.
88
89snap_features
90
91 A hexadecimal encoding of the feature bits for this snapshot.
92
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b7b7a88d9f68..c2ca1818f335 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1,3 +1,4 @@
1
1/* 2/*
2 rbd.c -- Export ceph rados objects as a Linux block device 3 rbd.c -- Export ceph rados objects as a Linux block device
3 4
@@ -32,12 +33,14 @@
32#include <linux/ceph/mon_client.h> 33#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h> 34#include <linux/ceph/decode.h>
34#include <linux/parser.h> 35#include <linux/parser.h>
36#include <linux/bsearch.h>
35 37
36#include <linux/kernel.h> 38#include <linux/kernel.h>
37#include <linux/device.h> 39#include <linux/device.h>
38#include <linux/module.h> 40#include <linux/module.h>
39#include <linux/fs.h> 41#include <linux/fs.h>
40#include <linux/blkdev.h> 42#include <linux/blkdev.h>
43#include <linux/slab.h>
41 44
42#include "rbd_types.h" 45#include "rbd_types.h"
43 46
@@ -52,13 +55,6 @@
52#define SECTOR_SHIFT 9 55#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 56#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54 57
55/* It might be useful to have these defined elsewhere */
56
57#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
61
62#define RBD_DRV_NAME "rbd" 58#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)" 59#define RBD_DRV_NAME_LONG "rbd (rados block device)"
64 60
@@ -72,6 +68,8 @@
72 68
73#define RBD_SNAP_HEAD_NAME "-" 69#define RBD_SNAP_HEAD_NAME "-"
74 70
71#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
72
75/* This allows a single page to hold an image name sent by OSD */ 73/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
77#define RBD_IMAGE_ID_LEN_MAX 64 75#define RBD_IMAGE_ID_LEN_MAX 64
@@ -80,11 +78,14 @@
80 78
81/* Feature bits */ 79/* Feature bits */
82 80
83#define RBD_FEATURE_LAYERING 1 81#define RBD_FEATURE_LAYERING (1<<0)
82#define RBD_FEATURE_STRIPINGV2 (1<<1)
83#define RBD_FEATURES_ALL \
84 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
84 85
85/* Features supported by this (client software) implementation. */ 86/* Features supported by this (client software) implementation. */
86 87
87#define RBD_FEATURES_ALL (0) 88#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
88 89
89/* 90/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from 91 * An RBD device name will be "rbd#", where the "rbd" comes from
@@ -112,7 +113,8 @@ struct rbd_image_header {
112 char *snap_names; 113 char *snap_names;
113 u64 *snap_sizes; 114 u64 *snap_sizes;
114 115
115 u64 obj_version; 116 u64 stripe_unit;
117 u64 stripe_count;
116}; 118};
117 119
118/* 120/*
@@ -142,13 +144,13 @@ struct rbd_image_header {
142 */ 144 */
143struct rbd_spec { 145struct rbd_spec {
144 u64 pool_id; 146 u64 pool_id;
145 char *pool_name; 147 const char *pool_name;
146 148
147 char *image_id; 149 const char *image_id;
148 char *image_name; 150 const char *image_name;
149 151
150 u64 snap_id; 152 u64 snap_id;
151 char *snap_name; 153 const char *snap_name;
152 154
153 struct kref kref; 155 struct kref kref;
154}; 156};
@@ -174,13 +176,44 @@ enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 176 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175}; 177};
176 178
179enum obj_req_flags {
180 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
181 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
182 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
183 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
184};
185
177struct rbd_obj_request { 186struct rbd_obj_request {
178 const char *object_name; 187 const char *object_name;
179 u64 offset; /* object start byte */ 188 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */ 189 u64 length; /* bytes from offset */
190 unsigned long flags;
181 191
182 struct rbd_img_request *img_request; 192 /*
183 struct list_head links; /* img_request->obj_requests */ 193 * An object request associated with an image will have its
194 * img_data flag set; a standalone object request will not.
195 *
196 * A standalone object request will have which == BAD_WHICH
197 * and a null obj_request pointer.
198 *
199 * An object request initiated in support of a layered image
200 * object (to check for its existence before a write) will
201 * have which == BAD_WHICH and a non-null obj_request pointer.
202 *
203 * Finally, an object request for rbd image data will have
204 * which != BAD_WHICH, and will have a non-null img_request
205 * pointer. The value of which will be in the range
206 * 0..(img_request->obj_request_count-1).
207 */
208 union {
209 struct rbd_obj_request *obj_request; /* STAT op */
210 struct {
211 struct rbd_img_request *img_request;
212 u64 img_offset;
213 /* links for img_request->obj_requests list */
214 struct list_head links;
215 };
216 };
184 u32 which; /* posn image request list */ 217 u32 which; /* posn image request list */
185 218
186 enum obj_request_type type; 219 enum obj_request_type type;
@@ -191,13 +224,12 @@ struct rbd_obj_request {
191 u32 page_count; 224 u32 page_count;
192 }; 225 };
193 }; 226 };
227 struct page **copyup_pages;
194 228
195 struct ceph_osd_request *osd_req; 229 struct ceph_osd_request *osd_req;
196 230
197 u64 xferred; /* bytes transferred */ 231 u64 xferred; /* bytes transferred */
198 u64 version;
199 int result; 232 int result;
200 atomic_t done;
201 233
202 rbd_obj_callback_t callback; 234 rbd_obj_callback_t callback;
203 struct completion completion; 235 struct completion completion;
@@ -205,19 +237,31 @@ struct rbd_obj_request {
205 struct kref kref; 237 struct kref kref;
206}; 238};
207 239
240enum img_req_flags {
241 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
242 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
243 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
244};
245
208struct rbd_img_request { 246struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev; 247 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */ 248 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */ 249 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */ 250 unsigned long flags;
214 union { 251 union {
252 u64 snap_id; /* for reads */
215 struct ceph_snap_context *snapc; /* for writes */ 253 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 }; 254 };
255 union {
256 struct request *rq; /* block request */
257 struct rbd_obj_request *obj_request; /* obj req initiator */
258 };
259 struct page **copyup_pages;
218 spinlock_t completion_lock;/* protects next_completion */ 260 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion; 261 u32 next_completion;
220 rbd_img_callback_t callback; 262 rbd_img_callback_t callback;
263 u64 xferred;/* aggregate bytes transferred */
264 int result; /* first nonzero obj_request result */
221 265
222 u32 obj_request_count; 266 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */ 267 struct list_head obj_requests; /* rbd_obj_request structs */
@@ -232,15 +276,6 @@ struct rbd_img_request {
232#define for_each_obj_request_safe(ireq, oreq, n) \ 276#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 277 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
234 278
235struct rbd_snap {
236 struct device dev;
237 const char *name;
238 u64 size;
239 struct list_head node;
240 u64 id;
241 u64 features;
242};
243
244struct rbd_mapping { 279struct rbd_mapping {
245 u64 size; 280 u64 size;
246 u64 features; 281 u64 features;
@@ -276,6 +311,7 @@ struct rbd_device {
276 311
277 struct rbd_spec *parent_spec; 312 struct rbd_spec *parent_spec;
278 u64 parent_overlap; 313 u64 parent_overlap;
314 struct rbd_device *parent;
279 315
280 /* protects updating the header */ 316 /* protects updating the header */
281 struct rw_semaphore header_rwsem; 317 struct rw_semaphore header_rwsem;
@@ -284,9 +320,6 @@ struct rbd_device {
284 320
285 struct list_head node; 321 struct list_head node;
286 322
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */ 323 /* sysfs related */
291 struct device dev; 324 struct device dev;
292 unsigned long open_count; /* protected by lock */ 325 unsigned long open_count; /* protected by lock */
@@ -312,16 +345,21 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock);
312static LIST_HEAD(rbd_client_list); /* clients */ 345static LIST_HEAD(rbd_client_list); /* clients */
313static DEFINE_SPINLOCK(rbd_client_list_lock); 346static DEFINE_SPINLOCK(rbd_client_list_lock);
314 347
315static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 348/* Slab caches for frequently-allocated structures */
316static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 349
350static struct kmem_cache *rbd_img_request_cache;
351static struct kmem_cache *rbd_obj_request_cache;
352static struct kmem_cache *rbd_segment_name_cache;
317 353
318static void rbd_dev_release(struct device *dev); 354static int rbd_img_request_submit(struct rbd_img_request *img_request);
319static void rbd_remove_snap_dev(struct rbd_snap *snap); 355
356static void rbd_dev_device_release(struct device *dev);
320 357
321static ssize_t rbd_add(struct bus_type *bus, const char *buf, 358static ssize_t rbd_add(struct bus_type *bus, const char *buf,
322 size_t count); 359 size_t count);
323static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 360static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
324 size_t count); 361 size_t count);
362static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
325 363
326static struct bus_attribute rbd_bus_attrs[] = { 364static struct bus_attribute rbd_bus_attrs[] = {
327 __ATTR(add, S_IWUSR, NULL, rbd_add), 365 __ATTR(add, S_IWUSR, NULL, rbd_add),
@@ -383,8 +421,19 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
383# define rbd_assert(expr) ((void) 0) 421# define rbd_assert(expr) ((void) 0)
384#endif /* !RBD_DEBUG */ 422#endif /* !RBD_DEBUG */
385 423
386static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 424static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
387static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 425static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
426static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
427
428static int rbd_dev_refresh(struct rbd_device *rbd_dev);
429static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
430static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
431 u64 snap_id);
432static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
433 u8 *order, u64 *snap_size);
434static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
435 u64 *snap_features);
436static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
388 437
389static int rbd_open(struct block_device *bdev, fmode_t mode) 438static int rbd_open(struct block_device *bdev, fmode_t mode)
390{ 439{
@@ -484,6 +533,13 @@ out_opt:
484 return ERR_PTR(ret); 533 return ERR_PTR(ret);
485} 534}
486 535
536static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
537{
538 kref_get(&rbdc->kref);
539
540 return rbdc;
541}
542
487/* 543/*
488 * Find a ceph client with specific addr and configuration. If 544 * Find a ceph client with specific addr and configuration. If
489 * found, bump its reference count. 545 * found, bump its reference count.
@@ -499,7 +555,8 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
499 spin_lock(&rbd_client_list_lock); 555 spin_lock(&rbd_client_list_lock);
500 list_for_each_entry(client_node, &rbd_client_list, node) { 556 list_for_each_entry(client_node, &rbd_client_list, node) {
501 if (!ceph_compare_options(ceph_opts, client_node->client)) { 557 if (!ceph_compare_options(ceph_opts, client_node->client)) {
502 kref_get(&client_node->kref); 558 __rbd_get_client(client_node);
559
503 found = true; 560 found = true;
504 break; 561 break;
505 } 562 }
@@ -722,7 +779,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
722 header->snap_sizes[i] = 779 header->snap_sizes[i] =
723 le64_to_cpu(ondisk->snaps[i].image_size); 780 le64_to_cpu(ondisk->snaps[i].image_size);
724 } else { 781 } else {
725 WARN_ON(ondisk->snap_names_len);
726 header->snap_names = NULL; 782 header->snap_names = NULL;
727 header->snap_sizes = NULL; 783 header->snap_sizes = NULL;
728 } 784 }
@@ -735,18 +791,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
735 /* Allocate and fill in the snapshot context */ 791 /* Allocate and fill in the snapshot context */
736 792
737 header->image_size = le64_to_cpu(ondisk->image_size); 793 header->image_size = le64_to_cpu(ondisk->image_size);
738 size = sizeof (struct ceph_snap_context); 794
739 size += snap_count * sizeof (header->snapc->snaps[0]); 795 header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
740 header->snapc = kzalloc(size, GFP_KERNEL);
741 if (!header->snapc) 796 if (!header->snapc)
742 goto out_err; 797 goto out_err;
743
744 atomic_set(&header->snapc->nref, 1);
745 header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 798 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
746 header->snapc->num_snaps = snap_count;
747 for (i = 0; i < snap_count; i++) 799 for (i = 0; i < snap_count; i++)
748 header->snapc->snaps[i] = 800 header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
749 le64_to_cpu(ondisk->snaps[i].id);
750 801
751 return 0; 802 return 0;
752 803
@@ -761,70 +812,174 @@ out_err:
761 return -ENOMEM; 812 return -ENOMEM;
762} 813}
763 814
764static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 815static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
765{ 816{
766 struct rbd_snap *snap; 817 const char *snap_name;
767 818
819 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
820
821 /* Skip over names until we find the one we are looking for */
822
823 snap_name = rbd_dev->header.snap_names;
824 while (which--)
825 snap_name += strlen(snap_name) + 1;
826
827 return kstrdup(snap_name, GFP_KERNEL);
828}
829
830/*
831 * Snapshot id comparison function for use with qsort()/bsearch().
832 * Note that result is for snapshots in *descending* order.
833 */
834static int snapid_compare_reverse(const void *s1, const void *s2)
835{
836 u64 snap_id1 = *(u64 *)s1;
837 u64 snap_id2 = *(u64 *)s2;
838
839 if (snap_id1 < snap_id2)
840 return 1;
841 return snap_id1 == snap_id2 ? 0 : -1;
842}
843
844/*
845 * Search a snapshot context to see if the given snapshot id is
846 * present.
847 *
848 * Returns the position of the snapshot id in the array if it's found,
849 * or BAD_SNAP_INDEX otherwise.
850 *
851 * Note: The snapshot array is in kept sorted (by the osd) in
852 * reverse order, highest snapshot id first.
853 */
854static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
855{
856 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
857 u64 *found;
858
859 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
860 sizeof (snap_id), snapid_compare_reverse);
861
862 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
863}
864
865static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
866 u64 snap_id)
867{
868 u32 which;
869
870 which = rbd_dev_snap_index(rbd_dev, snap_id);
871 if (which == BAD_SNAP_INDEX)
872 return NULL;
873
874 return _rbd_dev_v1_snap_name(rbd_dev, which);
875}
876
877static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
878{
768 if (snap_id == CEPH_NOSNAP) 879 if (snap_id == CEPH_NOSNAP)
769 return RBD_SNAP_HEAD_NAME; 880 return RBD_SNAP_HEAD_NAME;
770 881
771 list_for_each_entry(snap, &rbd_dev->snaps, node) 882 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
772 if (snap_id == snap->id) 883 if (rbd_dev->image_format == 1)
773 return snap->name; 884 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
774 885
775 return NULL; 886 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
776} 887}
777 888
778static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 889static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
890 u64 *snap_size)
779{ 891{
892 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
893 if (snap_id == CEPH_NOSNAP) {
894 *snap_size = rbd_dev->header.image_size;
895 } else if (rbd_dev->image_format == 1) {
896 u32 which;
780 897
781 struct rbd_snap *snap; 898 which = rbd_dev_snap_index(rbd_dev, snap_id);
899 if (which == BAD_SNAP_INDEX)
900 return -ENOENT;
782 901
783 list_for_each_entry(snap, &rbd_dev->snaps, node) { 902 *snap_size = rbd_dev->header.snap_sizes[which];
784 if (!strcmp(snap_name, snap->name)) { 903 } else {
785 rbd_dev->spec->snap_id = snap->id; 904 u64 size = 0;
786 rbd_dev->mapping.size = snap->size; 905 int ret;
787 rbd_dev->mapping.features = snap->features;
788 906
789 return 0; 907 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
790 } 908 if (ret)
909 return ret;
910
911 *snap_size = size;
791 } 912 }
913 return 0;
914}
915
916static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
917 u64 *snap_features)
918{
919 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
920 if (snap_id == CEPH_NOSNAP) {
921 *snap_features = rbd_dev->header.features;
922 } else if (rbd_dev->image_format == 1) {
923 *snap_features = 0; /* No features for format 1 */
924 } else {
925 u64 features = 0;
926 int ret;
927
928 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
929 if (ret)
930 return ret;
792 931
793 return -ENOENT; 932 *snap_features = features;
933 }
934 return 0;
794} 935}
795 936
796static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 937static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
797{ 938{
939 const char *snap_name = rbd_dev->spec->snap_name;
940 u64 snap_id;
941 u64 size = 0;
942 u64 features = 0;
798 int ret; 943 int ret;
799 944
800 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 945 if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
801 sizeof (RBD_SNAP_HEAD_NAME))) { 946 snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
802 rbd_dev->spec->snap_id = CEPH_NOSNAP; 947 if (snap_id == CEPH_NOSNAP)
803 rbd_dev->mapping.size = rbd_dev->header.image_size; 948 return -ENOENT;
804 rbd_dev->mapping.features = rbd_dev->header.features;
805 ret = 0;
806 } else { 949 } else {
807 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 950 snap_id = CEPH_NOSNAP;
808 if (ret < 0)
809 goto done;
810 rbd_dev->mapping.read_only = true;
811 } 951 }
812 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
813 952
814done: 953 ret = rbd_snap_size(rbd_dev, snap_id, &size);
815 return ret; 954 if (ret)
955 return ret;
956 ret = rbd_snap_features(rbd_dev, snap_id, &features);
957 if (ret)
958 return ret;
959
960 rbd_dev->mapping.size = size;
961 rbd_dev->mapping.features = features;
962
963 /* If we are mapping a snapshot it must be marked read-only */
964
965 if (snap_id != CEPH_NOSNAP)
966 rbd_dev->mapping.read_only = true;
967
968 return 0;
816} 969}
817 970
818static void rbd_header_free(struct rbd_image_header *header) 971static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
819{ 972{
820 kfree(header->object_prefix); 973 rbd_dev->mapping.size = 0;
821 header->object_prefix = NULL; 974 rbd_dev->mapping.features = 0;
822 kfree(header->snap_sizes); 975 rbd_dev->mapping.read_only = true;
823 header->snap_sizes = NULL; 976}
824 kfree(header->snap_names); 977
825 header->snap_names = NULL; 978static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
826 ceph_put_snap_context(header->snapc); 979{
827 header->snapc = NULL; 980 rbd_dev->mapping.size = 0;
981 rbd_dev->mapping.features = 0;
982 rbd_dev->mapping.read_only = true;
828} 983}
829 984
830static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 985static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
@@ -833,7 +988,7 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
833 u64 segment; 988 u64 segment;
834 int ret; 989 int ret;
835 990
836 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 991 name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
837 if (!name) 992 if (!name)
838 return NULL; 993 return NULL;
839 segment = offset >> rbd_dev->header.obj_order; 994 segment = offset >> rbd_dev->header.obj_order;
@@ -849,6 +1004,13 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
849 return name; 1004 return name;
850} 1005}
851 1006
1007static void rbd_segment_name_free(const char *name)
1008{
1009 /* The explicit cast here is needed to drop the const qualifier */
1010
1011 kmem_cache_free(rbd_segment_name_cache, (void *)name);
1012}
1013
852static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 1014static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
853{ 1015{
854 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1016 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
@@ -921,6 +1083,37 @@ static void zero_bio_chain(struct bio *chain, int start_ofs)
921} 1083}
922 1084
923/* 1085/*
1086 * similar to zero_bio_chain(), zeros data defined by a page array,
1087 * starting at the given byte offset from the start of the array and
1088 * continuing up to the given end offset. The pages array is
1089 * assumed to be big enough to hold all bytes up to the end.
1090 */
1091static void zero_pages(struct page **pages, u64 offset, u64 end)
1092{
1093 struct page **page = &pages[offset >> PAGE_SHIFT];
1094
1095 rbd_assert(end > offset);
1096 rbd_assert(end - offset <= (u64)SIZE_MAX);
1097 while (offset < end) {
1098 size_t page_offset;
1099 size_t length;
1100 unsigned long flags;
1101 void *kaddr;
1102
1103 page_offset = (size_t)(offset & ~PAGE_MASK);
1104 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1105 local_irq_save(flags);
1106 kaddr = kmap_atomic(*page);
1107 memset(kaddr + page_offset, 0, length);
1108 kunmap_atomic(kaddr);
1109 local_irq_restore(flags);
1110
1111 offset += length;
1112 page++;
1113 }
1114}
1115
1116/*
924 * Clone a portion of a bio, starting at the given byte offset 1117 * Clone a portion of a bio, starting at the given byte offset
925 * and continuing for the number of bytes indicated. 1118 * and continuing for the number of bytes indicated.
926 */ 1119 */
@@ -1064,6 +1257,77 @@ out_err:
1064 return NULL; 1257 return NULL;
1065} 1258}
1066 1259
1260/*
1261 * The default/initial value for all object request flags is 0. For
1262 * each flag, once its value is set to 1 it is never reset to 0
1263 * again.
1264 */
1265static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1266{
1267 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1268 struct rbd_device *rbd_dev;
1269
1270 rbd_dev = obj_request->img_request->rbd_dev;
1271 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1272 obj_request);
1273 }
1274}
1275
1276static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1277{
1278 smp_mb();
1279 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1280}
1281
1282static void obj_request_done_set(struct rbd_obj_request *obj_request)
1283{
1284 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1285 struct rbd_device *rbd_dev = NULL;
1286
1287 if (obj_request_img_data_test(obj_request))
1288 rbd_dev = obj_request->img_request->rbd_dev;
1289 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1290 obj_request);
1291 }
1292}
1293
1294static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1295{
1296 smp_mb();
1297 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1298}
1299
1300/*
1301 * This sets the KNOWN flag after (possibly) setting the EXISTS
1302 * flag. The latter is set based on the "exists" value provided.
1303 *
1304 * Note that for our purposes once an object exists it never goes
1305 * away again. It's possible that the response from two existence
1306 * checks are separated by the creation of the target object, and
1307 * the first ("doesn't exist") response arrives *after* the second
1308 * ("does exist"). In that case we ignore the second one.
1309 */
1310static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1311 bool exists)
1312{
1313 if (exists)
1314 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1315 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1316 smp_mb();
1317}
1318
1319static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1320{
1321 smp_mb();
1322 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1323}
1324
1325static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1326{
1327 smp_mb();
1328 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1329}
1330
1067static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1331static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1068{ 1332{
1069 dout("%s: obj %p (was %d)\n", __func__, obj_request, 1333 dout("%s: obj %p (was %d)\n", __func__, obj_request,
@@ -1101,9 +1365,11 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1101{ 1365{
1102 rbd_assert(obj_request->img_request == NULL); 1366 rbd_assert(obj_request->img_request == NULL);
1103 1367
1104 rbd_obj_request_get(obj_request); 1368 /* Image request now owns object's original reference */
1105 obj_request->img_request = img_request; 1369 obj_request->img_request = img_request;
1106 obj_request->which = img_request->obj_request_count; 1370 obj_request->which = img_request->obj_request_count;
1371 rbd_assert(!obj_request_img_data_test(obj_request));
1372 obj_request_img_data_set(obj_request);
1107 rbd_assert(obj_request->which != BAD_WHICH); 1373 rbd_assert(obj_request->which != BAD_WHICH);
1108 img_request->obj_request_count++; 1374 img_request->obj_request_count++;
1109 list_add_tail(&obj_request->links, &img_request->obj_requests); 1375 list_add_tail(&obj_request->links, &img_request->obj_requests);
@@ -1123,6 +1389,7 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1123 img_request->obj_request_count--; 1389 img_request->obj_request_count--;
1124 rbd_assert(obj_request->which == img_request->obj_request_count); 1390 rbd_assert(obj_request->which == img_request->obj_request_count);
1125 obj_request->which = BAD_WHICH; 1391 obj_request->which = BAD_WHICH;
1392 rbd_assert(obj_request_img_data_test(obj_request));
1126 rbd_assert(obj_request->img_request == img_request); 1393 rbd_assert(obj_request->img_request == img_request);
1127 obj_request->img_request = NULL; 1394 obj_request->img_request = NULL;
1128 obj_request->callback = NULL; 1395 obj_request->callback = NULL;
@@ -1141,76 +1408,6 @@ static bool obj_request_type_valid(enum obj_request_type type)
1141 } 1408 }
1142} 1409}
1143 1410
1144static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1145{
1146 struct ceph_osd_req_op *op;
1147 va_list args;
1148 size_t size;
1149
1150 op = kzalloc(sizeof (*op), GFP_NOIO);
1151 if (!op)
1152 return NULL;
1153 op->op = opcode;
1154 va_start(args, opcode);
1155 switch (opcode) {
1156 case CEPH_OSD_OP_READ:
1157 case CEPH_OSD_OP_WRITE:
1158 /* rbd_osd_req_op_create(READ, offset, length) */
1159 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160 op->extent.offset = va_arg(args, u64);
1161 op->extent.length = va_arg(args, u64);
1162 if (opcode == CEPH_OSD_OP_WRITE)
1163 op->payload_len = op->extent.length;
1164 break;
1165 case CEPH_OSD_OP_STAT:
1166 break;
1167 case CEPH_OSD_OP_CALL:
1168 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169 op->cls.class_name = va_arg(args, char *);
1170 size = strlen(op->cls.class_name);
1171 rbd_assert(size <= (size_t) U8_MAX);
1172 op->cls.class_len = size;
1173 op->payload_len = size;
1174
1175 op->cls.method_name = va_arg(args, char *);
1176 size = strlen(op->cls.method_name);
1177 rbd_assert(size <= (size_t) U8_MAX);
1178 op->cls.method_len = size;
1179 op->payload_len += size;
1180
1181 op->cls.argc = 0;
1182 op->cls.indata = va_arg(args, void *);
1183 size = va_arg(args, size_t);
1184 rbd_assert(size <= (size_t) U32_MAX);
1185 op->cls.indata_len = (u32) size;
1186 op->payload_len += size;
1187 break;
1188 case CEPH_OSD_OP_NOTIFY_ACK:
1189 case CEPH_OSD_OP_WATCH:
1190 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192 op->watch.cookie = va_arg(args, u64);
1193 op->watch.ver = va_arg(args, u64);
1194 op->watch.ver = cpu_to_le64(op->watch.ver);
1195 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196 op->watch.flag = (u8) 1;
1197 break;
1198 default:
1199 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200 kfree(op);
1201 op = NULL;
1202 break;
1203 }
1204 va_end(args);
1205
1206 return op;
1207}
1208
1209static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1210{
1211 kfree(op);
1212}
1213
1214static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1411static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215 struct rbd_obj_request *obj_request) 1412 struct rbd_obj_request *obj_request)
1216{ 1413{
@@ -1221,7 +1418,24 @@ static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1221 1418
1222static void rbd_img_request_complete(struct rbd_img_request *img_request) 1419static void rbd_img_request_complete(struct rbd_img_request *img_request)
1223{ 1420{
1421
1224 dout("%s: img %p\n", __func__, img_request); 1422 dout("%s: img %p\n", __func__, img_request);
1423
1424 /*
1425 * If no error occurred, compute the aggregate transfer
1426 * count for the image request. We could instead use
1427 * atomic64_cmpxchg() to update it as each object request
1428 * completes; not clear which way is better off hand.
1429 */
1430 if (!img_request->result) {
1431 struct rbd_obj_request *obj_request;
1432 u64 xferred = 0;
1433
1434 for_each_obj_request(img_request, obj_request)
1435 xferred += obj_request->xferred;
1436 img_request->xferred = xferred;
1437 }
1438
1225 if (img_request->callback) 1439 if (img_request->callback)
1226 img_request->callback(img_request); 1440 img_request->callback(img_request);
1227 else 1441 else
@@ -1237,39 +1451,56 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1237 return wait_for_completion_interruptible(&obj_request->completion); 1451 return wait_for_completion_interruptible(&obj_request->completion);
1238} 1452}
1239 1453
1240static void obj_request_done_init(struct rbd_obj_request *obj_request) 1454/*
1455 * The default/initial value for all image request flags is 0. Each
1456 * is conditionally set to 1 at image request initialization time
1457 * and currently never change thereafter.
1458 */
1459static void img_request_write_set(struct rbd_img_request *img_request)
1241{ 1460{
1242 atomic_set(&obj_request->done, 0); 1461 set_bit(IMG_REQ_WRITE, &img_request->flags);
1243 smp_wmb(); 1462 smp_mb();
1244} 1463}
1245 1464
1246static void obj_request_done_set(struct rbd_obj_request *obj_request) 1465static bool img_request_write_test(struct rbd_img_request *img_request)
1247{ 1466{
1248 int done; 1467 smp_mb();
1468 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1469}
1249 1470
1250 done = atomic_inc_return(&obj_request->done); 1471static void img_request_child_set(struct rbd_img_request *img_request)
1251 if (done > 1) { 1472{
1252 struct rbd_img_request *img_request = obj_request->img_request; 1473 set_bit(IMG_REQ_CHILD, &img_request->flags);
1253 struct rbd_device *rbd_dev; 1474 smp_mb();
1475}
1254 1476
1255 rbd_dev = img_request ? img_request->rbd_dev : NULL; 1477static bool img_request_child_test(struct rbd_img_request *img_request)
1256 rbd_warn(rbd_dev, "obj_request %p was already done\n", 1478{
1257 obj_request); 1479 smp_mb();
1258 } 1480 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1259} 1481}
1260 1482
1261static bool obj_request_done_test(struct rbd_obj_request *obj_request) 1483static void img_request_layered_set(struct rbd_img_request *img_request)
1484{
1485 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1486 smp_mb();
1487}
1488
1489static bool img_request_layered_test(struct rbd_img_request *img_request)
1262{ 1490{
1263 smp_mb(); 1491 smp_mb();
1264 return atomic_read(&obj_request->done) != 0; 1492 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1265} 1493}
1266 1494
1267static void 1495static void
1268rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 1496rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1269{ 1497{
1498 u64 xferred = obj_request->xferred;
1499 u64 length = obj_request->length;
1500
1270 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 1501 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1271 obj_request, obj_request->img_request, obj_request->result, 1502 obj_request, obj_request->img_request, obj_request->result,
1272 obj_request->xferred, obj_request->length); 1503 xferred, length);
1273 /* 1504 /*
1274 * ENOENT means a hole in the image. We zero-fill the 1505 * ENOENT means a hole in the image. We zero-fill the
1275 * entire length of the request. A short read also implies 1506 * entire length of the request. A short read also implies
@@ -1277,15 +1508,20 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1277 * update the xferred count to indicate the whole request 1508 * update the xferred count to indicate the whole request
1278 * was satisfied. 1509 * was satisfied.
1279 */ 1510 */
1280 BUG_ON(obj_request->type != OBJ_REQUEST_BIO); 1511 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1281 if (obj_request->result == -ENOENT) { 1512 if (obj_request->result == -ENOENT) {
1282 zero_bio_chain(obj_request->bio_list, 0); 1513 if (obj_request->type == OBJ_REQUEST_BIO)
1514 zero_bio_chain(obj_request->bio_list, 0);
1515 else
1516 zero_pages(obj_request->pages, 0, length);
1283 obj_request->result = 0; 1517 obj_request->result = 0;
1284 obj_request->xferred = obj_request->length; 1518 obj_request->xferred = length;
1285 } else if (obj_request->xferred < obj_request->length && 1519 } else if (xferred < length && !obj_request->result) {
1286 !obj_request->result) { 1520 if (obj_request->type == OBJ_REQUEST_BIO)
1287 zero_bio_chain(obj_request->bio_list, obj_request->xferred); 1521 zero_bio_chain(obj_request->bio_list, xferred);
1288 obj_request->xferred = obj_request->length; 1522 else
1523 zero_pages(obj_request->pages, xferred, length);
1524 obj_request->xferred = length;
1289 } 1525 }
1290 obj_request_done_set(obj_request); 1526 obj_request_done_set(obj_request);
1291} 1527}
@@ -1308,9 +1544,23 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1308 1544
1309static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1545static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1310{ 1546{
1311 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, 1547 struct rbd_img_request *img_request = NULL;
1312 obj_request->result, obj_request->xferred, obj_request->length); 1548 struct rbd_device *rbd_dev = NULL;
1313 if (obj_request->img_request) 1549 bool layered = false;
1550
1551 if (obj_request_img_data_test(obj_request)) {
1552 img_request = obj_request->img_request;
1553 layered = img_request && img_request_layered_test(img_request);
1554 rbd_dev = img_request->rbd_dev;
1555 }
1556
1557 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1558 obj_request, img_request, obj_request->result,
1559 obj_request->xferred, obj_request->length);
1560 if (layered && obj_request->result == -ENOENT &&
1561 obj_request->img_offset < rbd_dev->parent_overlap)
1562 rbd_img_parent_read(obj_request);
1563 else if (img_request)
1314 rbd_img_obj_request_read_callback(obj_request); 1564 rbd_img_obj_request_read_callback(obj_request);
1315 else 1565 else
1316 obj_request_done_set(obj_request); 1566 obj_request_done_set(obj_request);
@@ -1321,9 +1571,8 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1321 dout("%s: obj %p result %d %llu\n", __func__, obj_request, 1571 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1322 obj_request->result, obj_request->length); 1572 obj_request->result, obj_request->length);
1323 /* 1573 /*
1324 * There is no such thing as a successful short write. 1574 * There is no such thing as a successful short write. Set
1325 * Our xferred value is the number of bytes transferred 1575 * it to our originally-requested length.
1326 * back. Set it to our originally-requested length.
1327 */ 1576 */
1328 obj_request->xferred = obj_request->length; 1577 obj_request->xferred = obj_request->length;
1329 obj_request_done_set(obj_request); 1578 obj_request_done_set(obj_request);
@@ -1347,22 +1596,25 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1347 1596
1348 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1597 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1349 rbd_assert(osd_req == obj_request->osd_req); 1598 rbd_assert(osd_req == obj_request->osd_req);
1350 rbd_assert(!!obj_request->img_request ^ 1599 if (obj_request_img_data_test(obj_request)) {
1351 (obj_request->which == BAD_WHICH)); 1600 rbd_assert(obj_request->img_request);
1601 rbd_assert(obj_request->which != BAD_WHICH);
1602 } else {
1603 rbd_assert(obj_request->which == BAD_WHICH);
1604 }
1352 1605
1353 if (osd_req->r_result < 0) 1606 if (osd_req->r_result < 0)
1354 obj_request->result = osd_req->r_result; 1607 obj_request->result = osd_req->r_result;
1355 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1356 1608
1357 WARN_ON(osd_req->r_num_ops != 1); /* For now */ 1609 BUG_ON(osd_req->r_num_ops > 2);
1358 1610
1359 /* 1611 /*
1360 * We support a 64-bit length, but ultimately it has to be 1612 * We support a 64-bit length, but ultimately it has to be
1361 * passed to blk_end_request(), which takes an unsigned int. 1613 * passed to blk_end_request(), which takes an unsigned int.
1362 */ 1614 */
1363 obj_request->xferred = osd_req->r_reply_op_len[0]; 1615 obj_request->xferred = osd_req->r_reply_op_len[0];
1364 rbd_assert(obj_request->xferred < (u64) UINT_MAX); 1616 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1365 opcode = osd_req->r_request_ops[0].op; 1617 opcode = osd_req->r_ops[0].op;
1366 switch (opcode) { 1618 switch (opcode) {
1367 case CEPH_OSD_OP_READ: 1619 case CEPH_OSD_OP_READ:
1368 rbd_osd_read_callback(obj_request); 1620 rbd_osd_read_callback(obj_request);
@@ -1388,28 +1640,49 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1388 rbd_obj_request_complete(obj_request); 1640 rbd_obj_request_complete(obj_request);
1389} 1641}
1390 1642
1643static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1644{
1645 struct rbd_img_request *img_request = obj_request->img_request;
1646 struct ceph_osd_request *osd_req = obj_request->osd_req;
1647 u64 snap_id;
1648
1649 rbd_assert(osd_req != NULL);
1650
1651 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1652 ceph_osdc_build_request(osd_req, obj_request->offset,
1653 NULL, snap_id, NULL);
1654}
1655
1656static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1657{
1658 struct rbd_img_request *img_request = obj_request->img_request;
1659 struct ceph_osd_request *osd_req = obj_request->osd_req;
1660 struct ceph_snap_context *snapc;
1661 struct timespec mtime = CURRENT_TIME;
1662
1663 rbd_assert(osd_req != NULL);
1664
1665 snapc = img_request ? img_request->snapc : NULL;
1666 ceph_osdc_build_request(osd_req, obj_request->offset,
1667 snapc, CEPH_NOSNAP, &mtime);
1668}
1669
1391static struct ceph_osd_request *rbd_osd_req_create( 1670static struct ceph_osd_request *rbd_osd_req_create(
1392 struct rbd_device *rbd_dev, 1671 struct rbd_device *rbd_dev,
1393 bool write_request, 1672 bool write_request,
1394 struct rbd_obj_request *obj_request, 1673 struct rbd_obj_request *obj_request)
1395 struct ceph_osd_req_op *op)
1396{ 1674{
1397 struct rbd_img_request *img_request = obj_request->img_request;
1398 struct ceph_snap_context *snapc = NULL; 1675 struct ceph_snap_context *snapc = NULL;
1399 struct ceph_osd_client *osdc; 1676 struct ceph_osd_client *osdc;
1400 struct ceph_osd_request *osd_req; 1677 struct ceph_osd_request *osd_req;
1401 struct timespec now;
1402 struct timespec *mtime;
1403 u64 snap_id = CEPH_NOSNAP;
1404 u64 offset = obj_request->offset;
1405 u64 length = obj_request->length;
1406 1678
1407 if (img_request) { 1679 if (obj_request_img_data_test(obj_request)) {
1408 rbd_assert(img_request->write_request == write_request); 1680 struct rbd_img_request *img_request = obj_request->img_request;
1409 if (img_request->write_request) 1681
1682 rbd_assert(write_request ==
1683 img_request_write_test(img_request));
1684 if (write_request)
1410 snapc = img_request->snapc; 1685 snapc = img_request->snapc;
1411 else
1412 snap_id = img_request->snap_id;
1413 } 1686 }
1414 1687
1415 /* Allocate and initialize the request, for the single op */ 1688 /* Allocate and initialize the request, for the single op */
@@ -1419,31 +1692,10 @@ static struct ceph_osd_request *rbd_osd_req_create(
1419 if (!osd_req) 1692 if (!osd_req)
1420 return NULL; /* ENOMEM */ 1693 return NULL; /* ENOMEM */
1421 1694
1422 rbd_assert(obj_request_type_valid(obj_request->type)); 1695 if (write_request)
1423 switch (obj_request->type) {
1424 case OBJ_REQUEST_NODATA:
1425 break; /* Nothing to do */
1426 case OBJ_REQUEST_BIO:
1427 rbd_assert(obj_request->bio_list != NULL);
1428 osd_req->r_bio = obj_request->bio_list;
1429 break;
1430 case OBJ_REQUEST_PAGES:
1431 osd_req->r_pages = obj_request->pages;
1432 osd_req->r_num_pages = obj_request->page_count;
1433 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1434 break;
1435 }
1436
1437 if (write_request) {
1438 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1696 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1439 now = CURRENT_TIME; 1697 else
1440 mtime = &now;
1441 } else {
1442 osd_req->r_flags = CEPH_OSD_FLAG_READ; 1698 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1443 mtime = NULL; /* not needed for reads */
1444 offset = 0; /* These are not used... */
1445 length = 0; /* ...for osd read requests */
1446 }
1447 1699
1448 osd_req->r_callback = rbd_osd_req_callback; 1700 osd_req->r_callback = rbd_osd_req_callback;
1449 osd_req->r_priv = obj_request; 1701 osd_req->r_priv = obj_request;
@@ -1454,14 +1706,51 @@ static struct ceph_osd_request *rbd_osd_req_create(
1454 1706
1455 osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1707 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1456 1708
1457 /* osd_req will get its own reference to snapc (if non-null) */ 1709 return osd_req;
1710}
1458 1711
1459 ceph_osdc_build_request(osd_req, offset, length, 1, op, 1712/*
1460 snapc, snap_id, mtime); 1713 * Create a copyup osd request based on the information in the
1714 * object request supplied. A copyup request has two osd ops,
1715 * a copyup method call, and a "normal" write request.
1716 */
1717static struct ceph_osd_request *
1718rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1719{
1720 struct rbd_img_request *img_request;
1721 struct ceph_snap_context *snapc;
1722 struct rbd_device *rbd_dev;
1723 struct ceph_osd_client *osdc;
1724 struct ceph_osd_request *osd_req;
1725
1726 rbd_assert(obj_request_img_data_test(obj_request));
1727 img_request = obj_request->img_request;
1728 rbd_assert(img_request);
1729 rbd_assert(img_request_write_test(img_request));
1730
1731 /* Allocate and initialize the request, for the two ops */
1732
1733 snapc = img_request->snapc;
1734 rbd_dev = img_request->rbd_dev;
1735 osdc = &rbd_dev->rbd_client->client->osdc;
1736 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1737 if (!osd_req)
1738 return NULL; /* ENOMEM */
1739
1740 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1741 osd_req->r_callback = rbd_osd_req_callback;
1742 osd_req->r_priv = obj_request;
1743
1744 osd_req->r_oid_len = strlen(obj_request->object_name);
1745 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1746 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1747
1748 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1461 1749
1462 return osd_req; 1750 return osd_req;
1463} 1751}
1464 1752
1753
1465static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1754static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1466{ 1755{
1467 ceph_osdc_put_request(osd_req); 1756 ceph_osdc_put_request(osd_req);
@@ -1480,18 +1769,23 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1480 rbd_assert(obj_request_type_valid(type)); 1769 rbd_assert(obj_request_type_valid(type));
1481 1770
1482 size = strlen(object_name) + 1; 1771 size = strlen(object_name) + 1;
1483 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); 1772 name = kmalloc(size, GFP_KERNEL);
1484 if (!obj_request) 1773 if (!name)
1485 return NULL; 1774 return NULL;
1486 1775
1487 name = (char *)(obj_request + 1); 1776 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1777 if (!obj_request) {
1778 kfree(name);
1779 return NULL;
1780 }
1781
1488 obj_request->object_name = memcpy(name, object_name, size); 1782 obj_request->object_name = memcpy(name, object_name, size);
1489 obj_request->offset = offset; 1783 obj_request->offset = offset;
1490 obj_request->length = length; 1784 obj_request->length = length;
1785 obj_request->flags = 0;
1491 obj_request->which = BAD_WHICH; 1786 obj_request->which = BAD_WHICH;
1492 obj_request->type = type; 1787 obj_request->type = type;
1493 INIT_LIST_HEAD(&obj_request->links); 1788 INIT_LIST_HEAD(&obj_request->links);
1494 obj_request_done_init(obj_request);
1495 init_completion(&obj_request->completion); 1789 init_completion(&obj_request->completion);
1496 kref_init(&obj_request->kref); 1790 kref_init(&obj_request->kref);
1497 1791
@@ -1530,7 +1824,9 @@ static void rbd_obj_request_destroy(struct kref *kref)
1530 break; 1824 break;
1531 } 1825 }
1532 1826
1533 kfree(obj_request); 1827 kfree(obj_request->object_name);
1828 obj_request->object_name = NULL;
1829 kmem_cache_free(rbd_obj_request_cache, obj_request);
1534} 1830}
1535 1831
1536/* 1832/*
@@ -1541,37 +1837,40 @@ static void rbd_obj_request_destroy(struct kref *kref)
1541static struct rbd_img_request *rbd_img_request_create( 1837static struct rbd_img_request *rbd_img_request_create(
1542 struct rbd_device *rbd_dev, 1838 struct rbd_device *rbd_dev,
1543 u64 offset, u64 length, 1839 u64 offset, u64 length,
1544 bool write_request) 1840 bool write_request,
1841 bool child_request)
1545{ 1842{
1546 struct rbd_img_request *img_request; 1843 struct rbd_img_request *img_request;
1547 struct ceph_snap_context *snapc = NULL;
1548 1844
1549 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); 1845 img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1550 if (!img_request) 1846 if (!img_request)
1551 return NULL; 1847 return NULL;
1552 1848
1553 if (write_request) { 1849 if (write_request) {
1554 down_read(&rbd_dev->header_rwsem); 1850 down_read(&rbd_dev->header_rwsem);
1555 snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1851 ceph_get_snap_context(rbd_dev->header.snapc);
1556 up_read(&rbd_dev->header_rwsem); 1852 up_read(&rbd_dev->header_rwsem);
1557 if (WARN_ON(!snapc)) {
1558 kfree(img_request);
1559 return NULL; /* Shouldn't happen */
1560 }
1561 } 1853 }
1562 1854
1563 img_request->rq = NULL; 1855 img_request->rq = NULL;
1564 img_request->rbd_dev = rbd_dev; 1856 img_request->rbd_dev = rbd_dev;
1565 img_request->offset = offset; 1857 img_request->offset = offset;
1566 img_request->length = length; 1858 img_request->length = length;
1567 img_request->write_request = write_request; 1859 img_request->flags = 0;
1568 if (write_request) 1860 if (write_request) {
1569 img_request->snapc = snapc; 1861 img_request_write_set(img_request);
1570 else 1862 img_request->snapc = rbd_dev->header.snapc;
1863 } else {
1571 img_request->snap_id = rbd_dev->spec->snap_id; 1864 img_request->snap_id = rbd_dev->spec->snap_id;
1865 }
1866 if (child_request)
1867 img_request_child_set(img_request);
1868 if (rbd_dev->parent_spec)
1869 img_request_layered_set(img_request);
1572 spin_lock_init(&img_request->completion_lock); 1870 spin_lock_init(&img_request->completion_lock);
1573 img_request->next_completion = 0; 1871 img_request->next_completion = 0;
1574 img_request->callback = NULL; 1872 img_request->callback = NULL;
1873 img_request->result = 0;
1575 img_request->obj_request_count = 0; 1874 img_request->obj_request_count = 0;
1576 INIT_LIST_HEAD(&img_request->obj_requests); 1875 INIT_LIST_HEAD(&img_request->obj_requests);
1577 kref_init(&img_request->kref); 1876 kref_init(&img_request->kref);
@@ -1600,78 +1899,204 @@ static void rbd_img_request_destroy(struct kref *kref)
1600 rbd_img_obj_request_del(img_request, obj_request); 1899 rbd_img_obj_request_del(img_request, obj_request);
1601 rbd_assert(img_request->obj_request_count == 0); 1900 rbd_assert(img_request->obj_request_count == 0);
1602 1901
1603 if (img_request->write_request) 1902 if (img_request_write_test(img_request))
1604 ceph_put_snap_context(img_request->snapc); 1903 ceph_put_snap_context(img_request->snapc);
1605 1904
1606 kfree(img_request); 1905 if (img_request_child_test(img_request))
1906 rbd_obj_request_put(img_request->obj_request);
1907
1908 kmem_cache_free(rbd_img_request_cache, img_request);
1909}
1910
1911static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1912{
1913 struct rbd_img_request *img_request;
1914 unsigned int xferred;
1915 int result;
1916 bool more;
1917
1918 rbd_assert(obj_request_img_data_test(obj_request));
1919 img_request = obj_request->img_request;
1920
1921 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1922 xferred = (unsigned int)obj_request->xferred;
1923 result = obj_request->result;
1924 if (result) {
1925 struct rbd_device *rbd_dev = img_request->rbd_dev;
1926
1927 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1928 img_request_write_test(img_request) ? "write" : "read",
1929 obj_request->length, obj_request->img_offset,
1930 obj_request->offset);
1931 rbd_warn(rbd_dev, " result %d xferred %x\n",
1932 result, xferred);
1933 if (!img_request->result)
1934 img_request->result = result;
1935 }
1936
1937 /* Image object requests don't own their page array */
1938
1939 if (obj_request->type == OBJ_REQUEST_PAGES) {
1940 obj_request->pages = NULL;
1941 obj_request->page_count = 0;
1942 }
1943
1944 if (img_request_child_test(img_request)) {
1945 rbd_assert(img_request->obj_request != NULL);
1946 more = obj_request->which < img_request->obj_request_count - 1;
1947 } else {
1948 rbd_assert(img_request->rq != NULL);
1949 more = blk_end_request(img_request->rq, result, xferred);
1950 }
1951
1952 return more;
1607} 1953}
1608 1954
1609static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, 1955static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1610 struct bio *bio_list) 1956{
1957 struct rbd_img_request *img_request;
1958 u32 which = obj_request->which;
1959 bool more = true;
1960
1961 rbd_assert(obj_request_img_data_test(obj_request));
1962 img_request = obj_request->img_request;
1963
1964 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1965 rbd_assert(img_request != NULL);
1966 rbd_assert(img_request->obj_request_count > 0);
1967 rbd_assert(which != BAD_WHICH);
1968 rbd_assert(which < img_request->obj_request_count);
1969 rbd_assert(which >= img_request->next_completion);
1970
1971 spin_lock_irq(&img_request->completion_lock);
1972 if (which != img_request->next_completion)
1973 goto out;
1974
1975 for_each_obj_request_from(img_request, obj_request) {
1976 rbd_assert(more);
1977 rbd_assert(which < img_request->obj_request_count);
1978
1979 if (!obj_request_done_test(obj_request))
1980 break;
1981 more = rbd_img_obj_end_request(obj_request);
1982 which++;
1983 }
1984
1985 rbd_assert(more ^ (which == img_request->obj_request_count));
1986 img_request->next_completion = which;
1987out:
1988 spin_unlock_irq(&img_request->completion_lock);
1989
1990 if (!more)
1991 rbd_img_request_complete(img_request);
1992}
1993
1994/*
1995 * Split up an image request into one or more object requests, each
1996 * to a different object. The "type" parameter indicates whether
1997 * "data_desc" is the pointer to the head of a list of bio
1998 * structures, or the base of a page array. In either case this
1999 * function assumes data_desc describes memory sufficient to hold
2000 * all data described by the image request.
2001 */
2002static int rbd_img_request_fill(struct rbd_img_request *img_request,
2003 enum obj_request_type type,
2004 void *data_desc)
1611{ 2005{
1612 struct rbd_device *rbd_dev = img_request->rbd_dev; 2006 struct rbd_device *rbd_dev = img_request->rbd_dev;
1613 struct rbd_obj_request *obj_request = NULL; 2007 struct rbd_obj_request *obj_request = NULL;
1614 struct rbd_obj_request *next_obj_request; 2008 struct rbd_obj_request *next_obj_request;
1615 unsigned int bio_offset; 2009 bool write_request = img_request_write_test(img_request);
1616 u64 image_offset; 2010 struct bio *bio_list;
2011 unsigned int bio_offset = 0;
2012 struct page **pages;
2013 u64 img_offset;
1617 u64 resid; 2014 u64 resid;
1618 u16 opcode; 2015 u16 opcode;
1619 2016
1620 dout("%s: img %p bio %p\n", __func__, img_request, bio_list); 2017 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2018 (int)type, data_desc);
1621 2019
1622 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE 2020 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1623 : CEPH_OSD_OP_READ; 2021 img_offset = img_request->offset;
1624 bio_offset = 0;
1625 image_offset = img_request->offset;
1626 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1627 resid = img_request->length; 2022 resid = img_request->length;
1628 rbd_assert(resid > 0); 2023 rbd_assert(resid > 0);
2024
2025 if (type == OBJ_REQUEST_BIO) {
2026 bio_list = data_desc;
2027 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2028 } else {
2029 rbd_assert(type == OBJ_REQUEST_PAGES);
2030 pages = data_desc;
2031 }
2032
1629 while (resid) { 2033 while (resid) {
2034 struct ceph_osd_request *osd_req;
1630 const char *object_name; 2035 const char *object_name;
1631 unsigned int clone_size;
1632 struct ceph_osd_req_op *op;
1633 u64 offset; 2036 u64 offset;
1634 u64 length; 2037 u64 length;
1635 2038
1636 object_name = rbd_segment_name(rbd_dev, image_offset); 2039 object_name = rbd_segment_name(rbd_dev, img_offset);
1637 if (!object_name) 2040 if (!object_name)
1638 goto out_unwind; 2041 goto out_unwind;
1639 offset = rbd_segment_offset(rbd_dev, image_offset); 2042 offset = rbd_segment_offset(rbd_dev, img_offset);
1640 length = rbd_segment_length(rbd_dev, image_offset, resid); 2043 length = rbd_segment_length(rbd_dev, img_offset, resid);
1641 obj_request = rbd_obj_request_create(object_name, 2044 obj_request = rbd_obj_request_create(object_name,
1642 offset, length, 2045 offset, length, type);
1643 OBJ_REQUEST_BIO); 2046 /* object request has its own copy of the object name */
1644 kfree(object_name); /* object request has its own copy */ 2047 rbd_segment_name_free(object_name);
1645 if (!obj_request) 2048 if (!obj_request)
1646 goto out_unwind; 2049 goto out_unwind;
1647 2050
1648 rbd_assert(length <= (u64) UINT_MAX); 2051 if (type == OBJ_REQUEST_BIO) {
1649 clone_size = (unsigned int) length; 2052 unsigned int clone_size;
1650 obj_request->bio_list = bio_chain_clone_range(&bio_list, 2053
1651 &bio_offset, clone_size, 2054 rbd_assert(length <= (u64)UINT_MAX);
1652 GFP_ATOMIC); 2055 clone_size = (unsigned int)length;
1653 if (!obj_request->bio_list) 2056 obj_request->bio_list =
1654 goto out_partial; 2057 bio_chain_clone_range(&bio_list,
2058 &bio_offset,
2059 clone_size,
2060 GFP_ATOMIC);
2061 if (!obj_request->bio_list)
2062 goto out_partial;
2063 } else {
2064 unsigned int page_count;
2065
2066 obj_request->pages = pages;
2067 page_count = (u32)calc_pages_for(offset, length);
2068 obj_request->page_count = page_count;
2069 if ((offset + length) & ~PAGE_MASK)
2070 page_count--; /* more on last page */
2071 pages += page_count;
2072 }
1655 2073
1656 /* 2074 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1657 * Build up the op to use in building the osd 2075 obj_request);
1658 * request. Note that the contents of the op are 2076 if (!osd_req)
1659 * copied by rbd_osd_req_create().
1660 */
1661 op = rbd_osd_req_op_create(opcode, offset, length);
1662 if (!op)
1663 goto out_partial;
1664 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1665 img_request->write_request,
1666 obj_request, op);
1667 rbd_osd_req_op_destroy(op);
1668 if (!obj_request->osd_req)
1669 goto out_partial; 2077 goto out_partial;
1670 /* status and version are initially zero-filled */ 2078 obj_request->osd_req = osd_req;
2079 obj_request->callback = rbd_img_obj_callback;
1671 2080
2081 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2082 0, 0);
2083 if (type == OBJ_REQUEST_BIO)
2084 osd_req_op_extent_osd_data_bio(osd_req, 0,
2085 obj_request->bio_list, length);
2086 else
2087 osd_req_op_extent_osd_data_pages(osd_req, 0,
2088 obj_request->pages, length,
2089 offset & ~PAGE_MASK, false, false);
2090
2091 if (write_request)
2092 rbd_osd_req_format_write(obj_request);
2093 else
2094 rbd_osd_req_format_read(obj_request);
2095
2096 obj_request->img_offset = img_offset;
1672 rbd_img_obj_request_add(img_request, obj_request); 2097 rbd_img_obj_request_add(img_request, obj_request);
1673 2098
1674 image_offset += length; 2099 img_offset += length;
1675 resid -= length; 2100 resid -= length;
1676 } 2101 }
1677 2102
@@ -1686,61 +2111,389 @@ out_unwind:
1686 return -ENOMEM; 2111 return -ENOMEM;
1687} 2112}
1688 2113
1689static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 2114static void
2115rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
1690{ 2116{
1691 struct rbd_img_request *img_request; 2117 struct rbd_img_request *img_request;
1692 u32 which = obj_request->which; 2118 struct rbd_device *rbd_dev;
1693 bool more = true; 2119 u64 length;
2120 u32 page_count;
1694 2121
2122 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2123 rbd_assert(obj_request_img_data_test(obj_request));
1695 img_request = obj_request->img_request; 2124 img_request = obj_request->img_request;
2125 rbd_assert(img_request);
1696 2126
1697 dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 2127 rbd_dev = img_request->rbd_dev;
2128 rbd_assert(rbd_dev);
2129 length = (u64)1 << rbd_dev->header.obj_order;
2130 page_count = (u32)calc_pages_for(0, length);
2131
2132 rbd_assert(obj_request->copyup_pages);
2133 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2134 obj_request->copyup_pages = NULL;
2135
2136 /*
2137 * We want the transfer count to reflect the size of the
2138 * original write request. There is no such thing as a
2139 * successful short write, so if the request was successful
2140 * we can just set it to the originally-requested length.
2141 */
2142 if (!obj_request->result)
2143 obj_request->xferred = obj_request->length;
2144
2145 /* Finish up with the normal image object callback */
2146
2147 rbd_img_obj_callback(obj_request);
2148}
2149
2150static void
2151rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2152{
2153 struct rbd_obj_request *orig_request;
2154 struct ceph_osd_request *osd_req;
2155 struct ceph_osd_client *osdc;
2156 struct rbd_device *rbd_dev;
2157 struct page **pages;
2158 int result;
2159 u64 obj_size;
2160 u64 xferred;
2161
2162 rbd_assert(img_request_child_test(img_request));
2163
2164 /* First get what we need from the image request */
2165
2166 pages = img_request->copyup_pages;
2167 rbd_assert(pages != NULL);
2168 img_request->copyup_pages = NULL;
2169
2170 orig_request = img_request->obj_request;
2171 rbd_assert(orig_request != NULL);
2172 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
2173 result = img_request->result;
2174 obj_size = img_request->length;
2175 xferred = img_request->xferred;
2176
2177 rbd_dev = img_request->rbd_dev;
2178 rbd_assert(rbd_dev);
2179 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2180
2181 rbd_img_request_put(img_request);
2182
2183 if (result)
2184 goto out_err;
2185
2186 /* Allocate the new copyup osd request for the original request */
2187
2188 result = -ENOMEM;
2189 rbd_assert(!orig_request->osd_req);
2190 osd_req = rbd_osd_req_create_copyup(orig_request);
2191 if (!osd_req)
2192 goto out_err;
2193 orig_request->osd_req = osd_req;
2194 orig_request->copyup_pages = pages;
2195
2196 /* Initialize the copyup op */
2197
2198 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2199 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2200 false, false);
2201
2202 /* Then the original write request op */
2203
2204 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2205 orig_request->offset,
2206 orig_request->length, 0, 0);
2207 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2208 orig_request->length);
2209
2210 rbd_osd_req_format_write(orig_request);
2211
2212 /* All set, send it off. */
2213
2214 orig_request->callback = rbd_img_obj_copyup_callback;
2215 osdc = &rbd_dev->rbd_client->client->osdc;
2216 result = rbd_obj_request_submit(osdc, orig_request);
2217 if (!result)
2218 return;
2219out_err:
2220 /* Record the error code and complete the request */
2221
2222 orig_request->result = result;
2223 orig_request->xferred = 0;
2224 obj_request_done_set(orig_request);
2225 rbd_obj_request_complete(orig_request);
2226}
2227
2228/*
2229 * Read from the parent image the range of data that covers the
2230 * entire target of the given object request. This is used for
2231 * satisfying a layered image write request when the target of an
2232 * object request from the image request does not exist.
2233 *
2234 * A page array big enough to hold the returned data is allocated
2235 * and supplied to rbd_img_request_fill() as the "data descriptor."
2236 * When the read completes, this page array will be transferred to
2237 * the original object request for the copyup operation.
2238 *
2239 * If an error occurs, record it as the result of the original
2240 * object request and mark it done so it gets completed.
2241 */
2242static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2243{
2244 struct rbd_img_request *img_request = NULL;
2245 struct rbd_img_request *parent_request = NULL;
2246 struct rbd_device *rbd_dev;
2247 u64 img_offset;
2248 u64 length;
2249 struct page **pages = NULL;
2250 u32 page_count;
2251 int result;
2252
2253 rbd_assert(obj_request_img_data_test(obj_request));
2254 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2255
2256 img_request = obj_request->img_request;
1698 rbd_assert(img_request != NULL); 2257 rbd_assert(img_request != NULL);
1699 rbd_assert(img_request->rq != NULL); 2258 rbd_dev = img_request->rbd_dev;
1700 rbd_assert(img_request->obj_request_count > 0); 2259 rbd_assert(rbd_dev->parent != NULL);
1701 rbd_assert(which != BAD_WHICH);
1702 rbd_assert(which < img_request->obj_request_count);
1703 rbd_assert(which >= img_request->next_completion);
1704 2260
1705 spin_lock_irq(&img_request->completion_lock); 2261 /*
1706 if (which != img_request->next_completion) 2262 * First things first. The original osd request is of no
1707 goto out; 2263 * use to use any more, we'll need a new one that can hold
2264 * the two ops in a copyup request. We'll get that later,
2265 * but for now we can release the old one.
2266 */
2267 rbd_osd_req_destroy(obj_request->osd_req);
2268 obj_request->osd_req = NULL;
1708 2269
1709 for_each_obj_request_from(img_request, obj_request) { 2270 /*
1710 unsigned int xferred; 2271 * Determine the byte range covered by the object in the
1711 int result; 2272 * child image to which the original request was to be sent.
2273 */
2274 img_offset = obj_request->img_offset - obj_request->offset;
2275 length = (u64)1 << rbd_dev->header.obj_order;
1712 2276
1713 rbd_assert(more); 2277 /*
1714 rbd_assert(which < img_request->obj_request_count); 2278 * There is no defined parent data beyond the parent
2279 * overlap, so limit what we read at that boundary if
2280 * necessary.
2281 */
2282 if (img_offset + length > rbd_dev->parent_overlap) {
2283 rbd_assert(img_offset < rbd_dev->parent_overlap);
2284 length = rbd_dev->parent_overlap - img_offset;
2285 }
1715 2286
1716 if (!obj_request_done_test(obj_request)) 2287 /*
1717 break; 2288 * Allocate a page array big enough to receive the data read
2289 * from the parent.
2290 */
2291 page_count = (u32)calc_pages_for(0, length);
2292 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2293 if (IS_ERR(pages)) {
2294 result = PTR_ERR(pages);
2295 pages = NULL;
2296 goto out_err;
2297 }
1718 2298
1719 rbd_assert(obj_request->xferred <= (u64) UINT_MAX); 2299 result = -ENOMEM;
1720 xferred = (unsigned int) obj_request->xferred; 2300 parent_request = rbd_img_request_create(rbd_dev->parent,
1721 result = (int) obj_request->result; 2301 img_offset, length,
1722 if (result) 2302 false, true);
1723 rbd_warn(NULL, "obj_request %s result %d xferred %u\n", 2303 if (!parent_request)
1724 img_request->write_request ? "write" : "read", 2304 goto out_err;
1725 result, xferred); 2305 rbd_obj_request_get(obj_request);
2306 parent_request->obj_request = obj_request;
1726 2307
1727 more = blk_end_request(img_request->rq, result, xferred); 2308 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
1728 which++; 2309 if (result)
2310 goto out_err;
2311 parent_request->copyup_pages = pages;
2312
2313 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2314 result = rbd_img_request_submit(parent_request);
2315 if (!result)
2316 return 0;
2317
2318 parent_request->copyup_pages = NULL;
2319 parent_request->obj_request = NULL;
2320 rbd_obj_request_put(obj_request);
2321out_err:
2322 if (pages)
2323 ceph_release_page_vector(pages, page_count);
2324 if (parent_request)
2325 rbd_img_request_put(parent_request);
2326 obj_request->result = result;
2327 obj_request->xferred = 0;
2328 obj_request_done_set(obj_request);
2329
2330 return result;
2331}
2332
2333static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2334{
2335 struct rbd_obj_request *orig_request;
2336 int result;
2337
2338 rbd_assert(!obj_request_img_data_test(obj_request));
2339
2340 /*
2341 * All we need from the object request is the original
2342 * request and the result of the STAT op. Grab those, then
2343 * we're done with the request.
2344 */
2345 orig_request = obj_request->obj_request;
2346 obj_request->obj_request = NULL;
2347 rbd_assert(orig_request);
2348 rbd_assert(orig_request->img_request);
2349
2350 result = obj_request->result;
2351 obj_request->result = 0;
2352
2353 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2354 obj_request, orig_request, result,
2355 obj_request->xferred, obj_request->length);
2356 rbd_obj_request_put(obj_request);
2357
2358 rbd_assert(orig_request);
2359 rbd_assert(orig_request->img_request);
2360
2361 /*
2362 * Our only purpose here is to determine whether the object
2363 * exists, and we don't want to treat the non-existence as
2364 * an error. If something else comes back, transfer the
2365 * error to the original request and complete it now.
2366 */
2367 if (!result) {
2368 obj_request_existence_set(orig_request, true);
2369 } else if (result == -ENOENT) {
2370 obj_request_existence_set(orig_request, false);
2371 } else if (result) {
2372 orig_request->result = result;
2373 goto out;
1729 } 2374 }
1730 2375
1731 rbd_assert(more ^ (which == img_request->obj_request_count)); 2376 /*
1732 img_request->next_completion = which; 2377 * Resubmit the original request now that we have recorded
2378 * whether the target object exists.
2379 */
2380 orig_request->result = rbd_img_obj_request_submit(orig_request);
1733out: 2381out:
1734 spin_unlock_irq(&img_request->completion_lock); 2382 if (orig_request->result)
2383 rbd_obj_request_complete(orig_request);
2384 rbd_obj_request_put(orig_request);
2385}
1735 2386
1736 if (!more) 2387static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
1737 rbd_img_request_complete(img_request); 2388{
2389 struct rbd_obj_request *stat_request;
2390 struct rbd_device *rbd_dev;
2391 struct ceph_osd_client *osdc;
2392 struct page **pages = NULL;
2393 u32 page_count;
2394 size_t size;
2395 int ret;
2396
2397 /*
2398 * The response data for a STAT call consists of:
2399 * le64 length;
2400 * struct {
2401 * le32 tv_sec;
2402 * le32 tv_nsec;
2403 * } mtime;
2404 */
2405 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2406 page_count = (u32)calc_pages_for(0, size);
2407 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2408 if (IS_ERR(pages))
2409 return PTR_ERR(pages);
2410
2411 ret = -ENOMEM;
2412 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2413 OBJ_REQUEST_PAGES);
2414 if (!stat_request)
2415 goto out;
2416
2417 rbd_obj_request_get(obj_request);
2418 stat_request->obj_request = obj_request;
2419 stat_request->pages = pages;
2420 stat_request->page_count = page_count;
2421
2422 rbd_assert(obj_request->img_request);
2423 rbd_dev = obj_request->img_request->rbd_dev;
2424 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2425 stat_request);
2426 if (!stat_request->osd_req)
2427 goto out;
2428 stat_request->callback = rbd_img_obj_exists_callback;
2429
2430 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2431 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2432 false, false);
2433 rbd_osd_req_format_read(stat_request);
2434
2435 osdc = &rbd_dev->rbd_client->client->osdc;
2436 ret = rbd_obj_request_submit(osdc, stat_request);
2437out:
2438 if (ret)
2439 rbd_obj_request_put(obj_request);
2440
2441 return ret;
2442}
2443
2444static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2445{
2446 struct rbd_img_request *img_request;
2447 struct rbd_device *rbd_dev;
2448 bool known;
2449
2450 rbd_assert(obj_request_img_data_test(obj_request));
2451
2452 img_request = obj_request->img_request;
2453 rbd_assert(img_request);
2454 rbd_dev = img_request->rbd_dev;
2455
2456 /*
2457 * Only writes to layered images need special handling.
2458 * Reads and non-layered writes are simple object requests.
2459 * Layered writes that start beyond the end of the overlap
2460 * with the parent have no parent data, so they too are
2461 * simple object requests. Finally, if the target object is
2462 * known to already exist, its parent data has already been
2463 * copied, so a write to the object can also be handled as a
2464 * simple object request.
2465 */
2466 if (!img_request_write_test(img_request) ||
2467 !img_request_layered_test(img_request) ||
2468 rbd_dev->parent_overlap <= obj_request->img_offset ||
2469 ((known = obj_request_known_test(obj_request)) &&
2470 obj_request_exists_test(obj_request))) {
2471
2472 struct rbd_device *rbd_dev;
2473 struct ceph_osd_client *osdc;
2474
2475 rbd_dev = obj_request->img_request->rbd_dev;
2476 osdc = &rbd_dev->rbd_client->client->osdc;
2477
2478 return rbd_obj_request_submit(osdc, obj_request);
2479 }
2480
2481 /*
2482 * It's a layered write. The target object might exist but
2483 * we may not know that yet. If we know it doesn't exist,
2484 * start by reading the data for the full target object from
2485 * the parent so we can use it for a copyup to the target.
2486 */
2487 if (known)
2488 return rbd_img_obj_parent_read_full(obj_request);
2489
2490 /* We don't know whether the target exists. Go find out. */
2491
2492 return rbd_img_obj_exists_submit(obj_request);
1738} 2493}
1739 2494
1740static int rbd_img_request_submit(struct rbd_img_request *img_request) 2495static int rbd_img_request_submit(struct rbd_img_request *img_request)
1741{ 2496{
1742 struct rbd_device *rbd_dev = img_request->rbd_dev;
1743 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1744 struct rbd_obj_request *obj_request; 2497 struct rbd_obj_request *obj_request;
1745 struct rbd_obj_request *next_obj_request; 2498 struct rbd_obj_request *next_obj_request;
1746 2499
@@ -1748,27 +2501,105 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
1748 for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2501 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1749 int ret; 2502 int ret;
1750 2503
1751 obj_request->callback = rbd_img_obj_callback; 2504 ret = rbd_img_obj_request_submit(obj_request);
1752 ret = rbd_obj_request_submit(osdc, obj_request);
1753 if (ret) 2505 if (ret)
1754 return ret; 2506 return ret;
1755 /*
1756 * The image request has its own reference to each
1757 * of its object requests, so we can safely drop the
1758 * initial one here.
1759 */
1760 rbd_obj_request_put(obj_request);
1761 } 2507 }
1762 2508
1763 return 0; 2509 return 0;
1764} 2510}
1765 2511
1766static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, 2512static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
1767 u64 ver, u64 notify_id)
1768{ 2513{
1769 struct rbd_obj_request *obj_request; 2514 struct rbd_obj_request *obj_request;
1770 struct ceph_osd_req_op *op; 2515 struct rbd_device *rbd_dev;
1771 struct ceph_osd_client *osdc; 2516 u64 obj_end;
2517
2518 rbd_assert(img_request_child_test(img_request));
2519
2520 obj_request = img_request->obj_request;
2521 rbd_assert(obj_request);
2522 rbd_assert(obj_request->img_request);
2523
2524 obj_request->result = img_request->result;
2525 if (obj_request->result)
2526 goto out;
2527
2528 /*
2529 * We need to zero anything beyond the parent overlap
2530 * boundary. Since rbd_img_obj_request_read_callback()
2531 * will zero anything beyond the end of a short read, an
2532 * easy way to do this is to pretend the data from the
2533 * parent came up short--ending at the overlap boundary.
2534 */
2535 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2536 obj_end = obj_request->img_offset + obj_request->length;
2537 rbd_dev = obj_request->img_request->rbd_dev;
2538 if (obj_end > rbd_dev->parent_overlap) {
2539 u64 xferred = 0;
2540
2541 if (obj_request->img_offset < rbd_dev->parent_overlap)
2542 xferred = rbd_dev->parent_overlap -
2543 obj_request->img_offset;
2544
2545 obj_request->xferred = min(img_request->xferred, xferred);
2546 } else {
2547 obj_request->xferred = img_request->xferred;
2548 }
2549out:
2550 rbd_img_request_put(img_request);
2551 rbd_img_obj_request_read_callback(obj_request);
2552 rbd_obj_request_complete(obj_request);
2553}
2554
2555static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2556{
2557 struct rbd_device *rbd_dev;
2558 struct rbd_img_request *img_request;
2559 int result;
2560
2561 rbd_assert(obj_request_img_data_test(obj_request));
2562 rbd_assert(obj_request->img_request != NULL);
2563 rbd_assert(obj_request->result == (s32) -ENOENT);
2564 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2565
2566 rbd_dev = obj_request->img_request->rbd_dev;
2567 rbd_assert(rbd_dev->parent != NULL);
2568 /* rbd_read_finish(obj_request, obj_request->length); */
2569 img_request = rbd_img_request_create(rbd_dev->parent,
2570 obj_request->img_offset,
2571 obj_request->length,
2572 false, true);
2573 result = -ENOMEM;
2574 if (!img_request)
2575 goto out_err;
2576
2577 rbd_obj_request_get(obj_request);
2578 img_request->obj_request = obj_request;
2579
2580 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2581 obj_request->bio_list);
2582 if (result)
2583 goto out_err;
2584
2585 img_request->callback = rbd_img_parent_read_callback;
2586 result = rbd_img_request_submit(img_request);
2587 if (result)
2588 goto out_err;
2589
2590 return;
2591out_err:
2592 if (img_request)
2593 rbd_img_request_put(img_request);
2594 obj_request->result = result;
2595 obj_request->xferred = 0;
2596 obj_request_done_set(obj_request);
2597}
2598
2599static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
2600{
2601 struct rbd_obj_request *obj_request;
2602 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1772 int ret; 2603 int ret;
1773 2604
1774 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2605 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
@@ -1777,17 +2608,15 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1777 return -ENOMEM; 2608 return -ENOMEM;
1778 2609
1779 ret = -ENOMEM; 2610 ret = -ENOMEM;
1780 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); 2611 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1781 if (!op)
1782 goto out;
1783 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1784 obj_request, op);
1785 rbd_osd_req_op_destroy(op);
1786 if (!obj_request->osd_req) 2612 if (!obj_request->osd_req)
1787 goto out; 2613 goto out;
1788
1789 osdc = &rbd_dev->rbd_client->client->osdc;
1790 obj_request->callback = rbd_obj_request_put; 2614 obj_request->callback = rbd_obj_request_put;
2615
2616 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2617 notify_id, 0, 0);
2618 rbd_osd_req_format_read(obj_request);
2619
1791 ret = rbd_obj_request_submit(osdc, obj_request); 2620 ret = rbd_obj_request_submit(osdc, obj_request);
1792out: 2621out:
1793 if (ret) 2622 if (ret)
@@ -1799,21 +2628,16 @@ out:
1799static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2628static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1800{ 2629{
1801 struct rbd_device *rbd_dev = (struct rbd_device *)data; 2630 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1802 u64 hver;
1803 int rc;
1804 2631
1805 if (!rbd_dev) 2632 if (!rbd_dev)
1806 return; 2633 return;
1807 2634
1808 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2635 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1809 rbd_dev->header_name, (unsigned long long) notify_id, 2636 rbd_dev->header_name, (unsigned long long)notify_id,
1810 (unsigned int) opcode); 2637 (unsigned int)opcode);
1811 rc = rbd_dev_refresh(rbd_dev, &hver); 2638 (void)rbd_dev_refresh(rbd_dev);
1812 if (rc)
1813 rbd_warn(rbd_dev, "got notification but failed to "
1814 " update snaps: %d\n", rc);
1815 2639
1816 rbd_obj_notify_ack(rbd_dev, hver, notify_id); 2640 rbd_obj_notify_ack(rbd_dev, notify_id);
1817} 2641}
1818 2642
1819/* 2643/*
@@ -1824,7 +2648,6 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1824{ 2648{
1825 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2649 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1826 struct rbd_obj_request *obj_request; 2650 struct rbd_obj_request *obj_request;
1827 struct ceph_osd_req_op *op;
1828 int ret; 2651 int ret;
1829 2652
1830 rbd_assert(start ^ !!rbd_dev->watch_event); 2653 rbd_assert(start ^ !!rbd_dev->watch_event);
@@ -1844,14 +2667,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1844 if (!obj_request) 2667 if (!obj_request)
1845 goto out_cancel; 2668 goto out_cancel;
1846 2669
1847 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, 2670 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1848 rbd_dev->watch_event->cookie,
1849 rbd_dev->header.obj_version, start);
1850 if (!op)
1851 goto out_cancel;
1852 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1853 obj_request, op);
1854 rbd_osd_req_op_destroy(op);
1855 if (!obj_request->osd_req) 2671 if (!obj_request->osd_req)
1856 goto out_cancel; 2672 goto out_cancel;
1857 2673
@@ -1860,6 +2676,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1860 else 2676 else
1861 ceph_osdc_unregister_linger_request(osdc, 2677 ceph_osdc_unregister_linger_request(osdc,
1862 rbd_dev->watch_request->osd_req); 2678 rbd_dev->watch_request->osd_req);
2679
2680 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2681 rbd_dev->watch_event->cookie, 0, start);
2682 rbd_osd_req_format_write(obj_request);
2683
1863 ret = rbd_obj_request_submit(osdc, obj_request); 2684 ret = rbd_obj_request_submit(osdc, obj_request);
1864 if (ret) 2685 if (ret)
1865 goto out_cancel; 2686 goto out_cancel;
@@ -1899,40 +2720,38 @@ out_cancel:
1899} 2720}
1900 2721
1901/* 2722/*
1902 * Synchronous osd object method call 2723 * Synchronous osd object method call. Returns the number of bytes
2724 * returned in the outbound buffer, or a negative error code.
1903 */ 2725 */
1904static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 2726static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1905 const char *object_name, 2727 const char *object_name,
1906 const char *class_name, 2728 const char *class_name,
1907 const char *method_name, 2729 const char *method_name,
1908 const char *outbound, 2730 const void *outbound,
1909 size_t outbound_size, 2731 size_t outbound_size,
1910 char *inbound, 2732 void *inbound,
1911 size_t inbound_size, 2733 size_t inbound_size)
1912 u64 *version)
1913{ 2734{
2735 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1914 struct rbd_obj_request *obj_request; 2736 struct rbd_obj_request *obj_request;
1915 struct ceph_osd_client *osdc;
1916 struct ceph_osd_req_op *op;
1917 struct page **pages; 2737 struct page **pages;
1918 u32 page_count; 2738 u32 page_count;
1919 int ret; 2739 int ret;
1920 2740
1921 /* 2741 /*
1922 * Method calls are ultimately read operations but they 2742 * Method calls are ultimately read operations. The result
1923 * don't involve object data (so no offset or length). 2743 * should placed into the inbound buffer provided. They
1924 * The result should placed into the inbound buffer 2744 * also supply outbound data--parameters for the object
1925 * provided. They also supply outbound data--parameters for 2745 * method. Currently if this is present it will be a
1926 * the object method. Currently if this is present it will 2746 * snapshot id.
1927 * be a snapshot id.
1928 */ 2747 */
1929 page_count = (u32) calc_pages_for(0, inbound_size); 2748 page_count = (u32)calc_pages_for(0, inbound_size);
1930 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2749 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1931 if (IS_ERR(pages)) 2750 if (IS_ERR(pages))
1932 return PTR_ERR(pages); 2751 return PTR_ERR(pages);
1933 2752
1934 ret = -ENOMEM; 2753 ret = -ENOMEM;
1935 obj_request = rbd_obj_request_create(object_name, 0, 0, 2754 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
1936 OBJ_REQUEST_PAGES); 2755 OBJ_REQUEST_PAGES);
1937 if (!obj_request) 2756 if (!obj_request)
1938 goto out; 2757 goto out;
@@ -1940,17 +2759,29 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1940 obj_request->pages = pages; 2759 obj_request->pages = pages;
1941 obj_request->page_count = page_count; 2760 obj_request->page_count = page_count;
1942 2761
1943 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, 2762 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1944 method_name, outbound, outbound_size);
1945 if (!op)
1946 goto out;
1947 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1948 obj_request, op);
1949 rbd_osd_req_op_destroy(op);
1950 if (!obj_request->osd_req) 2763 if (!obj_request->osd_req)
1951 goto out; 2764 goto out;
1952 2765
1953 osdc = &rbd_dev->rbd_client->client->osdc; 2766 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
2767 class_name, method_name);
2768 if (outbound_size) {
2769 struct ceph_pagelist *pagelist;
2770
2771 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2772 if (!pagelist)
2773 goto out;
2774
2775 ceph_pagelist_init(pagelist);
2776 ceph_pagelist_append(pagelist, outbound, outbound_size);
2777 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2778 pagelist);
2779 }
2780 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2781 obj_request->pages, inbound_size,
2782 0, false, false);
2783 rbd_osd_req_format_read(obj_request);
2784
1954 ret = rbd_obj_request_submit(osdc, obj_request); 2785 ret = rbd_obj_request_submit(osdc, obj_request);
1955 if (ret) 2786 if (ret)
1956 goto out; 2787 goto out;
@@ -1961,10 +2792,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1961 ret = obj_request->result; 2792 ret = obj_request->result;
1962 if (ret < 0) 2793 if (ret < 0)
1963 goto out; 2794 goto out;
1964 ret = 0; 2795
2796 rbd_assert(obj_request->xferred < (u64)INT_MAX);
2797 ret = (int)obj_request->xferred;
1965 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 2798 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
1966 if (version)
1967 *version = obj_request->version;
1968out: 2799out:
1969 if (obj_request) 2800 if (obj_request)
1970 rbd_obj_request_put(obj_request); 2801 rbd_obj_request_put(obj_request);
@@ -2034,18 +2865,22 @@ static void rbd_request_fn(struct request_queue *q)
2034 } 2865 }
2035 2866
2036 result = -EINVAL; 2867 result = -EINVAL;
2037 if (WARN_ON(offset && length > U64_MAX - offset + 1)) 2868 if (offset && length > U64_MAX - offset + 1) {
2869 rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2870 offset, length);
2038 goto end_request; /* Shouldn't happen */ 2871 goto end_request; /* Shouldn't happen */
2872 }
2039 2873
2040 result = -ENOMEM; 2874 result = -ENOMEM;
2041 img_request = rbd_img_request_create(rbd_dev, offset, length, 2875 img_request = rbd_img_request_create(rbd_dev, offset, length,
2042 write_request); 2876 write_request, false);
2043 if (!img_request) 2877 if (!img_request)
2044 goto end_request; 2878 goto end_request;
2045 2879
2046 img_request->rq = rq; 2880 img_request->rq = rq;
2047 2881
2048 result = rbd_img_request_fill_bio(img_request, rq->bio); 2882 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2883 rq->bio);
2049 if (!result) 2884 if (!result)
2050 result = rbd_img_request_submit(img_request); 2885 result = rbd_img_request_submit(img_request);
2051 if (result) 2886 if (result)
@@ -2053,8 +2888,10 @@ static void rbd_request_fn(struct request_queue *q)
2053end_request: 2888end_request:
2054 spin_lock_irq(q->queue_lock); 2889 spin_lock_irq(q->queue_lock);
2055 if (result < 0) { 2890 if (result < 0) {
2056 rbd_warn(rbd_dev, "obj_request %s result %d\n", 2891 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2057 write_request ? "write" : "read", result); 2892 write_request ? "write" : "read",
2893 length, offset, result);
2894
2058 __blk_end_request_all(rq, result); 2895 __blk_end_request_all(rq, result);
2059 } 2896 }
2060 } 2897 }
@@ -2113,22 +2950,22 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
2113 if (!disk) 2950 if (!disk)
2114 return; 2951 return;
2115 2952
2116 if (disk->flags & GENHD_FL_UP) 2953 rbd_dev->disk = NULL;
2954 if (disk->flags & GENHD_FL_UP) {
2117 del_gendisk(disk); 2955 del_gendisk(disk);
2118 if (disk->queue) 2956 if (disk->queue)
2119 blk_cleanup_queue(disk->queue); 2957 blk_cleanup_queue(disk->queue);
2958 }
2120 put_disk(disk); 2959 put_disk(disk);
2121} 2960}
2122 2961
2123static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 2962static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2124 const char *object_name, 2963 const char *object_name,
2125 u64 offset, u64 length, 2964 u64 offset, u64 length, void *buf)
2126 char *buf, u64 *version)
2127 2965
2128{ 2966{
2129 struct ceph_osd_req_op *op; 2967 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2130 struct rbd_obj_request *obj_request; 2968 struct rbd_obj_request *obj_request;
2131 struct ceph_osd_client *osdc;
2132 struct page **pages = NULL; 2969 struct page **pages = NULL;
2133 u32 page_count; 2970 u32 page_count;
2134 size_t size; 2971 size_t size;
@@ -2148,16 +2985,19 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2148 obj_request->pages = pages; 2985 obj_request->pages = pages;
2149 obj_request->page_count = page_count; 2986 obj_request->page_count = page_count;
2150 2987
2151 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); 2988 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2152 if (!op)
2153 goto out;
2154 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2155 obj_request, op);
2156 rbd_osd_req_op_destroy(op);
2157 if (!obj_request->osd_req) 2989 if (!obj_request->osd_req)
2158 goto out; 2990 goto out;
2159 2991
2160 osdc = &rbd_dev->rbd_client->client->osdc; 2992 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2993 offset, length, 0, 0);
2994 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2995 obj_request->pages,
2996 obj_request->length,
2997 obj_request->offset & ~PAGE_MASK,
2998 false, false);
2999 rbd_osd_req_format_read(obj_request);
3000
2161 ret = rbd_obj_request_submit(osdc, obj_request); 3001 ret = rbd_obj_request_submit(osdc, obj_request);
2162 if (ret) 3002 if (ret)
2163 goto out; 3003 goto out;
@@ -2172,10 +3012,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2172 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 3012 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2173 size = (size_t) obj_request->xferred; 3013 size = (size_t) obj_request->xferred;
2174 ceph_copy_from_page_vector(pages, buf, 0, size); 3014 ceph_copy_from_page_vector(pages, buf, 0, size);
2175 rbd_assert(size <= (size_t) INT_MAX); 3015 rbd_assert(size <= (size_t)INT_MAX);
2176 ret = (int) size; 3016 ret = (int)size;
2177 if (version)
2178 *version = obj_request->version;
2179out: 3017out:
2180 if (obj_request) 3018 if (obj_request)
2181 rbd_obj_request_put(obj_request); 3019 rbd_obj_request_put(obj_request);
@@ -2196,7 +3034,7 @@ out:
2196 * Returns a pointer-coded errno if a failure occurs. 3034 * Returns a pointer-coded errno if a failure occurs.
2197 */ 3035 */
2198static struct rbd_image_header_ondisk * 3036static struct rbd_image_header_ondisk *
2199rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 3037rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
2200{ 3038{
2201 struct rbd_image_header_ondisk *ondisk = NULL; 3039 struct rbd_image_header_ondisk *ondisk = NULL;
2202 u32 snap_count = 0; 3040 u32 snap_count = 0;
@@ -2224,11 +3062,10 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2224 return ERR_PTR(-ENOMEM); 3062 return ERR_PTR(-ENOMEM);
2225 3063
2226 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 3064 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2227 0, size, 3065 0, size, ondisk);
2228 (char *) ondisk, version);
2229 if (ret < 0) 3066 if (ret < 0)
2230 goto out_err; 3067 goto out_err;
2231 if (WARN_ON((size_t) ret < size)) { 3068 if ((size_t)ret < size) {
2232 ret = -ENXIO; 3069 ret = -ENXIO;
2233 rbd_warn(rbd_dev, "short header read (want %zd got %d)", 3070 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2234 size, ret); 3071 size, ret);
@@ -2260,46 +3097,36 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
2260 struct rbd_image_header *header) 3097 struct rbd_image_header *header)
2261{ 3098{
2262 struct rbd_image_header_ondisk *ondisk; 3099 struct rbd_image_header_ondisk *ondisk;
2263 u64 ver = 0;
2264 int ret; 3100 int ret;
2265 3101
2266 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 3102 ondisk = rbd_dev_v1_header_read(rbd_dev);
2267 if (IS_ERR(ondisk)) 3103 if (IS_ERR(ondisk))
2268 return PTR_ERR(ondisk); 3104 return PTR_ERR(ondisk);
2269 ret = rbd_header_from_disk(header, ondisk); 3105 ret = rbd_header_from_disk(header, ondisk);
2270 if (ret >= 0)
2271 header->obj_version = ver;
2272 kfree(ondisk); 3106 kfree(ondisk);
2273 3107
2274 return ret; 3108 return ret;
2275} 3109}
2276 3110
2277static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2278{
2279 struct rbd_snap *snap;
2280 struct rbd_snap *next;
2281
2282 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
2283 rbd_remove_snap_dev(snap);
2284}
2285
2286static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 3111static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2287{ 3112{
2288 sector_t size;
2289
2290 if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 3113 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2291 return; 3114 return;
2292 3115
2293 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 3116 if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
2294 dout("setting size to %llu sectors", (unsigned long long) size); 3117 sector_t size;
2295 rbd_dev->mapping.size = (u64) size; 3118
2296 set_capacity(rbd_dev->disk, size); 3119 rbd_dev->mapping.size = rbd_dev->header.image_size;
3120 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3121 dout("setting size to %llu sectors", (unsigned long long)size);
3122 set_capacity(rbd_dev->disk, size);
3123 }
2297} 3124}
2298 3125
2299/* 3126/*
2300 * only read the first part of the ondisk header, without the snaps info 3127 * only read the first part of the ondisk header, without the snaps info
2301 */ 3128 */
2302static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 3129static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
2303{ 3130{
2304 int ret; 3131 int ret;
2305 struct rbd_image_header h; 3132 struct rbd_image_header h;
@@ -2320,37 +3147,61 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2320 /* osd requests may still refer to snapc */ 3147 /* osd requests may still refer to snapc */
2321 ceph_put_snap_context(rbd_dev->header.snapc); 3148 ceph_put_snap_context(rbd_dev->header.snapc);
2322 3149
2323 if (hver)
2324 *hver = h.obj_version;
2325 rbd_dev->header.obj_version = h.obj_version;
2326 rbd_dev->header.image_size = h.image_size; 3150 rbd_dev->header.image_size = h.image_size;
2327 rbd_dev->header.snapc = h.snapc; 3151 rbd_dev->header.snapc = h.snapc;
2328 rbd_dev->header.snap_names = h.snap_names; 3152 rbd_dev->header.snap_names = h.snap_names;
2329 rbd_dev->header.snap_sizes = h.snap_sizes; 3153 rbd_dev->header.snap_sizes = h.snap_sizes;
2330 /* Free the extra copy of the object prefix */ 3154 /* Free the extra copy of the object prefix */
2331 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 3155 if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3156 rbd_warn(rbd_dev, "object prefix changed (ignoring)");
2332 kfree(h.object_prefix); 3157 kfree(h.object_prefix);
2333 3158
2334 ret = rbd_dev_snaps_update(rbd_dev);
2335 if (!ret)
2336 ret = rbd_dev_snaps_register(rbd_dev);
2337
2338 up_write(&rbd_dev->header_rwsem); 3159 up_write(&rbd_dev->header_rwsem);
2339 3160
2340 return ret; 3161 return ret;
2341} 3162}
2342 3163
2343static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 3164/*
3165 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3166 * has disappeared from the (just updated) snapshot context.
3167 */
3168static void rbd_exists_validate(struct rbd_device *rbd_dev)
3169{
3170 u64 snap_id;
3171
3172 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3173 return;
3174
3175 snap_id = rbd_dev->spec->snap_id;
3176 if (snap_id == CEPH_NOSNAP)
3177 return;
3178
3179 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3180 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3181}
3182
3183static int rbd_dev_refresh(struct rbd_device *rbd_dev)
2344{ 3184{
3185 u64 image_size;
2345 int ret; 3186 int ret;
2346 3187
2347 rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 3188 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3189 image_size = rbd_dev->header.image_size;
2348 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3190 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2349 if (rbd_dev->image_format == 1) 3191 if (rbd_dev->image_format == 1)
2350 ret = rbd_dev_v1_refresh(rbd_dev, hver); 3192 ret = rbd_dev_v1_refresh(rbd_dev);
2351 else 3193 else
2352 ret = rbd_dev_v2_refresh(rbd_dev, hver); 3194 ret = rbd_dev_v2_refresh(rbd_dev);
3195
3196 /* If it's a mapped snapshot, validate its EXISTS flag */
3197
3198 rbd_exists_validate(rbd_dev);
2353 mutex_unlock(&ctl_mutex); 3199 mutex_unlock(&ctl_mutex);
3200 if (ret)
3201 rbd_warn(rbd_dev, "got notification but failed to "
3202 " update snaps: %d\n", ret);
3203 if (image_size != rbd_dev->header.image_size)
3204 revalidate_disk(rbd_dev->disk);
2354 3205
2355 return ret; 3206 return ret;
2356} 3207}
@@ -2394,8 +3245,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
2394 3245
2395 rbd_dev->disk = disk; 3246 rbd_dev->disk = disk;
2396 3247
2397 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2398
2399 return 0; 3248 return 0;
2400out_disk: 3249out_disk:
2401 put_disk(disk); 3250 put_disk(disk);
@@ -2416,13 +3265,9 @@ static ssize_t rbd_size_show(struct device *dev,
2416 struct device_attribute *attr, char *buf) 3265 struct device_attribute *attr, char *buf)
2417{ 3266{
2418 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3267 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2419 sector_t size;
2420 3268
2421 down_read(&rbd_dev->header_rwsem); 3269 return sprintf(buf, "%llu\n",
2422 size = get_capacity(rbd_dev->disk); 3270 (unsigned long long)rbd_dev->mapping.size);
2423 up_read(&rbd_dev->header_rwsem);
2424
2425 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2426} 3271}
2427 3272
2428/* 3273/*
@@ -2435,7 +3280,7 @@ static ssize_t rbd_features_show(struct device *dev,
2435 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3280 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2436 3281
2437 return sprintf(buf, "0x%016llx\n", 3282 return sprintf(buf, "0x%016llx\n",
2438 (unsigned long long) rbd_dev->mapping.features); 3283 (unsigned long long)rbd_dev->mapping.features);
2439} 3284}
2440 3285
2441static ssize_t rbd_major_show(struct device *dev, 3286static ssize_t rbd_major_show(struct device *dev,
@@ -2443,7 +3288,11 @@ static ssize_t rbd_major_show(struct device *dev,
2443{ 3288{
2444 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3289 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2445 3290
2446 return sprintf(buf, "%d\n", rbd_dev->major); 3291 if (rbd_dev->major)
3292 return sprintf(buf, "%d\n", rbd_dev->major);
3293
3294 return sprintf(buf, "(none)\n");
3295
2447} 3296}
2448 3297
2449static ssize_t rbd_client_id_show(struct device *dev, 3298static ssize_t rbd_client_id_show(struct device *dev,
@@ -2469,7 +3318,7 @@ static ssize_t rbd_pool_id_show(struct device *dev,
2469 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3318 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2470 3319
2471 return sprintf(buf, "%llu\n", 3320 return sprintf(buf, "%llu\n",
2472 (unsigned long long) rbd_dev->spec->pool_id); 3321 (unsigned long long) rbd_dev->spec->pool_id);
2473} 3322}
2474 3323
2475static ssize_t rbd_name_show(struct device *dev, 3324static ssize_t rbd_name_show(struct device *dev,
@@ -2555,7 +3404,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
2555 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3404 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2556 int ret; 3405 int ret;
2557 3406
2558 ret = rbd_dev_refresh(rbd_dev, NULL); 3407 ret = rbd_dev_refresh(rbd_dev);
2559 3408
2560 return ret < 0 ? ret : size; 3409 return ret < 0 ? ret : size;
2561} 3410}
@@ -2606,71 +3455,6 @@ static struct device_type rbd_device_type = {
2606 .release = rbd_sysfs_dev_release, 3455 .release = rbd_sysfs_dev_release,
2607}; 3456};
2608 3457
2609
2610/*
2611 sysfs - snapshots
2612*/
2613
2614static ssize_t rbd_snap_size_show(struct device *dev,
2615 struct device_attribute *attr,
2616 char *buf)
2617{
2618 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2619
2620 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2621}
2622
2623static ssize_t rbd_snap_id_show(struct device *dev,
2624 struct device_attribute *attr,
2625 char *buf)
2626{
2627 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2628
2629 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2630}
2631
2632static ssize_t rbd_snap_features_show(struct device *dev,
2633 struct device_attribute *attr,
2634 char *buf)
2635{
2636 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2637
2638 return sprintf(buf, "0x%016llx\n",
2639 (unsigned long long) snap->features);
2640}
2641
2642static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2643static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2644static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2645
2646static struct attribute *rbd_snap_attrs[] = {
2647 &dev_attr_snap_size.attr,
2648 &dev_attr_snap_id.attr,
2649 &dev_attr_snap_features.attr,
2650 NULL,
2651};
2652
2653static struct attribute_group rbd_snap_attr_group = {
2654 .attrs = rbd_snap_attrs,
2655};
2656
2657static void rbd_snap_dev_release(struct device *dev)
2658{
2659 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2660 kfree(snap->name);
2661 kfree(snap);
2662}
2663
2664static const struct attribute_group *rbd_snap_attr_groups[] = {
2665 &rbd_snap_attr_group,
2666 NULL
2667};
2668
2669static struct device_type rbd_snap_device_type = {
2670 .groups = rbd_snap_attr_groups,
2671 .release = rbd_snap_dev_release,
2672};
2673
2674static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 3458static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2675{ 3459{
2676 kref_get(&spec->kref); 3460 kref_get(&spec->kref);
@@ -2694,8 +3478,6 @@ static struct rbd_spec *rbd_spec_alloc(void)
2694 return NULL; 3478 return NULL;
2695 kref_init(&spec->kref); 3479 kref_init(&spec->kref);
2696 3480
2697 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2698
2699 return spec; 3481 return spec;
2700} 3482}
2701 3483
@@ -2722,7 +3504,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2722 spin_lock_init(&rbd_dev->lock); 3504 spin_lock_init(&rbd_dev->lock);
2723 rbd_dev->flags = 0; 3505 rbd_dev->flags = 0;
2724 INIT_LIST_HEAD(&rbd_dev->node); 3506 INIT_LIST_HEAD(&rbd_dev->node);
2725 INIT_LIST_HEAD(&rbd_dev->snaps);
2726 init_rwsem(&rbd_dev->header_rwsem); 3507 init_rwsem(&rbd_dev->header_rwsem);
2727 3508
2728 rbd_dev->spec = spec; 3509 rbd_dev->spec = spec;
@@ -2740,96 +3521,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2740 3521
2741static void rbd_dev_destroy(struct rbd_device *rbd_dev) 3522static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2742{ 3523{
2743 rbd_spec_put(rbd_dev->parent_spec);
2744 kfree(rbd_dev->header_name);
2745 rbd_put_client(rbd_dev->rbd_client); 3524 rbd_put_client(rbd_dev->rbd_client);
2746 rbd_spec_put(rbd_dev->spec); 3525 rbd_spec_put(rbd_dev->spec);
2747 kfree(rbd_dev); 3526 kfree(rbd_dev);
2748} 3527}
2749 3528
2750static bool rbd_snap_registered(struct rbd_snap *snap)
2751{
2752 bool ret = snap->dev.type == &rbd_snap_device_type;
2753 bool reg = device_is_registered(&snap->dev);
2754
2755 rbd_assert(!ret ^ reg);
2756
2757 return ret;
2758}
2759
2760static void rbd_remove_snap_dev(struct rbd_snap *snap)
2761{
2762 list_del(&snap->node);
2763 if (device_is_registered(&snap->dev))
2764 device_unregister(&snap->dev);
2765}
2766
2767static int rbd_register_snap_dev(struct rbd_snap *snap,
2768 struct device *parent)
2769{
2770 struct device *dev = &snap->dev;
2771 int ret;
2772
2773 dev->type = &rbd_snap_device_type;
2774 dev->parent = parent;
2775 dev->release = rbd_snap_dev_release;
2776 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2777 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2778
2779 ret = device_register(dev);
2780
2781 return ret;
2782}
2783
2784static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2785 const char *snap_name,
2786 u64 snap_id, u64 snap_size,
2787 u64 snap_features)
2788{
2789 struct rbd_snap *snap;
2790 int ret;
2791
2792 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2793 if (!snap)
2794 return ERR_PTR(-ENOMEM);
2795
2796 ret = -ENOMEM;
2797 snap->name = kstrdup(snap_name, GFP_KERNEL);
2798 if (!snap->name)
2799 goto err;
2800
2801 snap->id = snap_id;
2802 snap->size = snap_size;
2803 snap->features = snap_features;
2804
2805 return snap;
2806
2807err:
2808 kfree(snap->name);
2809 kfree(snap);
2810
2811 return ERR_PTR(ret);
2812}
2813
2814static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2815 u64 *snap_size, u64 *snap_features)
2816{
2817 char *snap_name;
2818
2819 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2820
2821 *snap_size = rbd_dev->header.snap_sizes[which];
2822 *snap_features = 0; /* No features for v1 */
2823
2824 /* Skip over names until we find the one we are looking for */
2825
2826 snap_name = rbd_dev->header.snap_names;
2827 while (which--)
2828 snap_name += strlen(snap_name) + 1;
2829
2830 return snap_name;
2831}
2832
2833/* 3529/*
2834 * Get the size and object order for an image snapshot, or if 3530 * Get the size and object order for an image snapshot, or if
2835 * snap_id is CEPH_NOSNAP, gets this information for the base 3531 * snap_id is CEPH_NOSNAP, gets this information for the base
@@ -2847,18 +3543,21 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2847 3543
2848 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3544 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2849 "rbd", "get_size", 3545 "rbd", "get_size",
2850 (char *) &snapid, sizeof (snapid), 3546 &snapid, sizeof (snapid),
2851 (char *) &size_buf, sizeof (size_buf), NULL); 3547 &size_buf, sizeof (size_buf));
2852 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3548 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2853 if (ret < 0) 3549 if (ret < 0)
2854 return ret; 3550 return ret;
3551 if (ret < sizeof (size_buf))
3552 return -ERANGE;
2855 3553
2856 *order = size_buf.order; 3554 if (order)
3555 *order = size_buf.order;
2857 *snap_size = le64_to_cpu(size_buf.size); 3556 *snap_size = le64_to_cpu(size_buf.size);
2858 3557
2859 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 3558 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2860 (unsigned long long) snap_id, (unsigned int) *order, 3559 (unsigned long long)snap_id, (unsigned int)*order,
2861 (unsigned long long) *snap_size); 3560 (unsigned long long)*snap_size);
2862 3561
2863 return 0; 3562 return 0;
2864} 3563}
@@ -2881,17 +3580,16 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2881 return -ENOMEM; 3580 return -ENOMEM;
2882 3581
2883 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3582 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2884 "rbd", "get_object_prefix", 3583 "rbd", "get_object_prefix", NULL, 0,
2885 NULL, 0, 3584 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
2886 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2887 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3585 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2888 if (ret < 0) 3586 if (ret < 0)
2889 goto out; 3587 goto out;
2890 3588
2891 p = reply_buf; 3589 p = reply_buf;
2892 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 3590 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2893 p + RBD_OBJ_PREFIX_LEN_MAX, 3591 p + ret, NULL, GFP_NOIO);
2894 NULL, GFP_NOIO); 3592 ret = 0;
2895 3593
2896 if (IS_ERR(rbd_dev->header.object_prefix)) { 3594 if (IS_ERR(rbd_dev->header.object_prefix)) {
2897 ret = PTR_ERR(rbd_dev->header.object_prefix); 3595 ret = PTR_ERR(rbd_dev->header.object_prefix);
@@ -2899,7 +3597,6 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2899 } else { 3597 } else {
2900 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 3598 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2901 } 3599 }
2902
2903out: 3600out:
2904 kfree(reply_buf); 3601 kfree(reply_buf);
2905 3602
@@ -2913,29 +3610,30 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2913 struct { 3610 struct {
2914 __le64 features; 3611 __le64 features;
2915 __le64 incompat; 3612 __le64 incompat;
2916 } features_buf = { 0 }; 3613 } __attribute__ ((packed)) features_buf = { 0 };
2917 u64 incompat; 3614 u64 incompat;
2918 int ret; 3615 int ret;
2919 3616
2920 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3617 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2921 "rbd", "get_features", 3618 "rbd", "get_features",
2922 (char *) &snapid, sizeof (snapid), 3619 &snapid, sizeof (snapid),
2923 (char *) &features_buf, sizeof (features_buf), 3620 &features_buf, sizeof (features_buf));
2924 NULL);
2925 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3621 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2926 if (ret < 0) 3622 if (ret < 0)
2927 return ret; 3623 return ret;
3624 if (ret < sizeof (features_buf))
3625 return -ERANGE;
2928 3626
2929 incompat = le64_to_cpu(features_buf.incompat); 3627 incompat = le64_to_cpu(features_buf.incompat);
2930 if (incompat & ~RBD_FEATURES_ALL) 3628 if (incompat & ~RBD_FEATURES_SUPPORTED)
2931 return -ENXIO; 3629 return -ENXIO;
2932 3630
2933 *snap_features = le64_to_cpu(features_buf.features); 3631 *snap_features = le64_to_cpu(features_buf.features);
2934 3632
2935 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3633 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2936 (unsigned long long) snap_id, 3634 (unsigned long long)snap_id,
2937 (unsigned long long) *snap_features, 3635 (unsigned long long)*snap_features,
2938 (unsigned long long) le64_to_cpu(features_buf.incompat)); 3636 (unsigned long long)le64_to_cpu(features_buf.incompat));
2939 3637
2940 return 0; 3638 return 0;
2941} 3639}
@@ -2975,15 +3673,15 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2975 snapid = cpu_to_le64(CEPH_NOSNAP); 3673 snapid = cpu_to_le64(CEPH_NOSNAP);
2976 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3674 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2977 "rbd", "get_parent", 3675 "rbd", "get_parent",
2978 (char *) &snapid, sizeof (snapid), 3676 &snapid, sizeof (snapid),
2979 (char *) reply_buf, size, NULL); 3677 reply_buf, size);
2980 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3678 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2981 if (ret < 0) 3679 if (ret < 0)
2982 goto out_err; 3680 goto out_err;
2983 3681
2984 ret = -ERANGE;
2985 p = reply_buf; 3682 p = reply_buf;
2986 end = (char *) reply_buf + size; 3683 end = reply_buf + ret;
3684 ret = -ERANGE;
2987 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 3685 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2988 if (parent_spec->pool_id == CEPH_NOPOOL) 3686 if (parent_spec->pool_id == CEPH_NOPOOL)
2989 goto out; /* No parent? No problem. */ 3687 goto out; /* No parent? No problem. */
@@ -2991,8 +3689,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2991 /* The ceph file layout needs to fit pool id in 32 bits */ 3689 /* The ceph file layout needs to fit pool id in 32 bits */
2992 3690
2993 ret = -EIO; 3691 ret = -EIO;
2994 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) 3692 if (parent_spec->pool_id > (u64)U32_MAX) {
2995 goto out; 3693 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3694 (unsigned long long)parent_spec->pool_id, U32_MAX);
3695 goto out_err;
3696 }
2996 3697
2997 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3698 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2998 if (IS_ERR(image_id)) { 3699 if (IS_ERR(image_id)) {
@@ -3015,6 +3716,56 @@ out_err:
3015 return ret; 3716 return ret;
3016} 3717}
3017 3718
3719static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3720{
3721 struct {
3722 __le64 stripe_unit;
3723 __le64 stripe_count;
3724 } __attribute__ ((packed)) striping_info_buf = { 0 };
3725 size_t size = sizeof (striping_info_buf);
3726 void *p;
3727 u64 obj_size;
3728 u64 stripe_unit;
3729 u64 stripe_count;
3730 int ret;
3731
3732 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3733 "rbd", "get_stripe_unit_count", NULL, 0,
3734 (char *)&striping_info_buf, size);
3735 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3736 if (ret < 0)
3737 return ret;
3738 if (ret < size)
3739 return -ERANGE;
3740
3741 /*
3742 * We don't actually support the "fancy striping" feature
3743 * (STRIPINGV2) yet, but if the striping sizes are the
3744 * defaults the behavior is the same as before. So find
3745 * out, and only fail if the image has non-default values.
3746 */
3747 ret = -EINVAL;
3748 obj_size = (u64)1 << rbd_dev->header.obj_order;
3749 p = &striping_info_buf;
3750 stripe_unit = ceph_decode_64(&p);
3751 if (stripe_unit != obj_size) {
3752 rbd_warn(rbd_dev, "unsupported stripe unit "
3753 "(got %llu want %llu)",
3754 stripe_unit, obj_size);
3755 return -EINVAL;
3756 }
3757 stripe_count = ceph_decode_64(&p);
3758 if (stripe_count != 1) {
3759 rbd_warn(rbd_dev, "unsupported stripe count "
3760 "(got %llu want 1)", stripe_count);
3761 return -EINVAL;
3762 }
3763 rbd_dev->header.stripe_unit = stripe_unit;
3764 rbd_dev->header.stripe_count = stripe_count;
3765
3766 return 0;
3767}
3768
3018static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 3769static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3019{ 3770{
3020 size_t image_id_size; 3771 size_t image_id_size;
@@ -3036,8 +3787,8 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3036 return NULL; 3787 return NULL;
3037 3788
3038 p = image_id; 3789 p = image_id;
3039 end = (char *) image_id + image_id_size; 3790 end = image_id + image_id_size;
3040 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); 3791 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
3041 3792
3042 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 3793 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3043 reply_buf = kmalloc(size, GFP_KERNEL); 3794 reply_buf = kmalloc(size, GFP_KERNEL);
@@ -3047,11 +3798,12 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3047 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 3798 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3048 "rbd", "dir_get_name", 3799 "rbd", "dir_get_name",
3049 image_id, image_id_size, 3800 image_id, image_id_size,
3050 (char *) reply_buf, size, NULL); 3801 reply_buf, size);
3051 if (ret < 0) 3802 if (ret < 0)
3052 goto out; 3803 goto out;
3053 p = reply_buf; 3804 p = reply_buf;
3054 end = (char *) reply_buf + size; 3805 end = reply_buf + ret;
3806
3055 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 3807 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3056 if (IS_ERR(image_name)) 3808 if (IS_ERR(image_name))
3057 image_name = NULL; 3809 image_name = NULL;
@@ -3064,69 +3816,134 @@ out:
3064 return image_name; 3816 return image_name;
3065} 3817}
3066 3818
3819static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3820{
3821 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3822 const char *snap_name;
3823 u32 which = 0;
3824
3825 /* Skip over names until we find the one we are looking for */
3826
3827 snap_name = rbd_dev->header.snap_names;
3828 while (which < snapc->num_snaps) {
3829 if (!strcmp(name, snap_name))
3830 return snapc->snaps[which];
3831 snap_name += strlen(snap_name) + 1;
3832 which++;
3833 }
3834 return CEPH_NOSNAP;
3835}
3836
3837static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3838{
3839 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3840 u32 which;
3841 bool found = false;
3842 u64 snap_id;
3843
3844 for (which = 0; !found && which < snapc->num_snaps; which++) {
3845 const char *snap_name;
3846
3847 snap_id = snapc->snaps[which];
3848 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
3849 if (IS_ERR(snap_name))
3850 break;
3851 found = !strcmp(name, snap_name);
3852 kfree(snap_name);
3853 }
3854 return found ? snap_id : CEPH_NOSNAP;
3855}
3856
3067/* 3857/*
3068 * When a parent image gets probed, we only have the pool, image, 3858 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
3069 * and snapshot ids but not the names of any of them. This call 3859 * no snapshot by that name is found, or if an error occurs.
3070 * is made later to fill in those names. It has to be done after
3071 * rbd_dev_snaps_update() has completed because some of the
3072 * information (in particular, snapshot name) is not available
3073 * until then.
3074 */ 3860 */
3075static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 3861static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3076{ 3862{
3077 struct ceph_osd_client *osdc; 3863 if (rbd_dev->image_format == 1)
3078 const char *name; 3864 return rbd_v1_snap_id_by_name(rbd_dev, name);
3079 void *reply_buf = NULL; 3865
3866 return rbd_v2_snap_id_by_name(rbd_dev, name);
3867}
3868
3869/*
3870 * When an rbd image has a parent image, it is identified by the
3871 * pool, image, and snapshot ids (not names). This function fills
3872 * in the names for those ids. (It's OK if we can't figure out the
3873 * name for an image id, but the pool and snapshot ids should always
3874 * exist and have names.) All names in an rbd spec are dynamically
3875 * allocated.
3876 *
3877 * When an image being mapped (not a parent) is probed, we have the
3878 * pool name and pool id, image name and image id, and the snapshot
3879 * name. The only thing we're missing is the snapshot id.
3880 */
3881static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
3882{
3883 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3884 struct rbd_spec *spec = rbd_dev->spec;
3885 const char *pool_name;
3886 const char *image_name;
3887 const char *snap_name;
3080 int ret; 3888 int ret;
3081 3889
3082 if (rbd_dev->spec->pool_name) 3890 /*
3083 return 0; /* Already have the names */ 3891 * An image being mapped will have the pool name (etc.), but
3892 * we need to look up the snapshot id.
3893 */
3894 if (spec->pool_name) {
3895 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
3896 u64 snap_id;
3897
3898 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
3899 if (snap_id == CEPH_NOSNAP)
3900 return -ENOENT;
3901 spec->snap_id = snap_id;
3902 } else {
3903 spec->snap_id = CEPH_NOSNAP;
3904 }
3084 3905
3085 /* Look up the pool name */ 3906 return 0;
3907 }
3086 3908
3087 osdc = &rbd_dev->rbd_client->client->osdc; 3909 /* Get the pool name; we have to make our own copy of this */
3088 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3910
3089 if (!name) { 3911 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
3090 rbd_warn(rbd_dev, "there is no pool with id %llu", 3912 if (!pool_name) {
3091 rbd_dev->spec->pool_id); /* Really a BUG() */ 3913 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
3092 return -EIO; 3914 return -EIO;
3093 } 3915 }
3094 3916 pool_name = kstrdup(pool_name, GFP_KERNEL);
3095 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 3917 if (!pool_name)
3096 if (!rbd_dev->spec->pool_name)
3097 return -ENOMEM; 3918 return -ENOMEM;
3098 3919
3099 /* Fetch the image name; tolerate failure here */ 3920 /* Fetch the image name; tolerate failure here */
3100 3921
3101 name = rbd_dev_image_name(rbd_dev); 3922 image_name = rbd_dev_image_name(rbd_dev);
3102 if (name) 3923 if (!image_name)
3103 rbd_dev->spec->image_name = (char *) name;
3104 else
3105 rbd_warn(rbd_dev, "unable to get image name"); 3924 rbd_warn(rbd_dev, "unable to get image name");
3106 3925
3107 /* Look up the snapshot name. */ 3926 /* Look up the snapshot name, and make a copy */
3108 3927
3109 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 3928 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
3110 if (!name) { 3929 if (!snap_name) {
3111 rbd_warn(rbd_dev, "no snapshot with id %llu", 3930 ret = -ENOMEM;
3112 rbd_dev->spec->snap_id); /* Really a BUG() */
3113 ret = -EIO;
3114 goto out_err; 3931 goto out_err;
3115 } 3932 }
3116 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 3933
3117 if(!rbd_dev->spec->snap_name) 3934 spec->pool_name = pool_name;
3118 goto out_err; 3935 spec->image_name = image_name;
3936 spec->snap_name = snap_name;
3119 3937
3120 return 0; 3938 return 0;
3121out_err: 3939out_err:
3122 kfree(reply_buf); 3940 kfree(image_name);
3123 kfree(rbd_dev->spec->pool_name); 3941 kfree(pool_name);
3124 rbd_dev->spec->pool_name = NULL;
3125 3942
3126 return ret; 3943 return ret;
3127} 3944}
3128 3945
3129static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 3946static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
3130{ 3947{
3131 size_t size; 3948 size_t size;
3132 int ret; 3949 int ret;
@@ -3151,16 +3968,15 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3151 return -ENOMEM; 3968 return -ENOMEM;
3152 3969
3153 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3970 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3154 "rbd", "get_snapcontext", 3971 "rbd", "get_snapcontext", NULL, 0,
3155 NULL, 0, 3972 reply_buf, size);
3156 reply_buf, size, ver);
3157 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3973 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3158 if (ret < 0) 3974 if (ret < 0)
3159 goto out; 3975 goto out;
3160 3976
3161 ret = -ERANGE;
3162 p = reply_buf; 3977 p = reply_buf;
3163 end = (char *) reply_buf + size; 3978 end = reply_buf + ret;
3979 ret = -ERANGE;
3164 ceph_decode_64_safe(&p, end, seq, out); 3980 ceph_decode_64_safe(&p, end, seq, out);
3165 ceph_decode_32_safe(&p, end, snap_count, out); 3981 ceph_decode_32_safe(&p, end, snap_count, out);
3166 3982
@@ -3177,37 +3993,33 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3177 } 3993 }
3178 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 3994 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3179 goto out; 3995 goto out;
3996 ret = 0;
3180 3997
3181 size = sizeof (struct ceph_snap_context) + 3998 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
3182 snap_count * sizeof (snapc->snaps[0]);
3183 snapc = kmalloc(size, GFP_KERNEL);
3184 if (!snapc) { 3999 if (!snapc) {
3185 ret = -ENOMEM; 4000 ret = -ENOMEM;
3186 goto out; 4001 goto out;
3187 } 4002 }
3188
3189 atomic_set(&snapc->nref, 1);
3190 snapc->seq = seq; 4003 snapc->seq = seq;
3191 snapc->num_snaps = snap_count;
3192 for (i = 0; i < snap_count; i++) 4004 for (i = 0; i < snap_count; i++)
3193 snapc->snaps[i] = ceph_decode_64(&p); 4005 snapc->snaps[i] = ceph_decode_64(&p);
3194 4006
3195 rbd_dev->header.snapc = snapc; 4007 rbd_dev->header.snapc = snapc;
3196 4008
3197 dout(" snap context seq = %llu, snap_count = %u\n", 4009 dout(" snap context seq = %llu, snap_count = %u\n",
3198 (unsigned long long) seq, (unsigned int) snap_count); 4010 (unsigned long long)seq, (unsigned int)snap_count);
3199
3200out: 4011out:
3201 kfree(reply_buf); 4012 kfree(reply_buf);
3202 4013
3203 return 0; 4014 return ret;
3204} 4015}
3205 4016
3206static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 4017static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4018 u64 snap_id)
3207{ 4019{
3208 size_t size; 4020 size_t size;
3209 void *reply_buf; 4021 void *reply_buf;
3210 __le64 snap_id; 4022 __le64 snapid;
3211 int ret; 4023 int ret;
3212 void *p; 4024 void *p;
3213 void *end; 4025 void *end;
@@ -3218,236 +4030,52 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3218 if (!reply_buf) 4030 if (!reply_buf)
3219 return ERR_PTR(-ENOMEM); 4031 return ERR_PTR(-ENOMEM);
3220 4032
3221 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 4033 snapid = cpu_to_le64(snap_id);
3222 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4034 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3223 "rbd", "get_snapshot_name", 4035 "rbd", "get_snapshot_name",
3224 (char *) &snap_id, sizeof (snap_id), 4036 &snapid, sizeof (snapid),
3225 reply_buf, size, NULL); 4037 reply_buf, size);
3226 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4038 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3227 if (ret < 0) 4039 if (ret < 0) {
4040 snap_name = ERR_PTR(ret);
3228 goto out; 4041 goto out;
4042 }
3229 4043
3230 p = reply_buf; 4044 p = reply_buf;
3231 end = (char *) reply_buf + size; 4045 end = reply_buf + ret;
3232 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4046 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3233 if (IS_ERR(snap_name)) { 4047 if (IS_ERR(snap_name))
3234 ret = PTR_ERR(snap_name);
3235 goto out; 4048 goto out;
3236 } else {
3237 dout(" snap_id 0x%016llx snap_name = %s\n",
3238 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3239 }
3240 kfree(reply_buf);
3241 4049
3242 return snap_name; 4050 dout(" snap_id 0x%016llx snap_name = %s\n",
4051 (unsigned long long)snap_id, snap_name);
3243out: 4052out:
3244 kfree(reply_buf); 4053 kfree(reply_buf);
3245 4054
3246 return ERR_PTR(ret); 4055 return snap_name;
3247}
3248
3249static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3250 u64 *snap_size, u64 *snap_features)
3251{
3252 u64 snap_id;
3253 u8 order;
3254 int ret;
3255
3256 snap_id = rbd_dev->header.snapc->snaps[which];
3257 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3258 if (ret)
3259 return ERR_PTR(ret);
3260 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3261 if (ret)
3262 return ERR_PTR(ret);
3263
3264 return rbd_dev_v2_snap_name(rbd_dev, which);
3265}
3266
3267static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3268 u64 *snap_size, u64 *snap_features)
3269{
3270 if (rbd_dev->image_format == 1)
3271 return rbd_dev_v1_snap_info(rbd_dev, which,
3272 snap_size, snap_features);
3273 if (rbd_dev->image_format == 2)
3274 return rbd_dev_v2_snap_info(rbd_dev, which,
3275 snap_size, snap_features);
3276 return ERR_PTR(-EINVAL);
3277} 4056}
3278 4057
3279static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 4058static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
3280{ 4059{
3281 int ret; 4060 int ret;
3282 __u8 obj_order;
3283 4061
3284 down_write(&rbd_dev->header_rwsem); 4062 down_write(&rbd_dev->header_rwsem);
3285 4063
3286 /* Grab old order first, to see if it changes */
3287
3288 obj_order = rbd_dev->header.obj_order,
3289 ret = rbd_dev_v2_image_size(rbd_dev); 4064 ret = rbd_dev_v2_image_size(rbd_dev);
3290 if (ret) 4065 if (ret)
3291 goto out; 4066 goto out;
3292 if (rbd_dev->header.obj_order != obj_order) {
3293 ret = -EIO;
3294 goto out;
3295 }
3296 rbd_update_mapping_size(rbd_dev); 4067 rbd_update_mapping_size(rbd_dev);
3297 4068
3298 ret = rbd_dev_v2_snap_context(rbd_dev, hver); 4069 ret = rbd_dev_v2_snap_context(rbd_dev);
3299 dout("rbd_dev_v2_snap_context returned %d\n", ret); 4070 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3300 if (ret) 4071 if (ret)
3301 goto out; 4072 goto out;
3302 ret = rbd_dev_snaps_update(rbd_dev);
3303 dout("rbd_dev_snaps_update returned %d\n", ret);
3304 if (ret)
3305 goto out;
3306 ret = rbd_dev_snaps_register(rbd_dev);
3307 dout("rbd_dev_snaps_register returned %d\n", ret);
3308out: 4073out:
3309 up_write(&rbd_dev->header_rwsem); 4074 up_write(&rbd_dev->header_rwsem);
3310 4075
3311 return ret; 4076 return ret;
3312} 4077}
3313 4078
3314/*
3315 * Scan the rbd device's current snapshot list and compare it to the
3316 * newly-received snapshot context. Remove any existing snapshots
3317 * not present in the new snapshot context. Add a new snapshot for
3318 * any snaphots in the snapshot context not in the current list.
3319 * And verify there are no changes to snapshots we already know
3320 * about.
3321 *
3322 * Assumes the snapshots in the snapshot context are sorted by
3323 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3324 * are also maintained in that order.)
3325 */
3326static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3327{
3328 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3329 const u32 snap_count = snapc->num_snaps;
3330 struct list_head *head = &rbd_dev->snaps;
3331 struct list_head *links = head->next;
3332 u32 index = 0;
3333
3334 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
3335 while (index < snap_count || links != head) {
3336 u64 snap_id;
3337 struct rbd_snap *snap;
3338 char *snap_name;
3339 u64 snap_size = 0;
3340 u64 snap_features = 0;
3341
3342 snap_id = index < snap_count ? snapc->snaps[index]
3343 : CEPH_NOSNAP;
3344 snap = links != head ? list_entry(links, struct rbd_snap, node)
3345 : NULL;
3346 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3347
3348 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3349 struct list_head *next = links->next;
3350
3351 /*
3352 * A previously-existing snapshot is not in
3353 * the new snap context.
3354 *
3355 * If the now missing snapshot is the one the
3356 * image is mapped to, clear its exists flag
3357 * so we can avoid sending any more requests
3358 * to it.
3359 */
3360 if (rbd_dev->spec->snap_id == snap->id)
3361 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3362 rbd_remove_snap_dev(snap);
3363 dout("%ssnap id %llu has been removed\n",
3364 rbd_dev->spec->snap_id == snap->id ?
3365 "mapped " : "",
3366 (unsigned long long) snap->id);
3367
3368 /* Done with this list entry; advance */
3369
3370 links = next;
3371 continue;
3372 }
3373
3374 snap_name = rbd_dev_snap_info(rbd_dev, index,
3375 &snap_size, &snap_features);
3376 if (IS_ERR(snap_name))
3377 return PTR_ERR(snap_name);
3378
3379 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3380 (unsigned long long) snap_id);
3381 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3382 struct rbd_snap *new_snap;
3383
3384 /* We haven't seen this snapshot before */
3385
3386 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3387 snap_id, snap_size, snap_features);
3388 if (IS_ERR(new_snap)) {
3389 int err = PTR_ERR(new_snap);
3390
3391 dout(" failed to add dev, error %d\n", err);
3392
3393 return err;
3394 }
3395
3396 /* New goes before existing, or at end of list */
3397
3398 dout(" added dev%s\n", snap ? "" : " at end\n");
3399 if (snap)
3400 list_add_tail(&new_snap->node, &snap->node);
3401 else
3402 list_add_tail(&new_snap->node, head);
3403 } else {
3404 /* Already have this one */
3405
3406 dout(" already present\n");
3407
3408 rbd_assert(snap->size == snap_size);
3409 rbd_assert(!strcmp(snap->name, snap_name));
3410 rbd_assert(snap->features == snap_features);
3411
3412 /* Done with this list entry; advance */
3413
3414 links = links->next;
3415 }
3416
3417 /* Advance to the next entry in the snapshot context */
3418
3419 index++;
3420 }
3421 dout("%s: done\n", __func__);
3422
3423 return 0;
3424}
3425
3426/*
3427 * Scan the list of snapshots and register the devices for any that
3428 * have not already been registered.
3429 */
3430static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3431{
3432 struct rbd_snap *snap;
3433 int ret = 0;
3434
3435 dout("%s:\n", __func__);
3436 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3437 return -EIO;
3438
3439 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3440 if (!rbd_snap_registered(snap)) {
3441 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3442 if (ret < 0)
3443 break;
3444 }
3445 }
3446 dout("%s: returning %d\n", __func__, ret);
3447
3448 return ret;
3449}
3450
3451static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4079static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3452{ 4080{
3453 struct device *dev; 4081 struct device *dev;
@@ -3459,7 +4087,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3459 dev->bus = &rbd_bus_type; 4087 dev->bus = &rbd_bus_type;
3460 dev->type = &rbd_device_type; 4088 dev->type = &rbd_device_type;
3461 dev->parent = &rbd_root_dev; 4089 dev->parent = &rbd_root_dev;
3462 dev->release = rbd_dev_release; 4090 dev->release = rbd_dev_device_release;
3463 dev_set_name(dev, "%d", rbd_dev->dev_id); 4091 dev_set_name(dev, "%d", rbd_dev->dev_id);
3464 ret = device_register(dev); 4092 ret = device_register(dev);
3465 4093
@@ -3673,6 +4301,7 @@ static int rbd_add_parse_args(const char *buf,
3673 size_t len; 4301 size_t len;
3674 char *options; 4302 char *options;
3675 const char *mon_addrs; 4303 const char *mon_addrs;
4304 char *snap_name;
3676 size_t mon_addrs_size; 4305 size_t mon_addrs_size;
3677 struct rbd_spec *spec = NULL; 4306 struct rbd_spec *spec = NULL;
3678 struct rbd_options *rbd_opts = NULL; 4307 struct rbd_options *rbd_opts = NULL;
@@ -3731,10 +4360,11 @@ static int rbd_add_parse_args(const char *buf,
3731 ret = -ENAMETOOLONG; 4360 ret = -ENAMETOOLONG;
3732 goto out_err; 4361 goto out_err;
3733 } 4362 }
3734 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4363 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3735 if (!spec->snap_name) 4364 if (!snap_name)
3736 goto out_mem; 4365 goto out_mem;
3737 *(spec->snap_name + len) = '\0'; 4366 *(snap_name + len) = '\0';
4367 spec->snap_name = snap_name;
3738 4368
3739 /* Initialize all rbd options to the defaults */ 4369 /* Initialize all rbd options to the defaults */
3740 4370
@@ -3788,15 +4418,19 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3788 size_t size; 4418 size_t size;
3789 char *object_name; 4419 char *object_name;
3790 void *response; 4420 void *response;
3791 void *p; 4421 char *image_id;
3792 4422
3793 /* 4423 /*
3794 * When probing a parent image, the image id is already 4424 * When probing a parent image, the image id is already
3795 * known (and the image name likely is not). There's no 4425 * known (and the image name likely is not). There's no
3796 * need to fetch the image id again in this case. 4426 * need to fetch the image id again in this case. We
4427 * do still need to set the image format though.
3797 */ 4428 */
3798 if (rbd_dev->spec->image_id) 4429 if (rbd_dev->spec->image_id) {
4430 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4431
3799 return 0; 4432 return 0;
4433 }
3800 4434
3801 /* 4435 /*
3802 * First, see if the format 2 image id file exists, and if 4436 * First, see if the format 2 image id file exists, and if
@@ -3818,23 +4452,32 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3818 goto out; 4452 goto out;
3819 } 4453 }
3820 4454
4455 /* If it doesn't exist we'll assume it's a format 1 image */
4456
3821 ret = rbd_obj_method_sync(rbd_dev, object_name, 4457 ret = rbd_obj_method_sync(rbd_dev, object_name,
3822 "rbd", "get_id", 4458 "rbd", "get_id", NULL, 0,
3823 NULL, 0, 4459 response, RBD_IMAGE_ID_LEN_MAX);
3824 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3825 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4460 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3826 if (ret < 0) 4461 if (ret == -ENOENT) {
3827 goto out; 4462 image_id = kstrdup("", GFP_KERNEL);
3828 4463 ret = image_id ? 0 : -ENOMEM;
3829 p = response; 4464 if (!ret)
3830 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 4465 rbd_dev->image_format = 1;
3831 p + RBD_IMAGE_ID_LEN_MAX, 4466 } else if (ret > sizeof (__le32)) {
4467 void *p = response;
4468
4469 image_id = ceph_extract_encoded_string(&p, p + ret,
3832 NULL, GFP_NOIO); 4470 NULL, GFP_NOIO);
3833 if (IS_ERR(rbd_dev->spec->image_id)) { 4471 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
3834 ret = PTR_ERR(rbd_dev->spec->image_id); 4472 if (!ret)
3835 rbd_dev->spec->image_id = NULL; 4473 rbd_dev->image_format = 2;
3836 } else { 4474 } else {
3837 dout("image_id is %s\n", rbd_dev->spec->image_id); 4475 ret = -EINVAL;
4476 }
4477
4478 if (!ret) {
4479 rbd_dev->spec->image_id = image_id;
4480 dout("image_id is %s\n", image_id);
3838 } 4481 }
3839out: 4482out:
3840 kfree(response); 4483 kfree(response);
@@ -3843,27 +4486,30 @@ out:
3843 return ret; 4486 return ret;
3844} 4487}
3845 4488
3846static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 4489/* Undo whatever state changes are made by v1 or v2 image probe */
4490
4491static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
3847{ 4492{
3848 int ret; 4493 struct rbd_image_header *header;
3849 size_t size;
3850 4494
3851 /* Version 1 images have no id; empty string is used */ 4495 rbd_dev_remove_parent(rbd_dev);
4496 rbd_spec_put(rbd_dev->parent_spec);
4497 rbd_dev->parent_spec = NULL;
4498 rbd_dev->parent_overlap = 0;
3852 4499
3853 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 4500 /* Free dynamic fields from the header, then zero it out */
3854 if (!rbd_dev->spec->image_id)
3855 return -ENOMEM;
3856 4501
3857 /* Record the header object name for this rbd image. */ 4502 header = &rbd_dev->header;
4503 ceph_put_snap_context(header->snapc);
4504 kfree(header->snap_sizes);
4505 kfree(header->snap_names);
4506 kfree(header->object_prefix);
4507 memset(header, 0, sizeof (*header));
4508}
3858 4509
3859 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 4510static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3860 rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4511{
3861 if (!rbd_dev->header_name) { 4512 int ret;
3862 ret = -ENOMEM;
3863 goto out_err;
3864 }
3865 sprintf(rbd_dev->header_name, "%s%s",
3866 rbd_dev->spec->image_name, RBD_SUFFIX);
3867 4513
3868 /* Populate rbd image metadata */ 4514 /* Populate rbd image metadata */
3869 4515
@@ -3876,8 +4522,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3876 rbd_dev->parent_spec = NULL; 4522 rbd_dev->parent_spec = NULL;
3877 rbd_dev->parent_overlap = 0; 4523 rbd_dev->parent_overlap = 0;
3878 4524
3879 rbd_dev->image_format = 1;
3880
3881 dout("discovered version 1 image, header name is %s\n", 4525 dout("discovered version 1 image, header name is %s\n",
3882 rbd_dev->header_name); 4526 rbd_dev->header_name);
3883 4527
@@ -3894,43 +4538,45 @@ out_err:
3894 4538
3895static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 4539static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3896{ 4540{
3897 size_t size;
3898 int ret; 4541 int ret;
3899 u64 ver = 0;
3900
3901 /*
3902 * Image id was filled in by the caller. Record the header
3903 * object name for this rbd image.
3904 */
3905 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3906 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3907 if (!rbd_dev->header_name)
3908 return -ENOMEM;
3909 sprintf(rbd_dev->header_name, "%s%s",
3910 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
3911
3912 /* Get the size and object order for the image */
3913 4542
3914 ret = rbd_dev_v2_image_size(rbd_dev); 4543 ret = rbd_dev_v2_image_size(rbd_dev);
3915 if (ret < 0) 4544 if (ret)
3916 goto out_err; 4545 goto out_err;
3917 4546
3918 /* Get the object prefix (a.k.a. block_name) for the image */ 4547 /* Get the object prefix (a.k.a. block_name) for the image */
3919 4548
3920 ret = rbd_dev_v2_object_prefix(rbd_dev); 4549 ret = rbd_dev_v2_object_prefix(rbd_dev);
3921 if (ret < 0) 4550 if (ret)
3922 goto out_err; 4551 goto out_err;
3923 4552
3924 /* Get the and check features for the image */ 4553 /* Get the and check features for the image */
3925 4554
3926 ret = rbd_dev_v2_features(rbd_dev); 4555 ret = rbd_dev_v2_features(rbd_dev);
3927 if (ret < 0) 4556 if (ret)
3928 goto out_err; 4557 goto out_err;
3929 4558
3930 /* If the image supports layering, get the parent info */ 4559 /* If the image supports layering, get the parent info */
3931 4560
3932 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 4561 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3933 ret = rbd_dev_v2_parent_info(rbd_dev); 4562 ret = rbd_dev_v2_parent_info(rbd_dev);
4563 if (ret)
4564 goto out_err;
4565
4566 /*
4567 * Don't print a warning for parent images. We can
4568 * tell this point because we won't know its pool
4569 * name yet (just its pool id).
4570 */
4571 if (rbd_dev->spec->pool_name)
4572 rbd_warn(rbd_dev, "WARNING: kernel layering "
4573 "is EXPERIMENTAL!");
4574 }
4575
4576 /* If the image supports fancy striping, get its parameters */
4577
4578 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4579 ret = rbd_dev_v2_striping_info(rbd_dev);
3934 if (ret < 0) 4580 if (ret < 0)
3935 goto out_err; 4581 goto out_err;
3936 } 4582 }
@@ -3942,12 +4588,9 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3942 4588
3943 /* Get the snapshot context, plus the header version */ 4589 /* Get the snapshot context, plus the header version */
3944 4590
3945 ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 4591 ret = rbd_dev_v2_snap_context(rbd_dev);
3946 if (ret) 4592 if (ret)
3947 goto out_err; 4593 goto out_err;
3948 rbd_dev->header.obj_version = ver;
3949
3950 rbd_dev->image_format = 2;
3951 4594
3952 dout("discovered version 2 image, header name is %s\n", 4595 dout("discovered version 2 image, header name is %s\n",
3953 rbd_dev->header_name); 4596 rbd_dev->header_name);
@@ -3965,22 +4608,54 @@ out_err:
3965 return ret; 4608 return ret;
3966} 4609}
3967 4610
3968static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 4611static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
3969{ 4612{
4613 struct rbd_device *parent = NULL;
4614 struct rbd_spec *parent_spec;
4615 struct rbd_client *rbdc;
3970 int ret; 4616 int ret;
3971 4617
3972 /* no need to lock here, as rbd_dev is not registered yet */ 4618 if (!rbd_dev->parent_spec)
3973 ret = rbd_dev_snaps_update(rbd_dev); 4619 return 0;
3974 if (ret) 4620 /*
3975 return ret; 4621 * We need to pass a reference to the client and the parent
4622 * spec when creating the parent rbd_dev. Images related by
4623 * parent/child relationships always share both.
4624 */
4625 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4626 rbdc = __rbd_get_client(rbd_dev->rbd_client);
3976 4627
3977 ret = rbd_dev_probe_update_spec(rbd_dev); 4628 ret = -ENOMEM;
3978 if (ret) 4629 parent = rbd_dev_create(rbdc, parent_spec);
3979 goto err_out_snaps; 4630 if (!parent)
4631 goto out_err;
4632
4633 ret = rbd_dev_image_probe(parent);
4634 if (ret < 0)
4635 goto out_err;
4636 rbd_dev->parent = parent;
4637
4638 return 0;
4639out_err:
4640 if (parent) {
4641 rbd_spec_put(rbd_dev->parent_spec);
4642 kfree(rbd_dev->header_name);
4643 rbd_dev_destroy(parent);
4644 } else {
4645 rbd_put_client(rbdc);
4646 rbd_spec_put(parent_spec);
4647 }
4648
4649 return ret;
4650}
4651
4652static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4653{
4654 int ret;
3980 4655
3981 ret = rbd_dev_set_mapping(rbd_dev); 4656 ret = rbd_dev_mapping_set(rbd_dev);
3982 if (ret) 4657 if (ret)
3983 goto err_out_snaps; 4658 return ret;
3984 4659
3985 /* generate unique id: find highest unique id, add one */ 4660 /* generate unique id: find highest unique id, add one */
3986 rbd_dev_id_get(rbd_dev); 4661 rbd_dev_id_get(rbd_dev);
@@ -4007,54 +4682,81 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4007 if (ret) 4682 if (ret)
4008 goto err_out_disk; 4683 goto err_out_disk;
4009 4684
4010 /*
4011 * At this point cleanup in the event of an error is the job
4012 * of the sysfs code (initiated by rbd_bus_del_dev()).
4013 */
4014 down_write(&rbd_dev->header_rwsem);
4015 ret = rbd_dev_snaps_register(rbd_dev);
4016 up_write(&rbd_dev->header_rwsem);
4017 if (ret)
4018 goto err_out_bus;
4019
4020 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4021 if (ret)
4022 goto err_out_bus;
4023
4024 /* Everything's ready. Announce the disk to the world. */ 4685 /* Everything's ready. Announce the disk to the world. */
4025 4686
4687 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4688 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4026 add_disk(rbd_dev->disk); 4689 add_disk(rbd_dev->disk);
4027 4690
4028 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 4691 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4029 (unsigned long long) rbd_dev->mapping.size); 4692 (unsigned long long) rbd_dev->mapping.size);
4030 4693
4031 return ret; 4694 return ret;
4032err_out_bus:
4033 /* this will also clean up rest of rbd_dev stuff */
4034 4695
4035 rbd_bus_del_dev(rbd_dev);
4036
4037 return ret;
4038err_out_disk: 4696err_out_disk:
4039 rbd_free_disk(rbd_dev); 4697 rbd_free_disk(rbd_dev);
4040err_out_blkdev: 4698err_out_blkdev:
4041 unregister_blkdev(rbd_dev->major, rbd_dev->name); 4699 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4042err_out_id: 4700err_out_id:
4043 rbd_dev_id_put(rbd_dev); 4701 rbd_dev_id_put(rbd_dev);
4044err_out_snaps: 4702 rbd_dev_mapping_clear(rbd_dev);
4045 rbd_remove_all_snaps(rbd_dev);
4046 4703
4047 return ret; 4704 return ret;
4048} 4705}
4049 4706
4707static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4708{
4709 struct rbd_spec *spec = rbd_dev->spec;
4710 size_t size;
4711
4712 /* Record the header object name for this rbd image. */
4713
4714 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4715
4716 if (rbd_dev->image_format == 1)
4717 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4718 else
4719 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4720
4721 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4722 if (!rbd_dev->header_name)
4723 return -ENOMEM;
4724
4725 if (rbd_dev->image_format == 1)
4726 sprintf(rbd_dev->header_name, "%s%s",
4727 spec->image_name, RBD_SUFFIX);
4728 else
4729 sprintf(rbd_dev->header_name, "%s%s",
4730 RBD_HEADER_PREFIX, spec->image_id);
4731 return 0;
4732}
4733
4734static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4735{
4736 int ret;
4737
4738 rbd_dev_unprobe(rbd_dev);
4739 ret = rbd_dev_header_watch_sync(rbd_dev, 0);
4740 if (ret)
4741 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
4742 kfree(rbd_dev->header_name);
4743 rbd_dev->header_name = NULL;
4744 rbd_dev->image_format = 0;
4745 kfree(rbd_dev->spec->image_id);
4746 rbd_dev->spec->image_id = NULL;
4747
4748 rbd_dev_destroy(rbd_dev);
4749}
4750
4050/* 4751/*
4051 * Probe for the existence of the header object for the given rbd 4752 * Probe for the existence of the header object for the given rbd
4052 * device. For format 2 images this includes determining the image 4753 * device. For format 2 images this includes determining the image
4053 * id. 4754 * id.
4054 */ 4755 */
4055static int rbd_dev_probe(struct rbd_device *rbd_dev) 4756static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
4056{ 4757{
4057 int ret; 4758 int ret;
4759 int tmp;
4058 4760
4059 /* 4761 /*
4060 * Get the id from the image id object. If it's not a 4762 * Get the id from the image id object. If it's not a
@@ -4063,18 +4765,48 @@ static int rbd_dev_probe(struct rbd_device *rbd_dev)
4063 */ 4765 */
4064 ret = rbd_dev_image_id(rbd_dev); 4766 ret = rbd_dev_image_id(rbd_dev);
4065 if (ret) 4767 if (ret)
4768 return ret;
4769 rbd_assert(rbd_dev->spec->image_id);
4770 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4771
4772 ret = rbd_dev_header_name(rbd_dev);
4773 if (ret)
4774 goto err_out_format;
4775
4776 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4777 if (ret)
4778 goto out_header_name;
4779
4780 if (rbd_dev->image_format == 1)
4066 ret = rbd_dev_v1_probe(rbd_dev); 4781 ret = rbd_dev_v1_probe(rbd_dev);
4067 else 4782 else
4068 ret = rbd_dev_v2_probe(rbd_dev); 4783 ret = rbd_dev_v2_probe(rbd_dev);
4069 if (ret) { 4784 if (ret)
4070 dout("probe failed, returning %d\n", ret); 4785 goto err_out_watch;
4071
4072 return ret;
4073 }
4074 4786
4075 ret = rbd_dev_probe_finish(rbd_dev); 4787 ret = rbd_dev_spec_update(rbd_dev);
4076 if (ret) 4788 if (ret)
4077 rbd_header_free(&rbd_dev->header); 4789 goto err_out_probe;
4790
4791 ret = rbd_dev_probe_parent(rbd_dev);
4792 if (!ret)
4793 return 0;
4794
4795err_out_probe:
4796 rbd_dev_unprobe(rbd_dev);
4797err_out_watch:
4798 tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4799 if (tmp)
4800 rbd_warn(rbd_dev, "unable to tear down watch request\n");
4801out_header_name:
4802 kfree(rbd_dev->header_name);
4803 rbd_dev->header_name = NULL;
4804err_out_format:
4805 rbd_dev->image_format = 0;
4806 kfree(rbd_dev->spec->image_id);
4807 rbd_dev->spec->image_id = NULL;
4808
4809 dout("probe failed, returning %d\n", ret);
4078 4810
4079 return ret; 4811 return ret;
4080} 4812}
@@ -4111,11 +4843,13 @@ static ssize_t rbd_add(struct bus_type *bus,
4111 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 4843 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4112 if (rc < 0) 4844 if (rc < 0)
4113 goto err_out_client; 4845 goto err_out_client;
4114 spec->pool_id = (u64) rc; 4846 spec->pool_id = (u64)rc;
4115 4847
4116 /* The ceph file layout needs to fit pool id in 32 bits */ 4848 /* The ceph file layout needs to fit pool id in 32 bits */
4117 4849
4118 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { 4850 if (spec->pool_id > (u64)U32_MAX) {
4851 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4852 (unsigned long long)spec->pool_id, U32_MAX);
4119 rc = -EIO; 4853 rc = -EIO;
4120 goto err_out_client; 4854 goto err_out_client;
4121 } 4855 }
@@ -4130,11 +4864,15 @@ static ssize_t rbd_add(struct bus_type *bus,
4130 kfree(rbd_opts); 4864 kfree(rbd_opts);
4131 rbd_opts = NULL; /* done with this */ 4865 rbd_opts = NULL; /* done with this */
4132 4866
4133 rc = rbd_dev_probe(rbd_dev); 4867 rc = rbd_dev_image_probe(rbd_dev);
4134 if (rc < 0) 4868 if (rc < 0)
4135 goto err_out_rbd_dev; 4869 goto err_out_rbd_dev;
4136 4870
4137 return count; 4871 rc = rbd_dev_device_setup(rbd_dev);
4872 if (!rc)
4873 return count;
4874
4875 rbd_dev_image_release(rbd_dev);
4138err_out_rbd_dev: 4876err_out_rbd_dev:
4139 rbd_dev_destroy(rbd_dev); 4877 rbd_dev_destroy(rbd_dev);
4140err_out_client: 4878err_out_client:
@@ -4149,7 +4887,7 @@ err_out_module:
4149 4887
4150 dout("Error adding device %s\n", buf); 4888 dout("Error adding device %s\n", buf);
4151 4889
4152 return (ssize_t) rc; 4890 return (ssize_t)rc;
4153} 4891}
4154 4892
4155static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 4893static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
@@ -4169,27 +4907,43 @@ static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4169 return NULL; 4907 return NULL;
4170} 4908}
4171 4909
4172static void rbd_dev_release(struct device *dev) 4910static void rbd_dev_device_release(struct device *dev)
4173{ 4911{
4174 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4912 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4175 4913
4176 if (rbd_dev->watch_event)
4177 rbd_dev_header_watch_sync(rbd_dev, 0);
4178
4179 /* clean up and free blkdev */
4180 rbd_free_disk(rbd_dev); 4914 rbd_free_disk(rbd_dev);
4915 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4916 rbd_dev_clear_mapping(rbd_dev);
4181 unregister_blkdev(rbd_dev->major, rbd_dev->name); 4917 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4182 4918 rbd_dev->major = 0;
4183 /* release allocated disk header fields */
4184 rbd_header_free(&rbd_dev->header);
4185
4186 /* done with the id, and with the rbd_dev */
4187 rbd_dev_id_put(rbd_dev); 4919 rbd_dev_id_put(rbd_dev);
4188 rbd_assert(rbd_dev->rbd_client != NULL); 4920 rbd_dev_mapping_clear(rbd_dev);
4189 rbd_dev_destroy(rbd_dev); 4921}
4190 4922
4191 /* release module ref */ 4923static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
4192 module_put(THIS_MODULE); 4924{
4925 while (rbd_dev->parent) {
4926 struct rbd_device *first = rbd_dev;
4927 struct rbd_device *second = first->parent;
4928 struct rbd_device *third;
4929
4930 /*
4931 * Follow to the parent with no grandparent and
4932 * remove it.
4933 */
4934 while (second && (third = second->parent)) {
4935 first = second;
4936 second = third;
4937 }
4938 rbd_assert(second);
4939 rbd_dev_image_release(second);
4940 first->parent = NULL;
4941 first->parent_overlap = 0;
4942
4943 rbd_assert(first->parent_spec);
4944 rbd_spec_put(first->parent_spec);
4945 first->parent_spec = NULL;
4946 }
4193} 4947}
4194 4948
4195static ssize_t rbd_remove(struct bus_type *bus, 4949static ssize_t rbd_remove(struct bus_type *bus,
@@ -4197,13 +4951,13 @@ static ssize_t rbd_remove(struct bus_type *bus,
4197 size_t count) 4951 size_t count)
4198{ 4952{
4199 struct rbd_device *rbd_dev = NULL; 4953 struct rbd_device *rbd_dev = NULL;
4200 int target_id, rc; 4954 int target_id;
4201 unsigned long ul; 4955 unsigned long ul;
4202 int ret = count; 4956 int ret;
4203 4957
4204 rc = strict_strtoul(buf, 10, &ul); 4958 ret = strict_strtoul(buf, 10, &ul);
4205 if (rc) 4959 if (ret)
4206 return rc; 4960 return ret;
4207 4961
4208 /* convert to int; abort if we lost anything in the conversion */ 4962 /* convert to int; abort if we lost anything in the conversion */
4209 target_id = (int) ul; 4963 target_id = (int) ul;
@@ -4226,10 +4980,10 @@ static ssize_t rbd_remove(struct bus_type *bus,
4226 spin_unlock_irq(&rbd_dev->lock); 4980 spin_unlock_irq(&rbd_dev->lock);
4227 if (ret < 0) 4981 if (ret < 0)
4228 goto done; 4982 goto done;
4229 4983 ret = count;
4230 rbd_remove_all_snaps(rbd_dev);
4231 rbd_bus_del_dev(rbd_dev); 4984 rbd_bus_del_dev(rbd_dev);
4232 4985 rbd_dev_image_release(rbd_dev);
4986 module_put(THIS_MODULE);
4233done: 4987done:
4234 mutex_unlock(&ctl_mutex); 4988 mutex_unlock(&ctl_mutex);
4235 4989
@@ -4261,6 +5015,56 @@ static void rbd_sysfs_cleanup(void)
4261 device_unregister(&rbd_root_dev); 5015 device_unregister(&rbd_root_dev);
4262} 5016}
4263 5017
5018static int rbd_slab_init(void)
5019{
5020 rbd_assert(!rbd_img_request_cache);
5021 rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5022 sizeof (struct rbd_img_request),
5023 __alignof__(struct rbd_img_request),
5024 0, NULL);
5025 if (!rbd_img_request_cache)
5026 return -ENOMEM;
5027
5028 rbd_assert(!rbd_obj_request_cache);
5029 rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5030 sizeof (struct rbd_obj_request),
5031 __alignof__(struct rbd_obj_request),
5032 0, NULL);
5033 if (!rbd_obj_request_cache)
5034 goto out_err;
5035
5036 rbd_assert(!rbd_segment_name_cache);
5037 rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
5038 MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
5039 if (rbd_segment_name_cache)
5040 return 0;
5041out_err:
5042 if (rbd_obj_request_cache) {
5043 kmem_cache_destroy(rbd_obj_request_cache);
5044 rbd_obj_request_cache = NULL;
5045 }
5046
5047 kmem_cache_destroy(rbd_img_request_cache);
5048 rbd_img_request_cache = NULL;
5049
5050 return -ENOMEM;
5051}
5052
5053static void rbd_slab_exit(void)
5054{
5055 rbd_assert(rbd_segment_name_cache);
5056 kmem_cache_destroy(rbd_segment_name_cache);
5057 rbd_segment_name_cache = NULL;
5058
5059 rbd_assert(rbd_obj_request_cache);
5060 kmem_cache_destroy(rbd_obj_request_cache);
5061 rbd_obj_request_cache = NULL;
5062
5063 rbd_assert(rbd_img_request_cache);
5064 kmem_cache_destroy(rbd_img_request_cache);
5065 rbd_img_request_cache = NULL;
5066}
5067
4264static int __init rbd_init(void) 5068static int __init rbd_init(void)
4265{ 5069{
4266 int rc; 5070 int rc;
@@ -4270,16 +5074,22 @@ static int __init rbd_init(void)
4270 5074
4271 return -EINVAL; 5075 return -EINVAL;
4272 } 5076 }
4273 rc = rbd_sysfs_init(); 5077 rc = rbd_slab_init();
4274 if (rc) 5078 if (rc)
4275 return rc; 5079 return rc;
4276 pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 5080 rc = rbd_sysfs_init();
4277 return 0; 5081 if (rc)
5082 rbd_slab_exit();
5083 else
5084 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5085
5086 return rc;
4278} 5087}
4279 5088
4280static void __exit rbd_exit(void) 5089static void __exit rbd_exit(void)
4281{ 5090{
4282 rbd_sysfs_cleanup(); 5091 rbd_sysfs_cleanup();
5092 rbd_slab_exit();
4283} 5093}
4284 5094
4285module_init(rbd_init); 5095module_init(rbd_init);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a60ea977af6f..3e68ac101040 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -236,15 +236,21 @@ static int ceph_readpage(struct file *filp, struct page *page)
236static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 236static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
237{ 237{
238 struct inode *inode = req->r_inode; 238 struct inode *inode = req->r_inode;
239 struct ceph_osd_data *osd_data;
239 int rc = req->r_result; 240 int rc = req->r_result;
240 int bytes = le32_to_cpu(msg->hdr.data_len); 241 int bytes = le32_to_cpu(msg->hdr.data_len);
242 int num_pages;
241 int i; 243 int i;
242 244
243 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 245 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
244 246
245 /* unlock all pages, zeroing any data we didn't read */ 247 /* unlock all pages, zeroing any data we didn't read */
246 for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { 248 osd_data = osd_req_op_extent_osd_data(req, 0);
247 struct page *page = req->r_pages[i]; 249 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
250 num_pages = calc_pages_for((u64)osd_data->alignment,
251 (u64)osd_data->length);
252 for (i = 0; i < num_pages; i++) {
253 struct page *page = osd_data->pages[i];
248 254
249 if (bytes < (int)PAGE_CACHE_SIZE) { 255 if (bytes < (int)PAGE_CACHE_SIZE) {
250 /* zero (remainder of) page */ 256 /* zero (remainder of) page */
@@ -257,8 +263,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
257 SetPageUptodate(page); 263 SetPageUptodate(page);
258 unlock_page(page); 264 unlock_page(page);
259 page_cache_release(page); 265 page_cache_release(page);
266 bytes -= PAGE_CACHE_SIZE;
260 } 267 }
261 kfree(req->r_pages); 268 kfree(osd_data->pages);
262} 269}
263 270
264static void ceph_unlock_page_vector(struct page **pages, int num_pages) 271static void ceph_unlock_page_vector(struct page **pages, int num_pages)
@@ -279,6 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
279 &ceph_inode_to_client(inode)->client->osdc; 286 &ceph_inode_to_client(inode)->client->osdc;
280 struct ceph_inode_info *ci = ceph_inode(inode); 287 struct ceph_inode_info *ci = ceph_inode(inode);
281 struct page *page = list_entry(page_list->prev, struct page, lru); 288 struct page *page = list_entry(page_list->prev, struct page, lru);
289 struct ceph_vino vino;
282 struct ceph_osd_request *req; 290 struct ceph_osd_request *req;
283 u64 off; 291 u64 off;
284 u64 len; 292 u64 len;
@@ -303,18 +311,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
303 len = nr_pages << PAGE_CACHE_SHIFT; 311 len = nr_pages << PAGE_CACHE_SHIFT;
304 dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, 312 dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
305 off, len); 313 off, len);
306 314 vino = ceph_vino(inode);
307 req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), 315 req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
308 off, &len, 316 1, CEPH_OSD_OP_READ,
309 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 317 CEPH_OSD_FLAG_READ, NULL,
310 NULL, 0,
311 ci->i_truncate_seq, ci->i_truncate_size, 318 ci->i_truncate_seq, ci->i_truncate_size,
312 NULL, false, 0); 319 false);
313 if (IS_ERR(req)) 320 if (IS_ERR(req))
314 return PTR_ERR(req); 321 return PTR_ERR(req);
315 322
316 /* build page vector */ 323 /* build page vector */
317 nr_pages = len >> PAGE_CACHE_SHIFT; 324 nr_pages = calc_pages_for(0, len);
318 pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); 325 pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
319 ret = -ENOMEM; 326 ret = -ENOMEM;
320 if (!pages) 327 if (!pages)
@@ -336,11 +343,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
336 } 343 }
337 pages[i] = page; 344 pages[i] = page;
338 } 345 }
339 req->r_pages = pages; 346 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
340 req->r_num_pages = nr_pages;
341 req->r_callback = finish_read; 347 req->r_callback = finish_read;
342 req->r_inode = inode; 348 req->r_inode = inode;
343 349
350 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
351
344 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); 352 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
345 ret = ceph_osdc_start_request(osdc, req, false); 353 ret = ceph_osdc_start_request(osdc, req, false);
346 if (ret < 0) 354 if (ret < 0)
@@ -373,7 +381,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
373 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 381 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
374 >> PAGE_SHIFT; 382 >> PAGE_SHIFT;
375 383
376 dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, 384 dout("readpages %p file %p nr_pages %d max %d\n", inode,
385 file, nr_pages,
377 max); 386 max);
378 while (!list_empty(page_list)) { 387 while (!list_empty(page_list)) {
379 rc = start_read(inode, page_list, max); 388 rc = start_read(inode, page_list, max);
@@ -548,17 +557,23 @@ static void writepages_finish(struct ceph_osd_request *req,
548{ 557{
549 struct inode *inode = req->r_inode; 558 struct inode *inode = req->r_inode;
550 struct ceph_inode_info *ci = ceph_inode(inode); 559 struct ceph_inode_info *ci = ceph_inode(inode);
560 struct ceph_osd_data *osd_data;
551 unsigned wrote; 561 unsigned wrote;
552 struct page *page; 562 struct page *page;
563 int num_pages;
553 int i; 564 int i;
554 struct ceph_snap_context *snapc = req->r_snapc; 565 struct ceph_snap_context *snapc = req->r_snapc;
555 struct address_space *mapping = inode->i_mapping; 566 struct address_space *mapping = inode->i_mapping;
556 int rc = req->r_result; 567 int rc = req->r_result;
557 u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); 568 u64 bytes = req->r_ops[0].extent.length;
558 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 569 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
559 long writeback_stat; 570 long writeback_stat;
560 unsigned issued = ceph_caps_issued(ci); 571 unsigned issued = ceph_caps_issued(ci);
561 572
573 osd_data = osd_req_op_extent_osd_data(req, 0);
574 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
575 num_pages = calc_pages_for((u64)osd_data->alignment,
576 (u64)osd_data->length);
562 if (rc >= 0) { 577 if (rc >= 0) {
563 /* 578 /*
564 * Assume we wrote the pages we originally sent. The 579 * Assume we wrote the pages we originally sent. The
@@ -566,7 +581,7 @@ static void writepages_finish(struct ceph_osd_request *req,
566 * raced with a truncation and was adjusted at the osd, 581 * raced with a truncation and was adjusted at the osd,
567 * so don't believe the reply. 582 * so don't believe the reply.
568 */ 583 */
569 wrote = req->r_num_pages; 584 wrote = num_pages;
570 } else { 585 } else {
571 wrote = 0; 586 wrote = 0;
572 mapping_set_error(mapping, rc); 587 mapping_set_error(mapping, rc);
@@ -575,8 +590,8 @@ static void writepages_finish(struct ceph_osd_request *req,
575 inode, rc, bytes, wrote); 590 inode, rc, bytes, wrote);
576 591
577 /* clean all pages */ 592 /* clean all pages */
578 for (i = 0; i < req->r_num_pages; i++) { 593 for (i = 0; i < num_pages; i++) {
579 page = req->r_pages[i]; 594 page = osd_data->pages[i];
580 BUG_ON(!page); 595 BUG_ON(!page);
581 WARN_ON(!PageUptodate(page)); 596 WARN_ON(!PageUptodate(page));
582 597
@@ -605,32 +620,34 @@ static void writepages_finish(struct ceph_osd_request *req,
605 unlock_page(page); 620 unlock_page(page);
606 } 621 }
607 dout("%p wrote+cleaned %d pages\n", inode, wrote); 622 dout("%p wrote+cleaned %d pages\n", inode, wrote);
608 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); 623 ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
609 624
610 ceph_release_pages(req->r_pages, req->r_num_pages); 625 ceph_release_pages(osd_data->pages, num_pages);
611 if (req->r_pages_from_pool) 626 if (osd_data->pages_from_pool)
612 mempool_free(req->r_pages, 627 mempool_free(osd_data->pages,
613 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); 628 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
614 else 629 else
615 kfree(req->r_pages); 630 kfree(osd_data->pages);
616 ceph_osdc_put_request(req); 631 ceph_osdc_put_request(req);
617} 632}
618 633
619/* 634static struct ceph_osd_request *
620 * allocate a page vec, either directly, or if necessary, via a the 635ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
621 * mempool. we avoid the mempool if we can because req->r_num_pages 636 struct ceph_snap_context *snapc, int num_ops)
622 * may be less than the maximum write size.
623 */
624static void alloc_page_vec(struct ceph_fs_client *fsc,
625 struct ceph_osd_request *req)
626{ 637{
627 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 638 struct ceph_fs_client *fsc;
628 GFP_NOFS); 639 struct ceph_inode_info *ci;
629 if (!req->r_pages) { 640 struct ceph_vino vino;
630 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); 641
631 req->r_pages_from_pool = 1; 642 fsc = ceph_inode_to_client(inode);
632 WARN_ON(!req->r_pages); 643 ci = ceph_inode(inode);
633 } 644 vino = ceph_vino(inode);
645 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
646
647 return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
648 vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
649 CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
650 snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
634} 651}
635 652
636/* 653/*
@@ -653,7 +670,7 @@ static int ceph_writepages_start(struct address_space *mapping,
653 unsigned wsize = 1 << inode->i_blkbits; 670 unsigned wsize = 1 << inode->i_blkbits;
654 struct ceph_osd_request *req = NULL; 671 struct ceph_osd_request *req = NULL;
655 int do_sync; 672 int do_sync;
656 u64 snap_size = 0; 673 u64 snap_size;
657 674
658 /* 675 /*
659 * Include a 'sync' in the OSD request if this is a data 676 * Include a 'sync' in the OSD request if this is a data
@@ -699,6 +716,7 @@ static int ceph_writepages_start(struct address_space *mapping,
699retry: 716retry:
700 /* find oldest snap context with dirty data */ 717 /* find oldest snap context with dirty data */
701 ceph_put_snap_context(snapc); 718 ceph_put_snap_context(snapc);
719 snap_size = 0;
702 snapc = get_oldest_context(inode, &snap_size); 720 snapc = get_oldest_context(inode, &snap_size);
703 if (!snapc) { 721 if (!snapc) {
704 /* hmm, why does writepages get called when there 722 /* hmm, why does writepages get called when there
@@ -706,6 +724,8 @@ retry:
706 dout(" no snap context with dirty data?\n"); 724 dout(" no snap context with dirty data?\n");
707 goto out; 725 goto out;
708 } 726 }
727 if (snap_size == 0)
728 snap_size = i_size_read(inode);
709 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 729 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
710 snapc, snapc->seq, snapc->num_snaps); 730 snapc, snapc->seq, snapc->num_snaps);
711 if (last_snapc && snapc != last_snapc) { 731 if (last_snapc && snapc != last_snapc) {
@@ -718,10 +738,14 @@ retry:
718 last_snapc = snapc; 738 last_snapc = snapc;
719 739
720 while (!done && index <= end) { 740 while (!done && index <= end) {
741 int num_ops = do_sync ? 2 : 1;
742 struct ceph_vino vino;
721 unsigned i; 743 unsigned i;
722 int first; 744 int first;
723 pgoff_t next; 745 pgoff_t next;
724 int pvec_pages, locked_pages; 746 int pvec_pages, locked_pages;
747 struct page **pages = NULL;
748 mempool_t *pool = NULL; /* Becomes non-null if mempool used */
725 struct page *page; 749 struct page *page;
726 int want; 750 int want;
727 u64 offset, len; 751 u64 offset, len;
@@ -773,11 +797,8 @@ get_more_pages:
773 dout("waiting on writeback %p\n", page); 797 dout("waiting on writeback %p\n", page);
774 wait_on_page_writeback(page); 798 wait_on_page_writeback(page);
775 } 799 }
776 if ((snap_size && page_offset(page) > snap_size) || 800 if (page_offset(page) >= snap_size) {
777 (!snap_size && 801 dout("%p page eof %llu\n", page, snap_size);
778 page_offset(page) > i_size_read(inode))) {
779 dout("%p page eof %llu\n", page, snap_size ?
780 snap_size : i_size_read(inode));
781 done = 1; 802 done = 1;
782 unlock_page(page); 803 unlock_page(page);
783 break; 804 break;
@@ -805,22 +826,23 @@ get_more_pages:
805 break; 826 break;
806 } 827 }
807 828
808 /* ok */ 829 /*
830 * We have something to write. If this is
831 * the first locked page this time through,
832 * allocate an osd request and a page array
833 * that it will use.
834 */
809 if (locked_pages == 0) { 835 if (locked_pages == 0) {
836 size_t size;
837
838 BUG_ON(pages);
839
810 /* prepare async write request */ 840 /* prepare async write request */
811 offset = (u64) page_offset(page); 841 offset = (u64)page_offset(page);
812 len = wsize; 842 len = wsize;
813 req = ceph_osdc_new_request(&fsc->client->osdc, 843 req = ceph_writepages_osd_request(inode,
814 &ci->i_layout, 844 offset, &len, snapc,
815 ceph_vino(inode), 845 num_ops);
816 offset, &len,
817 CEPH_OSD_OP_WRITE,
818 CEPH_OSD_FLAG_WRITE |
819 CEPH_OSD_FLAG_ONDISK,
820 snapc, do_sync,
821 ci->i_truncate_seq,
822 ci->i_truncate_size,
823 &inode->i_mtime, true, 0);
824 846
825 if (IS_ERR(req)) { 847 if (IS_ERR(req)) {
826 rc = PTR_ERR(req); 848 rc = PTR_ERR(req);
@@ -828,11 +850,17 @@ get_more_pages:
828 break; 850 break;
829 } 851 }
830 852
831 max_pages = req->r_num_pages;
832
833 alloc_page_vec(fsc, req);
834 req->r_callback = writepages_finish; 853 req->r_callback = writepages_finish;
835 req->r_inode = inode; 854 req->r_inode = inode;
855
856 max_pages = calc_pages_for(0, (u64)len);
857 size = max_pages * sizeof (*pages);
858 pages = kmalloc(size, GFP_NOFS);
859 if (!pages) {
860 pool = fsc->wb_pagevec_pool;
861 pages = mempool_alloc(pool, GFP_NOFS);
862 BUG_ON(!pages);
863 }
836 } 864 }
837 865
838 /* note position of first page in pvec */ 866 /* note position of first page in pvec */
@@ -850,7 +878,7 @@ get_more_pages:
850 } 878 }
851 879
852 set_page_writeback(page); 880 set_page_writeback(page);
853 req->r_pages[locked_pages] = page; 881 pages[locked_pages] = page;
854 locked_pages++; 882 locked_pages++;
855 next = page->index + 1; 883 next = page->index + 1;
856 } 884 }
@@ -879,18 +907,27 @@ get_more_pages:
879 pvec.nr -= i-first; 907 pvec.nr -= i-first;
880 } 908 }
881 909
882 /* submit the write */ 910 /* Format the osd request message and submit the write */
883 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; 911
884 len = min((snap_size ? snap_size : i_size_read(inode)) - offset, 912 offset = page_offset(pages[0]);
913 len = min(snap_size - offset,
885 (u64)locked_pages << PAGE_CACHE_SHIFT); 914 (u64)locked_pages << PAGE_CACHE_SHIFT);
886 dout("writepages got %d pages at %llu~%llu\n", 915 dout("writepages got %d pages at %llu~%llu\n",
887 locked_pages, offset, len); 916 locked_pages, offset, len);
888 917
889 /* revise final length, page count */ 918 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
890 req->r_num_pages = locked_pages; 919 !!pool, false);
891 req->r_request_ops[0].extent.length = cpu_to_le64(len); 920
892 req->r_request_ops[0].payload_len = cpu_to_le32(len); 921 pages = NULL; /* request message now owns the pages array */
893 req->r_request->hdr.data_len = cpu_to_le32(len); 922 pool = NULL;
923
924 /* Update the write op length in case we changed it */
925
926 osd_req_op_extent_update(req, 0, len);
927
928 vino = ceph_vino(inode);
929 ceph_osdc_build_request(req, offset, snapc, vino.snap,
930 &inode->i_mtime);
894 931
895 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 932 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
896 BUG_ON(rc); 933 BUG_ON(rc);
@@ -1067,51 +1104,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1067 struct page **pagep, void **fsdata) 1104 struct page **pagep, void **fsdata)
1068{ 1105{
1069 struct inode *inode = file_inode(file); 1106 struct inode *inode = file_inode(file);
1070 struct ceph_inode_info *ci = ceph_inode(inode);
1071 struct ceph_file_info *fi = file->private_data;
1072 struct page *page; 1107 struct page *page;
1073 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1108 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1074 int r, want, got = 0; 1109 int r;
1075
1076 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1077 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1078 else
1079 want = CEPH_CAP_FILE_BUFFER;
1080
1081 dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
1082 inode, ceph_vinop(inode), pos, len, inode->i_size);
1083 r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
1084 if (r < 0)
1085 return r;
1086 dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
1087 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1088 if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
1089 ceph_put_cap_refs(ci, got);
1090 return -EAGAIN;
1091 }
1092 1110
1093 do { 1111 do {
1094 /* get a page */ 1112 /* get a page */
1095 page = grab_cache_page_write_begin(mapping, index, 0); 1113 page = grab_cache_page_write_begin(mapping, index, 0);
1096 if (!page) { 1114 if (!page)
1097 r = -ENOMEM; 1115 return -ENOMEM;
1098 break; 1116 *pagep = page;
1099 }
1100 1117
1101 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1118 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1102 inode, page, (int)pos, (int)len); 1119 inode, page, (int)pos, (int)len);
1103 1120
1104 r = ceph_update_writeable_page(file, pos, len, page); 1121 r = ceph_update_writeable_page(file, pos, len, page);
1105 if (r)
1106 page_cache_release(page);
1107 } while (r == -EAGAIN); 1122 } while (r == -EAGAIN);
1108 1123
1109 if (r) {
1110 ceph_put_cap_refs(ci, got);
1111 } else {
1112 *pagep = page;
1113 *(int *)fsdata = got;
1114 }
1115 return r; 1124 return r;
1116} 1125}
1117 1126
@@ -1125,12 +1134,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1125 struct page *page, void *fsdata) 1134 struct page *page, void *fsdata)
1126{ 1135{
1127 struct inode *inode = file_inode(file); 1136 struct inode *inode = file_inode(file);
1128 struct ceph_inode_info *ci = ceph_inode(inode);
1129 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1137 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1130 struct ceph_mds_client *mdsc = fsc->mdsc; 1138 struct ceph_mds_client *mdsc = fsc->mdsc;
1131 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1139 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1132 int check_cap = 0; 1140 int check_cap = 0;
1133 int got = (unsigned long)fsdata;
1134 1141
1135 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, 1142 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1136 inode, page, (int)pos, (int)copied, (int)len); 1143 inode, page, (int)pos, (int)copied, (int)len);
@@ -1153,19 +1160,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1153 up_read(&mdsc->snap_rwsem); 1160 up_read(&mdsc->snap_rwsem);
1154 page_cache_release(page); 1161 page_cache_release(page);
1155 1162
1156 if (copied > 0) {
1157 int dirty;
1158 spin_lock(&ci->i_ceph_lock);
1159 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
1160 spin_unlock(&ci->i_ceph_lock);
1161 if (dirty)
1162 __mark_inode_dirty(inode, dirty);
1163 }
1164
1165 dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
1166 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1167 ceph_put_cap_refs(ci, got);
1168
1169 if (check_cap) 1163 if (check_cap)
1170 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); 1164 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1171 1165
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 78e2f575247d..da0f9b8a3bcb 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,15 +490,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
490 ci->i_rdcache_gen++; 490 ci->i_rdcache_gen++;
491 491
492 /* 492 /*
493 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we 493 * if we are newly issued FILE_SHARED, mark dir not complete; we
494 * don't know what happened to this directory while we didn't 494 * don't know what happened to this directory while we didn't
495 * have the cap. 495 * have the cap.
496 */ 496 */
497 if ((issued & CEPH_CAP_FILE_SHARED) && 497 if ((issued & CEPH_CAP_FILE_SHARED) &&
498 (had & CEPH_CAP_FILE_SHARED) == 0) { 498 (had & CEPH_CAP_FILE_SHARED) == 0) {
499 ci->i_shared_gen++; 499 ci->i_shared_gen++;
500 if (S_ISDIR(ci->vfs_inode.i_mode)) 500 if (S_ISDIR(ci->vfs_inode.i_mode)) {
501 ceph_dir_clear_complete(&ci->vfs_inode); 501 dout(" marking %p NOT complete\n", &ci->vfs_inode);
502 __ceph_dir_clear_complete(ci);
503 }
502 } 504 }
503} 505}
504 506
@@ -553,6 +555,7 @@ retry:
553 cap->implemented = 0; 555 cap->implemented = 0;
554 cap->mds = mds; 556 cap->mds = mds;
555 cap->mds_wanted = 0; 557 cap->mds_wanted = 0;
558 cap->mseq = 0;
556 559
557 cap->ci = ci; 560 cap->ci = ci;
558 __insert_cap_node(ci, cap); 561 __insert_cap_node(ci, cap);
@@ -628,7 +631,10 @@ retry:
628 cap->cap_id = cap_id; 631 cap->cap_id = cap_id;
629 cap->issued = issued; 632 cap->issued = issued;
630 cap->implemented |= issued; 633 cap->implemented |= issued;
631 cap->mds_wanted |= wanted; 634 if (mseq > cap->mseq)
635 cap->mds_wanted = wanted;
636 else
637 cap->mds_wanted |= wanted;
632 cap->seq = seq; 638 cap->seq = seq;
633 cap->issue_seq = seq; 639 cap->issue_seq = seq;
634 cap->mseq = mseq; 640 cap->mseq = mseq;
@@ -997,9 +1003,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
997 return 0; 1003 return 0;
998} 1004}
999 1005
1000static void __queue_cap_release(struct ceph_mds_session *session, 1006void __queue_cap_release(struct ceph_mds_session *session,
1001 u64 ino, u64 cap_id, u32 migrate_seq, 1007 u64 ino, u64 cap_id, u32 migrate_seq,
1002 u32 issue_seq) 1008 u32 issue_seq)
1003{ 1009{
1004 struct ceph_msg *msg; 1010 struct ceph_msg *msg;
1005 struct ceph_mds_cap_release *head; 1011 struct ceph_mds_cap_release *head;
@@ -2046,6 +2052,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2046 goto out; 2052 goto out;
2047 } 2053 }
2048 2054
2055 /* finish pending truncate */
2056 while (ci->i_truncate_pending) {
2057 spin_unlock(&ci->i_ceph_lock);
2058 __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR));
2059 spin_lock(&ci->i_ceph_lock);
2060 }
2061
2049 if (need & CEPH_CAP_FILE_WR) { 2062 if (need & CEPH_CAP_FILE_WR) {
2050 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { 2063 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2051 dout("get_cap_refs %p endoff %llu > maxsize %llu\n", 2064 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
@@ -2067,12 +2080,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2067 } 2080 }
2068 have = __ceph_caps_issued(ci, &implemented); 2081 have = __ceph_caps_issued(ci, &implemented);
2069 2082
2070 /*
2071 * disallow writes while a truncate is pending
2072 */
2073 if (ci->i_truncate_pending)
2074 have &= ~CEPH_CAP_FILE_WR;
2075
2076 if ((have & need) == need) { 2083 if ((have & need) == need) {
2077 /* 2084 /*
2078 * Look at (implemented & ~have & not) so that we keep waiting 2085 * Look at (implemented & ~have & not) so that we keep waiting
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6d797f46d772..f02d82b7933e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -107,7 +107,7 @@ static unsigned fpos_off(loff_t p)
107 * falling back to a "normal" sync readdir if any dentries in the dir 107 * falling back to a "normal" sync readdir if any dentries in the dir
108 * are dropped. 108 * are dropped.
109 * 109 *
110 * D_COMPLETE tells indicates we have all dentries in the dir. It is 110 * Complete dir indicates that we have all dentries in the dir. It is
111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
112 * the MDS if/when the directory is modified). 112 * the MDS if/when the directory is modified).
113 */ 113 */
@@ -198,8 +198,8 @@ more:
198 filp->f_pos++; 198 filp->f_pos++;
199 199
200 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 200 /* make sure a dentry wasn't dropped while we didn't have parent lock */
201 if (!ceph_dir_test_complete(dir)) { 201 if (!ceph_dir_is_complete(dir)) {
202 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); 202 dout(" lost dir complete on %p; falling back to mds\n", dir);
203 err = -EAGAIN; 203 err = -EAGAIN;
204 goto out; 204 goto out;
205 } 205 }
@@ -258,7 +258,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
258 if (filp->f_pos == 0) { 258 if (filp->f_pos == 0) {
259 /* note dir version at start of readdir so we can tell 259 /* note dir version at start of readdir so we can tell
260 * if any dentries get dropped */ 260 * if any dentries get dropped */
261 fi->dir_release_count = ci->i_release_count; 261 fi->dir_release_count = atomic_read(&ci->i_release_count);
262 262
263 dout("readdir off 0 -> '.'\n"); 263 dout("readdir off 0 -> '.'\n");
264 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 264 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
@@ -284,7 +284,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
284 if ((filp->f_pos == 2 || fi->dentry) && 284 if ((filp->f_pos == 2 || fi->dentry) &&
285 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 285 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
286 ceph_snap(inode) != CEPH_SNAPDIR && 286 ceph_snap(inode) != CEPH_SNAPDIR &&
287 ceph_dir_test_complete(inode) && 287 __ceph_dir_is_complete(ci) &&
288 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 288 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
289 spin_unlock(&ci->i_ceph_lock); 289 spin_unlock(&ci->i_ceph_lock);
290 err = __dcache_readdir(filp, dirent, filldir); 290 err = __dcache_readdir(filp, dirent, filldir);
@@ -350,7 +350,8 @@ more:
350 350
351 if (!req->r_did_prepopulate) { 351 if (!req->r_did_prepopulate) {
352 dout("readdir !did_prepopulate"); 352 dout("readdir !did_prepopulate");
353 fi->dir_release_count--; /* preclude D_COMPLETE */ 353 /* preclude from marking dir complete */
354 fi->dir_release_count--;
354 } 355 }
355 356
356 /* note next offset and last dentry name */ 357 /* note next offset and last dentry name */
@@ -428,8 +429,9 @@ more:
428 * the complete dir contents in our cache. 429 * the complete dir contents in our cache.
429 */ 430 */
430 spin_lock(&ci->i_ceph_lock); 431 spin_lock(&ci->i_ceph_lock);
431 if (ci->i_release_count == fi->dir_release_count) { 432 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
432 ceph_dir_set_complete(inode); 433 dout(" marking %p complete\n", inode);
434 __ceph_dir_set_complete(ci, fi->dir_release_count);
433 ci->i_max_offset = filp->f_pos; 435 ci->i_max_offset = filp->f_pos;
434 } 436 }
435 spin_unlock(&ci->i_ceph_lock); 437 spin_unlock(&ci->i_ceph_lock);
@@ -604,7 +606,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
604 fsc->mount_options->snapdir_name, 606 fsc->mount_options->snapdir_name,
605 dentry->d_name.len) && 607 dentry->d_name.len) &&
606 !is_root_ceph_dentry(dir, dentry) && 608 !is_root_ceph_dentry(dir, dentry) &&
607 ceph_dir_test_complete(dir) && 609 __ceph_dir_is_complete(ci) &&
608 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 610 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
609 spin_unlock(&ci->i_ceph_lock); 611 spin_unlock(&ci->i_ceph_lock);
610 dout(" dir %p complete, -ENOENT\n", dir); 612 dout(" dir %p complete, -ENOENT\n", dir);
@@ -1065,44 +1067,6 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1065} 1067}
1066 1068
1067/* 1069/*
1068 * Set/clear/test dir complete flag on the dir's dentry.
1069 */
1070void ceph_dir_set_complete(struct inode *inode)
1071{
1072 struct dentry *dentry = d_find_any_alias(inode);
1073
1074 if (dentry && ceph_dentry(dentry) &&
1075 ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
1076 dout(" marking %p (%p) complete\n", inode, dentry);
1077 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1078 }
1079 dput(dentry);
1080}
1081
1082void ceph_dir_clear_complete(struct inode *inode)
1083{
1084 struct dentry *dentry = d_find_any_alias(inode);
1085
1086 if (dentry && ceph_dentry(dentry)) {
1087 dout(" marking %p (%p) complete\n", inode, dentry);
1088 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1089 }
1090 dput(dentry);
1091}
1092
1093bool ceph_dir_test_complete(struct inode *inode)
1094{
1095 struct dentry *dentry = d_find_any_alias(inode);
1096
1097 if (dentry && ceph_dentry(dentry)) {
1098 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1099 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1100 }
1101 dput(dentry);
1102 return false;
1103}
1104
1105/*
1106 * When the VFS prunes a dentry from the cache, we need to clear the 1070 * When the VFS prunes a dentry from the cache, we need to clear the
1107 * complete flag on the parent directory. 1071 * complete flag on the parent directory.
1108 * 1072 *
@@ -1110,15 +1074,13 @@ bool ceph_dir_test_complete(struct inode *inode)
1110 */ 1074 */
1111static void ceph_d_prune(struct dentry *dentry) 1075static void ceph_d_prune(struct dentry *dentry)
1112{ 1076{
1113 struct ceph_dentry_info *di;
1114
1115 dout("ceph_d_prune %p\n", dentry); 1077 dout("ceph_d_prune %p\n", dentry);
1116 1078
1117 /* do we have a valid parent? */ 1079 /* do we have a valid parent? */
1118 if (IS_ROOT(dentry)) 1080 if (IS_ROOT(dentry))
1119 return; 1081 return;
1120 1082
1121 /* if we are not hashed, we don't affect D_COMPLETE */ 1083 /* if we are not hashed, we don't affect dir's completeness */
1122 if (d_unhashed(dentry)) 1084 if (d_unhashed(dentry))
1123 return; 1085 return;
1124 1086
@@ -1126,8 +1088,7 @@ static void ceph_d_prune(struct dentry *dentry)
1126 * we hold d_lock, so d_parent is stable, and d_fsdata is never 1088 * we hold d_lock, so d_parent is stable, and d_fsdata is never
1127 * cleared until d_release 1089 * cleared until d_release
1128 */ 1090 */
1129 di = ceph_dentry(dentry->d_parent); 1091 ceph_dir_clear_complete(dentry->d_parent->d_inode);
1130 clear_bit(CEPH_D_COMPLETE, &di->flags);
1131} 1092}
1132 1093
1133/* 1094/*
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bf338d9b67e3..d70830c66833 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -446,19 +446,35 @@ done:
446} 446}
447 447
448/* 448/*
449 * Write commit callback, called if we requested both an ACK and 449 * Write commit request unsafe callback, called to tell us when a
450 * ONDISK commit reply from the OSD. 450 * request is unsafe (that is, in flight--has been handed to the
451 * messenger to send to its target osd). It is called again when
452 * we've received a response message indicating the request is
453 * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
454 * is completed early (and unsuccessfully) due to a timeout or
455 * interrupt.
456 *
457 * This is used if we requested both an ACK and ONDISK commit reply
458 * from the OSD.
451 */ 459 */
452static void sync_write_commit(struct ceph_osd_request *req, 460static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
453 struct ceph_msg *msg)
454{ 461{
455 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 462 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
456 463
457 dout("sync_write_commit %p tid %llu\n", req, req->r_tid); 464 dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
458 spin_lock(&ci->i_unsafe_lock); 465 unsafe ? "un" : "");
459 list_del_init(&req->r_unsafe_item); 466 if (unsafe) {
460 spin_unlock(&ci->i_unsafe_lock); 467 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
461 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); 468 spin_lock(&ci->i_unsafe_lock);
469 list_add_tail(&req->r_unsafe_item,
470 &ci->i_unsafe_writes);
471 spin_unlock(&ci->i_unsafe_lock);
472 } else {
473 spin_lock(&ci->i_unsafe_lock);
474 list_del_init(&req->r_unsafe_item);
475 spin_unlock(&ci->i_unsafe_lock);
476 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
477 }
462} 478}
463 479
464/* 480/*
@@ -470,36 +486,33 @@ static void sync_write_commit(struct ceph_osd_request *req,
470 * objects, rollback on failure, etc.) 486 * objects, rollback on failure, etc.)
471 */ 487 */
472static ssize_t ceph_sync_write(struct file *file, const char __user *data, 488static ssize_t ceph_sync_write(struct file *file, const char __user *data,
473 size_t left, loff_t *offset) 489 size_t left, loff_t pos, loff_t *ppos)
474{ 490{
475 struct inode *inode = file_inode(file); 491 struct inode *inode = file_inode(file);
476 struct ceph_inode_info *ci = ceph_inode(inode); 492 struct ceph_inode_info *ci = ceph_inode(inode);
477 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 493 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
494 struct ceph_snap_context *snapc;
495 struct ceph_vino vino;
478 struct ceph_osd_request *req; 496 struct ceph_osd_request *req;
497 int num_ops = 1;
479 struct page **pages; 498 struct page **pages;
480 int num_pages; 499 int num_pages;
481 long long unsigned pos;
482 u64 len; 500 u64 len;
483 int written = 0; 501 int written = 0;
484 int flags; 502 int flags;
485 int do_sync = 0;
486 int check_caps = 0; 503 int check_caps = 0;
487 int page_align, io_align; 504 int page_align, io_align;
488 unsigned long buf_align; 505 unsigned long buf_align;
489 int ret; 506 int ret;
490 struct timespec mtime = CURRENT_TIME; 507 struct timespec mtime = CURRENT_TIME;
508 bool own_pages = false;
491 509
492 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 510 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
493 return -EROFS; 511 return -EROFS;
494 512
495 dout("sync_write on file %p %lld~%u %s\n", file, *offset, 513 dout("sync_write on file %p %lld~%u %s\n", file, pos,
496 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 514 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
497 515
498 if (file->f_flags & O_APPEND)
499 pos = i_size_read(inode);
500 else
501 pos = *offset;
502
503 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 516 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
504 if (ret < 0) 517 if (ret < 0)
505 return ret; 518 return ret;
@@ -516,7 +529,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
516 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) 529 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
517 flags |= CEPH_OSD_FLAG_ACK; 530 flags |= CEPH_OSD_FLAG_ACK;
518 else 531 else
519 do_sync = 1; 532 num_ops++; /* Also include a 'startsync' command. */
520 533
521 /* 534 /*
522 * we may need to do multiple writes here if we span an object 535 * we may need to do multiple writes here if we span an object
@@ -526,25 +539,20 @@ more:
526 io_align = pos & ~PAGE_MASK; 539 io_align = pos & ~PAGE_MASK;
527 buf_align = (unsigned long)data & ~PAGE_MASK; 540 buf_align = (unsigned long)data & ~PAGE_MASK;
528 len = left; 541 len = left;
529 if (file->f_flags & O_DIRECT) { 542
530 /* write from beginning of first page, regardless of 543 snapc = ci->i_snap_realm->cached_context;
531 io alignment */ 544 vino = ceph_vino(inode);
532 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
533 num_pages = calc_pages_for((unsigned long)data, len);
534 } else {
535 page_align = pos & ~PAGE_MASK;
536 num_pages = calc_pages_for(pos, len);
537 }
538 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 545 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
539 ceph_vino(inode), pos, &len, 546 vino, pos, &len, num_ops,
540 CEPH_OSD_OP_WRITE, flags, 547 CEPH_OSD_OP_WRITE, flags, snapc,
541 ci->i_snap_realm->cached_context,
542 do_sync,
543 ci->i_truncate_seq, ci->i_truncate_size, 548 ci->i_truncate_seq, ci->i_truncate_size,
544 &mtime, false, page_align); 549 false);
545 if (IS_ERR(req)) 550 if (IS_ERR(req))
546 return PTR_ERR(req); 551 return PTR_ERR(req);
547 552
553 /* write from beginning of first page, regardless of io alignment */
554 page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
555 num_pages = calc_pages_for(page_align, len);
548 if (file->f_flags & O_DIRECT) { 556 if (file->f_flags & O_DIRECT) {
549 pages = ceph_get_direct_page_vector(data, num_pages, false); 557 pages = ceph_get_direct_page_vector(data, num_pages, false);
550 if (IS_ERR(pages)) { 558 if (IS_ERR(pages)) {
@@ -572,36 +580,20 @@ more:
572 580
573 if ((file->f_flags & O_SYNC) == 0) { 581 if ((file->f_flags & O_SYNC) == 0) {
574 /* get a second commit callback */ 582 /* get a second commit callback */
575 req->r_safe_callback = sync_write_commit; 583 req->r_unsafe_callback = ceph_sync_write_unsafe;
576 req->r_own_pages = 1; 584 req->r_inode = inode;
585 own_pages = true;
577 } 586 }
578 } 587 }
579 req->r_pages = pages; 588 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
580 req->r_num_pages = num_pages; 589 false, own_pages);
581 req->r_inode = inode; 590
591 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
592 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
582 593
583 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 594 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
584 if (!ret) { 595 if (!ret)
585 if (req->r_safe_callback) {
586 /*
587 * Add to inode unsafe list only after we
588 * start_request so that a tid has been assigned.
589 */
590 spin_lock(&ci->i_unsafe_lock);
591 list_add_tail(&req->r_unsafe_item,
592 &ci->i_unsafe_writes);
593 spin_unlock(&ci->i_unsafe_lock);
594 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
595 }
596
597 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 596 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
598 if (ret < 0 && req->r_safe_callback) {
599 spin_lock(&ci->i_unsafe_lock);
600 list_del_init(&req->r_unsafe_item);
601 spin_unlock(&ci->i_unsafe_lock);
602 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
603 }
604 }
605 597
606 if (file->f_flags & O_DIRECT) 598 if (file->f_flags & O_DIRECT)
607 ceph_put_page_vector(pages, num_pages, false); 599 ceph_put_page_vector(pages, num_pages, false);
@@ -614,12 +606,12 @@ out:
614 pos += len; 606 pos += len;
615 written += len; 607 written += len;
616 left -= len; 608 left -= len;
617 data += written; 609 data += len;
618 if (left) 610 if (left)
619 goto more; 611 goto more;
620 612
621 ret = written; 613 ret = written;
622 *offset = pos; 614 *ppos = pos;
623 if (pos > i_size_read(inode)) 615 if (pos > i_size_read(inode))
624 check_caps = ceph_inode_set_size(inode, pos); 616 check_caps = ceph_inode_set_size(inode, pos);
625 if (check_caps) 617 if (check_caps)
@@ -653,7 +645,6 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
653 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 645 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
654 inode, ceph_vinop(inode), pos, (unsigned)len, inode); 646 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
655again: 647again:
656 __ceph_do_pending_vmtruncate(inode);
657 if (fi->fmode & CEPH_FILE_MODE_LAZY) 648 if (fi->fmode & CEPH_FILE_MODE_LAZY)
658 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 649 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
659 else 650 else
@@ -717,55 +708,75 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
717 struct ceph_inode_info *ci = ceph_inode(inode); 708 struct ceph_inode_info *ci = ceph_inode(inode);
718 struct ceph_osd_client *osdc = 709 struct ceph_osd_client *osdc =
719 &ceph_sb_to_client(inode->i_sb)->client->osdc; 710 &ceph_sb_to_client(inode->i_sb)->client->osdc;
720 loff_t endoff = pos + iov->iov_len; 711 ssize_t count, written = 0;
721 int got = 0; 712 int err, want, got;
722 int ret, err, written; 713 bool hold_mutex;
723 714
724 if (ceph_snap(inode) != CEPH_NOSNAP) 715 if (ceph_snap(inode) != CEPH_NOSNAP)
725 return -EROFS; 716 return -EROFS;
726 717
727retry_snap: 718 sb_start_write(inode->i_sb);
728 written = 0; 719 mutex_lock(&inode->i_mutex);
729 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 720 hold_mutex = true;
730 return -ENOSPC;
731 __ceph_do_pending_vmtruncate(inode);
732 721
733 /* 722 err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
734 * try to do a buffered write. if we don't have sufficient 723 if (err)
735 * caps, we'll get -EAGAIN from generic_file_aio_write, or a 724 goto out;
736 * short write if we only get caps for some pages. 725
737 */ 726 /* We can write back this queue in page reclaim */
738 if (!(iocb->ki_filp->f_flags & O_DIRECT) && 727 current->backing_dev_info = file->f_mapping->backing_dev_info;
739 !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && 728
740 !(fi->flags & CEPH_F_SYNC)) { 729 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
741 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 730 if (err)
742 if (ret >= 0) 731 goto out;
743 written = ret; 732
744 733 if (count == 0)
745 if ((ret >= 0 || ret == -EIOCBQUEUED) && 734 goto out;
746 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 735
747 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 736 err = file_remove_suid(file);
748 err = vfs_fsync_range(file, pos, pos + written - 1, 1); 737 if (err)
749 if (err < 0) 738 goto out;
750 ret = err; 739
751 } 740 err = file_update_time(file);
752 if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) 741 if (err)
753 goto out; 742 goto out;
743
744retry_snap:
745 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
746 err = -ENOSPC;
747 goto out;
754 } 748 }
755 749
756 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", 750 dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
757 inode, ceph_vinop(inode), pos + written, 751 inode, ceph_vinop(inode), pos, count, inode->i_size);
758 (unsigned)iov->iov_len - written, inode->i_size); 752 if (fi->fmode & CEPH_FILE_MODE_LAZY)
759 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); 753 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
760 if (ret < 0) 754 else
755 want = CEPH_CAP_FILE_BUFFER;
756 got = 0;
757 err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count);
758 if (err < 0)
761 goto out; 759 goto out;
762 760
763 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", 761 dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
764 inode, ceph_vinop(inode), pos + written, 762 inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
765 (unsigned)iov->iov_len - written, ceph_cap_string(got)); 763
766 ret = ceph_sync_write(file, iov->iov_base + written, 764 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
767 iov->iov_len - written, &iocb->ki_pos); 765 (iocb->ki_filp->f_flags & O_DIRECT) ||
768 if (ret >= 0) { 766 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
767 (fi->flags & CEPH_F_SYNC)) {
768 mutex_unlock(&inode->i_mutex);
769 written = ceph_sync_write(file, iov->iov_base, count,
770 pos, &iocb->ki_pos);
771 } else {
772 written = generic_file_buffered_write(iocb, iov, nr_segs,
773 pos, &iocb->ki_pos,
774 count, 0);
775 mutex_unlock(&inode->i_mutex);
776 }
777 hold_mutex = false;
778
779 if (written >= 0) {
769 int dirty; 780 int dirty;
770 spin_lock(&ci->i_ceph_lock); 781 spin_lock(&ci->i_ceph_lock);
771 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 782 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
@@ -773,18 +784,34 @@ retry_snap:
773 if (dirty) 784 if (dirty)
774 __mark_inode_dirty(inode, dirty); 785 __mark_inode_dirty(inode, dirty);
775 } 786 }
787
776 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 788 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
777 inode, ceph_vinop(inode), pos + written, 789 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
778 (unsigned)iov->iov_len - written, ceph_cap_string(got)); 790 ceph_cap_string(got));
779 ceph_put_cap_refs(ci, got); 791 ceph_put_cap_refs(ci, got);
780out: 792
781 if (ret == -EOLDSNAPC) { 793 if (written >= 0 &&
794 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) ||
795 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
796 err = vfs_fsync_range(file, pos, pos + written - 1, 1);
797 if (err < 0)
798 written = err;
799 }
800
801 if (written == -EOLDSNAPC) {
782 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", 802 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
783 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); 803 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
804 mutex_lock(&inode->i_mutex);
805 hold_mutex = true;
784 goto retry_snap; 806 goto retry_snap;
785 } 807 }
808out:
809 if (hold_mutex)
810 mutex_unlock(&inode->i_mutex);
811 sb_end_write(inode->i_sb);
812 current->backing_dev_info = NULL;
786 813
787 return ret; 814 return written ? written : err;
788} 815}
789 816
790/* 817/*
@@ -796,7 +823,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
796 int ret; 823 int ret;
797 824
798 mutex_lock(&inode->i_mutex); 825 mutex_lock(&inode->i_mutex);
799 __ceph_do_pending_vmtruncate(inode); 826 __ceph_do_pending_vmtruncate(inode, false);
800 827
801 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 828 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
802 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 829 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 851814d951cd..be0f7e20d62e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -302,7 +302,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
302 ci->i_version = 0; 302 ci->i_version = 0;
303 ci->i_time_warp_seq = 0; 303 ci->i_time_warp_seq = 0;
304 ci->i_ceph_flags = 0; 304 ci->i_ceph_flags = 0;
305 ci->i_release_count = 0; 305 atomic_set(&ci->i_release_count, 1);
306 atomic_set(&ci->i_complete_count, 0);
306 ci->i_symlink = NULL; 307 ci->i_symlink = NULL;
307 308
308 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); 309 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -561,7 +562,6 @@ static int fill_inode(struct inode *inode,
561 struct ceph_inode_info *ci = ceph_inode(inode); 562 struct ceph_inode_info *ci = ceph_inode(inode);
562 int i; 563 int i;
563 int issued = 0, implemented; 564 int issued = 0, implemented;
564 int updating_inode = 0;
565 struct timespec mtime, atime, ctime; 565 struct timespec mtime, atime, ctime;
566 u32 nsplits; 566 u32 nsplits;
567 struct ceph_buffer *xattr_blob = NULL; 567 struct ceph_buffer *xattr_blob = NULL;
@@ -601,7 +601,6 @@ static int fill_inode(struct inode *inode,
601 (ci->i_version & ~1) >= le64_to_cpu(info->version)) 601 (ci->i_version & ~1) >= le64_to_cpu(info->version))
602 goto no_change; 602 goto no_change;
603 603
604 updating_inode = 1;
605 issued = __ceph_caps_issued(ci, &implemented); 604 issued = __ceph_caps_issued(ci, &implemented);
606 issued |= implemented | __ceph_caps_dirty(ci); 605 issued |= implemented | __ceph_caps_dirty(ci);
607 606
@@ -717,6 +716,17 @@ static int fill_inode(struct inode *inode,
717 ceph_vinop(inode), inode->i_mode); 716 ceph_vinop(inode), inode->i_mode);
718 } 717 }
719 718
719 /* set dir completion flag? */
720 if (S_ISDIR(inode->i_mode) &&
721 ci->i_files == 0 && ci->i_subdirs == 0 &&
722 ceph_snap(inode) == CEPH_NOSNAP &&
723 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
724 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
725 !__ceph_dir_is_complete(ci)) {
726 dout(" marking %p complete (empty)\n", inode);
727 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
728 ci->i_max_offset = 2;
729 }
720no_change: 730no_change:
721 spin_unlock(&ci->i_ceph_lock); 731 spin_unlock(&ci->i_ceph_lock);
722 732
@@ -767,19 +777,6 @@ no_change:
767 __ceph_get_fmode(ci, cap_fmode); 777 __ceph_get_fmode(ci, cap_fmode);
768 } 778 }
769 779
770 /* set dir completion flag? */
771 if (S_ISDIR(inode->i_mode) &&
772 updating_inode && /* didn't jump to no_change */
773 ci->i_files == 0 && ci->i_subdirs == 0 &&
774 ceph_snap(inode) == CEPH_NOSNAP &&
775 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
776 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
777 !ceph_dir_test_complete(inode)) {
778 dout(" marking %p complete (empty)\n", inode);
779 ceph_dir_set_complete(inode);
780 ci->i_max_offset = 2;
781 }
782
783 /* update delegation info? */ 780 /* update delegation info? */
784 if (dirinfo) 781 if (dirinfo)
785 ceph_fill_dirfrag(inode, dirinfo); 782 ceph_fill_dirfrag(inode, dirinfo);
@@ -861,7 +858,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
861 di = ceph_dentry(dn); 858 di = ceph_dentry(dn);
862 859
863 spin_lock(&ci->i_ceph_lock); 860 spin_lock(&ci->i_ceph_lock);
864 if (!ceph_dir_test_complete(inode)) { 861 if (!__ceph_dir_is_complete(ci)) {
865 spin_unlock(&ci->i_ceph_lock); 862 spin_unlock(&ci->i_ceph_lock);
866 return; 863 return;
867 } 864 }
@@ -1065,8 +1062,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1065 /* 1062 /*
1066 * d_move() puts the renamed dentry at the end of 1063 * d_move() puts the renamed dentry at the end of
1067 * d_subdirs. We need to assign it an appropriate 1064 * d_subdirs. We need to assign it an appropriate
1068 * directory offset so we can behave when holding 1065 * directory offset so we can behave when dir is
1069 * D_COMPLETE. 1066 * complete.
1070 */ 1067 */
1071 ceph_set_dentry_offset(req->r_old_dentry); 1068 ceph_set_dentry_offset(req->r_old_dentry);
1072 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1069 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
@@ -1457,7 +1454,7 @@ out:
1457 1454
1458 1455
1459/* 1456/*
1460 * called by trunc_wq; take i_mutex ourselves 1457 * called by trunc_wq;
1461 * 1458 *
1462 * We also truncate in a separate thread as well. 1459 * We also truncate in a separate thread as well.
1463 */ 1460 */
@@ -1468,9 +1465,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)
1468 struct inode *inode = &ci->vfs_inode; 1465 struct inode *inode = &ci->vfs_inode;
1469 1466
1470 dout("vmtruncate_work %p\n", inode); 1467 dout("vmtruncate_work %p\n", inode);
1471 mutex_lock(&inode->i_mutex); 1468 __ceph_do_pending_vmtruncate(inode, true);
1472 __ceph_do_pending_vmtruncate(inode);
1473 mutex_unlock(&inode->i_mutex);
1474 iput(inode); 1469 iput(inode);
1475} 1470}
1476 1471
@@ -1494,12 +1489,10 @@ void ceph_queue_vmtruncate(struct inode *inode)
1494} 1489}
1495 1490
1496/* 1491/*
1497 * called with i_mutex held.
1498 *
1499 * Make sure any pending truncation is applied before doing anything 1492 * Make sure any pending truncation is applied before doing anything
1500 * that may depend on it. 1493 * that may depend on it.
1501 */ 1494 */
1502void __ceph_do_pending_vmtruncate(struct inode *inode) 1495void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock)
1503{ 1496{
1504 struct ceph_inode_info *ci = ceph_inode(inode); 1497 struct ceph_inode_info *ci = ceph_inode(inode);
1505 u64 to; 1498 u64 to;
@@ -1532,7 +1525,11 @@ retry:
1532 ci->i_truncate_pending, to); 1525 ci->i_truncate_pending, to);
1533 spin_unlock(&ci->i_ceph_lock); 1526 spin_unlock(&ci->i_ceph_lock);
1534 1527
1528 if (needlock)
1529 mutex_lock(&inode->i_mutex);
1535 truncate_inode_pages(inode->i_mapping, to); 1530 truncate_inode_pages(inode->i_mapping, to);
1531 if (needlock)
1532 mutex_unlock(&inode->i_mutex);
1536 1533
1537 spin_lock(&ci->i_ceph_lock); 1534 spin_lock(&ci->i_ceph_lock);
1538 if (to == ci->i_truncate_size) { 1535 if (to == ci->i_truncate_size) {
@@ -1563,6 +1560,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1563static const struct inode_operations ceph_symlink_iops = { 1560static const struct inode_operations ceph_symlink_iops = {
1564 .readlink = generic_readlink, 1561 .readlink = generic_readlink,
1565 .follow_link = ceph_sym_follow_link, 1562 .follow_link = ceph_sym_follow_link,
1563 .setattr = ceph_setattr,
1564 .getattr = ceph_getattr,
1565 .setxattr = ceph_setxattr,
1566 .getxattr = ceph_getxattr,
1567 .listxattr = ceph_listxattr,
1568 .removexattr = ceph_removexattr,
1566}; 1569};
1567 1570
1568/* 1571/*
@@ -1585,7 +1588,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1585 if (ceph_snap(inode) != CEPH_NOSNAP) 1588 if (ceph_snap(inode) != CEPH_NOSNAP)
1586 return -EROFS; 1589 return -EROFS;
1587 1590
1588 __ceph_do_pending_vmtruncate(inode); 1591 __ceph_do_pending_vmtruncate(inode, false);
1589 1592
1590 err = inode_change_ok(inode, attr); 1593 err = inode_change_ok(inode, attr);
1591 if (err != 0) 1594 if (err != 0)
@@ -1767,7 +1770,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1767 ceph_cap_string(dirtied), mask); 1770 ceph_cap_string(dirtied), mask);
1768 1771
1769 ceph_mdsc_put_request(req); 1772 ceph_mdsc_put_request(req);
1770 __ceph_do_pending_vmtruncate(inode); 1773 __ceph_do_pending_vmtruncate(inode, false);
1771 return err; 1774 return err;
1772out: 1775out:
1773 spin_unlock(&ci->i_ceph_lock); 1776 spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 4a989345b37b..e0b4ef31d3c8 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -208,8 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
208 208
209 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", 209 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
210 ceph_ino(inode), dl.object_no); 210 ceph_ino(inode), dl.object_no);
211 ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, 211
212 osdc->osdmap); 212 ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
213 ceph_file_layout_pg_pool(ci->i_layout));
213 214
214 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); 215 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
215 if (dl.osd >= 0) { 216 if (dl.osd >= 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 442880d099c9..4f22671a5bd4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -265,7 +265,8 @@ static int parse_reply_info_extra(void **p, void *end,
265{ 265{
266 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 266 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
267 return parse_reply_info_filelock(p, end, info, features); 267 return parse_reply_info_filelock(p, end, info, features);
268 else if (info->head->op == CEPH_MDS_OP_READDIR) 268 else if (info->head->op == CEPH_MDS_OP_READDIR ||
269 info->head->op == CEPH_MDS_OP_LSSNAP)
269 return parse_reply_info_dir(p, end, info, features); 270 return parse_reply_info_dir(p, end, info, features);
270 else if (info->head->op == CEPH_MDS_OP_CREATE) 271 else if (info->head->op == CEPH_MDS_OP_CREATE)
271 return parse_reply_info_create(p, end, info, features); 272 return parse_reply_info_create(p, end, info, features);
@@ -364,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
364 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 365 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
365 if (atomic_dec_and_test(&s->s_ref)) { 366 if (atomic_dec_and_test(&s->s_ref)) {
366 if (s->s_auth.authorizer) 367 if (s->s_auth.authorizer)
367 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( 368 ceph_auth_destroy_authorizer(
368 s->s_mdsc->fsc->client->monc.auth, 369 s->s_mdsc->fsc->client->monc.auth,
369 s->s_auth.authorizer); 370 s->s_auth.authorizer);
370 kfree(s); 371 kfree(s);
371 } 372 }
372} 373}
@@ -1196,6 +1197,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1196 session->s_trim_caps--; 1197 session->s_trim_caps--;
1197 if (oissued) { 1198 if (oissued) {
1198 /* we aren't the only cap.. just remove us */ 1199 /* we aren't the only cap.. just remove us */
1200 __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1201 cap->mseq, cap->issue_seq);
1199 __ceph_remove_cap(cap); 1202 __ceph_remove_cap(cap);
1200 } else { 1203 } else {
1201 /* try to drop referring dentries */ 1204 /* try to drop referring dentries */
@@ -1718,8 +1721,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1718 msg->front.iov_len = p - msg->front.iov_base; 1721 msg->front.iov_len = p - msg->front.iov_base;
1719 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1722 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1720 1723
1721 msg->pages = req->r_pages; 1724 if (req->r_data_len) {
1722 msg->nr_pages = req->r_num_pages; 1725 /* outbound data set only by ceph_sync_setxattr() */
1726 BUG_ON(!req->r_pages);
1727 ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
1728 }
1729
1723 msg->hdr.data_len = cpu_to_le32(req->r_data_len); 1730 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1724 msg->hdr.data_off = cpu_to_le16(0); 1731 msg->hdr.data_off = cpu_to_le16(0);
1725 1732
@@ -1913,6 +1920,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
1913 req = list_entry(tmp_list.next, 1920 req = list_entry(tmp_list.next,
1914 struct ceph_mds_request, r_wait); 1921 struct ceph_mds_request, r_wait);
1915 list_del_init(&req->r_wait); 1922 list_del_init(&req->r_wait);
1923 dout(" wake request %p tid %llu\n", req, req->r_tid);
1916 __do_request(mdsc, req); 1924 __do_request(mdsc, req);
1917 } 1925 }
1918} 1926}
@@ -2026,20 +2034,16 @@ out:
2026} 2034}
2027 2035
2028/* 2036/*
2029 * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS 2037 * Invalidate dir's completeness, dentry lease state on an aborted MDS
2030 * namespace request. 2038 * namespace request.
2031 */ 2039 */
2032void ceph_invalidate_dir_request(struct ceph_mds_request *req) 2040void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2033{ 2041{
2034 struct inode *inode = req->r_locked_dir; 2042 struct inode *inode = req->r_locked_dir;
2035 struct ceph_inode_info *ci = ceph_inode(inode);
2036 2043
2037 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); 2044 dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
2038 spin_lock(&ci->i_ceph_lock);
2039 ceph_dir_clear_complete(inode);
2040 ci->i_release_count++;
2041 spin_unlock(&ci->i_ceph_lock);
2042 2045
2046 ceph_dir_clear_complete(inode);
2043 if (req->r_dentry) 2047 if (req->r_dentry)
2044 ceph_invalidate_dentry_lease(req->r_dentry); 2048 ceph_invalidate_dentry_lease(req->r_dentry);
2045 if (req->r_old_dentry) 2049 if (req->r_old_dentry)
@@ -2599,11 +2603,13 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2599 goto fail; 2603 goto fail;
2600 } 2604 }
2601 2605
2602 reply->pagelist = pagelist;
2603 if (recon_state.flock) 2606 if (recon_state.flock)
2604 reply->hdr.version = cpu_to_le16(2); 2607 reply->hdr.version = cpu_to_le16(2);
2605 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2608 if (pagelist->length) {
2606 reply->nr_pages = calc_pages_for(0, pagelist->length); 2609 /* set up outbound data if we have any */
2610 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2611 ceph_msg_data_add_pagelist(reply, pagelist);
2612 }
2607 ceph_con_send(&session->s_con, reply); 2613 ceph_con_send(&session->s_con, reply);
2608 2614
2609 mutex_unlock(&session->s_mutex); 2615 mutex_unlock(&session->s_mutex);
@@ -3433,13 +3439,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
3433 struct ceph_auth_handshake *auth = &s->s_auth; 3439 struct ceph_auth_handshake *auth = &s->s_auth;
3434 3440
3435 if (force_new && auth->authorizer) { 3441 if (force_new && auth->authorizer) {
3436 if (ac->ops && ac->ops->destroy_authorizer) 3442 ceph_auth_destroy_authorizer(ac, auth->authorizer);
3437 ac->ops->destroy_authorizer(ac, auth->authorizer);
3438 auth->authorizer = NULL; 3443 auth->authorizer = NULL;
3439 } 3444 }
3440 if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { 3445 if (!auth->authorizer) {
3441 int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, 3446 int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
3442 auth); 3447 auth);
3448 if (ret)
3449 return ERR_PTR(ret);
3450 } else {
3451 int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
3452 auth);
3443 if (ret) 3453 if (ret)
3444 return ERR_PTR(ret); 3454 return ERR_PTR(ret);
3445 } 3455 }
@@ -3455,7 +3465,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
3455 struct ceph_mds_client *mdsc = s->s_mdsc; 3465 struct ceph_mds_client *mdsc = s->s_mdsc;
3456 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; 3466 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3457 3467
3458 return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len); 3468 return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
3459} 3469}
3460 3470
3461static int invalidate_authorizer(struct ceph_connection *con) 3471static int invalidate_authorizer(struct ceph_connection *con)
@@ -3464,12 +3474,32 @@ static int invalidate_authorizer(struct ceph_connection *con)
3464 struct ceph_mds_client *mdsc = s->s_mdsc; 3474 struct ceph_mds_client *mdsc = s->s_mdsc;
3465 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; 3475 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3466 3476
3467 if (ac->ops->invalidate_authorizer) 3477 ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3468 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3469 3478
3470 return ceph_monc_validate_auth(&mdsc->fsc->client->monc); 3479 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3471} 3480}
3472 3481
3482static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
3483 struct ceph_msg_header *hdr, int *skip)
3484{
3485 struct ceph_msg *msg;
3486 int type = (int) le16_to_cpu(hdr->type);
3487 int front_len = (int) le32_to_cpu(hdr->front_len);
3488
3489 if (con->in_msg)
3490 return con->in_msg;
3491
3492 *skip = 0;
3493 msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
3494 if (!msg) {
3495 pr_err("unable to allocate msg type %d len %d\n",
3496 type, front_len);
3497 return NULL;
3498 }
3499
3500 return msg;
3501}
3502
3473static const struct ceph_connection_operations mds_con_ops = { 3503static const struct ceph_connection_operations mds_con_ops = {
3474 .get = con_get, 3504 .get = con_get,
3475 .put = con_put, 3505 .put = con_put,
@@ -3478,6 +3508,7 @@ static const struct ceph_connection_operations mds_con_ops = {
3478 .verify_authorizer_reply = verify_authorizer_reply, 3508 .verify_authorizer_reply = verify_authorizer_reply,
3479 .invalidate_authorizer = invalidate_authorizer, 3509 .invalidate_authorizer = invalidate_authorizer,
3480 .peer_reset = peer_reset, 3510 .peer_reset = peer_reset,
3511 .alloc_msg = mds_alloc_msg,
3481}; 3512};
3482 3513
3483/* eof */ 3514/* eof */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 0d3c9240c61b..9278dec9e940 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{ 20{
21 int n = 0; 21 int n = 0;
22 int i; 22 int i;
23 char r; 23
24 /* special case for one mds */
25 if (1 == m->m_max_mds && m->m_info[0].state > 0)
26 return 0;
24 27
25 /* count */ 28 /* count */
26 for (i = 0; i < m->m_max_mds; i++) 29 for (i = 0; i < m->m_max_mds; i++)
@@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
30 return -1; 33 return -1;
31 34
32 /* pick */ 35 /* pick */
33 get_random_bytes(&r, 1); 36 n = prandom_u32() % n;
34 n = r % n;
35 i = 0; 37 i = 0;
36 for (i = 0; n > 0; i++, n--) 38 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0) 39 while (m->m_info[i].state <= 0)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index cbb2f54a3019..f01645a27752 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -332,10 +332,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
332 err = -ENOMEM; 332 err = -ENOMEM;
333 if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) 333 if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
334 goto fail; 334 goto fail;
335 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); 335 snapc = ceph_create_snap_context(num, GFP_NOFS);
336 if (!snapc) 336 if (!snapc)
337 goto fail; 337 goto fail;
338 atomic_set(&snapc->nref, 1);
339 338
340 /* build (reverse sorted) snap vector */ 339 /* build (reverse sorted) snap vector */
341 num = 0; 340 num = 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6ddc0bca56b2..7d377c9a5e35 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,6 +479,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
479 CEPH_FEATURE_FLOCK | 479 CEPH_FEATURE_FLOCK |
480 CEPH_FEATURE_DIRLAYOUTHASH; 480 CEPH_FEATURE_DIRLAYOUTHASH;
481 const unsigned required_features = 0; 481 const unsigned required_features = 0;
482 int page_count;
483 size_t size;
482 int err = -ENOMEM; 484 int err = -ENOMEM;
483 485
484 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); 486 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
@@ -522,8 +524,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
522 524
523 /* set up mempools */ 525 /* set up mempools */
524 err = -ENOMEM; 526 err = -ENOMEM;
525 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, 527 page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
526 fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); 528 size = sizeof (struct page *) * (page_count ? page_count : 1);
529 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
527 if (!fsc->wb_pagevec_pool) 530 if (!fsc->wb_pagevec_pool)
528 goto fail_trunc_wq; 531 goto fail_trunc_wq;
529 532
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c7b309723dcc..8696be2ff679 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -204,7 +204,6 @@ struct ceph_inode_xattr {
204 * Ceph dentry state 204 * Ceph dentry state
205 */ 205 */
206struct ceph_dentry_info { 206struct ceph_dentry_info {
207 unsigned long flags;
208 struct ceph_mds_session *lease_session; 207 struct ceph_mds_session *lease_session;
209 u32 lease_gen, lease_shared_gen; 208 u32 lease_gen, lease_shared_gen;
210 u32 lease_seq; 209 u32 lease_seq;
@@ -215,18 +214,6 @@ struct ceph_dentry_info {
215 u64 offset; 214 u64 offset;
216}; 215};
217 216
218/*
219 * dentry flags
220 *
221 * The locking for D_COMPLETE is a bit odd:
222 * - we can clear it at almost any time (see ceph_d_prune)
223 * - it is only meaningful if:
224 * - we hold dir inode i_ceph_lock
225 * - we hold dir FILE_SHARED caps
226 * - the dentry D_COMPLETE is set
227 */
228#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
229
230struct ceph_inode_xattrs_info { 217struct ceph_inode_xattrs_info {
231 /* 218 /*
232 * (still encoded) xattr blob. we avoid the overhead of parsing 219 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -257,7 +244,8 @@ struct ceph_inode_info {
257 u32 i_time_warp_seq; 244 u32 i_time_warp_seq;
258 245
259 unsigned i_ceph_flags; 246 unsigned i_ceph_flags;
260 unsigned long i_release_count; 247 atomic_t i_release_count;
248 atomic_t i_complete_count;
261 249
262 struct ceph_dir_layout i_dir_layout; 250 struct ceph_dir_layout i_dir_layout;
263 struct ceph_file_layout i_layout; 251 struct ceph_file_layout i_layout;
@@ -267,7 +255,7 @@ struct ceph_inode_info {
267 struct timespec i_rctime; 255 struct timespec i_rctime;
268 u64 i_rbytes, i_rfiles, i_rsubdirs; 256 u64 i_rbytes, i_rfiles, i_rsubdirs;
269 u64 i_files, i_subdirs; 257 u64 i_files, i_subdirs;
270 u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ 258 u64 i_max_offset; /* largest readdir offset, set with complete dir */
271 259
272 struct rb_root i_fragtree; 260 struct rb_root i_fragtree;
273 struct mutex i_fragtree_mutex; 261 struct mutex i_fragtree_mutex;
@@ -436,33 +424,31 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
436#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 424#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
437#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 425#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
438 426
439static inline void ceph_i_clear(struct inode *inode, unsigned mask) 427static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
428 int release_count)
440{ 429{
441 struct ceph_inode_info *ci = ceph_inode(inode); 430 atomic_set(&ci->i_complete_count, release_count);
442
443 spin_lock(&ci->i_ceph_lock);
444 ci->i_ceph_flags &= ~mask;
445 spin_unlock(&ci->i_ceph_lock);
446} 431}
447 432
448static inline void ceph_i_set(struct inode *inode, unsigned mask) 433static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
449{ 434{
450 struct ceph_inode_info *ci = ceph_inode(inode); 435 atomic_inc(&ci->i_release_count);
436}
451 437
452 spin_lock(&ci->i_ceph_lock); 438static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
453 ci->i_ceph_flags |= mask; 439{
454 spin_unlock(&ci->i_ceph_lock); 440 return atomic_read(&ci->i_complete_count) ==
441 atomic_read(&ci->i_release_count);
455} 442}
456 443
457static inline bool ceph_i_test(struct inode *inode, unsigned mask) 444static inline void ceph_dir_clear_complete(struct inode *inode)
458{ 445{
459 struct ceph_inode_info *ci = ceph_inode(inode); 446 __ceph_dir_clear_complete(ceph_inode(inode));
460 bool r; 447}
461 448
462 spin_lock(&ci->i_ceph_lock); 449static inline bool ceph_dir_is_complete(struct inode *inode)
463 r = (ci->i_ceph_flags & mask) == mask; 450{
464 spin_unlock(&ci->i_ceph_lock); 451 return __ceph_dir_is_complete(ceph_inode(inode));
465 return r;
466} 452}
467 453
468 454
@@ -489,13 +475,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
489} 475}
490 476
491/* 477/*
492 * set/clear directory D_COMPLETE flag
493 */
494void ceph_dir_set_complete(struct inode *inode);
495void ceph_dir_clear_complete(struct inode *inode);
496bool ceph_dir_test_complete(struct inode *inode);
497
498/*
499 * caps helpers 478 * caps helpers
500 */ 479 */
501static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) 480static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
@@ -584,7 +563,7 @@ struct ceph_file_info {
584 u64 next_offset; /* offset of next chunk (last_name's + 1) */ 563 u64 next_offset; /* offset of next chunk (last_name's + 1) */
585 char *last_name; /* last entry in previous chunk */ 564 char *last_name; /* last entry in previous chunk */
586 struct dentry *dentry; /* next dentry (for dcache readdir) */ 565 struct dentry *dentry; /* next dentry (for dcache readdir) */
587 unsigned long dir_release_count; 566 int dir_release_count;
588 567
589 /* used for -o dirstat read() on directory thing */ 568 /* used for -o dirstat read() on directory thing */
590 char *dir_info; 569 char *dir_info;
@@ -713,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
713extern int ceph_inode_holds_cap(struct inode *inode, int mask); 692extern int ceph_inode_holds_cap(struct inode *inode, int mask);
714 693
715extern int ceph_inode_set_size(struct inode *inode, loff_t size); 694extern int ceph_inode_set_size(struct inode *inode, loff_t size);
716extern void __ceph_do_pending_vmtruncate(struct inode *inode); 695extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock);
717extern void ceph_queue_vmtruncate(struct inode *inode); 696extern void ceph_queue_vmtruncate(struct inode *inode);
718 697
719extern void ceph_queue_invalidate(struct inode *inode); 698extern void ceph_queue_invalidate(struct inode *inode);
@@ -755,6 +734,8 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
755extern void ceph_put_cap(struct ceph_mds_client *mdsc, 734extern void ceph_put_cap(struct ceph_mds_client *mdsc,
756 struct ceph_cap *cap); 735 struct ceph_cap *cap);
757 736
737extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
738 u64 cap_id, u32 migrate_seq, u32 issue_seq);
758extern void ceph_queue_caps_release(struct inode *inode); 739extern void ceph_queue_caps_release(struct inode *inode);
759extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 740extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
760extern int ceph_fsync(struct file *file, loff_t start, loff_t end, 741extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index d4080f309b56..5f3386844134 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -52,6 +52,9 @@ struct ceph_auth_client_ops {
52 */ 52 */
53 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, 53 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
54 struct ceph_auth_handshake *auth); 54 struct ceph_auth_handshake *auth);
55 /* ensure that an existing authorizer is up to date */
56 int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type,
57 struct ceph_auth_handshake *auth);
55 int (*verify_authorizer_reply)(struct ceph_auth_client *ac, 58 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
56 struct ceph_authorizer *a, size_t len); 59 struct ceph_authorizer *a, size_t len);
57 void (*destroy_authorizer)(struct ceph_auth_client *ac, 60 void (*destroy_authorizer)(struct ceph_auth_client *ac,
@@ -75,6 +78,8 @@ struct ceph_auth_client {
75 u64 global_id; /* our unique id in system */ 78 u64 global_id; /* our unique id in system */
76 const struct ceph_crypto_key *key; /* our secret key */ 79 const struct ceph_crypto_key *key; /* our secret key */
77 unsigned want_keys; /* which services we want */ 80 unsigned want_keys; /* which services we want */
81
82 struct mutex mutex;
78}; 83};
79 84
80extern struct ceph_auth_client *ceph_auth_init(const char *name, 85extern struct ceph_auth_client *ceph_auth_init(const char *name,
@@ -94,5 +99,18 @@ extern int ceph_build_auth(struct ceph_auth_client *ac,
94 void *msg_buf, size_t msg_len); 99 void *msg_buf, size_t msg_len);
95 100
96extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); 101extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
102extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
103 int peer_type,
104 struct ceph_auth_handshake *auth);
105extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
106 struct ceph_authorizer *a);
107extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
108 int peer_type,
109 struct ceph_auth_handshake *a);
110extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
111 struct ceph_authorizer *a,
112 size_t len);
113extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
114 int peer_type);
97 115
98#endif 116#endif
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 76554cecaab2..4c42080347af 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -41,6 +41,7 @@
41 */ 41 */
42#define CEPH_FEATURES_SUPPORTED_DEFAULT \ 42#define CEPH_FEATURES_SUPPORTED_DEFAULT \
43 (CEPH_FEATURE_NOSRCADDR | \ 43 (CEPH_FEATURE_NOSRCADDR | \
44 CEPH_FEATURE_RECONNECT_SEQ | \
44 CEPH_FEATURE_PGID64 | \ 45 CEPH_FEATURE_PGID64 | \
45 CEPH_FEATURE_PGPOOL3 | \ 46 CEPH_FEATURE_PGPOOL3 | \
46 CEPH_FEATURE_OSDENC | \ 47 CEPH_FEATURE_OSDENC | \
@@ -51,6 +52,7 @@
51 52
52#define CEPH_FEATURES_REQUIRED_DEFAULT \ 53#define CEPH_FEATURES_REQUIRED_DEFAULT \
53 (CEPH_FEATURE_NOSRCADDR | \ 54 (CEPH_FEATURE_NOSRCADDR | \
55 CEPH_FEATURE_RECONNECT_SEQ | \
54 CEPH_FEATURE_PGID64 | \ 56 CEPH_FEATURE_PGID64 | \
55 CEPH_FEATURE_PGPOOL3 | \ 57 CEPH_FEATURE_PGPOOL3 | \
56 CEPH_FEATURE_OSDENC) 58 CEPH_FEATURE_OSDENC)
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 360d9d08ca9e..379f71508995 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -8,6 +8,23 @@
8 8
9#include <linux/ceph/types.h> 9#include <linux/ceph/types.h>
10 10
11/* This seemed to be the easiest place to define these */
12
13#define U8_MAX ((u8)(~0U))
14#define U16_MAX ((u16)(~0U))
15#define U32_MAX ((u32)(~0U))
16#define U64_MAX ((u64)(~0ULL))
17
18#define S8_MAX ((s8)(U8_MAX >> 1))
19#define S16_MAX ((s16)(U16_MAX >> 1))
20#define S32_MAX ((s32)(U32_MAX >> 1))
21#define S64_MAX ((s64)(U64_MAX >> 1LL))
22
23#define S8_MIN ((s8)(-S8_MAX - 1))
24#define S16_MIN ((s16)(-S16_MAX - 1))
25#define S32_MIN ((s32)(-S32_MAX - 1))
26#define S64_MIN ((s64)(-S64_MAX - 1LL))
27
11/* 28/*
12 * in all cases, 29 * in all cases,
13 * void **p pointer to position pointer 30 * void **p pointer to position pointer
@@ -137,14 +154,19 @@ bad:
137static inline void ceph_decode_timespec(struct timespec *ts, 154static inline void ceph_decode_timespec(struct timespec *ts,
138 const struct ceph_timespec *tv) 155 const struct ceph_timespec *tv)
139{ 156{
140 ts->tv_sec = le32_to_cpu(tv->tv_sec); 157 ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec);
141 ts->tv_nsec = le32_to_cpu(tv->tv_nsec); 158 ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec);
142} 159}
143static inline void ceph_encode_timespec(struct ceph_timespec *tv, 160static inline void ceph_encode_timespec(struct ceph_timespec *tv,
144 const struct timespec *ts) 161 const struct timespec *ts)
145{ 162{
146 tv->tv_sec = cpu_to_le32(ts->tv_sec); 163 BUG_ON(ts->tv_sec < 0);
147 tv->tv_nsec = cpu_to_le32(ts->tv_nsec); 164 BUG_ON(ts->tv_sec > (__kernel_time_t)U32_MAX);
165 BUG_ON(ts->tv_nsec < 0);
166 BUG_ON(ts->tv_nsec > (long)U32_MAX);
167
168 tv->tv_sec = cpu_to_le32((u32)ts->tv_sec);
169 tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec);
148} 170}
149 171
150/* 172/*
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 29818fc3fa49..2e3024881a5e 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -66,6 +66,7 @@ struct ceph_options {
66#define CEPH_OSD_IDLE_TTL_DEFAULT 60 66#define CEPH_OSD_IDLE_TTL_DEFAULT 60
67 67
68#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 68#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
69#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
69#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) 70#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
70 71
71#define CEPH_AUTH_NAME_DEFAULT "guest" 72#define CEPH_AUTH_NAME_DEFAULT "guest"
@@ -156,31 +157,11 @@ struct ceph_snap_context {
156 u64 snaps[]; 157 u64 snaps[];
157}; 158};
158 159
159static inline struct ceph_snap_context * 160extern struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
160ceph_get_snap_context(struct ceph_snap_context *sc) 161 gfp_t gfp_flags);
161{ 162extern struct ceph_snap_context *ceph_get_snap_context(
162 /* 163 struct ceph_snap_context *sc);
163 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), 164extern void ceph_put_snap_context(struct ceph_snap_context *sc);
164 atomic_read(&sc->nref)+1);
165 */
166 if (sc)
167 atomic_inc(&sc->nref);
168 return sc;
169}
170
171static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
172{
173 if (!sc)
174 return;
175 /*
176 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
177 atomic_read(&sc->nref)-1);
178 */
179 if (atomic_dec_and_test(&sc->nref)) {
180 /*printk(" deleting snap_context %p\n", sc);*/
181 kfree(sc);
182 }
183}
184 165
185/* 166/*
186 * calculate the number of pages a given length and offset map onto, 167 * calculate the number of pages a given length and offset map onto,
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 60903e0f665c..7c1420bb1dce 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -64,6 +64,77 @@ struct ceph_messenger {
64 u32 required_features; 64 u32 required_features;
65}; 65};
66 66
67enum ceph_msg_data_type {
68 CEPH_MSG_DATA_NONE, /* message contains no data payload */
69 CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */
70 CEPH_MSG_DATA_PAGELIST, /* data source/destination is a pagelist */
71#ifdef CONFIG_BLOCK
72 CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */
73#endif /* CONFIG_BLOCK */
74};
75
76static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
77{
78 switch (type) {
79 case CEPH_MSG_DATA_NONE:
80 case CEPH_MSG_DATA_PAGES:
81 case CEPH_MSG_DATA_PAGELIST:
82#ifdef CONFIG_BLOCK
83 case CEPH_MSG_DATA_BIO:
84#endif /* CONFIG_BLOCK */
85 return true;
86 default:
87 return false;
88 }
89}
90
91struct ceph_msg_data {
92 struct list_head links; /* ceph_msg->data */
93 enum ceph_msg_data_type type;
94 union {
95#ifdef CONFIG_BLOCK
96 struct {
97 struct bio *bio;
98 size_t bio_length;
99 };
100#endif /* CONFIG_BLOCK */
101 struct {
102 struct page **pages; /* NOT OWNER. */
103 size_t length; /* total # bytes */
104 unsigned int alignment; /* first page */
105 };
106 struct ceph_pagelist *pagelist;
107 };
108};
109
110struct ceph_msg_data_cursor {
111 size_t total_resid; /* across all data items */
112 struct list_head *data_head; /* = &ceph_msg->data */
113
114 struct ceph_msg_data *data; /* current data item */
115 size_t resid; /* bytes not yet consumed */
116 bool last_piece; /* current is last piece */
117 bool need_crc; /* crc update needed */
118 union {
119#ifdef CONFIG_BLOCK
120 struct { /* bio */
121 struct bio *bio; /* bio from list */
122 unsigned int vector_index; /* vector from bio */
123 unsigned int vector_offset; /* bytes from vector */
124 };
125#endif /* CONFIG_BLOCK */
126 struct { /* pages */
127 unsigned int page_offset; /* offset in page */
128 unsigned short page_index; /* index in array */
129 unsigned short page_count; /* pages in array */
130 };
131 struct { /* pagelist */
132 struct page *page; /* page from list */
133 size_t offset; /* bytes from list */
134 };
135 };
136};
137
67/* 138/*
68 * a single message. it contains a header (src, dest, message type, etc.), 139 * a single message. it contains a header (src, dest, message type, etc.),
69 * footer (crc values, mainly), a "front" message body, and possibly a 140 * footer (crc values, mainly), a "front" message body, and possibly a
@@ -74,21 +145,15 @@ struct ceph_msg {
74 struct ceph_msg_footer footer; /* footer */ 145 struct ceph_msg_footer footer; /* footer */
75 struct kvec front; /* unaligned blobs of message */ 146 struct kvec front; /* unaligned blobs of message */
76 struct ceph_buffer *middle; 147 struct ceph_buffer *middle;
77 struct page **pages; /* data payload. NOT OWNER. */ 148
78 unsigned nr_pages; /* size of page array */ 149 size_t data_length;
79 unsigned page_alignment; /* io offset in first page */ 150 struct list_head data;
80 struct ceph_pagelist *pagelist; /* instead of pages */ 151 struct ceph_msg_data_cursor cursor;
81 152
82 struct ceph_connection *con; 153 struct ceph_connection *con;
83 struct list_head list_head; 154 struct list_head list_head; /* links for connection lists */
84 155
85 struct kref kref; 156 struct kref kref;
86#ifdef CONFIG_BLOCK
87 struct bio *bio; /* instead of pages/pagelist */
88 struct bio *bio_iter; /* bio iterator */
89 int bio_seg; /* current bio segment */
90#endif /* CONFIG_BLOCK */
91 struct ceph_pagelist *trail; /* the trailing part of the data */
92 bool front_is_vmalloc; 157 bool front_is_vmalloc;
93 bool more_to_follow; 158 bool more_to_follow;
94 bool needs_out_seq; 159 bool needs_out_seq;
@@ -98,12 +163,6 @@ struct ceph_msg {
98 struct ceph_msgpool *pool; 163 struct ceph_msgpool *pool;
99}; 164};
100 165
101struct ceph_msg_pos {
102 int page, page_pos; /* which page; offset in page */
103 int data_pos; /* offset in data payload */
104 bool did_page_crc; /* true if we've calculated crc for current page */
105};
106
107/* ceph connection fault delay defaults, for exponential backoff */ 166/* ceph connection fault delay defaults, for exponential backoff */
108#define BASE_DELAY_INTERVAL (HZ/2) 167#define BASE_DELAY_INTERVAL (HZ/2)
109#define MAX_DELAY_INTERVAL (5 * 60 * HZ) 168#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
@@ -161,7 +220,6 @@ struct ceph_connection {
161 struct ceph_msg *out_msg; /* sending message (== tail of 220 struct ceph_msg *out_msg; /* sending message (== tail of
162 out_sent) */ 221 out_sent) */
163 bool out_msg_done; 222 bool out_msg_done;
164 struct ceph_msg_pos out_msg_pos;
165 223
166 struct kvec out_kvec[8], /* sending header/footer data */ 224 struct kvec out_kvec[8], /* sending header/footer data */
167 *out_kvec_cur; 225 *out_kvec_cur;
@@ -175,7 +233,6 @@ struct ceph_connection {
175 /* message in temps */ 233 /* message in temps */
176 struct ceph_msg_header in_hdr; 234 struct ceph_msg_header in_hdr;
177 struct ceph_msg *in_msg; 235 struct ceph_msg *in_msg;
178 struct ceph_msg_pos in_msg_pos;
179 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ 236 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
180 237
181 char in_tag; /* protocol control byte */ 238 char in_tag; /* protocol control byte */
@@ -218,6 +275,15 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
218 275
219extern void ceph_con_keepalive(struct ceph_connection *con); 276extern void ceph_con_keepalive(struct ceph_connection *con);
220 277
278extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
279 size_t length, size_t alignment);
280extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
281 struct ceph_pagelist *pagelist);
282#ifdef CONFIG_BLOCK
283extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
284 size_t length);
285#endif /* CONFIG_BLOCK */
286
221extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 287extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
222 bool can_fail); 288 bool can_fail);
223extern void ceph_msg_kfree(struct ceph_msg *m); 289extern void ceph_msg_kfree(struct ceph_msg *m);
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 680d3d648cac..3d94a73b5f30 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -87,6 +87,7 @@ struct ceph_entity_inst {
87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ 87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ 88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ 89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
90#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
90 91
91 92
92/* 93/*
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 1dd5d466b6f9..186db0bf4951 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -29,6 +29,7 @@ struct ceph_authorizer;
29 */ 29 */
30typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, 30typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
31 struct ceph_msg *); 31 struct ceph_msg *);
32typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
32 33
33/* a given osd we're communicating with */ 34/* a given osd we're communicating with */
34struct ceph_osd { 35struct ceph_osd {
@@ -48,7 +49,67 @@ struct ceph_osd {
48}; 49};
49 50
50 51
51#define CEPH_OSD_MAX_OP 10 52#define CEPH_OSD_MAX_OP 2
53
54enum ceph_osd_data_type {
55 CEPH_OSD_DATA_TYPE_NONE = 0,
56 CEPH_OSD_DATA_TYPE_PAGES,
57 CEPH_OSD_DATA_TYPE_PAGELIST,
58#ifdef CONFIG_BLOCK
59 CEPH_OSD_DATA_TYPE_BIO,
60#endif /* CONFIG_BLOCK */
61};
62
63struct ceph_osd_data {
64 enum ceph_osd_data_type type;
65 union {
66 struct {
67 struct page **pages;
68 u64 length;
69 u32 alignment;
70 bool pages_from_pool;
71 bool own_pages;
72 };
73 struct ceph_pagelist *pagelist;
74#ifdef CONFIG_BLOCK
75 struct {
76 struct bio *bio; /* list of bios */
77 size_t bio_length; /* total in list */
78 };
79#endif /* CONFIG_BLOCK */
80 };
81};
82
83struct ceph_osd_req_op {
84 u16 op; /* CEPH_OSD_OP_* */
85 u32 payload_len;
86 union {
87 struct ceph_osd_data raw_data_in;
88 struct {
89 u64 offset, length;
90 u64 truncate_size;
91 u32 truncate_seq;
92 struct ceph_osd_data osd_data;
93 } extent;
94 struct {
95 const char *class_name;
96 const char *method_name;
97 struct ceph_osd_data request_info;
98 struct ceph_osd_data request_data;
99 struct ceph_osd_data response_data;
100 __u8 class_len;
101 __u8 method_len;
102 __u8 argc;
103 } cls;
104 struct {
105 u64 cookie;
106 u64 ver;
107 u32 prot_ver;
108 u32 timeout;
109 __u8 flag;
110 } watch;
111 };
112};
52 113
53/* an in-flight request */ 114/* an in-flight request */
54struct ceph_osd_request { 115struct ceph_osd_request {
@@ -63,15 +124,14 @@ struct ceph_osd_request {
63 int r_pg_osds[CEPH_PG_MAX_SIZE]; 124 int r_pg_osds[CEPH_PG_MAX_SIZE];
64 int r_num_pg_osds; 125 int r_num_pg_osds;
65 126
66 struct ceph_connection *r_con_filling_msg;
67
68 struct ceph_msg *r_request, *r_reply; 127 struct ceph_msg *r_request, *r_reply;
69 int r_flags; /* any additional flags for the osd */ 128 int r_flags; /* any additional flags for the osd */
70 u32 r_sent; /* >0 if r_request is sending/sent */ 129 u32 r_sent; /* >0 if r_request is sending/sent */
71 int r_num_ops;
72 130
73 /* encoded message content */ 131 /* request osd ops array */
74 struct ceph_osd_op *r_request_ops; 132 unsigned int r_num_ops;
133 struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP];
134
75 /* these are updated on each send */ 135 /* these are updated on each send */
76 __le32 *r_request_osdmap_epoch; 136 __le32 *r_request_osdmap_epoch;
77 __le32 *r_request_flags; 137 __le32 *r_request_flags;
@@ -85,12 +145,14 @@ struct ceph_osd_request {
85 s32 r_reply_op_result[CEPH_OSD_MAX_OP]; 145 s32 r_reply_op_result[CEPH_OSD_MAX_OP];
86 int r_got_reply; 146 int r_got_reply;
87 int r_linger; 147 int r_linger;
148 int r_completed;
88 149
89 struct ceph_osd_client *r_osdc; 150 struct ceph_osd_client *r_osdc;
90 struct kref r_kref; 151 struct kref r_kref;
91 bool r_mempool; 152 bool r_mempool;
92 struct completion r_completion, r_safe_completion; 153 struct completion r_completion, r_safe_completion;
93 ceph_osdc_callback_t r_callback, r_safe_callback; 154 ceph_osdc_callback_t r_callback;
155 ceph_osdc_unsafe_callback_t r_unsafe_callback;
94 struct ceph_eversion r_reassert_version; 156 struct ceph_eversion r_reassert_version;
95 struct list_head r_unsafe_item; 157 struct list_head r_unsafe_item;
96 158
@@ -104,16 +166,6 @@ struct ceph_osd_request {
104 166
105 struct ceph_file_layout r_file_layout; 167 struct ceph_file_layout r_file_layout;
106 struct ceph_snap_context *r_snapc; /* snap context for writes */ 168 struct ceph_snap_context *r_snapc; /* snap context for writes */
107 unsigned r_num_pages; /* size of page array (follows) */
108 unsigned r_page_alignment; /* io offset in first page */
109 struct page **r_pages; /* pages for data payload */
110 int r_pages_from_pool;
111 int r_own_pages; /* if true, i own page list */
112#ifdef CONFIG_BLOCK
113 struct bio *r_bio; /* instead of pages */
114#endif
115
116 struct ceph_pagelist r_trail; /* trailing part of the data */
117}; 169};
118 170
119struct ceph_osd_event { 171struct ceph_osd_event {
@@ -172,48 +224,8 @@ struct ceph_osd_client {
172 struct workqueue_struct *notify_wq; 224 struct workqueue_struct *notify_wq;
173}; 225};
174 226
175struct ceph_osd_req_op { 227extern int ceph_osdc_setup(void);
176 u16 op; /* CEPH_OSD_OP_* */ 228extern void ceph_osdc_cleanup(void);
177 u32 payload_len;
178 union {
179 struct {
180 u64 offset, length;
181 u64 truncate_size;
182 u32 truncate_seq;
183 } extent;
184 struct {
185 const char *name;
186 const char *val;
187 u32 name_len;
188 u32 value_len;
189 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
190 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
191 } xattr;
192 struct {
193 const char *class_name;
194 const char *method_name;
195 const char *indata;
196 u32 indata_len;
197 __u8 class_len;
198 __u8 method_len;
199 __u8 argc;
200 } cls;
201 struct {
202 u64 cookie;
203 u64 count;
204 } pgls;
205 struct {
206 u64 snapid;
207 } snap;
208 struct {
209 u64 cookie;
210 u64 ver;
211 u32 prot_ver;
212 u32 timeout;
213 __u8 flag;
214 } watch;
215 };
216};
217 229
218extern int ceph_osdc_init(struct ceph_osd_client *osdc, 230extern int ceph_osdc_init(struct ceph_osd_client *osdc,
219 struct ceph_client *client); 231 struct ceph_client *client);
@@ -224,16 +236,71 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
224extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 236extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
225 struct ceph_msg *msg); 237 struct ceph_msg *msg);
226 238
239extern void osd_req_op_init(struct ceph_osd_request *osd_req,
240 unsigned int which, u16 opcode);
241
242extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
243 unsigned int which,
244 struct page **pages, u64 length,
245 u32 alignment, bool pages_from_pool,
246 bool own_pages);
247
248extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
249 unsigned int which, u16 opcode,
250 u64 offset, u64 length,
251 u64 truncate_size, u32 truncate_seq);
252extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
253 unsigned int which, u64 length);
254
255extern struct ceph_osd_data *osd_req_op_extent_osd_data(
256 struct ceph_osd_request *osd_req,
257 unsigned int which);
258extern struct ceph_osd_data *osd_req_op_cls_response_data(
259 struct ceph_osd_request *osd_req,
260 unsigned int which);
261
262extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
263 unsigned int which,
264 struct page **pages, u64 length,
265 u32 alignment, bool pages_from_pool,
266 bool own_pages);
267extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
268 unsigned int which,
269 struct ceph_pagelist *pagelist);
270#ifdef CONFIG_BLOCK
271extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
272 unsigned int which,
273 struct bio *bio, size_t bio_length);
274#endif /* CONFIG_BLOCK */
275
276extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
277 unsigned int which,
278 struct ceph_pagelist *pagelist);
279extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
280 unsigned int which,
281 struct page **pages, u64 length,
282 u32 alignment, bool pages_from_pool,
283 bool own_pages);
284extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
285 unsigned int which,
286 struct page **pages, u64 length,
287 u32 alignment, bool pages_from_pool,
288 bool own_pages);
289
290extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
291 unsigned int which, u16 opcode,
292 const char *class, const char *method);
293extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
294 unsigned int which, u16 opcode,
295 u64 cookie, u64 version, int flag);
296
227extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 297extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
228 struct ceph_snap_context *snapc, 298 struct ceph_snap_context *snapc,
229 unsigned int num_op, 299 unsigned int num_ops,
230 bool use_mempool, 300 bool use_mempool,
231 gfp_t gfp_flags); 301 gfp_t gfp_flags);
232 302
233extern void ceph_osdc_build_request(struct ceph_osd_request *req, 303extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
234 u64 off, u64 len,
235 unsigned int num_op,
236 struct ceph_osd_req_op *src_ops,
237 struct ceph_snap_context *snapc, 304 struct ceph_snap_context *snapc,
238 u64 snap_id, 305 u64 snap_id,
239 struct timespec *mtime); 306 struct timespec *mtime);
@@ -241,12 +308,11 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req,
241extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 308extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
242 struct ceph_file_layout *layout, 309 struct ceph_file_layout *layout,
243 struct ceph_vino vino, 310 struct ceph_vino vino,
244 u64 offset, u64 *len, int op, int flags, 311 u64 offset, u64 *len,
312 int num_ops, int opcode, int flags,
245 struct ceph_snap_context *snapc, 313 struct ceph_snap_context *snapc,
246 int do_sync, u32 truncate_seq, 314 u32 truncate_seq, u64 truncate_size,
247 u64 truncate_size, 315 bool use_mempool);
248 struct timespec *mtime,
249 bool use_mempool, int page_align);
250 316
251extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 317extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
252 struct ceph_osd_request *req); 318 struct ceph_osd_request *req);
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index c819190d1642..d05cc4451af6 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/rbtree.h> 4#include <linux/rbtree.h>
5#include <linux/ceph/types.h> 5#include <linux/ceph/types.h>
6#include <linux/ceph/decode.h>
6#include <linux/ceph/ceph_fs.h> 7#include <linux/ceph/ceph_fs.h>
7#include <linux/crush/crush.h> 8#include <linux/crush/crush.h>
8 9
@@ -119,6 +120,29 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
119 return &map->osd_addr[osd]; 120 return &map->osd_addr[osd];
120} 121}
121 122
123static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
124{
125 __u8 version;
126
127 if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
128 pr_warning("incomplete pg encoding");
129
130 return -EINVAL;
131 }
132 version = ceph_decode_8(p);
133 if (version > 1) {
134 pr_warning("do not understand pg encoding %d > 1",
135 (int)version);
136 return -EINVAL;
137 }
138
139 pgid->pool = ceph_decode_64(p);
140 pgid->seed = ceph_decode_32(p);
141 *p += 4; /* skip deprecated preferred value */
142
143 return 0;
144}
145
122extern struct ceph_osdmap *osdmap_decode(void **p, void *end); 146extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
123extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 147extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
124 struct ceph_osdmap *map, 148 struct ceph_osdmap *map,
@@ -131,10 +155,8 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
131 u64 *bno, u64 *oxoff, u64 *oxlen); 155 u64 *bno, u64 *oxoff, u64 *oxlen);
132 156
133/* calculate mapping of object to a placement group */ 157/* calculate mapping of object to a placement group */
134extern int ceph_calc_object_layout(struct ceph_pg *pg, 158extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
135 const char *oid, 159 struct ceph_osdmap *osdmap, uint64_t pool);
136 struct ceph_file_layout *fl,
137 struct ceph_osdmap *osdmap);
138extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 160extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
139 struct ceph_pg pgid, 161 struct ceph_pg pgid,
140 int *acting); 162 int *acting);
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index e87ef435e11b..958d9856912c 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
11 crypto.o armor.o \ 11 crypto.o armor.o \
12 auth_x.o \ 12 auth_x.o \
13 ceph_fs.o ceph_strings.o ceph_hash.o \ 13 ceph_fs.o ceph_strings.o ceph_hash.o \
14 pagevec.o 14 pagevec.o snapshot.o
15 15
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index b4bf4ac090f1..6b923bcaa2a4 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
47 if (!ac) 47 if (!ac)
48 goto out; 48 goto out;
49 49
50 mutex_init(&ac->mutex);
50 ac->negotiating = true; 51 ac->negotiating = true;
51 if (name) 52 if (name)
52 ac->name = name; 53 ac->name = name;
@@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac)
73 */ 74 */
74void ceph_auth_reset(struct ceph_auth_client *ac) 75void ceph_auth_reset(struct ceph_auth_client *ac)
75{ 76{
77 mutex_lock(&ac->mutex);
76 dout("auth_reset %p\n", ac); 78 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating) 79 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac); 80 ac->ops->reset(ac);
79 ac->negotiating = true; 81 ac->negotiating = true;
82 mutex_unlock(&ac->mutex);
80} 83}
81 84
82int ceph_entity_name_encode(const char *name, void **p, void *end) 85int ceph_entity_name_encode(const char *name, void **p, void *end)
@@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
102 int i, num; 105 int i, num;
103 int ret; 106 int ret;
104 107
108 mutex_lock(&ac->mutex);
105 dout("auth_build_hello\n"); 109 dout("auth_build_hello\n");
106 monhdr->have_version = 0; 110 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1); 111 monhdr->session_mon = cpu_to_le16(-1);
@@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
122 126
123 ret = ceph_entity_name_encode(ac->name, &p, end); 127 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0) 128 if (ret < 0)
125 return ret; 129 goto out;
126 ceph_decode_need(&p, end, sizeof(u64), bad); 130 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id); 131 ceph_encode_64(&p, ac->global_id);
128 132
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32)); 133 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf; 134 ret = p - buf;
135out:
136 mutex_unlock(&ac->mutex);
137 return ret;
131 138
132bad: 139bad:
133 return -ERANGE; 140 ret = -ERANGE;
141 goto out;
134} 142}
135 143
136static int ceph_build_auth_request(struct ceph_auth_client *ac, 144static int ceph_build_auth_request(struct ceph_auth_client *ac,
@@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac,
151 if (ret < 0) { 159 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret, 160 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name); 161 ac->ops->name);
154 return ret; 162 goto out;
155 } 163 }
156 dout(" built request %d bytes\n", ret); 164 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret); 165 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf; 166 ret = p + ret - msg_buf;
167out:
168 return ret;
159} 169}
160 170
161/* 171/*
@@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
176 int result_msg_len; 186 int result_msg_len;
177 int ret = -EINVAL; 187 int ret = -EINVAL;
178 188
189 mutex_lock(&ac->mutex);
179 dout("handle_auth_reply %p %p\n", p, end); 190 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); 191 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p); 192 protocol = ceph_decode_32(&p);
@@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
227 238
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end); 239 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) { 240 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len); 241 ret = ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) { 242 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret); 243 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 } 244 }
235 return 0;
236 245
237bad:
238 pr_err("failed to decode auth msg\n");
239out: 246out:
247 mutex_unlock(&ac->mutex);
240 return ret; 248 return ret;
249
250bad:
251 pr_err("failed to decode auth msg\n");
252 ret = -EINVAL;
253 goto out;
241} 254}
242 255
243int ceph_build_auth(struct ceph_auth_client *ac, 256int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len) 257 void *msg_buf, size_t msg_len)
245{ 258{
259 int ret = 0;
260
261 mutex_lock(&ac->mutex);
246 if (!ac->protocol) 262 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len); 263 ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops); 264 else if (ac->ops->should_authenticate(ac))
249 if (ac->ops->should_authenticate(ac)) 265 ret = ceph_build_auth_request(ac, msg_buf, msg_len);
250 return ceph_build_auth_request(ac, msg_buf, msg_len); 266 mutex_unlock(&ac->mutex);
251 return 0; 267 return ret;
252} 268}
253 269
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac) 270int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{ 271{
256 if (!ac->ops) 272 int ret = 0;
257 return 0; 273
258 return ac->ops->is_authenticated(ac); 274 mutex_lock(&ac->mutex);
275 if (ac->ops)
276 ret = ac->ops->is_authenticated(ac);
277 mutex_unlock(&ac->mutex);
278 return ret;
279}
280EXPORT_SYMBOL(ceph_auth_is_authenticated);
281
282int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
283 int peer_type,
284 struct ceph_auth_handshake *auth)
285{
286 int ret = 0;
287
288 mutex_lock(&ac->mutex);
289 if (ac->ops && ac->ops->create_authorizer)
290 ret = ac->ops->create_authorizer(ac, peer_type, auth);
291 mutex_unlock(&ac->mutex);
292 return ret;
293}
294EXPORT_SYMBOL(ceph_auth_create_authorizer);
295
296void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
297 struct ceph_authorizer *a)
298{
299 mutex_lock(&ac->mutex);
300 if (ac->ops && ac->ops->destroy_authorizer)
301 ac->ops->destroy_authorizer(ac, a);
302 mutex_unlock(&ac->mutex);
303}
304EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
305
306int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
307 int peer_type,
308 struct ceph_auth_handshake *a)
309{
310 int ret = 0;
311
312 mutex_lock(&ac->mutex);
313 if (ac->ops && ac->ops->update_authorizer)
314 ret = ac->ops->update_authorizer(ac, peer_type, a);
315 mutex_unlock(&ac->mutex);
316 return ret;
317}
318EXPORT_SYMBOL(ceph_auth_update_authorizer);
319
320int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
321 struct ceph_authorizer *a, size_t len)
322{
323 int ret = 0;
324
325 mutex_lock(&ac->mutex);
326 if (ac->ops && ac->ops->verify_authorizer_reply)
327 ret = ac->ops->verify_authorizer_reply(ac, a, len);
328 mutex_unlock(&ac->mutex);
329 return ret;
330}
331EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
332
333void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
334{
335 mutex_lock(&ac->mutex);
336 if (ac->ops && ac->ops->invalidate_authorizer)
337 ac->ops->invalidate_authorizer(ac, peer_type);
338 mutex_unlock(&ac->mutex);
259} 339}
340EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a16bf14eb027..96238ba95f2b 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
298 return -ENOMEM; 298 return -ENOMEM;
299 } 299 }
300 au->service = th->service; 300 au->service = th->service;
301 au->secret_id = th->secret_id;
301 302
302 msg_a = au->buf->vec.iov_base; 303 msg_a = au->buf->vec.iov_base;
303 msg_a->struct_v = 1; 304 msg_a->struct_v = 1;
@@ -555,6 +556,26 @@ static int ceph_x_create_authorizer(
555 return 0; 556 return 0;
556} 557}
557 558
559static int ceph_x_update_authorizer(
560 struct ceph_auth_client *ac, int peer_type,
561 struct ceph_auth_handshake *auth)
562{
563 struct ceph_x_authorizer *au;
564 struct ceph_x_ticket_handler *th;
565
566 th = get_ticket_handler(ac, peer_type);
567 if (IS_ERR(th))
568 return PTR_ERR(th);
569
570 au = (struct ceph_x_authorizer *)auth->authorizer;
571 if (au->secret_id < th->secret_id) {
572 dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
573 au->service, au->secret_id, th->secret_id);
574 return ceph_x_build_authorizer(ac, th, au);
575 }
576 return 0;
577}
578
558static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, 579static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
559 struct ceph_authorizer *a, size_t len) 580 struct ceph_authorizer *a, size_t len)
560{ 581{
@@ -630,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
630 651
631 th = get_ticket_handler(ac, peer_type); 652 th = get_ticket_handler(ac, peer_type);
632 if (!IS_ERR(th)) 653 if (!IS_ERR(th))
633 remove_ticket_handler(ac, th); 654 memset(&th->validity, 0, sizeof(th->validity));
634} 655}
635 656
636 657
@@ -641,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
641 .build_request = ceph_x_build_request, 662 .build_request = ceph_x_build_request,
642 .handle_reply = ceph_x_handle_reply, 663 .handle_reply = ceph_x_handle_reply,
643 .create_authorizer = ceph_x_create_authorizer, 664 .create_authorizer = ceph_x_create_authorizer,
665 .update_authorizer = ceph_x_update_authorizer,
644 .verify_authorizer_reply = ceph_x_verify_authorizer_reply, 666 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
645 .destroy_authorizer = ceph_x_destroy_authorizer, 667 .destroy_authorizer = ceph_x_destroy_authorizer,
646 .invalidate_authorizer = ceph_x_invalidate_authorizer, 668 .invalidate_authorizer = ceph_x_invalidate_authorizer,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index f459e93b774f..c5a058da7ac8 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -29,6 +29,7 @@ struct ceph_x_authorizer {
29 struct ceph_buffer *buf; 29 struct ceph_buffer *buf;
30 unsigned int service; 30 unsigned int service;
31 u64 nonce; 31 u64 nonce;
32 u64 secret_id;
32 char reply_buf[128]; /* big enough for encrypted blob */ 33 char reply_buf[128]; /* big enough for encrypted blob */
33}; 34};
34 35
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index e65e6e4be38b..34b11ee8124e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -606,11 +606,17 @@ static int __init init_ceph_lib(void)
606 if (ret < 0) 606 if (ret < 0)
607 goto out_crypto; 607 goto out_crypto;
608 608
609 ret = ceph_osdc_setup();
610 if (ret < 0)
611 goto out_msgr;
612
609 pr_info("loaded (mon/osd proto %d/%d)\n", 613 pr_info("loaded (mon/osd proto %d/%d)\n",
610 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); 614 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
611 615
612 return 0; 616 return 0;
613 617
618out_msgr:
619 ceph_msgr_exit();
614out_crypto: 620out_crypto:
615 ceph_crypto_shutdown(); 621 ceph_crypto_shutdown();
616out_debugfs: 622out_debugfs:
@@ -622,6 +628,7 @@ out:
622static void __exit exit_ceph_lib(void) 628static void __exit exit_ceph_lib(void)
623{ 629{
624 dout("exit_ceph_lib\n"); 630 dout("exit_ceph_lib\n");
631 ceph_osdc_cleanup();
625 ceph_msgr_exit(); 632 ceph_msgr_exit();
626 ceph_crypto_shutdown(); 633 ceph_crypto_shutdown();
627 ceph_debugfs_cleanup(); 634 ceph_debugfs_cleanup();
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 00d051f4894e..83661cdc0766 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp)
123 mutex_lock(&osdc->request_mutex); 123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req; 125 struct ceph_osd_request *req;
126 unsigned int i;
126 int opcode; 127 int opcode;
127 int i;
128 128
129 req = rb_entry(p, struct ceph_osd_request, r_node); 129 req = rb_entry(p, struct ceph_osd_request, r_node);
130 130
@@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp)
142 seq_printf(s, "\t"); 142 seq_printf(s, "\t");
143 143
144 for (i = 0; i < req->r_num_ops; i++) { 144 for (i = 0; i < req->r_num_ops; i++) {
145 opcode = le16_to_cpu(req->r_request_ops[i].op); 145 opcode = req->r_ops[i].op;
146 seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); 146 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
147 } 147 }
148 148
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2c0669fb54e3..eb0a46a49bd4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -21,6 +21,9 @@
21#include <linux/ceph/pagelist.h> 21#include <linux/ceph/pagelist.h>
22#include <linux/export.h> 22#include <linux/export.h>
23 23
24#define list_entry_next(pos, member) \
25 list_entry(pos->member.next, typeof(*pos), member)
26
24/* 27/*
25 * Ceph uses the messenger to exchange ceph_msg messages with other 28 * Ceph uses the messenger to exchange ceph_msg messages with other
26 * hosts in the system. The messenger provides ordered and reliable 29 * hosts in the system. The messenger provides ordered and reliable
@@ -149,6 +152,11 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
149 return test_and_set_bit(con_flag, &con->flags); 152 return test_and_set_bit(con_flag, &con->flags);
150} 153}
151 154
155/* Slab caches for frequently-allocated structures */
156
157static struct kmem_cache *ceph_msg_cache;
158static struct kmem_cache *ceph_msg_data_cache;
159
152/* static tag bytes (protocol control messages) */ 160/* static tag bytes (protocol control messages) */
153static char tag_msg = CEPH_MSGR_TAG_MSG; 161static char tag_msg = CEPH_MSGR_TAG_MSG;
154static char tag_ack = CEPH_MSGR_TAG_ACK; 162static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -223,6 +231,41 @@ static void encode_my_addr(struct ceph_messenger *msgr)
223 */ 231 */
224static struct workqueue_struct *ceph_msgr_wq; 232static struct workqueue_struct *ceph_msgr_wq;
225 233
234static int ceph_msgr_slab_init(void)
235{
236 BUG_ON(ceph_msg_cache);
237 ceph_msg_cache = kmem_cache_create("ceph_msg",
238 sizeof (struct ceph_msg),
239 __alignof__(struct ceph_msg), 0, NULL);
240
241 if (!ceph_msg_cache)
242 return -ENOMEM;
243
244 BUG_ON(ceph_msg_data_cache);
245 ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
246 sizeof (struct ceph_msg_data),
247 __alignof__(struct ceph_msg_data),
248 0, NULL);
249 if (ceph_msg_data_cache)
250 return 0;
251
252 kmem_cache_destroy(ceph_msg_cache);
253 ceph_msg_cache = NULL;
254
255 return -ENOMEM;
256}
257
258static void ceph_msgr_slab_exit(void)
259{
260 BUG_ON(!ceph_msg_data_cache);
261 kmem_cache_destroy(ceph_msg_data_cache);
262 ceph_msg_data_cache = NULL;
263
264 BUG_ON(!ceph_msg_cache);
265 kmem_cache_destroy(ceph_msg_cache);
266 ceph_msg_cache = NULL;
267}
268
226static void _ceph_msgr_exit(void) 269static void _ceph_msgr_exit(void)
227{ 270{
228 if (ceph_msgr_wq) { 271 if (ceph_msgr_wq) {
@@ -230,6 +273,8 @@ static void _ceph_msgr_exit(void)
230 ceph_msgr_wq = NULL; 273 ceph_msgr_wq = NULL;
231 } 274 }
232 275
276 ceph_msgr_slab_exit();
277
233 BUG_ON(zero_page == NULL); 278 BUG_ON(zero_page == NULL);
234 kunmap(zero_page); 279 kunmap(zero_page);
235 page_cache_release(zero_page); 280 page_cache_release(zero_page);
@@ -242,6 +287,9 @@ int ceph_msgr_init(void)
242 zero_page = ZERO_PAGE(0); 287 zero_page = ZERO_PAGE(0);
243 page_cache_get(zero_page); 288 page_cache_get(zero_page);
244 289
290 if (ceph_msgr_slab_init())
291 return -ENOMEM;
292
245 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); 293 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
246 if (ceph_msgr_wq) 294 if (ceph_msgr_wq)
247 return 0; 295 return 0;
@@ -471,6 +519,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
471 return r; 519 return r;
472} 520}
473 521
522static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
523 int page_offset, size_t length)
524{
525 void *kaddr;
526 int ret;
527
528 BUG_ON(page_offset + length > PAGE_SIZE);
529
530 kaddr = kmap(page);
531 BUG_ON(!kaddr);
532 ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length);
533 kunmap(page);
534
535 return ret;
536}
537
474/* 538/*
475 * write something. @more is true if caller will be sending more data 539 * write something. @more is true if caller will be sending more data
476 * shortly. 540 * shortly.
@@ -493,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
493} 557}
494 558
495static int ceph_tcp_sendpage(struct socket *sock, struct page *page, 559static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
496 int offset, size_t size, int more) 560 int offset, size_t size, bool more)
497{ 561{
498 int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); 562 int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
499 int ret; 563 int ret;
@@ -697,50 +761,397 @@ static void con_out_kvec_add(struct ceph_connection *con,
697} 761}
698 762
699#ifdef CONFIG_BLOCK 763#ifdef CONFIG_BLOCK
700static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) 764
765/*
766 * For a bio data item, a piece is whatever remains of the next
767 * entry in the current bio iovec, or the first entry in the next
768 * bio in the list.
769 */
770static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
771 size_t length)
701{ 772{
702 if (!bio) { 773 struct ceph_msg_data *data = cursor->data;
703 *iter = NULL; 774 struct bio *bio;
704 *seg = 0; 775
705 return; 776 BUG_ON(data->type != CEPH_MSG_DATA_BIO);
777
778 bio = data->bio;
779 BUG_ON(!bio);
780 BUG_ON(!bio->bi_vcnt);
781
782 cursor->resid = min(length, data->bio_length);
783 cursor->bio = bio;
784 cursor->vector_index = 0;
785 cursor->vector_offset = 0;
786 cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
787}
788
789static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
790 size_t *page_offset,
791 size_t *length)
792{
793 struct ceph_msg_data *data = cursor->data;
794 struct bio *bio;
795 struct bio_vec *bio_vec;
796 unsigned int index;
797
798 BUG_ON(data->type != CEPH_MSG_DATA_BIO);
799
800 bio = cursor->bio;
801 BUG_ON(!bio);
802
803 index = cursor->vector_index;
804 BUG_ON(index >= (unsigned int) bio->bi_vcnt);
805
806 bio_vec = &bio->bi_io_vec[index];
807 BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
808 *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
809 BUG_ON(*page_offset >= PAGE_SIZE);
810 if (cursor->last_piece) /* pagelist offset is always 0 */
811 *length = cursor->resid;
812 else
813 *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
814 BUG_ON(*length > cursor->resid);
815 BUG_ON(*page_offset + *length > PAGE_SIZE);
816
817 return bio_vec->bv_page;
818}
819
820static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
821 size_t bytes)
822{
823 struct bio *bio;
824 struct bio_vec *bio_vec;
825 unsigned int index;
826
827 BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
828
829 bio = cursor->bio;
830 BUG_ON(!bio);
831
832 index = cursor->vector_index;
833 BUG_ON(index >= (unsigned int) bio->bi_vcnt);
834 bio_vec = &bio->bi_io_vec[index];
835
836 /* Advance the cursor offset */
837
838 BUG_ON(cursor->resid < bytes);
839 cursor->resid -= bytes;
840 cursor->vector_offset += bytes;
841 if (cursor->vector_offset < bio_vec->bv_len)
842 return false; /* more bytes to process in this segment */
843 BUG_ON(cursor->vector_offset != bio_vec->bv_len);
844
845 /* Move on to the next segment, and possibly the next bio */
846
847 if (++index == (unsigned int) bio->bi_vcnt) {
848 bio = bio->bi_next;
849 index = 0;
706 } 850 }
707 *iter = bio; 851 cursor->bio = bio;
708 *seg = bio->bi_idx; 852 cursor->vector_index = index;
853 cursor->vector_offset = 0;
854
855 if (!cursor->last_piece) {
856 BUG_ON(!cursor->resid);
857 BUG_ON(!bio);
858 /* A short read is OK, so use <= rather than == */
859 if (cursor->resid <= bio->bi_io_vec[index].bv_len)
860 cursor->last_piece = true;
861 }
862
863 return true;
709} 864}
865#endif /* CONFIG_BLOCK */
710 866
711static void iter_bio_next(struct bio **bio_iter, int *seg) 867/*
868 * For a page array, a piece comes from the first page in the array
869 * that has not already been fully consumed.
870 */
871static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
872 size_t length)
712{ 873{
713 if (*bio_iter == NULL) 874 struct ceph_msg_data *data = cursor->data;
714 return; 875 int page_count;
876
877 BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
715 878
716 BUG_ON(*seg >= (*bio_iter)->bi_vcnt); 879 BUG_ON(!data->pages);
880 BUG_ON(!data->length);
717 881
718 (*seg)++; 882 cursor->resid = min(length, data->length);
719 if (*seg == (*bio_iter)->bi_vcnt) 883 page_count = calc_pages_for(data->alignment, (u64)data->length);
720 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); 884 cursor->page_offset = data->alignment & ~PAGE_MASK;
885 cursor->page_index = 0;
886 BUG_ON(page_count > (int)USHRT_MAX);
887 cursor->page_count = (unsigned short)page_count;
888 BUG_ON(length > SIZE_MAX - cursor->page_offset);
889 cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
721} 890}
722#endif
723 891
724static void prepare_write_message_data(struct ceph_connection *con) 892static struct page *
893ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
894 size_t *page_offset, size_t *length)
725{ 895{
726 struct ceph_msg *msg = con->out_msg; 896 struct ceph_msg_data *data = cursor->data;
727 897
728 BUG_ON(!msg); 898 BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
729 BUG_ON(!msg->hdr.data_len); 899
900 BUG_ON(cursor->page_index >= cursor->page_count);
901 BUG_ON(cursor->page_offset >= PAGE_SIZE);
902
903 *page_offset = cursor->page_offset;
904 if (cursor->last_piece)
905 *length = cursor->resid;
906 else
907 *length = PAGE_SIZE - *page_offset;
908
909 return data->pages[cursor->page_index];
910}
911
912static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
913 size_t bytes)
914{
915 BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
916
917 BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
918
919 /* Advance the cursor page offset */
920
921 cursor->resid -= bytes;
922 cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
923 if (!bytes || cursor->page_offset)
924 return false; /* more bytes to process in the current page */
925
926 /* Move on to the next page; offset is already at 0 */
927
928 BUG_ON(cursor->page_index >= cursor->page_count);
929 cursor->page_index++;
930 cursor->last_piece = cursor->resid <= PAGE_SIZE;
931
932 return true;
933}
934
935/*
936 * For a pagelist, a piece is whatever remains to be consumed in the
937 * first page in the list, or the front of the next page.
938 */
939static void
940ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
941 size_t length)
942{
943 struct ceph_msg_data *data = cursor->data;
944 struct ceph_pagelist *pagelist;
945 struct page *page;
946
947 BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
948
949 pagelist = data->pagelist;
950 BUG_ON(!pagelist);
951
952 if (!length)
953 return; /* pagelist can be assigned but empty */
954
955 BUG_ON(list_empty(&pagelist->head));
956 page = list_first_entry(&pagelist->head, struct page, lru);
957
958 cursor->resid = min(length, pagelist->length);
959 cursor->page = page;
960 cursor->offset = 0;
961 cursor->last_piece = cursor->resid <= PAGE_SIZE;
962}
963
964static struct page *
965ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
966 size_t *page_offset, size_t *length)
967{
968 struct ceph_msg_data *data = cursor->data;
969 struct ceph_pagelist *pagelist;
970
971 BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
730 972
731 /* initialize page iterator */ 973 pagelist = data->pagelist;
732 con->out_msg_pos.page = 0; 974 BUG_ON(!pagelist);
733 if (msg->pages) 975
734 con->out_msg_pos.page_pos = msg->page_alignment; 976 BUG_ON(!cursor->page);
977 BUG_ON(cursor->offset + cursor->resid != pagelist->length);
978
979 /* offset of first page in pagelist is always 0 */
980 *page_offset = cursor->offset & ~PAGE_MASK;
981 if (cursor->last_piece)
982 *length = cursor->resid;
735 else 983 else
736 con->out_msg_pos.page_pos = 0; 984 *length = PAGE_SIZE - *page_offset;
985
986 return cursor->page;
987}
988
989static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
990 size_t bytes)
991{
992 struct ceph_msg_data *data = cursor->data;
993 struct ceph_pagelist *pagelist;
994
995 BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
996
997 pagelist = data->pagelist;
998 BUG_ON(!pagelist);
999
1000 BUG_ON(cursor->offset + cursor->resid != pagelist->length);
1001 BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
1002
1003 /* Advance the cursor offset */
1004
1005 cursor->resid -= bytes;
1006 cursor->offset += bytes;
1007 /* offset of first page in pagelist is always 0 */
1008 if (!bytes || cursor->offset & ~PAGE_MASK)
1009 return false; /* more bytes to process in the current page */
1010
1011 /* Move on to the next page */
1012
1013 BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
1014 cursor->page = list_entry_next(cursor->page, lru);
1015 cursor->last_piece = cursor->resid <= PAGE_SIZE;
1016
1017 return true;
1018}
1019
1020/*
1021 * Message data is handled (sent or received) in pieces, where each
1022 * piece resides on a single page. The network layer might not
1023 * consume an entire piece at once. A data item's cursor keeps
1024 * track of which piece is next to process and how much remains to
1025 * be processed in that piece. It also tracks whether the current
1026 * piece is the last one in the data item.
1027 */
1028static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
1029{
1030 size_t length = cursor->total_resid;
1031
1032 switch (cursor->data->type) {
1033 case CEPH_MSG_DATA_PAGELIST:
1034 ceph_msg_data_pagelist_cursor_init(cursor, length);
1035 break;
1036 case CEPH_MSG_DATA_PAGES:
1037 ceph_msg_data_pages_cursor_init(cursor, length);
1038 break;
737#ifdef CONFIG_BLOCK 1039#ifdef CONFIG_BLOCK
738 if (msg->bio) 1040 case CEPH_MSG_DATA_BIO:
739 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); 1041 ceph_msg_data_bio_cursor_init(cursor, length);
740#endif 1042 break;
741 con->out_msg_pos.data_pos = 0; 1043#endif /* CONFIG_BLOCK */
742 con->out_msg_pos.did_page_crc = false; 1044 case CEPH_MSG_DATA_NONE:
743 con->out_more = 1; /* data + footer will follow */ 1045 default:
1046 /* BUG(); */
1047 break;
1048 }
1049 cursor->need_crc = true;
1050}
1051
1052static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
1053{
1054 struct ceph_msg_data_cursor *cursor = &msg->cursor;
1055 struct ceph_msg_data *data;
1056
1057 BUG_ON(!length);
1058 BUG_ON(length > msg->data_length);
1059 BUG_ON(list_empty(&msg->data));
1060
1061 cursor->data_head = &msg->data;
1062 cursor->total_resid = length;
1063 data = list_first_entry(&msg->data, struct ceph_msg_data, links);
1064 cursor->data = data;
1065
1066 __ceph_msg_data_cursor_init(cursor);
1067}
1068
1069/*
1070 * Return the page containing the next piece to process for a given
1071 * data item, and supply the page offset and length of that piece.
1072 * Indicate whether this is the last piece in this data item.
1073 */
1074static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
1075 size_t *page_offset, size_t *length,
1076 bool *last_piece)
1077{
1078 struct page *page;
1079
1080 switch (cursor->data->type) {
1081 case CEPH_MSG_DATA_PAGELIST:
1082 page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
1083 break;
1084 case CEPH_MSG_DATA_PAGES:
1085 page = ceph_msg_data_pages_next(cursor, page_offset, length);
1086 break;
1087#ifdef CONFIG_BLOCK
1088 case CEPH_MSG_DATA_BIO:
1089 page = ceph_msg_data_bio_next(cursor, page_offset, length);
1090 break;
1091#endif /* CONFIG_BLOCK */
1092 case CEPH_MSG_DATA_NONE:
1093 default:
1094 page = NULL;
1095 break;
1096 }
1097 BUG_ON(!page);
1098 BUG_ON(*page_offset + *length > PAGE_SIZE);
1099 BUG_ON(!*length);
1100 if (last_piece)
1101 *last_piece = cursor->last_piece;
1102
1103 return page;
1104}
1105
1106/*
1107 * Returns true if the result moves the cursor on to the next piece
1108 * of the data item.
1109 */
1110static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
1111 size_t bytes)
1112{
1113 bool new_piece;
1114
1115 BUG_ON(bytes > cursor->resid);
1116 switch (cursor->data->type) {
1117 case CEPH_MSG_DATA_PAGELIST:
1118 new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
1119 break;
1120 case CEPH_MSG_DATA_PAGES:
1121 new_piece = ceph_msg_data_pages_advance(cursor, bytes);
1122 break;
1123#ifdef CONFIG_BLOCK
1124 case CEPH_MSG_DATA_BIO:
1125 new_piece = ceph_msg_data_bio_advance(cursor, bytes);
1126 break;
1127#endif /* CONFIG_BLOCK */
1128 case CEPH_MSG_DATA_NONE:
1129 default:
1130 BUG();
1131 break;
1132 }
1133 cursor->total_resid -= bytes;
1134
1135 if (!cursor->resid && cursor->total_resid) {
1136 WARN_ON(!cursor->last_piece);
1137 BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
1138 cursor->data = list_entry_next(cursor->data, links);
1139 __ceph_msg_data_cursor_init(cursor);
1140 new_piece = true;
1141 }
1142 cursor->need_crc = new_piece;
1143
1144 return new_piece;
1145}
1146
1147static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
1148{
1149 BUG_ON(!msg);
1150 BUG_ON(!data_len);
1151
1152 /* Initialize data cursor */
1153
1154 ceph_msg_data_cursor_init(msg, (size_t)data_len);
744} 1155}
745 1156
746/* 1157/*
@@ -803,16 +1214,12 @@ static void prepare_write_message(struct ceph_connection *con)
803 m->hdr.seq = cpu_to_le64(++con->out_seq); 1214 m->hdr.seq = cpu_to_le64(++con->out_seq);
804 m->needs_out_seq = false; 1215 m->needs_out_seq = false;
805 } 1216 }
806#ifdef CONFIG_BLOCK 1217 WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
807 else
808 m->bio_iter = NULL;
809#endif
810 1218
811 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", 1219 dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
812 m, con->out_seq, le16_to_cpu(m->hdr.type), 1220 m, con->out_seq, le16_to_cpu(m->hdr.type),
813 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), 1221 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
814 le32_to_cpu(m->hdr.data_len), 1222 m->data_length);
815 m->nr_pages);
816 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); 1223 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
817 1224
818 /* tag + hdr + front + middle */ 1225 /* tag + hdr + front + middle */
@@ -843,11 +1250,13 @@ static void prepare_write_message(struct ceph_connection *con)
843 1250
844 /* is there a data payload? */ 1251 /* is there a data payload? */
845 con->out_msg->footer.data_crc = 0; 1252 con->out_msg->footer.data_crc = 0;
846 if (m->hdr.data_len) 1253 if (m->data_length) {
847 prepare_write_message_data(con); 1254 prepare_message_data(con->out_msg, m->data_length);
848 else 1255 con->out_more = 1; /* data + footer will follow */
1256 } else {
849 /* no, queue up footer too and be done */ 1257 /* no, queue up footer too and be done */
850 prepare_write_message_footer(con); 1258 prepare_write_message_footer(con);
1259 }
851 1260
852 con_flag_set(con, CON_FLAG_WRITE_PENDING); 1261 con_flag_set(con, CON_FLAG_WRITE_PENDING);
853} 1262}
@@ -874,6 +1283,24 @@ static void prepare_write_ack(struct ceph_connection *con)
874} 1283}
875 1284
876/* 1285/*
1286 * Prepare to share the seq during handshake
1287 */
1288static void prepare_write_seq(struct ceph_connection *con)
1289{
1290 dout("prepare_write_seq %p %llu -> %llu\n", con,
1291 con->in_seq_acked, con->in_seq);
1292 con->in_seq_acked = con->in_seq;
1293
1294 con_out_kvec_reset(con);
1295
1296 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
1297 con_out_kvec_add(con, sizeof (con->out_temp_ack),
1298 &con->out_temp_ack);
1299
1300 con_flag_set(con, CON_FLAG_WRITE_PENDING);
1301}
1302
1303/*
877 * Prepare to write keepalive byte. 1304 * Prepare to write keepalive byte.
878 */ 1305 */
879static void prepare_write_keepalive(struct ceph_connection *con) 1306static void prepare_write_keepalive(struct ceph_connection *con)
@@ -1022,35 +1449,19 @@ out:
1022 return ret; /* done! */ 1449 return ret; /* done! */
1023} 1450}
1024 1451
1025static void out_msg_pos_next(struct ceph_connection *con, struct page *page, 1452static u32 ceph_crc32c_page(u32 crc, struct page *page,
1026 size_t len, size_t sent, bool in_trail) 1453 unsigned int page_offset,
1454 unsigned int length)
1027{ 1455{
1028 struct ceph_msg *msg = con->out_msg; 1456 char *kaddr;
1029 1457
1030 BUG_ON(!msg); 1458 kaddr = kmap(page);
1031 BUG_ON(!sent); 1459 BUG_ON(kaddr == NULL);
1032 1460 crc = crc32c(crc, kaddr + page_offset, length);
1033 con->out_msg_pos.data_pos += sent; 1461 kunmap(page);
1034 con->out_msg_pos.page_pos += sent;
1035 if (sent < len)
1036 return;
1037 1462
1038 BUG_ON(sent != len); 1463 return crc;
1039 con->out_msg_pos.page_pos = 0;
1040 con->out_msg_pos.page++;
1041 con->out_msg_pos.did_page_crc = false;
1042 if (in_trail)
1043 list_move_tail(&page->lru,
1044 &msg->trail->head);
1045 else if (msg->pagelist)
1046 list_move_tail(&page->lru,
1047 &msg->pagelist->head);
1048#ifdef CONFIG_BLOCK
1049 else if (msg->bio)
1050 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
1051#endif
1052} 1464}
1053
1054/* 1465/*
1055 * Write as much message data payload as we can. If we finish, queue 1466 * Write as much message data payload as we can. If we finish, queue
1056 * up the footer. 1467 * up the footer.
@@ -1058,21 +1469,17 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
1058 * 0 -> socket full, but more to do 1469 * 0 -> socket full, but more to do
1059 * <0 -> error 1470 * <0 -> error
1060 */ 1471 */
1061static int write_partial_msg_pages(struct ceph_connection *con) 1472static int write_partial_message_data(struct ceph_connection *con)
1062{ 1473{
1063 struct ceph_msg *msg = con->out_msg; 1474 struct ceph_msg *msg = con->out_msg;
1064 unsigned int data_len = le32_to_cpu(msg->hdr.data_len); 1475 struct ceph_msg_data_cursor *cursor = &msg->cursor;
1065 size_t len;
1066 bool do_datacrc = !con->msgr->nocrc; 1476 bool do_datacrc = !con->msgr->nocrc;
1067 int ret; 1477 u32 crc;
1068 int total_max_write;
1069 bool in_trail = false;
1070 const size_t trail_len = (msg->trail ? msg->trail->length : 0);
1071 const size_t trail_off = data_len - trail_len;
1072 1478
1073 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", 1479 dout("%s %p msg %p\n", __func__, con, msg);
1074 con, msg, con->out_msg_pos.page, msg->nr_pages, 1480
1075 con->out_msg_pos.page_pos); 1481 if (list_empty(&msg->data))
1482 return -EINVAL;
1076 1483
1077 /* 1484 /*
1078 * Iterate through each page that contains data to be 1485 * Iterate through each page that contains data to be
@@ -1082,72 +1489,41 @@ static int write_partial_msg_pages(struct ceph_connection *con)
1082 * need to map the page. If we have no pages, they have 1489 * need to map the page. If we have no pages, they have
1083 * been revoked, so use the zero page. 1490 * been revoked, so use the zero page.
1084 */ 1491 */
1085 while (data_len > con->out_msg_pos.data_pos) { 1492 crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
1086 struct page *page = NULL; 1493 while (cursor->resid) {
1087 int max_write = PAGE_SIZE; 1494 struct page *page;
1088 int bio_offset = 0; 1495 size_t page_offset;
1089 1496 size_t length;
1090 in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; 1497 bool last_piece;
1091 if (!in_trail) 1498 bool need_crc;
1092 total_max_write = trail_off - con->out_msg_pos.data_pos; 1499 int ret;
1093
1094 if (in_trail) {
1095 total_max_write = data_len - con->out_msg_pos.data_pos;
1096
1097 page = list_first_entry(&msg->trail->head,
1098 struct page, lru);
1099 } else if (msg->pages) {
1100 page = msg->pages[con->out_msg_pos.page];
1101 } else if (msg->pagelist) {
1102 page = list_first_entry(&msg->pagelist->head,
1103 struct page, lru);
1104#ifdef CONFIG_BLOCK
1105 } else if (msg->bio) {
1106 struct bio_vec *bv;
1107 1500
1108 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); 1501 page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
1109 page = bv->bv_page; 1502 &last_piece);
1110 bio_offset = bv->bv_offset; 1503 ret = ceph_tcp_sendpage(con->sock, page, page_offset,
1111 max_write = bv->bv_len; 1504 length, last_piece);
1112#endif 1505 if (ret <= 0) {
1113 } else { 1506 if (do_datacrc)
1114 page = zero_page; 1507 msg->footer.data_crc = cpu_to_le32(crc);
1115 }
1116 len = min_t(int, max_write - con->out_msg_pos.page_pos,
1117 total_max_write);
1118
1119 if (do_datacrc && !con->out_msg_pos.did_page_crc) {
1120 void *base;
1121 u32 crc = le32_to_cpu(msg->footer.data_crc);
1122 char *kaddr;
1123
1124 kaddr = kmap(page);
1125 BUG_ON(kaddr == NULL);
1126 base = kaddr + con->out_msg_pos.page_pos + bio_offset;
1127 crc = crc32c(crc, base, len);
1128 kunmap(page);
1129 msg->footer.data_crc = cpu_to_le32(crc);
1130 con->out_msg_pos.did_page_crc = true;
1131 }
1132 ret = ceph_tcp_sendpage(con->sock, page,
1133 con->out_msg_pos.page_pos + bio_offset,
1134 len, 1);
1135 if (ret <= 0)
1136 goto out;
1137 1508
1138 out_msg_pos_next(con, page, len, (size_t) ret, in_trail); 1509 return ret;
1510 }
1511 if (do_datacrc && cursor->need_crc)
1512 crc = ceph_crc32c_page(crc, page, page_offset, length);
1513 need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
1139 } 1514 }
1140 1515
1141 dout("write_partial_msg_pages %p msg %p done\n", con, msg); 1516 dout("%s %p msg %p done\n", __func__, con, msg);
1142 1517
1143 /* prepare and queue up footer, too */ 1518 /* prepare and queue up footer, too */
1144 if (!do_datacrc) 1519 if (do_datacrc)
1520 msg->footer.data_crc = cpu_to_le32(crc);
1521 else
1145 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; 1522 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
1146 con_out_kvec_reset(con); 1523 con_out_kvec_reset(con);
1147 prepare_write_message_footer(con); 1524 prepare_write_message_footer(con);
1148 ret = 1; 1525
1149out: 1526 return 1; /* must return > 0 to indicate success */
1150 return ret;
1151} 1527}
1152 1528
1153/* 1529/*
@@ -1160,7 +1536,7 @@ static int write_partial_skip(struct ceph_connection *con)
1160 while (con->out_skip > 0) { 1536 while (con->out_skip > 0) {
1161 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); 1537 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
1162 1538
1163 ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); 1539 ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
1164 if (ret <= 0) 1540 if (ret <= 0)
1165 goto out; 1541 goto out;
1166 con->out_skip -= ret; 1542 con->out_skip -= ret;
@@ -1191,6 +1567,13 @@ static void prepare_read_ack(struct ceph_connection *con)
1191 con->in_base_pos = 0; 1567 con->in_base_pos = 0;
1192} 1568}
1193 1569
1570static void prepare_read_seq(struct ceph_connection *con)
1571{
1572 dout("prepare_read_seq %p\n", con);
1573 con->in_base_pos = 0;
1574 con->in_tag = CEPH_MSGR_TAG_SEQ;
1575}
1576
1194static void prepare_read_tag(struct ceph_connection *con) 1577static void prepare_read_tag(struct ceph_connection *con)
1195{ 1578{
1196 dout("prepare_read_tag %p\n", con); 1579 dout("prepare_read_tag %p\n", con);
@@ -1597,7 +1980,6 @@ static int process_connect(struct ceph_connection *con)
1597 con->error_msg = "connect authorization failure"; 1980 con->error_msg = "connect authorization failure";
1598 return -1; 1981 return -1;
1599 } 1982 }
1600 con->auth_retry = 1;
1601 con_out_kvec_reset(con); 1983 con_out_kvec_reset(con);
1602 ret = prepare_write_connect(con); 1984 ret = prepare_write_connect(con);
1603 if (ret < 0) 1985 if (ret < 0)
@@ -1668,6 +2050,7 @@ static int process_connect(struct ceph_connection *con)
1668 prepare_read_connect(con); 2050 prepare_read_connect(con);
1669 break; 2051 break;
1670 2052
2053 case CEPH_MSGR_TAG_SEQ:
1671 case CEPH_MSGR_TAG_READY: 2054 case CEPH_MSGR_TAG_READY:
1672 if (req_feat & ~server_feat) { 2055 if (req_feat & ~server_feat) {
1673 pr_err("%s%lld %s protocol feature mismatch," 2056 pr_err("%s%lld %s protocol feature mismatch,"
@@ -1682,7 +2065,7 @@ static int process_connect(struct ceph_connection *con)
1682 2065
1683 WARN_ON(con->state != CON_STATE_NEGOTIATING); 2066 WARN_ON(con->state != CON_STATE_NEGOTIATING);
1684 con->state = CON_STATE_OPEN; 2067 con->state = CON_STATE_OPEN;
1685 2068 con->auth_retry = 0; /* we authenticated; clear flag */
1686 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 2069 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1687 con->connect_seq++; 2070 con->connect_seq++;
1688 con->peer_features = server_feat; 2071 con->peer_features = server_feat;
@@ -1698,7 +2081,12 @@ static int process_connect(struct ceph_connection *con)
1698 2081
1699 con->delay = 0; /* reset backoff memory */ 2082 con->delay = 0; /* reset backoff memory */
1700 2083
1701 prepare_read_tag(con); 2084 if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
2085 prepare_write_seq(con);
2086 prepare_read_seq(con);
2087 } else {
2088 prepare_read_tag(con);
2089 }
1702 break; 2090 break;
1703 2091
1704 case CEPH_MSGR_TAG_WAIT: 2092 case CEPH_MSGR_TAG_WAIT:
@@ -1732,7 +2120,6 @@ static int read_partial_ack(struct ceph_connection *con)
1732 return read_partial(con, end, size, &con->in_temp_ack); 2120 return read_partial(con, end, size, &con->in_temp_ack);
1733} 2121}
1734 2122
1735
1736/* 2123/*
1737 * We can finally discard anything that's been acked. 2124 * We can finally discard anything that's been acked.
1738 */ 2125 */
@@ -1757,8 +2144,6 @@ static void process_ack(struct ceph_connection *con)
1757} 2144}
1758 2145
1759 2146
1760
1761
1762static int read_partial_message_section(struct ceph_connection *con, 2147static int read_partial_message_section(struct ceph_connection *con,
1763 struct kvec *section, 2148 struct kvec *section,
1764 unsigned int sec_len, u32 *crc) 2149 unsigned int sec_len, u32 *crc)
@@ -1782,77 +2167,49 @@ static int read_partial_message_section(struct ceph_connection *con,
1782 return 1; 2167 return 1;
1783} 2168}
1784 2169
1785static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); 2170static int read_partial_msg_data(struct ceph_connection *con)
1786
1787static int read_partial_message_pages(struct ceph_connection *con,
1788 struct page **pages,
1789 unsigned int data_len, bool do_datacrc)
1790{ 2171{
1791 void *p; 2172 struct ceph_msg *msg = con->in_msg;
2173 struct ceph_msg_data_cursor *cursor = &msg->cursor;
2174 const bool do_datacrc = !con->msgr->nocrc;
2175 struct page *page;
2176 size_t page_offset;
2177 size_t length;
2178 u32 crc = 0;
1792 int ret; 2179 int ret;
1793 int left;
1794 2180
1795 left = min((int)(data_len - con->in_msg_pos.data_pos), 2181 BUG_ON(!msg);
1796 (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); 2182 if (list_empty(&msg->data))
1797 /* (page) data */ 2183 return -EIO;
1798 BUG_ON(pages == NULL);
1799 p = kmap(pages[con->in_msg_pos.page]);
1800 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1801 left);
1802 if (ret > 0 && do_datacrc)
1803 con->in_data_crc =
1804 crc32c(con->in_data_crc,
1805 p + con->in_msg_pos.page_pos, ret);
1806 kunmap(pages[con->in_msg_pos.page]);
1807 if (ret <= 0)
1808 return ret;
1809 con->in_msg_pos.data_pos += ret;
1810 con->in_msg_pos.page_pos += ret;
1811 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1812 con->in_msg_pos.page_pos = 0;
1813 con->in_msg_pos.page++;
1814 }
1815
1816 return ret;
1817}
1818
1819#ifdef CONFIG_BLOCK
1820static int read_partial_message_bio(struct ceph_connection *con,
1821 struct bio **bio_iter, int *bio_seg,
1822 unsigned int data_len, bool do_datacrc)
1823{
1824 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1825 void *p;
1826 int ret, left;
1827 2184
1828 left = min((int)(data_len - con->in_msg_pos.data_pos), 2185 if (do_datacrc)
1829 (int)(bv->bv_len - con->in_msg_pos.page_pos)); 2186 crc = con->in_data_crc;
2187 while (cursor->resid) {
2188 page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
2189 NULL);
2190 ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
2191 if (ret <= 0) {
2192 if (do_datacrc)
2193 con->in_data_crc = crc;
1830 2194
1831 p = kmap(bv->bv_page) + bv->bv_offset; 2195 return ret;
2196 }
1832 2197
1833 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 2198 if (do_datacrc)
1834 left); 2199 crc = ceph_crc32c_page(crc, page, page_offset, ret);
1835 if (ret > 0 && do_datacrc) 2200 (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
1836 con->in_data_crc =
1837 crc32c(con->in_data_crc,
1838 p + con->in_msg_pos.page_pos, ret);
1839 kunmap(bv->bv_page);
1840 if (ret <= 0)
1841 return ret;
1842 con->in_msg_pos.data_pos += ret;
1843 con->in_msg_pos.page_pos += ret;
1844 if (con->in_msg_pos.page_pos == bv->bv_len) {
1845 con->in_msg_pos.page_pos = 0;
1846 iter_bio_next(bio_iter, bio_seg);
1847 } 2201 }
2202 if (do_datacrc)
2203 con->in_data_crc = crc;
1848 2204
1849 return ret; 2205 return 1; /* must return > 0 to indicate success */
1850} 2206}
1851#endif
1852 2207
1853/* 2208/*
1854 * read (part of) a message. 2209 * read (part of) a message.
1855 */ 2210 */
2211static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
2212
1856static int read_partial_message(struct ceph_connection *con) 2213static int read_partial_message(struct ceph_connection *con)
1857{ 2214{
1858 struct ceph_msg *m = con->in_msg; 2215 struct ceph_msg *m = con->in_msg;
@@ -1885,7 +2242,7 @@ static int read_partial_message(struct ceph_connection *con)
1885 if (front_len > CEPH_MSG_MAX_FRONT_LEN) 2242 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1886 return -EIO; 2243 return -EIO;
1887 middle_len = le32_to_cpu(con->in_hdr.middle_len); 2244 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1888 if (middle_len > CEPH_MSG_MAX_DATA_LEN) 2245 if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
1889 return -EIO; 2246 return -EIO;
1890 data_len = le32_to_cpu(con->in_hdr.data_len); 2247 data_len = le32_to_cpu(con->in_hdr.data_len);
1891 if (data_len > CEPH_MSG_MAX_DATA_LEN) 2248 if (data_len > CEPH_MSG_MAX_DATA_LEN)
@@ -1914,14 +2271,22 @@ static int read_partial_message(struct ceph_connection *con)
1914 int skip = 0; 2271 int skip = 0;
1915 2272
1916 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 2273 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1917 con->in_hdr.front_len, con->in_hdr.data_len); 2274 front_len, data_len);
1918 ret = ceph_con_in_msg_alloc(con, &skip); 2275 ret = ceph_con_in_msg_alloc(con, &skip);
1919 if (ret < 0) 2276 if (ret < 0)
1920 return ret; 2277 return ret;
2278
2279 BUG_ON(!con->in_msg ^ skip);
2280 if (con->in_msg && data_len > con->in_msg->data_length) {
2281 pr_warning("%s skipping long message (%u > %zd)\n",
2282 __func__, data_len, con->in_msg->data_length);
2283 ceph_msg_put(con->in_msg);
2284 con->in_msg = NULL;
2285 skip = 1;
2286 }
1921 if (skip) { 2287 if (skip) {
1922 /* skip this message */ 2288 /* skip this message */
1923 dout("alloc_msg said skip message\n"); 2289 dout("alloc_msg said skip message\n");
1924 BUG_ON(con->in_msg);
1925 con->in_base_pos = -front_len - middle_len - data_len - 2290 con->in_base_pos = -front_len - middle_len - data_len -
1926 sizeof(m->footer); 2291 sizeof(m->footer);
1927 con->in_tag = CEPH_MSGR_TAG_READY; 2292 con->in_tag = CEPH_MSGR_TAG_READY;
@@ -1936,17 +2301,10 @@ static int read_partial_message(struct ceph_connection *con)
1936 if (m->middle) 2301 if (m->middle)
1937 m->middle->vec.iov_len = 0; 2302 m->middle->vec.iov_len = 0;
1938 2303
1939 con->in_msg_pos.page = 0; 2304 /* prepare for data payload, if any */
1940 if (m->pages)
1941 con->in_msg_pos.page_pos = m->page_alignment;
1942 else
1943 con->in_msg_pos.page_pos = 0;
1944 con->in_msg_pos.data_pos = 0;
1945 2305
1946#ifdef CONFIG_BLOCK 2306 if (data_len)
1947 if (m->bio) 2307 prepare_message_data(con->in_msg, data_len);
1948 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1949#endif
1950 } 2308 }
1951 2309
1952 /* front */ 2310 /* front */
@@ -1965,24 +2323,10 @@ static int read_partial_message(struct ceph_connection *con)
1965 } 2323 }
1966 2324
1967 /* (page) data */ 2325 /* (page) data */
1968 while (con->in_msg_pos.data_pos < data_len) { 2326 if (data_len) {
1969 if (m->pages) { 2327 ret = read_partial_msg_data(con);
1970 ret = read_partial_message_pages(con, m->pages, 2328 if (ret <= 0)
1971 data_len, do_datacrc); 2329 return ret;
1972 if (ret <= 0)
1973 return ret;
1974#ifdef CONFIG_BLOCK
1975 } else if (m->bio) {
1976 BUG_ON(!m->bio_iter);
1977 ret = read_partial_message_bio(con,
1978 &m->bio_iter, &m->bio_seg,
1979 data_len, do_datacrc);
1980 if (ret <= 0)
1981 return ret;
1982#endif
1983 } else {
1984 BUG_ON(1);
1985 }
1986 } 2330 }
1987 2331
1988 /* footer */ 2332 /* footer */
@@ -2108,13 +2452,13 @@ more_kvec:
2108 goto do_next; 2452 goto do_next;
2109 } 2453 }
2110 2454
2111 ret = write_partial_msg_pages(con); 2455 ret = write_partial_message_data(con);
2112 if (ret == 1) 2456 if (ret == 1)
2113 goto more_kvec; /* we need to send the footer, too! */ 2457 goto more_kvec; /* we need to send the footer, too! */
2114 if (ret == 0) 2458 if (ret == 0)
2115 goto out; 2459 goto out;
2116 if (ret < 0) { 2460 if (ret < 0) {
2117 dout("try_write write_partial_msg_pages err %d\n", 2461 dout("try_write write_partial_message_data err %d\n",
2118 ret); 2462 ret);
2119 goto out; 2463 goto out;
2120 } 2464 }
@@ -2266,7 +2610,12 @@ more:
2266 prepare_read_tag(con); 2610 prepare_read_tag(con);
2267 goto more; 2611 goto more;
2268 } 2612 }
2269 if (con->in_tag == CEPH_MSGR_TAG_ACK) { 2613 if (con->in_tag == CEPH_MSGR_TAG_ACK ||
2614 con->in_tag == CEPH_MSGR_TAG_SEQ) {
2615 /*
2616 * the final handshake seq exchange is semantically
2617 * equivalent to an ACK
2618 */
2270 ret = read_partial_ack(con); 2619 ret = read_partial_ack(con);
2271 if (ret <= 0) 2620 if (ret <= 0)
2272 goto out; 2621 goto out;
@@ -2672,6 +3021,88 @@ void ceph_con_keepalive(struct ceph_connection *con)
2672} 3021}
2673EXPORT_SYMBOL(ceph_con_keepalive); 3022EXPORT_SYMBOL(ceph_con_keepalive);
2674 3023
3024static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
3025{
3026 struct ceph_msg_data *data;
3027
3028 if (WARN_ON(!ceph_msg_data_type_valid(type)))
3029 return NULL;
3030
3031 data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS);
3032 if (data)
3033 data->type = type;
3034 INIT_LIST_HEAD(&data->links);
3035
3036 return data;
3037}
3038
3039static void ceph_msg_data_destroy(struct ceph_msg_data *data)
3040{
3041 if (!data)
3042 return;
3043
3044 WARN_ON(!list_empty(&data->links));
3045 if (data->type == CEPH_MSG_DATA_PAGELIST) {
3046 ceph_pagelist_release(data->pagelist);
3047 kfree(data->pagelist);
3048 }
3049 kmem_cache_free(ceph_msg_data_cache, data);
3050}
3051
3052void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
3053 size_t length, size_t alignment)
3054{
3055 struct ceph_msg_data *data;
3056
3057 BUG_ON(!pages);
3058 BUG_ON(!length);
3059
3060 data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
3061 BUG_ON(!data);
3062 data->pages = pages;
3063 data->length = length;
3064 data->alignment = alignment & ~PAGE_MASK;
3065
3066 list_add_tail(&data->links, &msg->data);
3067 msg->data_length += length;
3068}
3069EXPORT_SYMBOL(ceph_msg_data_add_pages);
3070
3071void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
3072 struct ceph_pagelist *pagelist)
3073{
3074 struct ceph_msg_data *data;
3075
3076 BUG_ON(!pagelist);
3077 BUG_ON(!pagelist->length);
3078
3079 data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
3080 BUG_ON(!data);
3081 data->pagelist = pagelist;
3082
3083 list_add_tail(&data->links, &msg->data);
3084 msg->data_length += pagelist->length;
3085}
3086EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
3087
3088#ifdef CONFIG_BLOCK
3089void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
3090 size_t length)
3091{
3092 struct ceph_msg_data *data;
3093
3094 BUG_ON(!bio);
3095
3096 data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
3097 BUG_ON(!data);
3098 data->bio = bio;
3099 data->bio_length = length;
3100
3101 list_add_tail(&data->links, &msg->data);
3102 msg->data_length += length;
3103}
3104EXPORT_SYMBOL(ceph_msg_data_add_bio);
3105#endif /* CONFIG_BLOCK */
2675 3106
2676/* 3107/*
2677 * construct a new message with given type, size 3108 * construct a new message with given type, size
@@ -2682,49 +3113,20 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2682{ 3113{
2683 struct ceph_msg *m; 3114 struct ceph_msg *m;
2684 3115
2685 m = kmalloc(sizeof(*m), flags); 3116 m = kmem_cache_zalloc(ceph_msg_cache, flags);
2686 if (m == NULL) 3117 if (m == NULL)
2687 goto out; 3118 goto out;
2688 kref_init(&m->kref);
2689 3119
2690 m->con = NULL;
2691 INIT_LIST_HEAD(&m->list_head);
2692
2693 m->hdr.tid = 0;
2694 m->hdr.type = cpu_to_le16(type); 3120 m->hdr.type = cpu_to_le16(type);
2695 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); 3121 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2696 m->hdr.version = 0;
2697 m->hdr.front_len = cpu_to_le32(front_len); 3122 m->hdr.front_len = cpu_to_le32(front_len);
2698 m->hdr.middle_len = 0;
2699 m->hdr.data_len = 0;
2700 m->hdr.data_off = 0;
2701 m->hdr.reserved = 0;
2702 m->footer.front_crc = 0;
2703 m->footer.middle_crc = 0;
2704 m->footer.data_crc = 0;
2705 m->footer.flags = 0;
2706 m->front_max = front_len;
2707 m->front_is_vmalloc = false;
2708 m->more_to_follow = false;
2709 m->ack_stamp = 0;
2710 m->pool = NULL;
2711
2712 /* middle */
2713 m->middle = NULL;
2714 3123
2715 /* data */ 3124 INIT_LIST_HEAD(&m->list_head);
2716 m->nr_pages = 0; 3125 kref_init(&m->kref);
2717 m->page_alignment = 0; 3126 INIT_LIST_HEAD(&m->data);
2718 m->pages = NULL;
2719 m->pagelist = NULL;
2720#ifdef CONFIG_BLOCK
2721 m->bio = NULL;
2722 m->bio_iter = NULL;
2723 m->bio_seg = 0;
2724#endif /* CONFIG_BLOCK */
2725 m->trail = NULL;
2726 3127
2727 /* front */ 3128 /* front */
3129 m->front_max = front_len;
2728 if (front_len) { 3130 if (front_len) {
2729 if (front_len > PAGE_CACHE_SIZE) { 3131 if (front_len > PAGE_CACHE_SIZE) {
2730 m->front.iov_base = __vmalloc(front_len, flags, 3132 m->front.iov_base = __vmalloc(front_len, flags,
@@ -2802,49 +3204,37 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2802static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) 3204static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
2803{ 3205{
2804 struct ceph_msg_header *hdr = &con->in_hdr; 3206 struct ceph_msg_header *hdr = &con->in_hdr;
2805 int type = le16_to_cpu(hdr->type);
2806 int front_len = le32_to_cpu(hdr->front_len);
2807 int middle_len = le32_to_cpu(hdr->middle_len); 3207 int middle_len = le32_to_cpu(hdr->middle_len);
3208 struct ceph_msg *msg;
2808 int ret = 0; 3209 int ret = 0;
2809 3210
2810 BUG_ON(con->in_msg != NULL); 3211 BUG_ON(con->in_msg != NULL);
3212 BUG_ON(!con->ops->alloc_msg);
2811 3213
2812 if (con->ops->alloc_msg) { 3214 mutex_unlock(&con->mutex);
2813 struct ceph_msg *msg; 3215 msg = con->ops->alloc_msg(con, hdr, skip);
2814 3216 mutex_lock(&con->mutex);
2815 mutex_unlock(&con->mutex); 3217 if (con->state != CON_STATE_OPEN) {
2816 msg = con->ops->alloc_msg(con, hdr, skip); 3218 if (msg)
2817 mutex_lock(&con->mutex); 3219 ceph_msg_put(msg);
2818 if (con->state != CON_STATE_OPEN) { 3220 return -EAGAIN;
2819 if (msg)
2820 ceph_msg_put(msg);
2821 return -EAGAIN;
2822 }
2823 con->in_msg = msg;
2824 if (con->in_msg) {
2825 con->in_msg->con = con->ops->get(con);
2826 BUG_ON(con->in_msg->con == NULL);
2827 }
2828 if (*skip) {
2829 con->in_msg = NULL;
2830 return 0;
2831 }
2832 if (!con->in_msg) {
2833 con->error_msg =
2834 "error allocating memory for incoming message";
2835 return -ENOMEM;
2836 }
2837 } 3221 }
2838 if (!con->in_msg) { 3222 if (msg) {
2839 con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); 3223 BUG_ON(*skip);
2840 if (!con->in_msg) { 3224 con->in_msg = msg;
2841 pr_err("unable to allocate msg type %d len %d\n",
2842 type, front_len);
2843 return -ENOMEM;
2844 }
2845 con->in_msg->con = con->ops->get(con); 3225 con->in_msg->con = con->ops->get(con);
2846 BUG_ON(con->in_msg->con == NULL); 3226 BUG_ON(con->in_msg->con == NULL);
2847 con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); 3227 } else {
3228 /*
3229 * Null message pointer means either we should skip
3230 * this message or we couldn't allocate memory. The
3231 * former is not an error.
3232 */
3233 if (*skip)
3234 return 0;
3235 con->error_msg = "error allocating memory for incoming message";
3236
3237 return -ENOMEM;
2848 } 3238 }
2849 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 3239 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2850 3240
@@ -2870,7 +3260,7 @@ void ceph_msg_kfree(struct ceph_msg *m)
2870 vfree(m->front.iov_base); 3260 vfree(m->front.iov_base);
2871 else 3261 else
2872 kfree(m->front.iov_base); 3262 kfree(m->front.iov_base);
2873 kfree(m); 3263 kmem_cache_free(ceph_msg_cache, m);
2874} 3264}
2875 3265
2876/* 3266/*
@@ -2879,6 +3269,9 @@ void ceph_msg_kfree(struct ceph_msg *m)
2879void ceph_msg_last_put(struct kref *kref) 3269void ceph_msg_last_put(struct kref *kref)
2880{ 3270{
2881 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); 3271 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
3272 LIST_HEAD(data);
3273 struct list_head *links;
3274 struct list_head *next;
2882 3275
2883 dout("ceph_msg_put last one on %p\n", m); 3276 dout("ceph_msg_put last one on %p\n", m);
2884 WARN_ON(!list_empty(&m->list_head)); 3277 WARN_ON(!list_empty(&m->list_head));
@@ -2888,16 +3281,16 @@ void ceph_msg_last_put(struct kref *kref)
2888 ceph_buffer_put(m->middle); 3281 ceph_buffer_put(m->middle);
2889 m->middle = NULL; 3282 m->middle = NULL;
2890 } 3283 }
2891 m->nr_pages = 0;
2892 m->pages = NULL;
2893 3284
2894 if (m->pagelist) { 3285 list_splice_init(&m->data, &data);
2895 ceph_pagelist_release(m->pagelist); 3286 list_for_each_safe(links, next, &data) {
2896 kfree(m->pagelist); 3287 struct ceph_msg_data *data;
2897 m->pagelist = NULL;
2898 }
2899 3288
2900 m->trail = NULL; 3289 data = list_entry(links, struct ceph_msg_data, links);
3290 list_del_init(links);
3291 ceph_msg_data_destroy(data);
3292 }
3293 m->data_length = 0;
2901 3294
2902 if (m->pool) 3295 if (m->pool)
2903 ceph_msgpool_put(m->pool, m); 3296 ceph_msgpool_put(m->pool, m);
@@ -2908,8 +3301,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
2908 3301
2909void ceph_msg_dump(struct ceph_msg *msg) 3302void ceph_msg_dump(struct ceph_msg *msg)
2910{ 3303{
2911 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, 3304 pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
2912 msg->front_max, msg->nr_pages); 3305 msg->front_max, msg->data_length);
2913 print_hex_dump(KERN_DEBUG, "header: ", 3306 print_hex_dump(KERN_DEBUG, "header: ",
2914 DUMP_PREFIX_OFFSET, 16, 1, 3307 DUMP_PREFIX_OFFSET, 16, 1,
2915 &msg->hdr, sizeof(msg->hdr), true); 3308 &msg->hdr, sizeof(msg->hdr), true);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index aef5b1062bee..1fe25cd29d0e 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work)
737 737
738 __validate_auth(monc); 738 __validate_auth(monc);
739 739
740 if (monc->auth->ops->is_authenticated(monc->auth)) 740 if (ceph_auth_is_authenticated(monc->auth))
741 __send_subscribe(monc); 741 __send_subscribe(monc);
742 } 742 }
743 __schedule_delayed(monc); 743 __schedule_delayed(monc);
@@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
892 892
893 mutex_lock(&monc->mutex); 893 mutex_lock(&monc->mutex);
894 had_debugfs_info = have_debugfs_info(monc); 894 had_debugfs_info = have_debugfs_info(monc);
895 if (monc->auth->ops) 895 was_auth = ceph_auth_is_authenticated(monc->auth);
896 was_auth = monc->auth->ops->is_authenticated(monc->auth);
897 monc->pending_auth = 0; 896 monc->pending_auth = 0;
898 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 897 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
899 msg->front.iov_len, 898 msg->front.iov_len,
@@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
904 wake_up_all(&monc->client->auth_wq); 903 wake_up_all(&monc->client->auth_wq);
905 } else if (ret > 0) { 904 } else if (ret > 0) {
906 __send_prepared_auth_request(monc, ret); 905 __send_prepared_auth_request(monc, ret);
907 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { 906 } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
908 dout("authenticated, starting session\n"); 907 dout("authenticated, starting session\n");
909 908
910 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 909 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d730dd4d8eb2..a3395fdfbd4f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,3 +1,4 @@
1
1#include <linux/ceph/ceph_debug.h> 2#include <linux/ceph/ceph_debug.h>
2 3
3#include <linux/module.h> 4#include <linux/module.h>
@@ -21,6 +22,8 @@
21#define OSD_OP_FRONT_LEN 4096 22#define OSD_OP_FRONT_LEN 4096
22#define OSD_OPREPLY_FRONT_LEN 512 23#define OSD_OPREPLY_FRONT_LEN 512
23 24
25static struct kmem_cache *ceph_osd_request_cache;
26
24static const struct ceph_connection_operations osd_con_ops; 27static const struct ceph_connection_operations osd_con_ops;
25 28
26static void __send_queued(struct ceph_osd_client *osdc); 29static void __send_queued(struct ceph_osd_client *osdc);
@@ -32,12 +35,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
32static void __send_request(struct ceph_osd_client *osdc, 35static void __send_request(struct ceph_osd_client *osdc,
33 struct ceph_osd_request *req); 36 struct ceph_osd_request *req);
34 37
35static int op_has_extent(int op)
36{
37 return (op == CEPH_OSD_OP_READ ||
38 op == CEPH_OSD_OP_WRITE);
39}
40
41/* 38/*
42 * Implement client access to distributed object storage cluster. 39 * Implement client access to distributed object storage cluster.
43 * 40 *
@@ -63,53 +60,238 @@ static int op_has_extent(int op)
63 * 60 *
64 * fill osd op in request message. 61 * fill osd op in request message.
65 */ 62 */
66static int calc_layout(struct ceph_vino vino, 63static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
67 struct ceph_file_layout *layout, 64 u64 *objnum, u64 *objoff, u64 *objlen)
68 u64 off, u64 *plen,
69 struct ceph_osd_request *req,
70 struct ceph_osd_req_op *op)
71{ 65{
72 u64 orig_len = *plen; 66 u64 orig_len = *plen;
73 u64 bno = 0;
74 u64 objoff = 0;
75 u64 objlen = 0;
76 int r; 67 int r;
77 68
78 /* object extent? */ 69 /* object extent? */
79 r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno, 70 r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
80 &objoff, &objlen); 71 objoff, objlen);
81 if (r < 0) 72 if (r < 0)
82 return r; 73 return r;
83 if (objlen < orig_len) { 74 if (*objlen < orig_len) {
84 *plen = objlen; 75 *plen = *objlen;
85 dout(" skipping last %llu, final file extent %llu~%llu\n", 76 dout(" skipping last %llu, final file extent %llu~%llu\n",
86 orig_len - *plen, off, *plen); 77 orig_len - *plen, off, *plen);
87 } 78 }
88 79
89 if (op_has_extent(op->op)) { 80 dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
90 u32 osize = le32_to_cpu(layout->fl_object_size); 81
91 op->extent.offset = objoff; 82 return 0;
92 op->extent.length = objlen; 83}
93 if (op->extent.truncate_size <= off - objoff) { 84
94 op->extent.truncate_size = 0; 85static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
95 } else { 86{
96 op->extent.truncate_size -= off - objoff; 87 memset(osd_data, 0, sizeof (*osd_data));
97 if (op->extent.truncate_size > osize) 88 osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
98 op->extent.truncate_size = osize; 89}
99 } 90
91static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
92 struct page **pages, u64 length, u32 alignment,
93 bool pages_from_pool, bool own_pages)
94{
95 osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
96 osd_data->pages = pages;
97 osd_data->length = length;
98 osd_data->alignment = alignment;
99 osd_data->pages_from_pool = pages_from_pool;
100 osd_data->own_pages = own_pages;
101}
102
103static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
104 struct ceph_pagelist *pagelist)
105{
106 osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
107 osd_data->pagelist = pagelist;
108}
109
110#ifdef CONFIG_BLOCK
111static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
112 struct bio *bio, size_t bio_length)
113{
114 osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
115 osd_data->bio = bio;
116 osd_data->bio_length = bio_length;
117}
118#endif /* CONFIG_BLOCK */
119
120#define osd_req_op_data(oreq, whch, typ, fld) \
121 ({ \
122 BUG_ON(whch >= (oreq)->r_num_ops); \
123 &(oreq)->r_ops[whch].typ.fld; \
124 })
125
126static struct ceph_osd_data *
127osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
128{
129 BUG_ON(which >= osd_req->r_num_ops);
130
131 return &osd_req->r_ops[which].raw_data_in;
132}
133
134struct ceph_osd_data *
135osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
136 unsigned int which)
137{
138 return osd_req_op_data(osd_req, which, extent, osd_data);
139}
140EXPORT_SYMBOL(osd_req_op_extent_osd_data);
141
142struct ceph_osd_data *
143osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
144 unsigned int which)
145{
146 return osd_req_op_data(osd_req, which, cls, response_data);
147}
148EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
149
150void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
151 unsigned int which, struct page **pages,
152 u64 length, u32 alignment,
153 bool pages_from_pool, bool own_pages)
154{
155 struct ceph_osd_data *osd_data;
156
157 osd_data = osd_req_op_raw_data_in(osd_req, which);
158 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
159 pages_from_pool, own_pages);
160}
161EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
162
163void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
164 unsigned int which, struct page **pages,
165 u64 length, u32 alignment,
166 bool pages_from_pool, bool own_pages)
167{
168 struct ceph_osd_data *osd_data;
169
170 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
171 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
172 pages_from_pool, own_pages);
173}
174EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
175
176void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
177 unsigned int which, struct ceph_pagelist *pagelist)
178{
179 struct ceph_osd_data *osd_data;
180
181 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
182 ceph_osd_data_pagelist_init(osd_data, pagelist);
183}
184EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
185
186#ifdef CONFIG_BLOCK
187void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
188 unsigned int which, struct bio *bio, size_t bio_length)
189{
190 struct ceph_osd_data *osd_data;
191
192 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
193 ceph_osd_data_bio_init(osd_data, bio, bio_length);
194}
195EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
196#endif /* CONFIG_BLOCK */
197
198static void osd_req_op_cls_request_info_pagelist(
199 struct ceph_osd_request *osd_req,
200 unsigned int which, struct ceph_pagelist *pagelist)
201{
202 struct ceph_osd_data *osd_data;
203
204 osd_data = osd_req_op_data(osd_req, which, cls, request_info);
205 ceph_osd_data_pagelist_init(osd_data, pagelist);
206}
207
208void osd_req_op_cls_request_data_pagelist(
209 struct ceph_osd_request *osd_req,
210 unsigned int which, struct ceph_pagelist *pagelist)
211{
212 struct ceph_osd_data *osd_data;
213
214 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
215 ceph_osd_data_pagelist_init(osd_data, pagelist);
216}
217EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
218
219void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
220 unsigned int which, struct page **pages, u64 length,
221 u32 alignment, bool pages_from_pool, bool own_pages)
222{
223 struct ceph_osd_data *osd_data;
224
225 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
226 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
227 pages_from_pool, own_pages);
228}
229EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
230
231void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
232 unsigned int which, struct page **pages, u64 length,
233 u32 alignment, bool pages_from_pool, bool own_pages)
234{
235 struct ceph_osd_data *osd_data;
236
237 osd_data = osd_req_op_data(osd_req, which, cls, response_data);
238 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
239 pages_from_pool, own_pages);
240}
241EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
242
243static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
244{
245 switch (osd_data->type) {
246 case CEPH_OSD_DATA_TYPE_NONE:
247 return 0;
248 case CEPH_OSD_DATA_TYPE_PAGES:
249 return osd_data->length;
250 case CEPH_OSD_DATA_TYPE_PAGELIST:
251 return (u64)osd_data->pagelist->length;
252#ifdef CONFIG_BLOCK
253 case CEPH_OSD_DATA_TYPE_BIO:
254 return (u64)osd_data->bio_length;
255#endif /* CONFIG_BLOCK */
256 default:
257 WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
258 return 0;
100 } 259 }
101 req->r_num_pages = calc_pages_for(off, *plen); 260}
102 req->r_page_alignment = off & ~PAGE_MASK;
103 if (op->op == CEPH_OSD_OP_WRITE)
104 op->payload_len = *plen;
105 261
106 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", 262static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
107 bno, objoff, objlen, req->r_num_pages); 263{
264 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
265 int num_pages;
108 266
109 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 267 num_pages = calc_pages_for((u64)osd_data->alignment,
110 req->r_oid_len = strlen(req->r_oid); 268 (u64)osd_data->length);
269 ceph_release_page_vector(osd_data->pages, num_pages);
270 }
271 ceph_osd_data_init(osd_data);
272}
273
274static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
275 unsigned int which)
276{
277 struct ceph_osd_req_op *op;
278
279 BUG_ON(which >= osd_req->r_num_ops);
280 op = &osd_req->r_ops[which];
111 281
112 return r; 282 switch (op->op) {
283 case CEPH_OSD_OP_READ:
284 case CEPH_OSD_OP_WRITE:
285 ceph_osd_data_release(&op->extent.osd_data);
286 break;
287 case CEPH_OSD_OP_CALL:
288 ceph_osd_data_release(&op->cls.request_info);
289 ceph_osd_data_release(&op->cls.request_data);
290 ceph_osd_data_release(&op->cls.response_data);
291 break;
292 default:
293 break;
294 }
113} 295}
114 296
115/* 297/*
@@ -117,30 +299,26 @@ static int calc_layout(struct ceph_vino vino,
117 */ 299 */
118void ceph_osdc_release_request(struct kref *kref) 300void ceph_osdc_release_request(struct kref *kref)
119{ 301{
120 struct ceph_osd_request *req = container_of(kref, 302 struct ceph_osd_request *req;
121 struct ceph_osd_request, 303 unsigned int which;
122 r_kref);
123 304
305 req = container_of(kref, struct ceph_osd_request, r_kref);
124 if (req->r_request) 306 if (req->r_request)
125 ceph_msg_put(req->r_request); 307 ceph_msg_put(req->r_request);
126 if (req->r_con_filling_msg) { 308 if (req->r_reply) {
127 dout("%s revoking msg %p from con %p\n", __func__,
128 req->r_reply, req->r_con_filling_msg);
129 ceph_msg_revoke_incoming(req->r_reply); 309 ceph_msg_revoke_incoming(req->r_reply);
130 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
131 req->r_con_filling_msg = NULL;
132 }
133 if (req->r_reply)
134 ceph_msg_put(req->r_reply); 310 ceph_msg_put(req->r_reply);
135 if (req->r_own_pages) 311 }
136 ceph_release_page_vector(req->r_pages, 312
137 req->r_num_pages); 313 for (which = 0; which < req->r_num_ops; which++)
314 osd_req_op_data_release(req, which);
315
138 ceph_put_snap_context(req->r_snapc); 316 ceph_put_snap_context(req->r_snapc);
139 ceph_pagelist_release(&req->r_trail);
140 if (req->r_mempool) 317 if (req->r_mempool)
141 mempool_free(req, req->r_osdc->req_mempool); 318 mempool_free(req, req->r_osdc->req_mempool);
142 else 319 else
143 kfree(req); 320 kmem_cache_free(ceph_osd_request_cache, req);
321
144} 322}
145EXPORT_SYMBOL(ceph_osdc_release_request); 323EXPORT_SYMBOL(ceph_osdc_release_request);
146 324
@@ -154,6 +332,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
154 struct ceph_msg *msg; 332 struct ceph_msg *msg;
155 size_t msg_size; 333 size_t msg_size;
156 334
335 BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
336 BUG_ON(num_ops > CEPH_OSD_MAX_OP);
337
157 msg_size = 4 + 4 + 8 + 8 + 4+8; 338 msg_size = 4 + 4 + 8 + 8 + 4+8;
158 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 339 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
159 msg_size += 1 + 8 + 4 + 4; /* pg_t */ 340 msg_size += 1 + 8 + 4 + 4; /* pg_t */
@@ -168,13 +349,14 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
168 req = mempool_alloc(osdc->req_mempool, gfp_flags); 349 req = mempool_alloc(osdc->req_mempool, gfp_flags);
169 memset(req, 0, sizeof(*req)); 350 memset(req, 0, sizeof(*req));
170 } else { 351 } else {
171 req = kzalloc(sizeof(*req), gfp_flags); 352 req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
172 } 353 }
173 if (req == NULL) 354 if (req == NULL)
174 return NULL; 355 return NULL;
175 356
176 req->r_osdc = osdc; 357 req->r_osdc = osdc;
177 req->r_mempool = use_mempool; 358 req->r_mempool = use_mempool;
359 req->r_num_ops = num_ops;
178 360
179 kref_init(&req->r_kref); 361 kref_init(&req->r_kref);
180 init_completion(&req->r_completion); 362 init_completion(&req->r_completion);
@@ -198,8 +380,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
198 } 380 }
199 req->r_reply = msg; 381 req->r_reply = msg;
200 382
201 ceph_pagelist_init(&req->r_trail);
202
203 /* create request message; allow space for oid */ 383 /* create request message; allow space for oid */
204 if (use_mempool) 384 if (use_mempool)
205 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 385 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -218,60 +398,24 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
218} 398}
219EXPORT_SYMBOL(ceph_osdc_alloc_request); 399EXPORT_SYMBOL(ceph_osdc_alloc_request);
220 400
221static void osd_req_encode_op(struct ceph_osd_request *req, 401static bool osd_req_opcode_valid(u16 opcode)
222 struct ceph_osd_op *dst,
223 struct ceph_osd_req_op *src)
224{ 402{
225 dst->op = cpu_to_le16(src->op); 403 switch (opcode) {
226
227 switch (src->op) {
228 case CEPH_OSD_OP_STAT:
229 break;
230 case CEPH_OSD_OP_READ: 404 case CEPH_OSD_OP_READ:
231 case CEPH_OSD_OP_WRITE: 405 case CEPH_OSD_OP_STAT:
232 dst->extent.offset =
233 cpu_to_le64(src->extent.offset);
234 dst->extent.length =
235 cpu_to_le64(src->extent.length);
236 dst->extent.truncate_size =
237 cpu_to_le64(src->extent.truncate_size);
238 dst->extent.truncate_seq =
239 cpu_to_le32(src->extent.truncate_seq);
240 break;
241 case CEPH_OSD_OP_CALL:
242 dst->cls.class_len = src->cls.class_len;
243 dst->cls.method_len = src->cls.method_len;
244 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
245
246 ceph_pagelist_append(&req->r_trail, src->cls.class_name,
247 src->cls.class_len);
248 ceph_pagelist_append(&req->r_trail, src->cls.method_name,
249 src->cls.method_len);
250 ceph_pagelist_append(&req->r_trail, src->cls.indata,
251 src->cls.indata_len);
252 break;
253 case CEPH_OSD_OP_STARTSYNC:
254 break;
255 case CEPH_OSD_OP_NOTIFY_ACK:
256 case CEPH_OSD_OP_WATCH:
257 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
258 dst->watch.ver = cpu_to_le64(src->watch.ver);
259 dst->watch.flag = src->watch.flag;
260 break;
261 default:
262 pr_err("unrecognized osd opcode %d\n", dst->op);
263 WARN_ON(1);
264 break;
265 case CEPH_OSD_OP_MAPEXT: 406 case CEPH_OSD_OP_MAPEXT:
266 case CEPH_OSD_OP_MASKTRUNC: 407 case CEPH_OSD_OP_MASKTRUNC:
267 case CEPH_OSD_OP_SPARSE_READ: 408 case CEPH_OSD_OP_SPARSE_READ:
268 case CEPH_OSD_OP_NOTIFY: 409 case CEPH_OSD_OP_NOTIFY:
410 case CEPH_OSD_OP_NOTIFY_ACK:
269 case CEPH_OSD_OP_ASSERT_VER: 411 case CEPH_OSD_OP_ASSERT_VER:
412 case CEPH_OSD_OP_WRITE:
270 case CEPH_OSD_OP_WRITEFULL: 413 case CEPH_OSD_OP_WRITEFULL:
271 case CEPH_OSD_OP_TRUNCATE: 414 case CEPH_OSD_OP_TRUNCATE:
272 case CEPH_OSD_OP_ZERO: 415 case CEPH_OSD_OP_ZERO:
273 case CEPH_OSD_OP_DELETE: 416 case CEPH_OSD_OP_DELETE:
274 case CEPH_OSD_OP_APPEND: 417 case CEPH_OSD_OP_APPEND:
418 case CEPH_OSD_OP_STARTSYNC:
275 case CEPH_OSD_OP_SETTRUNC: 419 case CEPH_OSD_OP_SETTRUNC:
276 case CEPH_OSD_OP_TRIMTRUNC: 420 case CEPH_OSD_OP_TRIMTRUNC:
277 case CEPH_OSD_OP_TMAPUP: 421 case CEPH_OSD_OP_TMAPUP:
@@ -279,11 +423,11 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
279 case CEPH_OSD_OP_TMAPGET: 423 case CEPH_OSD_OP_TMAPGET:
280 case CEPH_OSD_OP_CREATE: 424 case CEPH_OSD_OP_CREATE:
281 case CEPH_OSD_OP_ROLLBACK: 425 case CEPH_OSD_OP_ROLLBACK:
426 case CEPH_OSD_OP_WATCH:
282 case CEPH_OSD_OP_OMAPGETKEYS: 427 case CEPH_OSD_OP_OMAPGETKEYS:
283 case CEPH_OSD_OP_OMAPGETVALS: 428 case CEPH_OSD_OP_OMAPGETVALS:
284 case CEPH_OSD_OP_OMAPGETHEADER: 429 case CEPH_OSD_OP_OMAPGETHEADER:
285 case CEPH_OSD_OP_OMAPGETVALSBYKEYS: 430 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
286 case CEPH_OSD_OP_MODE_RD:
287 case CEPH_OSD_OP_OMAPSETVALS: 431 case CEPH_OSD_OP_OMAPSETVALS:
288 case CEPH_OSD_OP_OMAPSETHEADER: 432 case CEPH_OSD_OP_OMAPSETHEADER:
289 case CEPH_OSD_OP_OMAPCLEAR: 433 case CEPH_OSD_OP_OMAPCLEAR:
@@ -314,113 +458,233 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
314 case CEPH_OSD_OP_RDUNLOCK: 458 case CEPH_OSD_OP_RDUNLOCK:
315 case CEPH_OSD_OP_UPLOCK: 459 case CEPH_OSD_OP_UPLOCK:
316 case CEPH_OSD_OP_DNLOCK: 460 case CEPH_OSD_OP_DNLOCK:
461 case CEPH_OSD_OP_CALL:
317 case CEPH_OSD_OP_PGLS: 462 case CEPH_OSD_OP_PGLS:
318 case CEPH_OSD_OP_PGLS_FILTER: 463 case CEPH_OSD_OP_PGLS_FILTER:
319 pr_err("unsupported osd opcode %s\n", 464 return true;
320 ceph_osd_op_name(dst->op)); 465 default:
321 WARN_ON(1); 466 return false;
322 break;
323 } 467 }
324 dst->payload_len = cpu_to_le32(src->payload_len);
325} 468}
326 469
327/* 470/*
328 * build new request AND message 471 * This is an osd op init function for opcodes that have no data or
329 * 472 * other information associated with them. It also serves as a
473 * common init routine for all the other init functions, below.
330 */ 474 */
331void ceph_osdc_build_request(struct ceph_osd_request *req, 475static struct ceph_osd_req_op *
332 u64 off, u64 len, unsigned int num_ops, 476_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
333 struct ceph_osd_req_op *src_ops, 477 u16 opcode)
334 struct ceph_snap_context *snapc, u64 snap_id,
335 struct timespec *mtime)
336{ 478{
337 struct ceph_msg *msg = req->r_request; 479 struct ceph_osd_req_op *op;
338 struct ceph_osd_req_op *src_op;
339 void *p;
340 size_t msg_size;
341 int flags = req->r_flags;
342 u64 data_len;
343 int i;
344 480
345 req->r_num_ops = num_ops; 481 BUG_ON(which >= osd_req->r_num_ops);
346 req->r_snapid = snap_id; 482 BUG_ON(!osd_req_opcode_valid(opcode));
347 req->r_snapc = ceph_get_snap_context(snapc);
348 483
349 /* encode request */ 484 op = &osd_req->r_ops[which];
350 msg->hdr.version = cpu_to_le16(4); 485 memset(op, 0, sizeof (*op));
486 op->op = opcode;
351 487
352 p = msg->front.iov_base; 488 return op;
353 ceph_encode_32(&p, 1); /* client_inc is always 1 */ 489}
354 req->r_request_osdmap_epoch = p;
355 p += 4;
356 req->r_request_flags = p;
357 p += 4;
358 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
359 ceph_encode_timespec(p, mtime);
360 p += sizeof(struct ceph_timespec);
361 req->r_request_reassert_version = p;
362 p += sizeof(struct ceph_eversion); /* will get filled in */
363 490
364 /* oloc */ 491void osd_req_op_init(struct ceph_osd_request *osd_req,
365 ceph_encode_8(&p, 4); 492 unsigned int which, u16 opcode)
366 ceph_encode_8(&p, 4); 493{
367 ceph_encode_32(&p, 8 + 4 + 4); 494 (void)_osd_req_op_init(osd_req, which, opcode);
368 req->r_request_pool = p; 495}
369 p += 8; 496EXPORT_SYMBOL(osd_req_op_init);
370 ceph_encode_32(&p, -1); /* preferred */
371 ceph_encode_32(&p, 0); /* key len */
372 497
373 ceph_encode_8(&p, 1); 498void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
374 req->r_request_pgid = p; 499 unsigned int which, u16 opcode,
375 p += 8 + 4; 500 u64 offset, u64 length,
376 ceph_encode_32(&p, -1); /* preferred */ 501 u64 truncate_size, u32 truncate_seq)
502{
503 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
504 size_t payload_len = 0;
377 505
378 /* oid */ 506 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
379 ceph_encode_32(&p, req->r_oid_len);
380 memcpy(p, req->r_oid, req->r_oid_len);
381 dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
382 p += req->r_oid_len;
383 507
384 /* ops */ 508 op->extent.offset = offset;
385 ceph_encode_16(&p, num_ops); 509 op->extent.length = length;
386 src_op = src_ops; 510 op->extent.truncate_size = truncate_size;
387 req->r_request_ops = p; 511 op->extent.truncate_seq = truncate_seq;
388 for (i = 0; i < num_ops; i++, src_op++) { 512 if (opcode == CEPH_OSD_OP_WRITE)
389 osd_req_encode_op(req, p, src_op); 513 payload_len += length;
390 p += sizeof(struct ceph_osd_op);
391 }
392 514
393 /* snaps */ 515 op->payload_len = payload_len;
394 ceph_encode_64(&p, req->r_snapid); 516}
395 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); 517EXPORT_SYMBOL(osd_req_op_extent_init);
396 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); 518
397 if (req->r_snapc) { 519void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
398 for (i = 0; i < snapc->num_snaps; i++) { 520 unsigned int which, u64 length)
399 ceph_encode_64(&p, req->r_snapc->snaps[i]); 521{
400 } 522 struct ceph_osd_req_op *op;
523 u64 previous;
524
525 BUG_ON(which >= osd_req->r_num_ops);
526 op = &osd_req->r_ops[which];
527 previous = op->extent.length;
528
529 if (length == previous)
530 return; /* Nothing to do */
531 BUG_ON(length > previous);
532
533 op->extent.length = length;
534 op->payload_len -= previous - length;
535}
536EXPORT_SYMBOL(osd_req_op_extent_update);
537
538void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
539 u16 opcode, const char *class, const char *method)
540{
541 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
542 struct ceph_pagelist *pagelist;
543 size_t payload_len = 0;
544 size_t size;
545
546 BUG_ON(opcode != CEPH_OSD_OP_CALL);
547
548 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
549 BUG_ON(!pagelist);
550 ceph_pagelist_init(pagelist);
551
552 op->cls.class_name = class;
553 size = strlen(class);
554 BUG_ON(size > (size_t) U8_MAX);
555 op->cls.class_len = size;
556 ceph_pagelist_append(pagelist, class, size);
557 payload_len += size;
558
559 op->cls.method_name = method;
560 size = strlen(method);
561 BUG_ON(size > (size_t) U8_MAX);
562 op->cls.method_len = size;
563 ceph_pagelist_append(pagelist, method, size);
564 payload_len += size;
565
566 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
567
568 op->cls.argc = 0; /* currently unused */
569
570 op->payload_len = payload_len;
571}
572EXPORT_SYMBOL(osd_req_op_cls_init);
573
574void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
575 unsigned int which, u16 opcode,
576 u64 cookie, u64 version, int flag)
577{
578 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
579
580 BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
581
582 op->watch.cookie = cookie;
583 op->watch.ver = version;
584 if (opcode == CEPH_OSD_OP_WATCH && flag)
585 op->watch.flag = (u8)1;
586}
587EXPORT_SYMBOL(osd_req_op_watch_init);
588
589static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
590 struct ceph_osd_data *osd_data)
591{
592 u64 length = ceph_osd_data_length(osd_data);
593
594 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
595 BUG_ON(length > (u64) SIZE_MAX);
596 if (length)
597 ceph_msg_data_add_pages(msg, osd_data->pages,
598 length, osd_data->alignment);
599 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
600 BUG_ON(!length);
601 ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
602#ifdef CONFIG_BLOCK
603 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
604 ceph_msg_data_add_bio(msg, osd_data->bio, length);
605#endif
606 } else {
607 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
401 } 608 }
609}
402 610
403 req->r_request_attempts = p; 611static u64 osd_req_encode_op(struct ceph_osd_request *req,
404 p += 4; 612 struct ceph_osd_op *dst, unsigned int which)
613{
614 struct ceph_osd_req_op *src;
615 struct ceph_osd_data *osd_data;
616 u64 request_data_len = 0;
617 u64 data_length;
405 618
406 data_len = req->r_trail.length; 619 BUG_ON(which >= req->r_num_ops);
407 if (flags & CEPH_OSD_FLAG_WRITE) { 620 src = &req->r_ops[which];
408 req->r_request->hdr.data_off = cpu_to_le16(off); 621 if (WARN_ON(!osd_req_opcode_valid(src->op))) {
409 data_len += len; 622 pr_err("unrecognized osd opcode %d\n", src->op);
623
624 return 0;
410 } 625 }
411 req->r_request->hdr.data_len = cpu_to_le32(data_len);
412 req->r_request->page_alignment = req->r_page_alignment;
413 626
414 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 627 switch (src->op) {
415 msg_size = p - msg->front.iov_base; 628 case CEPH_OSD_OP_STAT:
416 msg->front.iov_len = msg_size; 629 osd_data = &src->raw_data_in;
417 msg->hdr.front_len = cpu_to_le32(msg_size); 630 ceph_osdc_msg_data_add(req->r_reply, osd_data);
631 break;
632 case CEPH_OSD_OP_READ:
633 case CEPH_OSD_OP_WRITE:
634 if (src->op == CEPH_OSD_OP_WRITE)
635 request_data_len = src->extent.length;
636 dst->extent.offset = cpu_to_le64(src->extent.offset);
637 dst->extent.length = cpu_to_le64(src->extent.length);
638 dst->extent.truncate_size =
639 cpu_to_le64(src->extent.truncate_size);
640 dst->extent.truncate_seq =
641 cpu_to_le32(src->extent.truncate_seq);
642 osd_data = &src->extent.osd_data;
643 if (src->op == CEPH_OSD_OP_WRITE)
644 ceph_osdc_msg_data_add(req->r_request, osd_data);
645 else
646 ceph_osdc_msg_data_add(req->r_reply, osd_data);
647 break;
648 case CEPH_OSD_OP_CALL:
649 dst->cls.class_len = src->cls.class_len;
650 dst->cls.method_len = src->cls.method_len;
651 osd_data = &src->cls.request_info;
652 ceph_osdc_msg_data_add(req->r_request, osd_data);
653 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
654 request_data_len = osd_data->pagelist->length;
655
656 osd_data = &src->cls.request_data;
657 data_length = ceph_osd_data_length(osd_data);
658 if (data_length) {
659 BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
660 dst->cls.indata_len = cpu_to_le32(data_length);
661 ceph_osdc_msg_data_add(req->r_request, osd_data);
662 src->payload_len += data_length;
663 request_data_len += data_length;
664 }
665 osd_data = &src->cls.response_data;
666 ceph_osdc_msg_data_add(req->r_reply, osd_data);
667 break;
668 case CEPH_OSD_OP_STARTSYNC:
669 break;
670 case CEPH_OSD_OP_NOTIFY_ACK:
671 case CEPH_OSD_OP_WATCH:
672 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
673 dst->watch.ver = cpu_to_le64(src->watch.ver);
674 dst->watch.flag = src->watch.flag;
675 break;
676 default:
677 pr_err("unsupported osd opcode %s\n",
678 ceph_osd_op_name(src->op));
679 WARN_ON(1);
418 680
419 dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, 681 return 0;
420 num_ops); 682 }
421 return; 683 dst->op = cpu_to_le16(src->op);
684 dst->payload_len = cpu_to_le32(src->payload_len);
685
686 return request_data_len;
422} 687}
423EXPORT_SYMBOL(ceph_osdc_build_request);
424 688
425/* 689/*
426 * build new request AND message, calculate layout, and adjust file 690 * build new request AND message, calculate layout, and adjust file
@@ -436,51 +700,63 @@ EXPORT_SYMBOL(ceph_osdc_build_request);
436struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 700struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
437 struct ceph_file_layout *layout, 701 struct ceph_file_layout *layout,
438 struct ceph_vino vino, 702 struct ceph_vino vino,
439 u64 off, u64 *plen, 703 u64 off, u64 *plen, int num_ops,
440 int opcode, int flags, 704 int opcode, int flags,
441 struct ceph_snap_context *snapc, 705 struct ceph_snap_context *snapc,
442 int do_sync,
443 u32 truncate_seq, 706 u32 truncate_seq,
444 u64 truncate_size, 707 u64 truncate_size,
445 struct timespec *mtime, 708 bool use_mempool)
446 bool use_mempool,
447 int page_align)
448{ 709{
449 struct ceph_osd_req_op ops[2];
450 struct ceph_osd_request *req; 710 struct ceph_osd_request *req;
451 unsigned int num_op = 1; 711 u64 objnum = 0;
712 u64 objoff = 0;
713 u64 objlen = 0;
714 u32 object_size;
715 u64 object_base;
452 int r; 716 int r;
453 717
454 memset(&ops, 0, sizeof ops); 718 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
455
456 ops[0].op = opcode;
457 ops[0].extent.truncate_seq = truncate_seq;
458 ops[0].extent.truncate_size = truncate_size;
459
460 if (do_sync) {
461 ops[1].op = CEPH_OSD_OP_STARTSYNC;
462 num_op++;
463 }
464 719
465 req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, 720 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
466 GFP_NOFS); 721 GFP_NOFS);
467 if (!req) 722 if (!req)
468 return ERR_PTR(-ENOMEM); 723 return ERR_PTR(-ENOMEM);
724
469 req->r_flags = flags; 725 req->r_flags = flags;
470 726
471 /* calculate max write size */ 727 /* calculate max write size */
472 r = calc_layout(vino, layout, off, plen, req, ops); 728 r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
473 if (r < 0) 729 if (r < 0) {
730 ceph_osdc_put_request(req);
474 return ERR_PTR(r); 731 return ERR_PTR(r);
475 req->r_file_layout = *layout; /* keep a copy */ 732 }
476 733
477 /* in case it differs from natural (file) alignment that 734 object_size = le32_to_cpu(layout->fl_object_size);
478 calc_layout filled in for us */ 735 object_base = off - objoff;
479 req->r_num_pages = calc_pages_for(page_align, *plen); 736 if (truncate_size <= object_base) {
480 req->r_page_alignment = page_align; 737 truncate_size = 0;
738 } else {
739 truncate_size -= object_base;
740 if (truncate_size > object_size)
741 truncate_size = object_size;
742 }
743
744 osd_req_op_extent_init(req, 0, opcode, objoff, objlen,
745 truncate_size, truncate_seq);
746
747 /*
748 * A second op in the ops array means the caller wants to
749 * also issue a include a 'startsync' command so that the
750 * osd will flush data quickly.
751 */
752 if (num_ops > 1)
753 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
754
755 req->r_file_layout = *layout; /* keep a copy */
481 756
482 ceph_osdc_build_request(req, off, *plen, num_op, ops, 757 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx",
483 snapc, vino.snap, mtime); 758 vino.ino, objnum);
759 req->r_oid_len = strlen(req->r_oid);
484 760
485 return req; 761 return req;
486} 762}
@@ -558,21 +834,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
558 struct ceph_osd *osd) 834 struct ceph_osd *osd)
559{ 835{
560 struct ceph_osd_request *req, *nreq; 836 struct ceph_osd_request *req, *nreq;
837 LIST_HEAD(resend);
561 int err; 838 int err;
562 839
563 dout("__kick_osd_requests osd%d\n", osd->o_osd); 840 dout("__kick_osd_requests osd%d\n", osd->o_osd);
564 err = __reset_osd(osdc, osd); 841 err = __reset_osd(osdc, osd);
565 if (err) 842 if (err)
566 return; 843 return;
567 844 /*
845 * Build up a list of requests to resend by traversing the
846 * osd's list of requests. Requests for a given object are
847 * sent in tid order, and that is also the order they're
848 * kept on this list. Therefore all requests that are in
849 * flight will be found first, followed by all requests that
850 * have not yet been sent. And to resend requests while
851 * preserving this order we will want to put any sent
852 * requests back on the front of the osd client's unsent
853 * list.
854 *
855 * So we build a separate ordered list of already-sent
856 * requests for the affected osd and splice it onto the
857 * front of the osd client's unsent list. Once we've seen a
858 * request that has not yet been sent we're done. Those
859 * requests are already sitting right where they belong.
860 */
568 list_for_each_entry(req, &osd->o_requests, r_osd_item) { 861 list_for_each_entry(req, &osd->o_requests, r_osd_item) {
569 list_move(&req->r_req_lru_item, &osdc->req_unsent); 862 if (!req->r_sent)
570 dout("requeued %p tid %llu osd%d\n", req, req->r_tid, 863 break;
864 list_move_tail(&req->r_req_lru_item, &resend);
865 dout("requeueing %p tid %llu osd%d\n", req, req->r_tid,
571 osd->o_osd); 866 osd->o_osd);
572 if (!req->r_linger) 867 if (!req->r_linger)
573 req->r_flags |= CEPH_OSD_FLAG_RETRY; 868 req->r_flags |= CEPH_OSD_FLAG_RETRY;
574 } 869 }
870 list_splice(&resend, &osdc->req_unsent);
575 871
872 /*
873 * Linger requests are re-registered before sending, which
874 * sets up a new tid for each. We add them to the unsent
875 * list at the end to keep things in tid order.
876 */
576 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, 877 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
577 r_linger_osd) { 878 r_linger_osd) {
578 /* 879 /*
@@ -581,8 +882,8 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
581 */ 882 */
582 BUG_ON(!list_empty(&req->r_req_lru_item)); 883 BUG_ON(!list_empty(&req->r_req_lru_item));
583 __register_request(osdc, req); 884 __register_request(osdc, req);
584 list_add(&req->r_req_lru_item, &osdc->req_unsent); 885 list_add_tail(&req->r_req_lru_item, &osdc->req_unsent);
585 list_add(&req->r_osd_item, &req->r_osd->o_requests); 886 list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
586 __unregister_linger_request(osdc, req); 887 __unregister_linger_request(osdc, req);
587 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, 888 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid,
588 osd->o_osd); 889 osd->o_osd);
@@ -654,8 +955,7 @@ static void put_osd(struct ceph_osd *osd)
654 if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { 955 if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {
655 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; 956 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
656 957
657 if (ac->ops && ac->ops->destroy_authorizer) 958 ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
658 ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer);
659 kfree(osd); 959 kfree(osd);
660 } 960 }
661} 961}
@@ -820,14 +1120,6 @@ static void __register_request(struct ceph_osd_client *osdc,
820 } 1120 }
821} 1121}
822 1122
823static void register_request(struct ceph_osd_client *osdc,
824 struct ceph_osd_request *req)
825{
826 mutex_lock(&osdc->request_mutex);
827 __register_request(osdc, req);
828 mutex_unlock(&osdc->request_mutex);
829}
830
831/* 1123/*
832 * called under osdc->request_mutex 1124 * called under osdc->request_mutex
833 */ 1125 */
@@ -952,8 +1244,8 @@ static int __map_request(struct ceph_osd_client *osdc,
952 int err; 1244 int err;
953 1245
954 dout("map_request %p tid %lld\n", req, req->r_tid); 1246 dout("map_request %p tid %lld\n", req, req->r_tid);
955 err = ceph_calc_object_layout(&pgid, req->r_oid, 1247 err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
956 &req->r_file_layout, osdc->osdmap); 1248 ceph_file_layout_pg_pool(req->r_file_layout));
957 if (err) { 1249 if (err) {
958 list_move(&req->r_req_lru_item, &osdc->req_notarget); 1250 list_move(&req->r_req_lru_item, &osdc->req_notarget);
959 return err; 1251 return err;
@@ -1007,10 +1299,10 @@ static int __map_request(struct ceph_osd_client *osdc,
1007 1299
1008 if (req->r_osd) { 1300 if (req->r_osd) {
1009 __remove_osd_from_lru(req->r_osd); 1301 __remove_osd_from_lru(req->r_osd);
1010 list_add(&req->r_osd_item, &req->r_osd->o_requests); 1302 list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
1011 list_move(&req->r_req_lru_item, &osdc->req_unsent); 1303 list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
1012 } else { 1304 } else {
1013 list_move(&req->r_req_lru_item, &osdc->req_notarget); 1305 list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
1014 } 1306 }
1015 err = 1; /* osd or pg changed */ 1307 err = 1; /* osd or pg changed */
1016 1308
@@ -1045,8 +1337,14 @@ static void __send_request(struct ceph_osd_client *osdc,
1045 list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 1337 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
1046 1338
1047 ceph_msg_get(req->r_request); /* send consumes a ref */ 1339 ceph_msg_get(req->r_request); /* send consumes a ref */
1048 ceph_con_send(&req->r_osd->o_con, req->r_request); 1340
1341 /* Mark the request unsafe if this is the first timet's being sent. */
1342
1343 if (!req->r_sent && req->r_unsafe_callback)
1344 req->r_unsafe_callback(req, true);
1049 req->r_sent = req->r_osd->o_incarnation; 1345 req->r_sent = req->r_osd->o_incarnation;
1346
1347 ceph_con_send(&req->r_osd->o_con, req->r_request);
1050} 1348}
1051 1349
1052/* 1350/*
@@ -1134,31 +1432,11 @@ static void handle_osds_timeout(struct work_struct *work)
1134 1432
1135static void complete_request(struct ceph_osd_request *req) 1433static void complete_request(struct ceph_osd_request *req)
1136{ 1434{
1137 if (req->r_safe_callback) 1435 if (req->r_unsafe_callback)
1138 req->r_safe_callback(req, NULL); 1436 req->r_unsafe_callback(req, false);
1139 complete_all(&req->r_safe_completion); /* fsync waiter */ 1437 complete_all(&req->r_safe_completion); /* fsync waiter */
1140} 1438}
1141 1439
1142static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
1143{
1144 __u8 v;
1145
1146 ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
1147 v = ceph_decode_8(p);
1148 if (v > 1) {
1149 pr_warning("do not understand pg encoding %d > 1", v);
1150 return -EINVAL;
1151 }
1152 pgid->pool = ceph_decode_64(p);
1153 pgid->seed = ceph_decode_32(p);
1154 *p += 4;
1155 return 0;
1156
1157bad:
1158 pr_warning("incomplete pg encoding");
1159 return -EINVAL;
1160}
1161
1162/* 1440/*
1163 * handle osd op reply. either call the callback if it is specified, 1441 * handle osd op reply. either call the callback if it is specified,
1164 * or do the completion to wake up the waiting thread. 1442 * or do the completion to wake up the waiting thread.
@@ -1170,7 +1448,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1170 struct ceph_osd_request *req; 1448 struct ceph_osd_request *req;
1171 u64 tid; 1449 u64 tid;
1172 int object_len; 1450 int object_len;
1173 int numops, payload_len, flags; 1451 unsigned int numops;
1452 int payload_len, flags;
1174 s32 result; 1453 s32 result;
1175 s32 retry_attempt; 1454 s32 retry_attempt;
1176 struct ceph_pg pg; 1455 struct ceph_pg pg;
@@ -1178,7 +1457,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1178 u32 reassert_epoch; 1457 u32 reassert_epoch;
1179 u64 reassert_version; 1458 u64 reassert_version;
1180 u32 osdmap_epoch; 1459 u32 osdmap_epoch;
1181 int i; 1460 int already_completed;
1461 u32 bytes;
1462 unsigned int i;
1182 1463
1183 tid = le64_to_cpu(msg->hdr.tid); 1464 tid = le64_to_cpu(msg->hdr.tid);
1184 dout("handle_reply %p tid %llu\n", msg, tid); 1465 dout("handle_reply %p tid %llu\n", msg, tid);
@@ -1191,7 +1472,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1191 ceph_decode_need(&p, end, object_len, bad); 1472 ceph_decode_need(&p, end, object_len, bad);
1192 p += object_len; 1473 p += object_len;
1193 1474
1194 err = __decode_pgid(&p, end, &pg); 1475 err = ceph_decode_pgid(&p, end, &pg);
1195 if (err) 1476 if (err)
1196 goto bad; 1477 goto bad;
1197 1478
@@ -1207,8 +1488,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1207 req = __lookup_request(osdc, tid); 1488 req = __lookup_request(osdc, tid);
1208 if (req == NULL) { 1489 if (req == NULL) {
1209 dout("handle_reply tid %llu dne\n", tid); 1490 dout("handle_reply tid %llu dne\n", tid);
1210 mutex_unlock(&osdc->request_mutex); 1491 goto bad_mutex;
1211 return;
1212 } 1492 }
1213 ceph_osdc_get_request(req); 1493 ceph_osdc_get_request(req);
1214 1494
@@ -1233,9 +1513,10 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1233 payload_len += len; 1513 payload_len += len;
1234 p += sizeof(*op); 1514 p += sizeof(*op);
1235 } 1515 }
1236 if (payload_len != le32_to_cpu(msg->hdr.data_len)) { 1516 bytes = le32_to_cpu(msg->hdr.data_len);
1517 if (payload_len != bytes) {
1237 pr_warning("sum of op payload lens %d != data_len %d", 1518 pr_warning("sum of op payload lens %d != data_len %d",
1238 payload_len, le32_to_cpu(msg->hdr.data_len)); 1519 payload_len, bytes);
1239 goto bad_put; 1520 goto bad_put;
1240 } 1521 }
1241 1522
@@ -1244,21 +1525,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1244 for (i = 0; i < numops; i++) 1525 for (i = 0; i < numops; i++)
1245 req->r_reply_op_result[i] = ceph_decode_32(&p); 1526 req->r_reply_op_result[i] = ceph_decode_32(&p);
1246 1527
1247 /*
1248 * if this connection filled our message, drop our reference now, to
1249 * avoid a (safe but slower) revoke later.
1250 */
1251 if (req->r_con_filling_msg == con && req->r_reply == msg) {
1252 dout(" dropping con_filling_msg ref %p\n", con);
1253 req->r_con_filling_msg = NULL;
1254 con->ops->put(con);
1255 }
1256
1257 if (!req->r_got_reply) { 1528 if (!req->r_got_reply) {
1258 unsigned int bytes;
1259 1529
1260 req->r_result = result; 1530 req->r_result = result;
1261 bytes = le32_to_cpu(msg->hdr.data_len);
1262 dout("handle_reply result %d bytes %d\n", req->r_result, 1531 dout("handle_reply result %d bytes %d\n", req->r_result,
1263 bytes); 1532 bytes);
1264 if (req->r_result == 0) 1533 if (req->r_result == 0)
@@ -1286,7 +1555,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1286 ((flags & CEPH_OSD_FLAG_WRITE) == 0)) 1555 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
1287 __unregister_request(osdc, req); 1556 __unregister_request(osdc, req);
1288 1557
1558 already_completed = req->r_completed;
1559 req->r_completed = 1;
1289 mutex_unlock(&osdc->request_mutex); 1560 mutex_unlock(&osdc->request_mutex);
1561 if (already_completed)
1562 goto done;
1290 1563
1291 if (req->r_callback) 1564 if (req->r_callback)
1292 req->r_callback(req, msg); 1565 req->r_callback(req, msg);
@@ -1303,6 +1576,8 @@ done:
1303 1576
1304bad_put: 1577bad_put:
1305 ceph_osdc_put_request(req); 1578 ceph_osdc_put_request(req);
1579bad_mutex:
1580 mutex_unlock(&osdc->request_mutex);
1306bad: 1581bad:
1307 pr_err("corrupt osd_op_reply got %d %d\n", 1582 pr_err("corrupt osd_op_reply got %d %d\n",
1308 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); 1583 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
@@ -1736,6 +2011,104 @@ bad:
1736} 2011}
1737 2012
1738/* 2013/*
2014 * build new request AND message
2015 *
2016 */
2017void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
2018 struct ceph_snap_context *snapc, u64 snap_id,
2019 struct timespec *mtime)
2020{
2021 struct ceph_msg *msg = req->r_request;
2022 void *p;
2023 size_t msg_size;
2024 int flags = req->r_flags;
2025 u64 data_len;
2026 unsigned int i;
2027
2028 req->r_snapid = snap_id;
2029 req->r_snapc = ceph_get_snap_context(snapc);
2030
2031 /* encode request */
2032 msg->hdr.version = cpu_to_le16(4);
2033
2034 p = msg->front.iov_base;
2035 ceph_encode_32(&p, 1); /* client_inc is always 1 */
2036 req->r_request_osdmap_epoch = p;
2037 p += 4;
2038 req->r_request_flags = p;
2039 p += 4;
2040 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
2041 ceph_encode_timespec(p, mtime);
2042 p += sizeof(struct ceph_timespec);
2043 req->r_request_reassert_version = p;
2044 p += sizeof(struct ceph_eversion); /* will get filled in */
2045
2046 /* oloc */
2047 ceph_encode_8(&p, 4);
2048 ceph_encode_8(&p, 4);
2049 ceph_encode_32(&p, 8 + 4 + 4);
2050 req->r_request_pool = p;
2051 p += 8;
2052 ceph_encode_32(&p, -1); /* preferred */
2053 ceph_encode_32(&p, 0); /* key len */
2054
2055 ceph_encode_8(&p, 1);
2056 req->r_request_pgid = p;
2057 p += 8 + 4;
2058 ceph_encode_32(&p, -1); /* preferred */
2059
2060 /* oid */
2061 ceph_encode_32(&p, req->r_oid_len);
2062 memcpy(p, req->r_oid, req->r_oid_len);
2063 dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
2064 p += req->r_oid_len;
2065
2066 /* ops--can imply data */
2067 ceph_encode_16(&p, (u16)req->r_num_ops);
2068 data_len = 0;
2069 for (i = 0; i < req->r_num_ops; i++) {
2070 data_len += osd_req_encode_op(req, p, i);
2071 p += sizeof(struct ceph_osd_op);
2072 }
2073
2074 /* snaps */
2075 ceph_encode_64(&p, req->r_snapid);
2076 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
2077 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
2078 if (req->r_snapc) {
2079 for (i = 0; i < snapc->num_snaps; i++) {
2080 ceph_encode_64(&p, req->r_snapc->snaps[i]);
2081 }
2082 }
2083
2084 req->r_request_attempts = p;
2085 p += 4;
2086
2087 /* data */
2088 if (flags & CEPH_OSD_FLAG_WRITE) {
2089 u16 data_off;
2090
2091 /*
2092 * The header "data_off" is a hint to the receiver
2093 * allowing it to align received data into its
2094 * buffers such that there's no need to re-copy
2095 * it before writing it to disk (direct I/O).
2096 */
2097 data_off = (u16) (off & 0xffff);
2098 req->r_request->hdr.data_off = cpu_to_le16(data_off);
2099 }
2100 req->r_request->hdr.data_len = cpu_to_le32(data_len);
2101
2102 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
2103 msg_size = p - msg->front.iov_base;
2104 msg->front.iov_len = msg_size;
2105 msg->hdr.front_len = cpu_to_le32(msg_size);
2106
2107 dout("build_request msg_size was %d\n", (int)msg_size);
2108}
2109EXPORT_SYMBOL(ceph_osdc_build_request);
2110
2111/*
1739 * Register request, send initial attempt. 2112 * Register request, send initial attempt.
1740 */ 2113 */
1741int ceph_osdc_start_request(struct ceph_osd_client *osdc, 2114int ceph_osdc_start_request(struct ceph_osd_client *osdc,
@@ -1744,41 +2117,26 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1744{ 2117{
1745 int rc = 0; 2118 int rc = 0;
1746 2119
1747 req->r_request->pages = req->r_pages;
1748 req->r_request->nr_pages = req->r_num_pages;
1749#ifdef CONFIG_BLOCK
1750 req->r_request->bio = req->r_bio;
1751#endif
1752 req->r_request->trail = &req->r_trail;
1753
1754 register_request(osdc, req);
1755
1756 down_read(&osdc->map_sem); 2120 down_read(&osdc->map_sem);
1757 mutex_lock(&osdc->request_mutex); 2121 mutex_lock(&osdc->request_mutex);
1758 /* 2122 __register_request(osdc, req);
1759 * a racing kick_requests() may have sent the message for us 2123 WARN_ON(req->r_sent);
1760 * while we dropped request_mutex above, so only send now if 2124 rc = __map_request(osdc, req, 0);
1761 * the request still han't been touched yet. 2125 if (rc < 0) {
1762 */ 2126 if (nofail) {
1763 if (req->r_sent == 0) { 2127 dout("osdc_start_request failed map, "
1764 rc = __map_request(osdc, req, 0); 2128 " will retry %lld\n", req->r_tid);
1765 if (rc < 0) { 2129 rc = 0;
1766 if (nofail) {
1767 dout("osdc_start_request failed map, "
1768 " will retry %lld\n", req->r_tid);
1769 rc = 0;
1770 }
1771 goto out_unlock;
1772 }
1773 if (req->r_osd == NULL) {
1774 dout("send_request %p no up osds in pg\n", req);
1775 ceph_monc_request_next_osdmap(&osdc->client->monc);
1776 } else {
1777 __send_request(osdc, req);
1778 } 2130 }
1779 rc = 0; 2131 goto out_unlock;
1780 } 2132 }
1781 2133 if (req->r_osd == NULL) {
2134 dout("send_request %p no up osds in pg\n", req);
2135 ceph_monc_request_next_osdmap(&osdc->client->monc);
2136 } else {
2137 __send_queued(osdc);
2138 }
2139 rc = 0;
1782out_unlock: 2140out_unlock:
1783 mutex_unlock(&osdc->request_mutex); 2141 mutex_unlock(&osdc->request_mutex);
1784 up_read(&osdc->map_sem); 2142 up_read(&osdc->map_sem);
@@ -1940,18 +2298,22 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1940 2298
1941 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, 2299 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1942 vino.snap, off, *plen); 2300 vino.snap, off, *plen);
1943 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 2301 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1,
1944 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 2302 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1945 NULL, 0, truncate_seq, truncate_size, NULL, 2303 NULL, truncate_seq, truncate_size,
1946 false, page_align); 2304 false);
1947 if (IS_ERR(req)) 2305 if (IS_ERR(req))
1948 return PTR_ERR(req); 2306 return PTR_ERR(req);
1949 2307
1950 /* it may be a short read due to an object boundary */ 2308 /* it may be a short read due to an object boundary */
1951 req->r_pages = pages;
1952 2309
1953 dout("readpages final extent is %llu~%llu (%d pages align %d)\n", 2310 osd_req_op_extent_osd_data_pages(req, 0,
1954 off, *plen, req->r_num_pages, page_align); 2311 pages, *plen, page_align, false, false);
2312
2313 dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
2314 off, *plen, *plen, page_align);
2315
2316 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
1955 2317
1956 rc = ceph_osdc_start_request(osdc, req, false); 2318 rc = ceph_osdc_start_request(osdc, req, false);
1957 if (!rc) 2319 if (!rc)
@@ -1978,20 +2340,21 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1978 int rc = 0; 2340 int rc = 0;
1979 int page_align = off & ~PAGE_MASK; 2341 int page_align = off & ~PAGE_MASK;
1980 2342
1981 BUG_ON(vino.snap != CEPH_NOSNAP); 2343 BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
1982 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 2344 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1,
1983 CEPH_OSD_OP_WRITE, 2345 CEPH_OSD_OP_WRITE,
1984 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 2346 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1985 snapc, 0, 2347 snapc, truncate_seq, truncate_size,
1986 truncate_seq, truncate_size, mtime, 2348 true);
1987 true, page_align);
1988 if (IS_ERR(req)) 2349 if (IS_ERR(req))
1989 return PTR_ERR(req); 2350 return PTR_ERR(req);
1990 2351
1991 /* it may be a short write due to an object boundary */ 2352 /* it may be a short write due to an object boundary */
1992 req->r_pages = pages; 2353 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
1993 dout("writepages %llu~%llu (%d pages)\n", off, len, 2354 false, false);
1994 req->r_num_pages); 2355 dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
2356
2357 ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
1995 2358
1996 rc = ceph_osdc_start_request(osdc, req, true); 2359 rc = ceph_osdc_start_request(osdc, req, true);
1997 if (!rc) 2360 if (!rc)
@@ -2005,6 +2368,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
2005} 2368}
2006EXPORT_SYMBOL(ceph_osdc_writepages); 2369EXPORT_SYMBOL(ceph_osdc_writepages);
2007 2370
2371int ceph_osdc_setup(void)
2372{
2373 BUG_ON(ceph_osd_request_cache);
2374 ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
2375 sizeof (struct ceph_osd_request),
2376 __alignof__(struct ceph_osd_request),
2377 0, NULL);
2378
2379 return ceph_osd_request_cache ? 0 : -ENOMEM;
2380}
2381EXPORT_SYMBOL(ceph_osdc_setup);
2382
2383void ceph_osdc_cleanup(void)
2384{
2385 BUG_ON(!ceph_osd_request_cache);
2386 kmem_cache_destroy(ceph_osd_request_cache);
2387 ceph_osd_request_cache = NULL;
2388}
2389EXPORT_SYMBOL(ceph_osdc_cleanup);
2390
2008/* 2391/*
2009 * handle incoming message 2392 * handle incoming message
2010 */ 2393 */
@@ -2064,13 +2447,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2064 goto out; 2447 goto out;
2065 } 2448 }
2066 2449
2067 if (req->r_con_filling_msg) { 2450 if (req->r_reply->con)
2068 dout("%s revoking msg %p from old con %p\n", __func__, 2451 dout("%s revoking msg %p from old con %p\n", __func__,
2069 req->r_reply, req->r_con_filling_msg); 2452 req->r_reply, req->r_reply->con);
2070 ceph_msg_revoke_incoming(req->r_reply); 2453 ceph_msg_revoke_incoming(req->r_reply);
2071 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
2072 req->r_con_filling_msg = NULL;
2073 }
2074 2454
2075 if (front > req->r_reply->front.iov_len) { 2455 if (front > req->r_reply->front.iov_len) {
2076 pr_warning("get_reply front %d > preallocated %d\n", 2456 pr_warning("get_reply front %d > preallocated %d\n",
@@ -2084,26 +2464,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2084 m = ceph_msg_get(req->r_reply); 2464 m = ceph_msg_get(req->r_reply);
2085 2465
2086 if (data_len > 0) { 2466 if (data_len > 0) {
2087 int want = calc_pages_for(req->r_page_alignment, data_len); 2467 struct ceph_osd_data *osd_data;
2088 2468
2089 if (req->r_pages && unlikely(req->r_num_pages < want)) { 2469 /*
2090 pr_warning("tid %lld reply has %d bytes %d pages, we" 2470 * XXX This is assuming there is only one op containing
2091 " had only %d pages ready\n", tid, data_len, 2471 * XXX page data. Probably OK for reads, but this
2092 want, req->r_num_pages); 2472 * XXX ought to be done more generally.
2093 *skip = 1; 2473 */
2094 ceph_msg_put(m); 2474 osd_data = osd_req_op_extent_osd_data(req, 0);
2095 m = NULL; 2475 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
2096 goto out; 2476 if (osd_data->pages &&
2477 unlikely(osd_data->length < data_len)) {
2478
2479 pr_warning("tid %lld reply has %d bytes "
2480 "we had only %llu bytes ready\n",
2481 tid, data_len, osd_data->length);
2482 *skip = 1;
2483 ceph_msg_put(m);
2484 m = NULL;
2485 goto out;
2486 }
2097 } 2487 }
2098 m->pages = req->r_pages;
2099 m->nr_pages = req->r_num_pages;
2100 m->page_alignment = req->r_page_alignment;
2101#ifdef CONFIG_BLOCK
2102 m->bio = req->r_bio;
2103#endif
2104 } 2488 }
2105 *skip = 0; 2489 *skip = 0;
2106 req->r_con_filling_msg = con->ops->get(con);
2107 dout("get_reply tid %lld %p\n", tid, m); 2490 dout("get_reply tid %lld %p\n", tid, m);
2108 2491
2109out: 2492out:
@@ -2168,13 +2551,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
2168 struct ceph_auth_handshake *auth = &o->o_auth; 2551 struct ceph_auth_handshake *auth = &o->o_auth;
2169 2552
2170 if (force_new && auth->authorizer) { 2553 if (force_new && auth->authorizer) {
2171 if (ac->ops && ac->ops->destroy_authorizer) 2554 ceph_auth_destroy_authorizer(ac, auth->authorizer);
2172 ac->ops->destroy_authorizer(ac, auth->authorizer);
2173 auth->authorizer = NULL; 2555 auth->authorizer = NULL;
2174 } 2556 }
2175 if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { 2557 if (!auth->authorizer) {
2176 int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, 2558 int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
2177 auth); 2559 auth);
2560 if (ret)
2561 return ERR_PTR(ret);
2562 } else {
2563 int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
2564 auth);
2178 if (ret) 2565 if (ret)
2179 return ERR_PTR(ret); 2566 return ERR_PTR(ret);
2180 } 2567 }
@@ -2190,11 +2577,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
2190 struct ceph_osd_client *osdc = o->o_osdc; 2577 struct ceph_osd_client *osdc = o->o_osdc;
2191 struct ceph_auth_client *ac = osdc->client->monc.auth; 2578 struct ceph_auth_client *ac = osdc->client->monc.auth;
2192 2579
2193 /* 2580 return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
2194 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null,
2195 * XXX which do we do: succeed or fail?
2196 */
2197 return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
2198} 2581}
2199 2582
2200static int invalidate_authorizer(struct ceph_connection *con) 2583static int invalidate_authorizer(struct ceph_connection *con)
@@ -2203,9 +2586,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
2203 struct ceph_osd_client *osdc = o->o_osdc; 2586 struct ceph_osd_client *osdc = o->o_osdc;
2204 struct ceph_auth_client *ac = osdc->client->monc.auth; 2587 struct ceph_auth_client *ac = osdc->client->monc.auth;
2205 2588
2206 if (ac->ops && ac->ops->invalidate_authorizer) 2589 ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
2207 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
2208
2209 return ceph_monc_validate_auth(&osdc->client->monc); 2590 return ceph_monc_validate_auth(&osdc->client->monc);
2210} 2591}
2211 2592
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4543b9aba40c..603ddd92db19 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -654,24 +654,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
654 return 0; 654 return 0;
655} 655}
656 656
657static int __decode_pgid(void **p, void *end, struct ceph_pg *pg)
658{
659 u8 v;
660
661 ceph_decode_need(p, end, 1+8+4+4, bad);
662 v = ceph_decode_8(p);
663 if (v != 1)
664 goto bad;
665 pg->pool = ceph_decode_64(p);
666 pg->seed = ceph_decode_32(p);
667 *p += 4; /* skip preferred */
668 return 0;
669
670bad:
671 dout("error decoding pgid\n");
672 return -EINVAL;
673}
674
675/* 657/*
676 * decode a full map. 658 * decode a full map.
677 */ 659 */
@@ -765,7 +747,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
765 struct ceph_pg pgid; 747 struct ceph_pg pgid;
766 struct ceph_pg_mapping *pg; 748 struct ceph_pg_mapping *pg;
767 749
768 err = __decode_pgid(p, end, &pgid); 750 err = ceph_decode_pgid(p, end, &pgid);
769 if (err) 751 if (err)
770 goto bad; 752 goto bad;
771 ceph_decode_need(p, end, sizeof(u32), bad); 753 ceph_decode_need(p, end, sizeof(u32), bad);
@@ -983,7 +965,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
983 struct ceph_pg pgid; 965 struct ceph_pg pgid;
984 u32 pglen; 966 u32 pglen;
985 967
986 err = __decode_pgid(p, end, &pgid); 968 err = ceph_decode_pgid(p, end, &pgid);
987 if (err) 969 if (err)
988 goto bad; 970 goto bad;
989 ceph_decode_need(p, end, sizeof(u32), bad); 971 ceph_decode_need(p, end, sizeof(u32), bad);
@@ -1111,27 +1093,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
1111 * calculate an object layout (i.e. pgid) from an oid, 1093 * calculate an object layout (i.e. pgid) from an oid,
1112 * file_layout, and osdmap 1094 * file_layout, and osdmap
1113 */ 1095 */
1114int ceph_calc_object_layout(struct ceph_pg *pg, 1096int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
1115 const char *oid, 1097 struct ceph_osdmap *osdmap, uint64_t pool)
1116 struct ceph_file_layout *fl,
1117 struct ceph_osdmap *osdmap)
1118{ 1098{
1119 unsigned int num, num_mask; 1099 struct ceph_pg_pool_info *pool_info;
1120 struct ceph_pg_pool_info *pool;
1121 1100
1122 BUG_ON(!osdmap); 1101 BUG_ON(!osdmap);
1123 pg->pool = le32_to_cpu(fl->fl_pg_pool); 1102 pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
1124 pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); 1103 if (!pool_info)
1125 if (!pool)
1126 return -EIO; 1104 return -EIO;
1127 pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); 1105 pg->pool = pool;
1128 num = pool->pg_num; 1106 pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
1129 num_mask = pool->pg_num_mask;
1130 1107
1131 dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); 1108 dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
1132 return 0; 1109 return 0;
1133} 1110}
1134EXPORT_SYMBOL(ceph_calc_object_layout); 1111EXPORT_SYMBOL(ceph_calc_ceph_pg);
1135 1112
1136/* 1113/*
1137 * Calculate raw osd vector for the given pgid. Return pointer to osd 1114 * Calculate raw osd vector for the given pgid. Return pointer to osd
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
new file mode 100644
index 000000000000..154683f5f14c
--- /dev/null
+++ b/net/ceph/snapshot.c
@@ -0,0 +1,78 @@
1/*
2 * snapshot.c Ceph snapshot context utility routines (part of libceph)
3 *
4 * Copyright (C) 2013 Inktank Storage, Inc.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18 * 02110-1301, USA.
19 */
20
21#include <stddef.h>
22
23#include <linux/types.h>
24#include <linux/export.h>
25#include <linux/ceph/libceph.h>
26
27/*
28 * Ceph snapshot contexts are reference counted objects, and the
29 * returned structure holds a single reference. Acquire additional
30 * references with ceph_get_snap_context(), and release them with
31 * ceph_put_snap_context(). When the reference count reaches zero
32 * the entire structure is freed.
33 */
34
35/*
36 * Create a new ceph snapshot context large enough to hold the
37 * indicated number of snapshot ids (which can be 0). Caller has
38 * to fill in snapc->seq and snapc->snaps[0..snap_count-1].
39 *
40 * Returns a null pointer if an error occurs.
41 */
42struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
43 gfp_t gfp_flags)
44{
45 struct ceph_snap_context *snapc;
46 size_t size;
47
48 size = sizeof (struct ceph_snap_context);
49 size += snap_count * sizeof (snapc->snaps[0]);
50 snapc = kzalloc(size, gfp_flags);
51 if (!snapc)
52 return NULL;
53
54 atomic_set(&snapc->nref, 1);
55 snapc->num_snaps = snap_count;
56
57 return snapc;
58}
59EXPORT_SYMBOL(ceph_create_snap_context);
60
61struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
62{
63 if (sc)
64 atomic_inc(&sc->nref);
65 return sc;
66}
67EXPORT_SYMBOL(ceph_get_snap_context);
68
69void ceph_put_snap_context(struct ceph_snap_context *sc)
70{
71 if (!sc)
72 return;
73 if (atomic_dec_and_test(&sc->nref)) {
74 /*printk(" deleting snap_context %p\n", sc);*/
75 kfree(sc);
76 }
77}
78EXPORT_SYMBOL(ceph_put_snap_context);