aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-10 15:25:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-10 15:25:30 -0400
commitb284d4d5a6785f8cd07eda2646a95782373cd01e (patch)
tree62d835dcb6a6eb30fe9b0ebad7aeba4b4234b1d6
parenta7726f6b61e8917e73101bb8071facdff7ec5d72 (diff)
parent9122eed5281e89bdb02162a8ecb3cc13ffc8985e (diff)
Merge tag 'ceph-for-4.17-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The big ticket items are: - support for rbd "fancy" striping (myself). The striping feature bit is now fully implemented, allowing mapping v2 images with non-default striping patterns. This completes support for --image-format 2. - CephFS quota support (Luis Henriques and Zheng Yan). This set is based on the new SnapRealm code in the upcoming v13.y.z ("Mimic") release. Quota handling will be rejected on older filesystems. - memory usage improvements in CephFS (Chengguang Xu). Directory specific bits have been split out of ceph_file_info and some effort went into improving cap reservation code to avoid OOM crashes. Also included a bunch of assorted fixes all over the place from Chengguang and others" * tag 'ceph-for-4.17-rc1' of git://github.com/ceph/ceph-client: (67 commits) ceph: quota: report root dir quota usage in statfs ceph: quota: add counter for snaprealms with quota ceph: quota: cache inode pointer in ceph_snap_realm ceph: fix root quota realm check ceph: don't check quota for snap inode ceph: quota: update MDS when max_bytes is approaching ceph: quota: support for ceph.quota.max_bytes ceph: quota: don't allow cross-quota renames ceph: quota: support for ceph.quota.max_files ceph: quota: add initial infrastructure to support cephfs quotas rbd: remove VLA usage rbd: fix spelling mistake: "reregisteration" -> "reregistration" ceph: rename function drop_leases() to a more descriptive name ceph: fix invalid point dereference for error case in mdsc destroy ceph: return proper bool type to caller instead of pointer ceph: optimize memory usage ceph: optimize mds session register libceph, ceph: add __init attribution to init funcitons ceph: filter out used flags when printing unused open flags ceph: don't wait on writeback when there is no more dirty pages ...
-rw-r--r--Documentation/filesystems/ceph.txt16
-rw-r--r--drivers/block/rbd.c2452
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/addr.c63
-rw-r--r--fs/ceph/cache.c4
-rw-r--r--fs/ceph/caps.c128
-rw-r--r--fs/ceph/debugfs.c8
-rw-r--r--fs/ceph/dir.c204
-rw-r--r--fs/ceph/file.c117
-rw-r--r--fs/ceph/inode.c26
-rw-r--r--fs/ceph/ioctl.c13
-rw-r--r--fs/ceph/locks.c20
-rw-r--r--fs/ceph/mds_client.c87
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/quota.c361
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c50
-rw-r--r--fs/ceph/super.h42
-rw-r--r--fs/ceph/xattr.c44
-rw-r--r--include/linux/ceph/ceph_features.h1
-rw-r--r--include/linux/ceph/ceph_fs.h17
-rw-r--r--include/linux/ceph/libceph.h1
-rw-r--r--include/linux/ceph/messenger.h101
-rw-r--r--include/linux/ceph/osd_client.h19
-rw-r--r--include/linux/ceph/osdmap.h6
-rw-r--r--include/linux/ceph/striper.h69
-rw-r--r--net/ceph/Makefile1
-rw-r--r--net/ceph/ceph_common.c8
-rw-r--r--net/ceph/crypto.c6
-rw-r--r--net/ceph/debugfs.c17
-rw-r--r--net/ceph/messenger.c188
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/ceph/osd_client.c67
-rw-r--r--net/ceph/osdmap.c71
-rw-r--r--net/ceph/striper.c261
35 files changed, 2558 insertions, 1920 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 0b302a11718a..d7f011ddc150 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -62,6 +62,18 @@ subdirectories, and a summation of all nested file sizes. This makes
62the identification of large disk space consumers relatively quick, as 62the identification of large disk space consumers relatively quick, as
63no 'du' or similar recursive scan of the file system is required. 63no 'du' or similar recursive scan of the file system is required.
64 64
65Finally, Ceph also allows quotas to be set on any directory in the system.
66The quota can restrict the number of bytes or the number of files stored
67beneath that point in the directory hierarchy. Quotas can be set using
68extended attributes 'ceph.quota.max_files' and 'ceph.quota.max_bytes', eg:
69
70 setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir
71 getfattr -n ceph.quota.max_bytes /some/dir
72
73A limitation of the current quotas implementation is that it relies on the
74cooperation of the client mounting the file system to stop writers when a
75limit is reached. A modified or adversarial client cannot be prevented
76from writing as much data as it needs.
65 77
66Mount Syntax 78Mount Syntax
67============ 79============
@@ -137,6 +149,10 @@ Mount Options
137 noasyncreaddir 149 noasyncreaddir
138 Do not use the dcache as above for readdir. 150 Do not use the dcache as above for readdir.
139 151
152 noquotadf
153 Report overall filesystem usage in statfs instead of using the root
154 directory quota.
155
140More Information 156More Information
141================ 157================
142 158
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1e03b04819c8..07dc5419bd63 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -32,6 +32,7 @@
32#include <linux/ceph/osd_client.h> 32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h> 33#include <linux/ceph/mon_client.h>
34#include <linux/ceph/cls_lock_client.h> 34#include <linux/ceph/cls_lock_client.h>
35#include <linux/ceph/striper.h>
35#include <linux/ceph/decode.h> 36#include <linux/ceph/decode.h>
36#include <linux/parser.h> 37#include <linux/parser.h>
37#include <linux/bsearch.h> 38#include <linux/bsearch.h>
@@ -200,95 +201,81 @@ struct rbd_client {
200}; 201};
201 202
202struct rbd_img_request; 203struct rbd_img_request;
203typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
204
205#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
206
207struct rbd_obj_request;
208typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
209 204
210enum obj_request_type { 205enum obj_request_type {
211 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 206 OBJ_REQUEST_NODATA = 1,
207 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
208 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
209 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
212}; 210};
213 211
214enum obj_operation_type { 212enum obj_operation_type {
213 OBJ_OP_READ = 1,
215 OBJ_OP_WRITE, 214 OBJ_OP_WRITE,
216 OBJ_OP_READ,
217 OBJ_OP_DISCARD, 215 OBJ_OP_DISCARD,
218}; 216};
219 217
220enum obj_req_flags { 218/*
221 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 219 * Writes go through the following state machine to deal with
222 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 220 * layering:
223 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 221 *
224 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 222 * need copyup
223 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
224 * | ^ |
225 * v \------------------------------/
226 * done
227 * ^
228 * |
229 * RBD_OBJ_WRITE_FLAT
230 *
231 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
232 * there is a parent or not.
233 */
234enum rbd_obj_write_state {
235 RBD_OBJ_WRITE_FLAT = 1,
236 RBD_OBJ_WRITE_GUARD,
237 RBD_OBJ_WRITE_COPYUP,
225}; 238};
226 239
227struct rbd_obj_request { 240struct rbd_obj_request {
228 u64 object_no; 241 struct ceph_object_extent ex;
229 u64 offset; /* object start byte */
230 u64 length; /* bytes from offset */
231 unsigned long flags;
232
233 /*
234 * An object request associated with an image will have its
235 * img_data flag set; a standalone object request will not.
236 *
237 * A standalone object request will have which == BAD_WHICH
238 * and a null obj_request pointer.
239 *
240 * An object request initiated in support of a layered image
241 * object (to check for its existence before a write) will
242 * have which == BAD_WHICH and a non-null obj_request pointer.
243 *
244 * Finally, an object request for rbd image data will have
245 * which != BAD_WHICH, and will have a non-null img_request
246 * pointer. The value of which will be in the range
247 * 0..(img_request->obj_request_count-1).
248 */
249 union { 242 union {
250 struct rbd_obj_request *obj_request; /* STAT op */ 243 bool tried_parent; /* for reads */
251 struct { 244 enum rbd_obj_write_state write_state; /* for writes */
252 struct rbd_img_request *img_request;
253 u64 img_offset;
254 /* links for img_request->obj_requests list */
255 struct list_head links;
256 };
257 }; 245 };
258 u32 which; /* posn image request list */
259 246
260 enum obj_request_type type; 247 struct rbd_img_request *img_request;
248 struct ceph_file_extent *img_extents;
249 u32 num_img_extents;
250
261 union { 251 union {
262 struct bio *bio_list; 252 struct ceph_bio_iter bio_pos;
263 struct { 253 struct {
264 struct page **pages; 254 struct ceph_bvec_iter bvec_pos;
265 u32 page_count; 255 u32 bvec_count;
256 u32 bvec_idx;
266 }; 257 };
267 }; 258 };
268 struct page **copyup_pages; 259 struct bio_vec *copyup_bvecs;
269 u32 copyup_page_count; 260 u32 copyup_bvec_count;
270 261
271 struct ceph_osd_request *osd_req; 262 struct ceph_osd_request *osd_req;
272 263
273 u64 xferred; /* bytes transferred */ 264 u64 xferred; /* bytes transferred */
274 int result; 265 int result;
275 266
276 rbd_obj_callback_t callback;
277
278 struct kref kref; 267 struct kref kref;
279}; 268};
280 269
281enum img_req_flags { 270enum img_req_flags {
282 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
283 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 271 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
284 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 272 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
285 IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
286}; 273};
287 274
288struct rbd_img_request { 275struct rbd_img_request {
289 struct rbd_device *rbd_dev; 276 struct rbd_device *rbd_dev;
290 u64 offset; /* starting image byte offset */ 277 enum obj_operation_type op_type;
291 u64 length; /* byte count from offset */ 278 enum obj_request_type data_type;
292 unsigned long flags; 279 unsigned long flags;
293 union { 280 union {
294 u64 snap_id; /* for reads */ 281 u64 snap_id; /* for reads */
@@ -298,26 +285,21 @@ struct rbd_img_request {
298 struct request *rq; /* block request */ 285 struct request *rq; /* block request */
299 struct rbd_obj_request *obj_request; /* obj req initiator */ 286 struct rbd_obj_request *obj_request; /* obj req initiator */
300 }; 287 };
301 struct page **copyup_pages; 288 spinlock_t completion_lock;
302 u32 copyup_page_count;
303 spinlock_t completion_lock;/* protects next_completion */
304 u32 next_completion;
305 rbd_img_callback_t callback;
306 u64 xferred;/* aggregate bytes transferred */ 289 u64 xferred;/* aggregate bytes transferred */
307 int result; /* first nonzero obj_request result */ 290 int result; /* first nonzero obj_request result */
308 291
292 struct list_head object_extents; /* obj_req.ex structs */
309 u32 obj_request_count; 293 u32 obj_request_count;
310 struct list_head obj_requests; /* rbd_obj_request structs */ 294 u32 pending_count;
311 295
312 struct kref kref; 296 struct kref kref;
313}; 297};
314 298
315#define for_each_obj_request(ireq, oreq) \ 299#define for_each_obj_request(ireq, oreq) \
316 list_for_each_entry(oreq, &(ireq)->obj_requests, links) 300 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
317#define for_each_obj_request_from(ireq, oreq) \
318 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
319#define for_each_obj_request_safe(ireq, oreq, n) \ 301#define for_each_obj_request_safe(ireq, oreq, n) \
320 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 302 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
321 303
322enum rbd_watch_state { 304enum rbd_watch_state {
323 RBD_WATCH_STATE_UNREGISTERED, 305 RBD_WATCH_STATE_UNREGISTERED,
@@ -433,8 +415,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock);
433static struct kmem_cache *rbd_img_request_cache; 415static struct kmem_cache *rbd_img_request_cache;
434static struct kmem_cache *rbd_obj_request_cache; 416static struct kmem_cache *rbd_obj_request_cache;
435 417
436static struct bio_set *rbd_bio_clone;
437
438static int rbd_major; 418static int rbd_major;
439static DEFINE_IDA(rbd_dev_id_ida); 419static DEFINE_IDA(rbd_dev_id_ida);
440 420
@@ -447,8 +427,6 @@ static bool single_major = true;
447module_param(single_major, bool, S_IRUGO); 427module_param(single_major, bool, S_IRUGO);
448MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 428MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
449 429
450static int rbd_img_request_submit(struct rbd_img_request *img_request);
451
452static ssize_t rbd_add(struct bus_type *bus, const char *buf, 430static ssize_t rbd_add(struct bus_type *bus, const char *buf,
453 size_t count); 431 size_t count);
454static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 432static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
@@ -458,7 +436,6 @@ static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
458static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 436static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
459 size_t count); 437 size_t count);
460static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 438static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
461static void rbd_spec_put(struct rbd_spec *spec);
462 439
463static int rbd_dev_id_to_minor(int dev_id) 440static int rbd_dev_id_to_minor(int dev_id)
464{ 441{
@@ -577,9 +554,6 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
577# define rbd_assert(expr) ((void) 0) 554# define rbd_assert(expr) ((void) 0)
578#endif /* !RBD_DEBUG */ 555#endif /* !RBD_DEBUG */
579 556
580static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
581static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
582static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
583static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 557static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
584 558
585static int rbd_dev_refresh(struct rbd_device *rbd_dev); 559static int rbd_dev_refresh(struct rbd_device *rbd_dev);
@@ -857,26 +831,6 @@ static char* obj_op_name(enum obj_operation_type op_type)
857} 831}
858 832
859/* 833/*
860 * Get a ceph client with specific addr and configuration, if one does
861 * not exist create it. Either way, ceph_opts is consumed by this
862 * function.
863 */
864static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
865{
866 struct rbd_client *rbdc;
867
868 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
869 rbdc = rbd_client_find(ceph_opts);
870 if (rbdc) /* using an existing client */
871 ceph_destroy_options(ceph_opts);
872 else
873 rbdc = rbd_client_create(ceph_opts);
874 mutex_unlock(&client_mutex);
875
876 return rbdc;
877}
878
879/*
880 * Destroy ceph client 834 * Destroy ceph client
881 * 835 *
882 * Caller must hold rbd_client_list_lock. 836 * Caller must hold rbd_client_list_lock.
@@ -904,6 +858,56 @@ static void rbd_put_client(struct rbd_client *rbdc)
904 kref_put(&rbdc->kref, rbd_client_release); 858 kref_put(&rbdc->kref, rbd_client_release);
905} 859}
906 860
861static int wait_for_latest_osdmap(struct ceph_client *client)
862{
863 u64 newest_epoch;
864 int ret;
865
866 ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
867 if (ret)
868 return ret;
869
870 if (client->osdc.osdmap->epoch >= newest_epoch)
871 return 0;
872
873 ceph_osdc_maybe_request_map(&client->osdc);
874 return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
875 client->options->mount_timeout);
876}
877
878/*
879 * Get a ceph client with specific addr and configuration, if one does
880 * not exist create it. Either way, ceph_opts is consumed by this
881 * function.
882 */
883static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
884{
885 struct rbd_client *rbdc;
886 int ret;
887
888 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
889 rbdc = rbd_client_find(ceph_opts);
890 if (rbdc) {
891 ceph_destroy_options(ceph_opts);
892
893 /*
894 * Using an existing client. Make sure ->pg_pools is up to
895 * date before we look up the pool id in do_rbd_add().
896 */
897 ret = wait_for_latest_osdmap(rbdc->client);
898 if (ret) {
899 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
900 rbd_put_client(rbdc);
901 rbdc = ERR_PTR(ret);
902 }
903 } else {
904 rbdc = rbd_client_create(ceph_opts);
905 }
906 mutex_unlock(&client_mutex);
907
908 return rbdc;
909}
910
907static bool rbd_image_format_valid(u32 image_format) 911static bool rbd_image_format_valid(u32 image_format)
908{ 912{
909 return image_format == 1 || image_format == 2; 913 return image_format == 1 || image_format == 2;
@@ -1223,272 +1227,59 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1223 rbd_dev->mapping.features = 0; 1227 rbd_dev->mapping.features = 0;
1224} 1228}
1225 1229
1226static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 1230static void zero_bvec(struct bio_vec *bv)
1227{
1228 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
1229
1230 return offset & (segment_size - 1);
1231}
1232
1233static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1234 u64 offset, u64 length)
1235{
1236 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
1237
1238 offset &= segment_size - 1;
1239
1240 rbd_assert(length <= U64_MAX - offset);
1241 if (offset + length > segment_size)
1242 length = segment_size - offset;
1243
1244 return length;
1245}
1246
1247/*
1248 * bio helpers
1249 */
1250
1251static void bio_chain_put(struct bio *chain)
1252{
1253 struct bio *tmp;
1254
1255 while (chain) {
1256 tmp = chain;
1257 chain = chain->bi_next;
1258 bio_put(tmp);
1259 }
1260}
1261
1262/*
1263 * zeros a bio chain, starting at specific offset
1264 */
1265static void zero_bio_chain(struct bio *chain, int start_ofs)
1266{ 1231{
1267 struct bio_vec bv;
1268 struct bvec_iter iter;
1269 unsigned long flags;
1270 void *buf; 1232 void *buf;
1271 int pos = 0; 1233 unsigned long flags;
1272
1273 while (chain) {
1274 bio_for_each_segment(bv, chain, iter) {
1275 if (pos + bv.bv_len > start_ofs) {
1276 int remainder = max(start_ofs - pos, 0);
1277 buf = bvec_kmap_irq(&bv, &flags);
1278 memset(buf + remainder, 0,
1279 bv.bv_len - remainder);
1280 flush_dcache_page(bv.bv_page);
1281 bvec_kunmap_irq(buf, &flags);
1282 }
1283 pos += bv.bv_len;
1284 }
1285 1234
1286 chain = chain->bi_next; 1235 buf = bvec_kmap_irq(bv, &flags);
1287 } 1236 memset(buf, 0, bv->bv_len);
1237 flush_dcache_page(bv->bv_page);
1238 bvec_kunmap_irq(buf, &flags);
1288} 1239}
1289 1240
1290/* 1241static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1291 * similar to zero_bio_chain(), zeros data defined by a page array,
1292 * starting at the given byte offset from the start of the array and
1293 * continuing up to the given end offset. The pages array is
1294 * assumed to be big enough to hold all bytes up to the end.
1295 */
1296static void zero_pages(struct page **pages, u64 offset, u64 end)
1297{ 1242{
1298 struct page **page = &pages[offset >> PAGE_SHIFT]; 1243 struct ceph_bio_iter it = *bio_pos;
1299 1244
1300 rbd_assert(end > offset); 1245 ceph_bio_iter_advance(&it, off);
1301 rbd_assert(end - offset <= (u64)SIZE_MAX); 1246 ceph_bio_iter_advance_step(&it, bytes, ({
1302 while (offset < end) { 1247 zero_bvec(&bv);
1303 size_t page_offset; 1248 }));
1304 size_t length;
1305 unsigned long flags;
1306 void *kaddr;
1307
1308 page_offset = offset & ~PAGE_MASK;
1309 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1310 local_irq_save(flags);
1311 kaddr = kmap_atomic(*page);
1312 memset(kaddr + page_offset, 0, length);
1313 flush_dcache_page(*page);
1314 kunmap_atomic(kaddr);
1315 local_irq_restore(flags);
1316
1317 offset += length;
1318 page++;
1319 }
1320} 1249}
1321 1250
1322/* 1251static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1323 * Clone a portion of a bio, starting at the given byte offset
1324 * and continuing for the number of bytes indicated.
1325 */
1326static struct bio *bio_clone_range(struct bio *bio_src,
1327 unsigned int offset,
1328 unsigned int len,
1329 gfp_t gfpmask)
1330{ 1252{
1331 struct bio *bio; 1253 struct ceph_bvec_iter it = *bvec_pos;
1332
1333 bio = bio_clone_fast(bio_src, gfpmask, rbd_bio_clone);
1334 if (!bio)
1335 return NULL; /* ENOMEM */
1336 1254
1337 bio_advance(bio, offset); 1255 ceph_bvec_iter_advance(&it, off);
1338 bio->bi_iter.bi_size = len; 1256 ceph_bvec_iter_advance_step(&it, bytes, ({
1339 1257 zero_bvec(&bv);
1340 return bio; 1258 }));
1341} 1259}
1342 1260
1343/* 1261/*
1344 * Clone a portion of a bio chain, starting at the given byte offset 1262 * Zero a range in @obj_req data buffer defined by a bio (list) or
1345 * into the first bio in the source chain and continuing for the 1263 * (private) bio_vec array.
1346 * number of bytes indicated. The result is another bio chain of
1347 * exactly the given length, or a null pointer on error.
1348 *
1349 * The bio_src and offset parameters are both in-out. On entry they
1350 * refer to the first source bio and the offset into that bio where
1351 * the start of data to be cloned is located.
1352 * 1264 *
1353 * On return, bio_src is updated to refer to the bio in the source 1265 * @off is relative to the start of the data buffer.
1354 * chain that contains first un-cloned byte, and *offset will
1355 * contain the offset of that byte within that bio.
1356 */ 1266 */
1357static struct bio *bio_chain_clone_range(struct bio **bio_src, 1267static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1358 unsigned int *offset, 1268 u32 bytes)
1359 unsigned int len,
1360 gfp_t gfpmask)
1361{ 1269{
1362 struct bio *bi = *bio_src; 1270 switch (obj_req->img_request->data_type) {
1363 unsigned int off = *offset; 1271 case OBJ_REQUEST_BIO:
1364 struct bio *chain = NULL; 1272 zero_bios(&obj_req->bio_pos, off, bytes);
1365 struct bio **end; 1273 break;
1366 1274 case OBJ_REQUEST_BVECS:
1367 /* Build up a chain of clone bios up to the limit */ 1275 case OBJ_REQUEST_OWN_BVECS:
1368 1276 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1369 if (!bi || off >= bi->bi_iter.bi_size || !len) 1277 break;
1370 return NULL; /* Nothing to clone */ 1278 default:
1371 1279 rbd_assert(0);
1372 end = &chain;
1373 while (len) {
1374 unsigned int bi_size;
1375 struct bio *bio;
1376
1377 if (!bi) {
1378 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1379 goto out_err; /* EINVAL; ran out of bio's */
1380 }
1381 bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1382 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1383 if (!bio)
1384 goto out_err; /* ENOMEM */
1385
1386 *end = bio;
1387 end = &bio->bi_next;
1388
1389 off += bi_size;
1390 if (off == bi->bi_iter.bi_size) {
1391 bi = bi->bi_next;
1392 off = 0;
1393 }
1394 len -= bi_size;
1395 }
1396 *bio_src = bi;
1397 *offset = off;
1398
1399 return chain;
1400out_err:
1401 bio_chain_put(chain);
1402
1403 return NULL;
1404}
1405
1406/*
1407 * The default/initial value for all object request flags is 0. For
1408 * each flag, once its value is set to 1 it is never reset to 0
1409 * again.
1410 */
1411static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1412{
1413 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1414 struct rbd_device *rbd_dev;
1415
1416 rbd_dev = obj_request->img_request->rbd_dev;
1417 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
1418 obj_request);
1419 }
1420}
1421
1422static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1423{
1424 smp_mb();
1425 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1426}
1427
1428static void obj_request_done_set(struct rbd_obj_request *obj_request)
1429{
1430 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1431 struct rbd_device *rbd_dev = NULL;
1432
1433 if (obj_request_img_data_test(obj_request))
1434 rbd_dev = obj_request->img_request->rbd_dev;
1435 rbd_warn(rbd_dev, "obj_request %p already marked done",
1436 obj_request);
1437 } 1280 }
1438} 1281}
1439 1282
1440static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1441{
1442 smp_mb();
1443 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1444}
1445
1446/*
1447 * This sets the KNOWN flag after (possibly) setting the EXISTS
1448 * flag. The latter is set based on the "exists" value provided.
1449 *
1450 * Note that for our purposes once an object exists it never goes
1451 * away again. It's possible that the response from two existence
1452 * checks are separated by the creation of the target object, and
1453 * the first ("doesn't exist") response arrives *after* the second
1454 * ("does exist"). In that case we ignore the second one.
1455 */
1456static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1457 bool exists)
1458{
1459 if (exists)
1460 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1461 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1462 smp_mb();
1463}
1464
1465static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1466{
1467 smp_mb();
1468 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1469}
1470
1471static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1472{
1473 smp_mb();
1474 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1475}
1476
1477static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1478{
1479 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1480
1481 return obj_request->img_offset <
1482 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1483}
1484
1485static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1486{
1487 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1488 kref_read(&obj_request->kref));
1489 kref_get(&obj_request->kref);
1490}
1491
1492static void rbd_obj_request_destroy(struct kref *kref); 1283static void rbd_obj_request_destroy(struct kref *kref);
1493static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1284static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1494{ 1285{
@@ -1505,18 +1296,13 @@ static void rbd_img_request_get(struct rbd_img_request *img_request)
1505 kref_get(&img_request->kref); 1296 kref_get(&img_request->kref);
1506} 1297}
1507 1298
1508static bool img_request_child_test(struct rbd_img_request *img_request);
1509static void rbd_parent_request_destroy(struct kref *kref);
1510static void rbd_img_request_destroy(struct kref *kref); 1299static void rbd_img_request_destroy(struct kref *kref);
1511static void rbd_img_request_put(struct rbd_img_request *img_request) 1300static void rbd_img_request_put(struct rbd_img_request *img_request)
1512{ 1301{
1513 rbd_assert(img_request != NULL); 1302 rbd_assert(img_request != NULL);
1514 dout("%s: img %p (was %d)\n", __func__, img_request, 1303 dout("%s: img %p (was %d)\n", __func__, img_request,
1515 kref_read(&img_request->kref)); 1304 kref_read(&img_request->kref));
1516 if (img_request_child_test(img_request)) 1305 kref_put(&img_request->kref, rbd_img_request_destroy);
1517 kref_put(&img_request->kref, rbd_parent_request_destroy);
1518 else
1519 kref_put(&img_request->kref, rbd_img_request_destroy);
1520} 1306}
1521 1307
1522static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1308static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
@@ -1526,139 +1312,37 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1526 1312
1527 /* Image request now owns object's original reference */ 1313 /* Image request now owns object's original reference */
1528 obj_request->img_request = img_request; 1314 obj_request->img_request = img_request;
1529 obj_request->which = img_request->obj_request_count;
1530 rbd_assert(!obj_request_img_data_test(obj_request));
1531 obj_request_img_data_set(obj_request);
1532 rbd_assert(obj_request->which != BAD_WHICH);
1533 img_request->obj_request_count++; 1315 img_request->obj_request_count++;
1534 list_add_tail(&obj_request->links, &img_request->obj_requests); 1316 img_request->pending_count++;
1535 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 1317 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1536 obj_request->which);
1537} 1318}
1538 1319
1539static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1320static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1540 struct rbd_obj_request *obj_request) 1321 struct rbd_obj_request *obj_request)
1541{ 1322{
1542 rbd_assert(obj_request->which != BAD_WHICH); 1323 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1543 1324 list_del(&obj_request->ex.oe_item);
1544 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1545 obj_request->which);
1546 list_del(&obj_request->links);
1547 rbd_assert(img_request->obj_request_count > 0); 1325 rbd_assert(img_request->obj_request_count > 0);
1548 img_request->obj_request_count--; 1326 img_request->obj_request_count--;
1549 rbd_assert(obj_request->which == img_request->obj_request_count);
1550 obj_request->which = BAD_WHICH;
1551 rbd_assert(obj_request_img_data_test(obj_request));
1552 rbd_assert(obj_request->img_request == img_request); 1327 rbd_assert(obj_request->img_request == img_request);
1553 obj_request->img_request = NULL;
1554 obj_request->callback = NULL;
1555 rbd_obj_request_put(obj_request); 1328 rbd_obj_request_put(obj_request);
1556} 1329}
1557 1330
1558static bool obj_request_type_valid(enum obj_request_type type)
1559{
1560 switch (type) {
1561 case OBJ_REQUEST_NODATA:
1562 case OBJ_REQUEST_BIO:
1563 case OBJ_REQUEST_PAGES:
1564 return true;
1565 default:
1566 return false;
1567 }
1568}
1569
1570static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
1571
1572static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1331static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1573{ 1332{
1574 struct ceph_osd_request *osd_req = obj_request->osd_req; 1333 struct ceph_osd_request *osd_req = obj_request->osd_req;
1575 1334
1576 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 1335 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1577 obj_request, obj_request->object_no, obj_request->offset, 1336 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1578 obj_request->length, osd_req); 1337 obj_request->ex.oe_len, osd_req);
1579 if (obj_request_img_data_test(obj_request)) {
1580 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1581 rbd_img_request_get(obj_request->img_request);
1582 }
1583 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1338 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1584} 1339}
1585 1340
1586static void rbd_img_request_complete(struct rbd_img_request *img_request)
1587{
1588
1589 dout("%s: img %p\n", __func__, img_request);
1590
1591 /*
1592 * If no error occurred, compute the aggregate transfer
1593 * count for the image request. We could instead use
1594 * atomic64_cmpxchg() to update it as each object request
1595 * completes; not clear which way is better off hand.
1596 */
1597 if (!img_request->result) {
1598 struct rbd_obj_request *obj_request;
1599 u64 xferred = 0;
1600
1601 for_each_obj_request(img_request, obj_request)
1602 xferred += obj_request->xferred;
1603 img_request->xferred = xferred;
1604 }
1605
1606 if (img_request->callback)
1607 img_request->callback(img_request);
1608 else
1609 rbd_img_request_put(img_request);
1610}
1611
1612/* 1341/*
1613 * The default/initial value for all image request flags is 0. Each 1342 * The default/initial value for all image request flags is 0. Each
1614 * is conditionally set to 1 at image request initialization time 1343 * is conditionally set to 1 at image request initialization time
1615 * and currently never change thereafter. 1344 * and currently never change thereafter.
1616 */ 1345 */
1617static void img_request_write_set(struct rbd_img_request *img_request)
1618{
1619 set_bit(IMG_REQ_WRITE, &img_request->flags);
1620 smp_mb();
1621}
1622
1623static bool img_request_write_test(struct rbd_img_request *img_request)
1624{
1625 smp_mb();
1626 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1627}
1628
1629/*
1630 * Set the discard flag when the img_request is an discard request
1631 */
1632static void img_request_discard_set(struct rbd_img_request *img_request)
1633{
1634 set_bit(IMG_REQ_DISCARD, &img_request->flags);
1635 smp_mb();
1636}
1637
1638static bool img_request_discard_test(struct rbd_img_request *img_request)
1639{
1640 smp_mb();
1641 return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
1642}
1643
1644static void img_request_child_set(struct rbd_img_request *img_request)
1645{
1646 set_bit(IMG_REQ_CHILD, &img_request->flags);
1647 smp_mb();
1648}
1649
1650static void img_request_child_clear(struct rbd_img_request *img_request)
1651{
1652 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1653 smp_mb();
1654}
1655
1656static bool img_request_child_test(struct rbd_img_request *img_request)
1657{
1658 smp_mb();
1659 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1660}
1661
1662static void img_request_layered_set(struct rbd_img_request *img_request) 1346static void img_request_layered_set(struct rbd_img_request *img_request)
1663{ 1347{
1664 set_bit(IMG_REQ_LAYERED, &img_request->flags); 1348 set_bit(IMG_REQ_LAYERED, &img_request->flags);
@@ -1677,209 +1361,70 @@ static bool img_request_layered_test(struct rbd_img_request *img_request)
1677 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1361 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1678} 1362}
1679 1363
1680static enum obj_operation_type 1364static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1681rbd_img_request_op_type(struct rbd_img_request *img_request)
1682{
1683 if (img_request_write_test(img_request))
1684 return OBJ_OP_WRITE;
1685 else if (img_request_discard_test(img_request))
1686 return OBJ_OP_DISCARD;
1687 else
1688 return OBJ_OP_READ;
1689}
1690
1691static void
1692rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1693{
1694 u64 xferred = obj_request->xferred;
1695 u64 length = obj_request->length;
1696
1697 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1698 obj_request, obj_request->img_request, obj_request->result,
1699 xferred, length);
1700 /*
1701 * ENOENT means a hole in the image. We zero-fill the entire
1702 * length of the request. A short read also implies zero-fill
1703 * to the end of the request. An error requires the whole
1704 * length of the request to be reported finished with an error
1705 * to the block layer. In each case we update the xferred
1706 * count to indicate the whole request was satisfied.
1707 */
1708 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1709 if (obj_request->result == -ENOENT) {
1710 if (obj_request->type == OBJ_REQUEST_BIO)
1711 zero_bio_chain(obj_request->bio_list, 0);
1712 else
1713 zero_pages(obj_request->pages, 0, length);
1714 obj_request->result = 0;
1715 } else if (xferred < length && !obj_request->result) {
1716 if (obj_request->type == OBJ_REQUEST_BIO)
1717 zero_bio_chain(obj_request->bio_list, xferred);
1718 else
1719 zero_pages(obj_request->pages, xferred, length);
1720 }
1721 obj_request->xferred = length;
1722 obj_request_done_set(obj_request);
1723}
1724
1725static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1726{ 1365{
1727 dout("%s: obj %p cb %p\n", __func__, obj_request, 1366 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1728 obj_request->callback);
1729 obj_request->callback(obj_request);
1730}
1731 1367
1732static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) 1368 return !obj_req->ex.oe_off &&
1733{ 1369 obj_req->ex.oe_len == rbd_dev->layout.object_size;
1734 obj_request->result = err;
1735 obj_request->xferred = 0;
1736 /*
1737 * kludge - mirror rbd_obj_request_submit() to match a put in
1738 * rbd_img_obj_callback()
1739 */
1740 if (obj_request_img_data_test(obj_request)) {
1741 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1742 rbd_img_request_get(obj_request->img_request);
1743 }
1744 obj_request_done_set(obj_request);
1745 rbd_obj_request_complete(obj_request);
1746} 1370}
1747 1371
1748static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1372static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1749{ 1373{
1750 struct rbd_img_request *img_request = NULL; 1374 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1751 struct rbd_device *rbd_dev = NULL;
1752 bool layered = false;
1753
1754 if (obj_request_img_data_test(obj_request)) {
1755 img_request = obj_request->img_request;
1756 layered = img_request && img_request_layered_test(img_request);
1757 rbd_dev = img_request->rbd_dev;
1758 }
1759
1760 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1761 obj_request, img_request, obj_request->result,
1762 obj_request->xferred, obj_request->length);
1763 if (layered && obj_request->result == -ENOENT &&
1764 obj_request->img_offset < rbd_dev->parent_overlap)
1765 rbd_img_parent_read(obj_request);
1766 else if (img_request)
1767 rbd_img_obj_request_read_callback(obj_request);
1768 else
1769 obj_request_done_set(obj_request);
1770}
1771 1375
1772static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1376 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1773{ 1377 rbd_dev->layout.object_size;
1774 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1775 obj_request->result, obj_request->length);
1776 /*
1777 * There is no such thing as a successful short write. Set
1778 * it to our originally-requested length.
1779 */
1780 obj_request->xferred = obj_request->length;
1781 obj_request_done_set(obj_request);
1782} 1378}
1783 1379
1784static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 1380static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1785{ 1381{
1786 dout("%s: obj %p result %d %llu\n", __func__, obj_request, 1382 return ceph_file_extents_bytes(obj_req->img_extents,
1787 obj_request->result, obj_request->length); 1383 obj_req->num_img_extents);
1788 /*
1789 * There is no such thing as a successful short discard. Set
1790 * it to our originally-requested length.
1791 */
1792 obj_request->xferred = obj_request->length;
1793 /* discarding a non-existent object is not a problem */
1794 if (obj_request->result == -ENOENT)
1795 obj_request->result = 0;
1796 obj_request_done_set(obj_request);
1797} 1384}
1798 1385
1799/* 1386static bool rbd_img_is_write(struct rbd_img_request *img_req)
1800 * For a simple stat call there's nothing to do. We'll do more if
1801 * this is part of a write sequence for a layered image.
1802 */
1803static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1804{ 1387{
1805 dout("%s: obj %p\n", __func__, obj_request); 1388 switch (img_req->op_type) {
1806 obj_request_done_set(obj_request); 1389 case OBJ_OP_READ:
1390 return false;
1391 case OBJ_OP_WRITE:
1392 case OBJ_OP_DISCARD:
1393 return true;
1394 default:
1395 rbd_assert(0);
1396 }
1807} 1397}
1808 1398
1809static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 1399static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1810{
1811 dout("%s: obj %p\n", __func__, obj_request);
1812
1813 if (obj_request_img_data_test(obj_request))
1814 rbd_osd_copyup_callback(obj_request);
1815 else
1816 obj_request_done_set(obj_request);
1817}
1818 1400
1819static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1401static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1820{ 1402{
1821 struct rbd_obj_request *obj_request = osd_req->r_priv; 1403 struct rbd_obj_request *obj_req = osd_req->r_priv;
1822 u16 opcode;
1823 1404
1824 dout("%s: osd_req %p\n", __func__, osd_req); 1405 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1825 rbd_assert(osd_req == obj_request->osd_req); 1406 osd_req->r_result, obj_req);
1826 if (obj_request_img_data_test(obj_request)) { 1407 rbd_assert(osd_req == obj_req->osd_req);
1827 rbd_assert(obj_request->img_request);
1828 rbd_assert(obj_request->which != BAD_WHICH);
1829 } else {
1830 rbd_assert(obj_request->which == BAD_WHICH);
1831 }
1832
1833 if (osd_req->r_result < 0)
1834 obj_request->result = osd_req->r_result;
1835
1836 /*
1837 * We support a 64-bit length, but ultimately it has to be
1838 * passed to the block layer, which just supports a 32-bit
1839 * length field.
1840 */
1841 obj_request->xferred = osd_req->r_ops[0].outdata_len;
1842 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1843 1408
1844 opcode = osd_req->r_ops[0].op; 1409 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1845 switch (opcode) { 1410 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1846 case CEPH_OSD_OP_READ: 1411 obj_req->xferred = osd_req->r_result;
1847 rbd_osd_read_callback(obj_request); 1412 else
1848 break; 1413 /*
1849 case CEPH_OSD_OP_SETALLOCHINT: 1414 * Writes aren't allowed to return a data payload. In some
1850 rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1415 * guarded write cases (e.g. stat + zero on an empty object)
1851 osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 1416 * a stat response makes it through, but we don't care.
1852 /* fall through */ 1417 */
1853 case CEPH_OSD_OP_WRITE: 1418 obj_req->xferred = 0;
1854 case CEPH_OSD_OP_WRITEFULL:
1855 rbd_osd_write_callback(obj_request);
1856 break;
1857 case CEPH_OSD_OP_STAT:
1858 rbd_osd_stat_callback(obj_request);
1859 break;
1860 case CEPH_OSD_OP_DELETE:
1861 case CEPH_OSD_OP_TRUNCATE:
1862 case CEPH_OSD_OP_ZERO:
1863 rbd_osd_discard_callback(obj_request);
1864 break;
1865 case CEPH_OSD_OP_CALL:
1866 rbd_osd_call_callback(obj_request);
1867 break;
1868 default:
1869 rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
1870 obj_request->object_no, opcode);
1871 break;
1872 }
1873 1419
1874 if (obj_request_done_test(obj_request)) 1420 rbd_obj_handle_request(obj_req);
1875 rbd_obj_request_complete(obj_request);
1876} 1421}
1877 1422
1878static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1423static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1879{ 1424{
1880 struct ceph_osd_request *osd_req = obj_request->osd_req; 1425 struct ceph_osd_request *osd_req = obj_request->osd_req;
1881 1426
1882 rbd_assert(obj_request_img_data_test(obj_request)); 1427 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1883 osd_req->r_snapid = obj_request->img_request->snap_id; 1428 osd_req->r_snapid = obj_request->img_request->snap_id;
1884} 1429}
1885 1430
@@ -1887,32 +1432,33 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1887{ 1432{
1888 struct ceph_osd_request *osd_req = obj_request->osd_req; 1433 struct ceph_osd_request *osd_req = obj_request->osd_req;
1889 1434
1435 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1890 ktime_get_real_ts(&osd_req->r_mtime); 1436 ktime_get_real_ts(&osd_req->r_mtime);
1891 osd_req->r_data_offset = obj_request->offset; 1437 osd_req->r_data_offset = obj_request->ex.oe_off;
1892} 1438}
1893 1439
1894static struct ceph_osd_request * 1440static struct ceph_osd_request *
1895__rbd_osd_req_create(struct rbd_device *rbd_dev, 1441rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
1896 struct ceph_snap_context *snapc,
1897 int num_ops, unsigned int flags,
1898 struct rbd_obj_request *obj_request)
1899{ 1442{
1443 struct rbd_img_request *img_req = obj_req->img_request;
1444 struct rbd_device *rbd_dev = img_req->rbd_dev;
1900 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1445 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1901 struct ceph_osd_request *req; 1446 struct ceph_osd_request *req;
1902 const char *name_format = rbd_dev->image_format == 1 ? 1447 const char *name_format = rbd_dev->image_format == 1 ?
1903 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1448 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1904 1449
1905 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1450 req = ceph_osdc_alloc_request(osdc,
1451 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1452 num_ops, false, GFP_NOIO);
1906 if (!req) 1453 if (!req)
1907 return NULL; 1454 return NULL;
1908 1455
1909 req->r_flags = flags;
1910 req->r_callback = rbd_osd_req_callback; 1456 req->r_callback = rbd_osd_req_callback;
1911 req->r_priv = obj_request; 1457 req->r_priv = obj_req;
1912 1458
1913 req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1459 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1914 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 1460 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1915 rbd_dev->header.object_prefix, obj_request->object_no)) 1461 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
1916 goto err_req; 1462 goto err_req;
1917 1463
1918 if (ceph_osdc_alloc_messages(req, GFP_NOIO)) 1464 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
@@ -1925,83 +1471,20 @@ err_req:
1925 return NULL; 1471 return NULL;
1926} 1472}
1927 1473
1928/*
1929 * Create an osd request. A read request has one osd op (read).
1930 * A write request has either one (watch) or two (hint+write) osd ops.
1931 * (All rbd data writes are prefixed with an allocation hint op, but
1932 * technically osd watch is a write request, hence this distinction.)
1933 */
1934static struct ceph_osd_request *rbd_osd_req_create(
1935 struct rbd_device *rbd_dev,
1936 enum obj_operation_type op_type,
1937 unsigned int num_ops,
1938 struct rbd_obj_request *obj_request)
1939{
1940 struct ceph_snap_context *snapc = NULL;
1941
1942 if (obj_request_img_data_test(obj_request) &&
1943 (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
1944 struct rbd_img_request *img_request = obj_request->img_request;
1945 if (op_type == OBJ_OP_WRITE) {
1946 rbd_assert(img_request_write_test(img_request));
1947 } else {
1948 rbd_assert(img_request_discard_test(img_request));
1949 }
1950 snapc = img_request->snapc;
1951 }
1952
1953 rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1954
1955 return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
1956 (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
1957 CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
1958}
1959
1960/*
1961 * Create a copyup osd request based on the information in the object
1962 * request supplied. A copyup request has two or three osd ops, a
1963 * copyup method call, potentially a hint op, and a write or truncate
1964 * or zero op.
1965 */
1966static struct ceph_osd_request *
1967rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1968{
1969 struct rbd_img_request *img_request;
1970 int num_osd_ops = 3;
1971
1972 rbd_assert(obj_request_img_data_test(obj_request));
1973 img_request = obj_request->img_request;
1974 rbd_assert(img_request);
1975 rbd_assert(img_request_write_test(img_request) ||
1976 img_request_discard_test(img_request));
1977
1978 if (img_request_discard_test(img_request))
1979 num_osd_ops = 2;
1980
1981 return __rbd_osd_req_create(img_request->rbd_dev,
1982 img_request->snapc, num_osd_ops,
1983 CEPH_OSD_FLAG_WRITE, obj_request);
1984}
1985
1986static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1474static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1987{ 1475{
1988 ceph_osdc_put_request(osd_req); 1476 ceph_osdc_put_request(osd_req);
1989} 1477}
1990 1478
1991static struct rbd_obj_request * 1479static struct rbd_obj_request *rbd_obj_request_create(void)
1992rbd_obj_request_create(enum obj_request_type type)
1993{ 1480{
1994 struct rbd_obj_request *obj_request; 1481 struct rbd_obj_request *obj_request;
1995 1482
1996 rbd_assert(obj_request_type_valid(type));
1997
1998 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 1483 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1999 if (!obj_request) 1484 if (!obj_request)
2000 return NULL; 1485 return NULL;
2001 1486
2002 obj_request->which = BAD_WHICH; 1487 ceph_object_extent_init(&obj_request->ex);
2003 obj_request->type = type;
2004 INIT_LIST_HEAD(&obj_request->links);
2005 kref_init(&obj_request->kref); 1488 kref_init(&obj_request->kref);
2006 1489
2007 dout("%s %p\n", __func__, obj_request); 1490 dout("%s %p\n", __func__, obj_request);
@@ -2011,32 +1494,34 @@ rbd_obj_request_create(enum obj_request_type type)
2011static void rbd_obj_request_destroy(struct kref *kref) 1494static void rbd_obj_request_destroy(struct kref *kref)
2012{ 1495{
2013 struct rbd_obj_request *obj_request; 1496 struct rbd_obj_request *obj_request;
1497 u32 i;
2014 1498
2015 obj_request = container_of(kref, struct rbd_obj_request, kref); 1499 obj_request = container_of(kref, struct rbd_obj_request, kref);
2016 1500
2017 dout("%s: obj %p\n", __func__, obj_request); 1501 dout("%s: obj %p\n", __func__, obj_request);
2018 1502
2019 rbd_assert(obj_request->img_request == NULL);
2020 rbd_assert(obj_request->which == BAD_WHICH);
2021
2022 if (obj_request->osd_req) 1503 if (obj_request->osd_req)
2023 rbd_osd_req_destroy(obj_request->osd_req); 1504 rbd_osd_req_destroy(obj_request->osd_req);
2024 1505
2025 rbd_assert(obj_request_type_valid(obj_request->type)); 1506 switch (obj_request->img_request->data_type) {
2026 switch (obj_request->type) {
2027 case OBJ_REQUEST_NODATA: 1507 case OBJ_REQUEST_NODATA:
2028 break; /* Nothing to do */
2029 case OBJ_REQUEST_BIO: 1508 case OBJ_REQUEST_BIO:
2030 if (obj_request->bio_list) 1509 case OBJ_REQUEST_BVECS:
2031 bio_chain_put(obj_request->bio_list); 1510 break; /* Nothing to do */
2032 break; 1511 case OBJ_REQUEST_OWN_BVECS:
2033 case OBJ_REQUEST_PAGES: 1512 kfree(obj_request->bvec_pos.bvecs);
2034 /* img_data requests don't own their page array */
2035 if (obj_request->pages &&
2036 !obj_request_img_data_test(obj_request))
2037 ceph_release_page_vector(obj_request->pages,
2038 obj_request->page_count);
2039 break; 1513 break;
1514 default:
1515 rbd_assert(0);
1516 }
1517
1518 kfree(obj_request->img_extents);
1519 if (obj_request->copyup_bvecs) {
1520 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1521 if (obj_request->copyup_bvecs[i].bv_page)
1522 __free_page(obj_request->copyup_bvecs[i].bv_page);
1523 }
1524 kfree(obj_request->copyup_bvecs);
2040 } 1525 }
2041 1526
2042 kmem_cache_free(rbd_obj_request_cache, obj_request); 1527 kmem_cache_free(rbd_obj_request_cache, obj_request);
@@ -2111,7 +1596,6 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2111 */ 1596 */
2112static struct rbd_img_request *rbd_img_request_create( 1597static struct rbd_img_request *rbd_img_request_create(
2113 struct rbd_device *rbd_dev, 1598 struct rbd_device *rbd_dev,
2114 u64 offset, u64 length,
2115 enum obj_operation_type op_type, 1599 enum obj_operation_type op_type,
2116 struct ceph_snap_context *snapc) 1600 struct ceph_snap_context *snapc)
2117{ 1601{
@@ -2122,27 +1606,21 @@ static struct rbd_img_request *rbd_img_request_create(
2122 return NULL; 1606 return NULL;
2123 1607
2124 img_request->rbd_dev = rbd_dev; 1608 img_request->rbd_dev = rbd_dev;
2125 img_request->offset = offset; 1609 img_request->op_type = op_type;
2126 img_request->length = length; 1610 if (!rbd_img_is_write(img_request))
2127 if (op_type == OBJ_OP_DISCARD) {
2128 img_request_discard_set(img_request);
2129 img_request->snapc = snapc;
2130 } else if (op_type == OBJ_OP_WRITE) {
2131 img_request_write_set(img_request);
2132 img_request->snapc = snapc;
2133 } else {
2134 img_request->snap_id = rbd_dev->spec->snap_id; 1611 img_request->snap_id = rbd_dev->spec->snap_id;
2135 } 1612 else
1613 img_request->snapc = snapc;
1614
2136 if (rbd_dev_parent_get(rbd_dev)) 1615 if (rbd_dev_parent_get(rbd_dev))
2137 img_request_layered_set(img_request); 1616 img_request_layered_set(img_request);
2138 1617
2139 spin_lock_init(&img_request->completion_lock); 1618 spin_lock_init(&img_request->completion_lock);
2140 INIT_LIST_HEAD(&img_request->obj_requests); 1619 INIT_LIST_HEAD(&img_request->object_extents);
2141 kref_init(&img_request->kref); 1620 kref_init(&img_request->kref);
2142 1621
2143 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 1622 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
2144 obj_op_name(op_type), offset, length, img_request); 1623 obj_op_name(op_type), img_request);
2145
2146 return img_request; 1624 return img_request;
2147} 1625}
2148 1626
@@ -2165,829 +1643,934 @@ static void rbd_img_request_destroy(struct kref *kref)
2165 rbd_dev_parent_put(img_request->rbd_dev); 1643 rbd_dev_parent_put(img_request->rbd_dev);
2166 } 1644 }
2167 1645
2168 if (img_request_write_test(img_request) || 1646 if (rbd_img_is_write(img_request))
2169 img_request_discard_test(img_request))
2170 ceph_put_snap_context(img_request->snapc); 1647 ceph_put_snap_context(img_request->snapc);
2171 1648
2172 kmem_cache_free(rbd_img_request_cache, img_request); 1649 kmem_cache_free(rbd_img_request_cache, img_request);
2173} 1650}
2174 1651
2175static struct rbd_img_request *rbd_parent_request_create( 1652static void prune_extents(struct ceph_file_extent *img_extents,
2176 struct rbd_obj_request *obj_request, 1653 u32 *num_img_extents, u64 overlap)
2177 u64 img_offset, u64 length)
2178{ 1654{
2179 struct rbd_img_request *parent_request; 1655 u32 cnt = *num_img_extents;
2180 struct rbd_device *rbd_dev;
2181 1656
2182 rbd_assert(obj_request->img_request); 1657 /* drop extents completely beyond the overlap */
2183 rbd_dev = obj_request->img_request->rbd_dev; 1658 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1659 cnt--;
2184 1660
2185 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 1661 if (cnt) {
2186 length, OBJ_OP_READ, NULL); 1662 struct ceph_file_extent *ex = &img_extents[cnt - 1];
2187 if (!parent_request)
2188 return NULL;
2189 1663
2190 img_request_child_set(parent_request); 1664 /* trim final overlapping extent */
2191 rbd_obj_request_get(obj_request); 1665 if (ex->fe_off + ex->fe_len > overlap)
2192 parent_request->obj_request = obj_request; 1666 ex->fe_len = overlap - ex->fe_off;
1667 }
2193 1668
2194 return parent_request; 1669 *num_img_extents = cnt;
2195} 1670}
2196 1671
2197static void rbd_parent_request_destroy(struct kref *kref) 1672/*
1673 * Determine the byte range(s) covered by either just the object extent
1674 * or the entire object in the parent image.
1675 */
1676static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1677 bool entire)
2198{ 1678{
2199 struct rbd_img_request *parent_request; 1679 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2200 struct rbd_obj_request *orig_request; 1680 int ret;
2201 1681
2202 parent_request = container_of(kref, struct rbd_img_request, kref); 1682 if (!rbd_dev->parent_overlap)
2203 orig_request = parent_request->obj_request; 1683 return 0;
2204 1684
2205 parent_request->obj_request = NULL; 1685 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2206 rbd_obj_request_put(orig_request); 1686 entire ? 0 : obj_req->ex.oe_off,
2207 img_request_child_clear(parent_request); 1687 entire ? rbd_dev->layout.object_size :
1688 obj_req->ex.oe_len,
1689 &obj_req->img_extents,
1690 &obj_req->num_img_extents);
1691 if (ret)
1692 return ret;
2208 1693
2209 rbd_img_request_destroy(kref); 1694 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1695 rbd_dev->parent_overlap);
1696 return 0;
2210} 1697}
2211 1698
2212static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 1699static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
2213{ 1700{
2214 struct rbd_img_request *img_request; 1701 switch (obj_req->img_request->data_type) {
2215 unsigned int xferred; 1702 case OBJ_REQUEST_BIO:
2216 int result; 1703 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
2217 bool more; 1704 &obj_req->bio_pos,
2218 1705 obj_req->ex.oe_len);
2219 rbd_assert(obj_request_img_data_test(obj_request)); 1706 break;
2220 img_request = obj_request->img_request; 1707 case OBJ_REQUEST_BVECS:
2221 1708 case OBJ_REQUEST_OWN_BVECS:
2222 rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 1709 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2223 xferred = (unsigned int)obj_request->xferred; 1710 obj_req->ex.oe_len);
2224 result = obj_request->result; 1711 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2225 if (result) { 1712 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
2226 struct rbd_device *rbd_dev = img_request->rbd_dev; 1713 &obj_req->bvec_pos);
2227 enum obj_operation_type op_type; 1714 break;
2228 1715 default:
2229 if (img_request_discard_test(img_request)) 1716 rbd_assert(0);
2230 op_type = OBJ_OP_DISCARD;
2231 else if (img_request_write_test(img_request))
2232 op_type = OBJ_OP_WRITE;
2233 else
2234 op_type = OBJ_OP_READ;
2235
2236 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
2237 obj_op_name(op_type), obj_request->length,
2238 obj_request->img_offset, obj_request->offset);
2239 rbd_warn(rbd_dev, " result %d xferred %x",
2240 result, xferred);
2241 if (!img_request->result)
2242 img_request->result = result;
2243 /*
2244 * Need to end I/O on the entire obj_request worth of
2245 * bytes in case of error.
2246 */
2247 xferred = obj_request->length;
2248 } 1717 }
1718}
2249 1719
2250 if (img_request_child_test(img_request)) { 1720static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
2251 rbd_assert(img_request->obj_request != NULL); 1721{
2252 more = obj_request->which < img_request->obj_request_count - 1; 1722 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
2253 } else { 1723 if (!obj_req->osd_req)
2254 blk_status_t status = errno_to_blk_status(result); 1724 return -ENOMEM;
2255 1725
2256 rbd_assert(img_request->rq != NULL); 1726 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
1727 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
1728 rbd_osd_req_setup_data(obj_req, 0);
2257 1729
2258 more = blk_update_request(img_request->rq, status, xferred); 1730 rbd_osd_req_format_read(obj_req);
2259 if (!more) 1731 return 0;
2260 __blk_mq_end_request(img_request->rq, status); 1732}
2261 } 1733
1734static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1735 unsigned int which)
1736{
1737 struct page **pages;
2262 1738
2263 return more; 1739 /*
1740 * The response data for a STAT call consists of:
1741 * le64 length;
1742 * struct {
1743 * le32 tv_sec;
1744 * le32 tv_nsec;
1745 * } mtime;
1746 */
1747 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1748 if (IS_ERR(pages))
1749 return PTR_ERR(pages);
1750
1751 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1752 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1753 8 + sizeof(struct ceph_timespec),
1754 0, false, true);
1755 return 0;
2264} 1756}
2265 1757
2266static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 1758static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1759 unsigned int which)
2267{ 1760{
2268 struct rbd_img_request *img_request; 1761 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2269 u32 which = obj_request->which; 1762 u16 opcode;
2270 bool more = true;
2271 1763
2272 rbd_assert(obj_request_img_data_test(obj_request)); 1764 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
2273 img_request = obj_request->img_request; 1765 rbd_dev->layout.object_size,
1766 rbd_dev->layout.object_size);
2274 1767
2275 dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1768 if (rbd_obj_is_entire(obj_req))
2276 rbd_assert(img_request != NULL); 1769 opcode = CEPH_OSD_OP_WRITEFULL;
2277 rbd_assert(img_request->obj_request_count > 0); 1770 else
2278 rbd_assert(which != BAD_WHICH); 1771 opcode = CEPH_OSD_OP_WRITE;
2279 rbd_assert(which < img_request->obj_request_count);
2280 1772
2281 spin_lock_irq(&img_request->completion_lock); 1773 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
2282 if (which != img_request->next_completion) 1774 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2283 goto out; 1775 rbd_osd_req_setup_data(obj_req, which++);
1776
1777 rbd_assert(which == obj_req->osd_req->r_num_ops);
1778 rbd_osd_req_format_write(obj_req);
1779}
2284 1780
2285 for_each_obj_request_from(img_request, obj_request) { 1781static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
2286 rbd_assert(more); 1782{
2287 rbd_assert(which < img_request->obj_request_count); 1783 unsigned int num_osd_ops, which = 0;
1784 int ret;
2288 1785
2289 if (!obj_request_done_test(obj_request)) 1786 /* reverse map the entire object onto the parent */
2290 break; 1787 ret = rbd_obj_calc_img_extents(obj_req, true);
2291 more = rbd_img_obj_end_request(obj_request); 1788 if (ret)
2292 which++; 1789 return ret;
1790
1791 if (obj_req->num_img_extents) {
1792 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1793 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1794 } else {
1795 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1796 num_osd_ops = 2; /* setallochint + write/writefull */
2293 } 1797 }
2294 1798
2295 rbd_assert(more ^ (which == img_request->obj_request_count)); 1799 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
2296 img_request->next_completion = which; 1800 if (!obj_req->osd_req)
2297out: 1801 return -ENOMEM;
2298 spin_unlock_irq(&img_request->completion_lock);
2299 rbd_img_request_put(img_request);
2300 1802
2301 if (!more) 1803 if (obj_req->num_img_extents) {
2302 rbd_img_request_complete(img_request); 1804 ret = __rbd_obj_setup_stat(obj_req, which++);
1805 if (ret)
1806 return ret;
1807 }
1808
1809 __rbd_obj_setup_write(obj_req, which);
1810 return 0;
2303} 1811}
2304 1812
2305/* 1813static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
2306 * Add individual osd ops to the given ceph_osd_request and prepare 1814 unsigned int which)
2307 * them for submission. num_ops is the current number of 1815{
2308 * osd operations already to the object request.
2309 */
2310static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
2311 struct ceph_osd_request *osd_request,
2312 enum obj_operation_type op_type,
2313 unsigned int num_ops)
2314{
2315 struct rbd_img_request *img_request = obj_request->img_request;
2316 struct rbd_device *rbd_dev = img_request->rbd_dev;
2317 u64 object_size = rbd_obj_bytes(&rbd_dev->header);
2318 u64 offset = obj_request->offset;
2319 u64 length = obj_request->length;
2320 u64 img_end;
2321 u16 opcode; 1816 u16 opcode;
2322 1817
2323 if (op_type == OBJ_OP_DISCARD) { 1818 if (rbd_obj_is_entire(obj_req)) {
2324 if (!offset && length == object_size && 1819 if (obj_req->num_img_extents) {
2325 (!img_request_layered_test(img_request) || 1820 osd_req_op_init(obj_req->osd_req, which++,
2326 !obj_request_overlaps_parent(obj_request))) { 1821 CEPH_OSD_OP_CREATE, 0);
2327 opcode = CEPH_OSD_OP_DELETE;
2328 } else if ((offset + length == object_size)) {
2329 opcode = CEPH_OSD_OP_TRUNCATE; 1822 opcode = CEPH_OSD_OP_TRUNCATE;
2330 } else { 1823 } else {
2331 down_read(&rbd_dev->header_rwsem); 1824 osd_req_op_init(obj_req->osd_req, which++,
2332 img_end = rbd_dev->header.image_size; 1825 CEPH_OSD_OP_DELETE, 0);
2333 up_read(&rbd_dev->header_rwsem); 1826 opcode = 0;
2334
2335 if (obj_request->img_offset + length == img_end)
2336 opcode = CEPH_OSD_OP_TRUNCATE;
2337 else
2338 opcode = CEPH_OSD_OP_ZERO;
2339 } 1827 }
2340 } else if (op_type == OBJ_OP_WRITE) { 1828 } else if (rbd_obj_is_tail(obj_req)) {
2341 if (!offset && length == object_size) 1829 opcode = CEPH_OSD_OP_TRUNCATE;
2342 opcode = CEPH_OSD_OP_WRITEFULL;
2343 else
2344 opcode = CEPH_OSD_OP_WRITE;
2345 osd_req_op_alloc_hint_init(osd_request, num_ops,
2346 object_size, object_size);
2347 num_ops++;
2348 } else { 1830 } else {
2349 opcode = CEPH_OSD_OP_READ; 1831 opcode = CEPH_OSD_OP_ZERO;
2350 } 1832 }
2351 1833
2352 if (opcode == CEPH_OSD_OP_DELETE) 1834 if (opcode)
2353 osd_req_op_init(osd_request, num_ops, opcode, 0); 1835 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
2354 else 1836 obj_req->ex.oe_off, obj_req->ex.oe_len,
2355 osd_req_op_extent_init(osd_request, num_ops, opcode, 1837 0, 0);
2356 offset, length, 0, 0); 1838
2357 1839 rbd_assert(which == obj_req->osd_req->r_num_ops);
2358 if (obj_request->type == OBJ_REQUEST_BIO) 1840 rbd_osd_req_format_write(obj_req);
2359 osd_req_op_extent_osd_data_bio(osd_request, num_ops,
2360 obj_request->bio_list, length);
2361 else if (obj_request->type == OBJ_REQUEST_PAGES)
2362 osd_req_op_extent_osd_data_pages(osd_request, num_ops,
2363 obj_request->pages, length,
2364 offset & ~PAGE_MASK, false, false);
2365
2366 /* Discards are also writes */
2367 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
2368 rbd_osd_req_format_write(obj_request);
2369 else
2370 rbd_osd_req_format_read(obj_request);
2371} 1841}
2372 1842
2373/* 1843static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
2374 * Split up an image request into one or more object requests, each
2375 * to a different object. The "type" parameter indicates whether
2376 * "data_desc" is the pointer to the head of a list of bio
2377 * structures, or the base of a page array. In either case this
2378 * function assumes data_desc describes memory sufficient to hold
2379 * all data described by the image request.
2380 */
2381static int rbd_img_request_fill(struct rbd_img_request *img_request,
2382 enum obj_request_type type,
2383 void *data_desc)
2384{ 1844{
2385 struct rbd_device *rbd_dev = img_request->rbd_dev; 1845 unsigned int num_osd_ops, which = 0;
2386 struct rbd_obj_request *obj_request = NULL; 1846 int ret;
2387 struct rbd_obj_request *next_obj_request;
2388 struct bio *bio_list = NULL;
2389 unsigned int bio_offset = 0;
2390 struct page **pages = NULL;
2391 enum obj_operation_type op_type;
2392 u64 img_offset;
2393 u64 resid;
2394
2395 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2396 (int)type, data_desc);
2397 1847
2398 img_offset = img_request->offset; 1848 /* reverse map the entire object onto the parent */
2399 resid = img_request->length; 1849 ret = rbd_obj_calc_img_extents(obj_req, true);
2400 rbd_assert(resid > 0); 1850 if (ret)
2401 op_type = rbd_img_request_op_type(img_request); 1851 return ret;
2402 1852
2403 if (type == OBJ_REQUEST_BIO) { 1853 if (rbd_obj_is_entire(obj_req)) {
2404 bio_list = data_desc; 1854 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2405 rbd_assert(img_offset == 1855 if (obj_req->num_img_extents)
2406 bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 1856 num_osd_ops = 2; /* create + truncate */
2407 } else if (type == OBJ_REQUEST_PAGES) { 1857 else
2408 pages = data_desc; 1858 num_osd_ops = 1; /* delete */
1859 } else {
1860 if (obj_req->num_img_extents) {
1861 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1862 num_osd_ops = 2; /* stat + truncate/zero */
1863 } else {
1864 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1865 num_osd_ops = 1; /* truncate/zero */
1866 }
2409 } 1867 }
2410 1868
2411 while (resid) { 1869 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
2412 struct ceph_osd_request *osd_req; 1870 if (!obj_req->osd_req)
2413 u64 object_no = img_offset >> rbd_dev->header.obj_order; 1871 return -ENOMEM;
2414 u64 offset = rbd_segment_offset(rbd_dev, img_offset);
2415 u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
2416
2417 obj_request = rbd_obj_request_create(type);
2418 if (!obj_request)
2419 goto out_unwind;
2420
2421 obj_request->object_no = object_no;
2422 obj_request->offset = offset;
2423 obj_request->length = length;
2424
2425 /*
2426 * set obj_request->img_request before creating the
2427 * osd_request so that it gets the right snapc
2428 */
2429 rbd_img_obj_request_add(img_request, obj_request);
2430
2431 if (type == OBJ_REQUEST_BIO) {
2432 unsigned int clone_size;
2433
2434 rbd_assert(length <= (u64)UINT_MAX);
2435 clone_size = (unsigned int)length;
2436 obj_request->bio_list =
2437 bio_chain_clone_range(&bio_list,
2438 &bio_offset,
2439 clone_size,
2440 GFP_NOIO);
2441 if (!obj_request->bio_list)
2442 goto out_unwind;
2443 } else if (type == OBJ_REQUEST_PAGES) {
2444 unsigned int page_count;
2445
2446 obj_request->pages = pages;
2447 page_count = (u32)calc_pages_for(offset, length);
2448 obj_request->page_count = page_count;
2449 if ((offset + length) & ~PAGE_MASK)
2450 page_count--; /* more on last page */
2451 pages += page_count;
2452 }
2453 1872
2454 osd_req = rbd_osd_req_create(rbd_dev, op_type, 1873 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
2455 (op_type == OBJ_OP_WRITE) ? 2 : 1, 1874 ret = __rbd_obj_setup_stat(obj_req, which++);
2456 obj_request); 1875 if (ret)
2457 if (!osd_req) 1876 return ret;
2458 goto out_unwind; 1877 }
2459 1878
2460 obj_request->osd_req = osd_req; 1879 __rbd_obj_setup_discard(obj_req, which);
2461 obj_request->callback = rbd_img_obj_callback; 1880 return 0;
2462 obj_request->img_offset = img_offset; 1881}
2463 1882
2464 rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 1883/*
1884 * For each object request in @img_req, allocate an OSD request, add
1885 * individual OSD ops and prepare them for submission. The number of
1886 * OSD ops depends on op_type and the overlap point (if any).
1887 */
1888static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1889{
1890 struct rbd_obj_request *obj_req;
1891 int ret;
2465 1892
2466 img_offset += length; 1893 for_each_obj_request(img_req, obj_req) {
2467 resid -= length; 1894 switch (img_req->op_type) {
1895 case OBJ_OP_READ:
1896 ret = rbd_obj_setup_read(obj_req);
1897 break;
1898 case OBJ_OP_WRITE:
1899 ret = rbd_obj_setup_write(obj_req);
1900 break;
1901 case OBJ_OP_DISCARD:
1902 ret = rbd_obj_setup_discard(obj_req);
1903 break;
1904 default:
1905 rbd_assert(0);
1906 }
1907 if (ret)
1908 return ret;
2468 } 1909 }
2469 1910
2470 return 0; 1911 return 0;
1912}
2471 1913
2472out_unwind: 1914union rbd_img_fill_iter {
2473 for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1915 struct ceph_bio_iter bio_iter;
2474 rbd_img_obj_request_del(img_request, obj_request); 1916 struct ceph_bvec_iter bvec_iter;
1917};
2475 1918
2476 return -ENOMEM; 1919struct rbd_img_fill_ctx {
2477} 1920 enum obj_request_type pos_type;
1921 union rbd_img_fill_iter *pos;
1922 union rbd_img_fill_iter iter;
1923 ceph_object_extent_fn_t set_pos_fn;
1924 ceph_object_extent_fn_t count_fn;
1925 ceph_object_extent_fn_t copy_fn;
1926};
2478 1927
2479static void 1928static struct ceph_object_extent *alloc_object_extent(void *arg)
2480rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
2481{ 1929{
2482 struct rbd_img_request *img_request; 1930 struct rbd_img_request *img_req = arg;
2483 struct rbd_device *rbd_dev; 1931 struct rbd_obj_request *obj_req;
2484 struct page **pages;
2485 u32 page_count;
2486 1932
2487 dout("%s: obj %p\n", __func__, obj_request); 1933 obj_req = rbd_obj_request_create();
1934 if (!obj_req)
1935 return NULL;
2488 1936
2489 rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 1937 rbd_img_obj_request_add(img_req, obj_req);
2490 obj_request->type == OBJ_REQUEST_NODATA); 1938 return &obj_req->ex;
2491 rbd_assert(obj_request_img_data_test(obj_request)); 1939}
2492 img_request = obj_request->img_request;
2493 rbd_assert(img_request);
2494 1940
2495 rbd_dev = img_request->rbd_dev; 1941/*
2496 rbd_assert(rbd_dev); 1942 * While su != os && sc == 1 is technically not fancy (it's the same
1943 * layout as su == os && sc == 1), we can't use the nocopy path for it
1944 * because ->set_pos_fn() should be called only once per object.
1945 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1946 * treat su != os && sc == 1 as fancy.
1947 */
1948static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1949{
1950 return l->stripe_unit != l->object_size;
1951}
2497 1952
2498 pages = obj_request->copyup_pages; 1953static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2499 rbd_assert(pages != NULL); 1954 struct ceph_file_extent *img_extents,
2500 obj_request->copyup_pages = NULL; 1955 u32 num_img_extents,
2501 page_count = obj_request->copyup_page_count; 1956 struct rbd_img_fill_ctx *fctx)
2502 rbd_assert(page_count); 1957{
2503 obj_request->copyup_page_count = 0; 1958 u32 i;
2504 ceph_release_page_vector(pages, page_count); 1959 int ret;
1960
1961 img_req->data_type = fctx->pos_type;
2505 1962
2506 /* 1963 /*
2507 * We want the transfer count to reflect the size of the 1964 * Create object requests and set each object request's starting
2508 * original write request. There is no such thing as a 1965 * position in the provided bio (list) or bio_vec array.
2509 * successful short write, so if the request was successful
2510 * we can just set it to the originally-requested length.
2511 */ 1966 */
2512 if (!obj_request->result) 1967 fctx->iter = *fctx->pos;
2513 obj_request->xferred = obj_request->length; 1968 for (i = 0; i < num_img_extents; i++) {
1969 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
1970 img_extents[i].fe_off,
1971 img_extents[i].fe_len,
1972 &img_req->object_extents,
1973 alloc_object_extent, img_req,
1974 fctx->set_pos_fn, &fctx->iter);
1975 if (ret)
1976 return ret;
1977 }
2514 1978
2515 obj_request_done_set(obj_request); 1979 return __rbd_img_fill_request(img_req);
2516} 1980}
2517 1981
2518static void 1982/*
2519rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 1983 * Map a list of image extents to a list of object extents, create the
1984 * corresponding object requests (normally each to a different object,
1985 * but not always) and add them to @img_req. For each object request,
1986 * set up its data descriptor to point to the corresponding chunk(s) of
1987 * @fctx->pos data buffer.
1988 *
1989 * Because ceph_file_to_extents() will merge adjacent object extents
1990 * together, each object request's data descriptor may point to multiple
1991 * different chunks of @fctx->pos data buffer.
1992 *
1993 * @fctx->pos data buffer is assumed to be large enough.
1994 */
1995static int rbd_img_fill_request(struct rbd_img_request *img_req,
1996 struct ceph_file_extent *img_extents,
1997 u32 num_img_extents,
1998 struct rbd_img_fill_ctx *fctx)
2520{ 1999{
2521 struct rbd_obj_request *orig_request; 2000 struct rbd_device *rbd_dev = img_req->rbd_dev;
2522 struct ceph_osd_request *osd_req; 2001 struct rbd_obj_request *obj_req;
2523 struct rbd_device *rbd_dev; 2002 u32 i;
2524 struct page **pages; 2003 int ret;
2525 enum obj_operation_type op_type;
2526 u32 page_count;
2527 int img_result;
2528 u64 parent_length;
2529
2530 rbd_assert(img_request_child_test(img_request));
2531
2532 /* First get what we need from the image request */
2533
2534 pages = img_request->copyup_pages;
2535 rbd_assert(pages != NULL);
2536 img_request->copyup_pages = NULL;
2537 page_count = img_request->copyup_page_count;
2538 rbd_assert(page_count);
2539 img_request->copyup_page_count = 0;
2540
2541 orig_request = img_request->obj_request;
2542 rbd_assert(orig_request != NULL);
2543 rbd_assert(obj_request_type_valid(orig_request->type));
2544 img_result = img_request->result;
2545 parent_length = img_request->length;
2546 rbd_assert(img_result || parent_length == img_request->xferred);
2547 rbd_img_request_put(img_request);
2548 2004
2549 rbd_assert(orig_request->img_request); 2005 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2550 rbd_dev = orig_request->img_request->rbd_dev; 2006 !rbd_layout_is_fancy(&rbd_dev->layout))
2551 rbd_assert(rbd_dev); 2007 return rbd_img_fill_request_nocopy(img_req, img_extents,
2008 num_img_extents, fctx);
2009
2010 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2552 2011
2553 /* 2012 /*
2554 * If the overlap has become 0 (most likely because the 2013 * Create object requests and determine ->bvec_count for each object
2555 * image has been flattened) we need to free the pages 2014 * request. Note that ->bvec_count sum over all object requests may
2556 * and re-submit the original write request. 2015 * be greater than the number of bio_vecs in the provided bio (list)
2016 * or bio_vec array because when mapped, those bio_vecs can straddle
2017 * stripe unit boundaries.
2557 */ 2018 */
2558 if (!rbd_dev->parent_overlap) { 2019 fctx->iter = *fctx->pos;
2559 ceph_release_page_vector(pages, page_count); 2020 for (i = 0; i < num_img_extents; i++) {
2560 rbd_obj_request_submit(orig_request); 2021 ret = ceph_file_to_extents(&rbd_dev->layout,
2561 return; 2022 img_extents[i].fe_off,
2023 img_extents[i].fe_len,
2024 &img_req->object_extents,
2025 alloc_object_extent, img_req,
2026 fctx->count_fn, &fctx->iter);
2027 if (ret)
2028 return ret;
2562 } 2029 }
2563 2030
2564 if (img_result) 2031 for_each_obj_request(img_req, obj_req) {
2565 goto out_err; 2032 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2033 sizeof(*obj_req->bvec_pos.bvecs),
2034 GFP_NOIO);
2035 if (!obj_req->bvec_pos.bvecs)
2036 return -ENOMEM;
2037 }
2566 2038
2567 /* 2039 /*
2568 * The original osd request is of no use to use any more. 2040 * Fill in each object request's private bio_vec array, splitting and
2569 * We need a new one that can hold the three ops in a copyup 2041 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2570 * request. Allocate the new copyup osd request for the
2571 * original request, and release the old one.
2572 */ 2042 */
2573 img_result = -ENOMEM; 2043 fctx->iter = *fctx->pos;
2574 osd_req = rbd_osd_req_create_copyup(orig_request); 2044 for (i = 0; i < num_img_extents; i++) {
2575 if (!osd_req) 2045 ret = ceph_iterate_extents(&rbd_dev->layout,
2576 goto out_err; 2046 img_extents[i].fe_off,
2577 rbd_osd_req_destroy(orig_request->osd_req); 2047 img_extents[i].fe_len,
2578 orig_request->osd_req = osd_req; 2048 &img_req->object_extents,
2579 orig_request->copyup_pages = pages; 2049 fctx->copy_fn, &fctx->iter);
2580 orig_request->copyup_page_count = page_count; 2050 if (ret)
2051 return ret;
2052 }
2581 2053
2582 /* Initialize the copyup op */ 2054 return __rbd_img_fill_request(img_req);
2055}
2583 2056
2584 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2057static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2585 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 2058 u64 off, u64 len)
2586 false, false); 2059{
2060 struct ceph_file_extent ex = { off, len };
2061 union rbd_img_fill_iter dummy;
2062 struct rbd_img_fill_ctx fctx = {
2063 .pos_type = OBJ_REQUEST_NODATA,
2064 .pos = &dummy,
2065 };
2587 2066
2588 /* Add the other op(s) */ 2067 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2068}
2589 2069
2590 op_type = rbd_img_request_op_type(orig_request->img_request); 2070static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2591 rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 2071{
2072 struct rbd_obj_request *obj_req =
2073 container_of(ex, struct rbd_obj_request, ex);
2074 struct ceph_bio_iter *it = arg;
2592 2075
2593 /* All set, send it off. */ 2076 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2077 obj_req->bio_pos = *it;
2078 ceph_bio_iter_advance(it, bytes);
2079}
2594 2080
2595 rbd_obj_request_submit(orig_request); 2081static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2596 return; 2082{
2083 struct rbd_obj_request *obj_req =
2084 container_of(ex, struct rbd_obj_request, ex);
2085 struct ceph_bio_iter *it = arg;
2086
2087 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2088 ceph_bio_iter_advance_step(it, bytes, ({
2089 obj_req->bvec_count++;
2090 }));
2597 2091
2598out_err:
2599 ceph_release_page_vector(pages, page_count);
2600 rbd_obj_request_error(orig_request, img_result);
2601} 2092}
2602 2093
2603/* 2094static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2604 * Read from the parent image the range of data that covers the 2095{
2605 * entire target of the given object request. This is used for 2096 struct rbd_obj_request *obj_req =
2606 * satisfying a layered image write request when the target of an 2097 container_of(ex, struct rbd_obj_request, ex);
2607 * object request from the image request does not exist. 2098 struct ceph_bio_iter *it = arg;
2608 *
2609 * A page array big enough to hold the returned data is allocated
2610 * and supplied to rbd_img_request_fill() as the "data descriptor."
2611 * When the read completes, this page array will be transferred to
2612 * the original object request for the copyup operation.
2613 *
2614 * If an error occurs, it is recorded as the result of the original
2615 * object request in rbd_img_obj_exists_callback().
2616 */
2617static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2618{
2619 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
2620 struct rbd_img_request *parent_request = NULL;
2621 u64 img_offset;
2622 u64 length;
2623 struct page **pages = NULL;
2624 u32 page_count;
2625 int result;
2626 2099
2627 rbd_assert(rbd_dev->parent != NULL); 2100 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2101 ceph_bio_iter_advance_step(it, bytes, ({
2102 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2103 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2104 }));
2105}
2628 2106
2629 /* 2107static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2630 * Determine the byte range covered by the object in the 2108 struct ceph_file_extent *img_extents,
2631 * child image to which the original request was to be sent. 2109 u32 num_img_extents,
2632 */ 2110 struct ceph_bio_iter *bio_pos)
2633 img_offset = obj_request->img_offset - obj_request->offset; 2111{
2634 length = rbd_obj_bytes(&rbd_dev->header); 2112 struct rbd_img_fill_ctx fctx = {
2113 .pos_type = OBJ_REQUEST_BIO,
2114 .pos = (union rbd_img_fill_iter *)bio_pos,
2115 .set_pos_fn = set_bio_pos,
2116 .count_fn = count_bio_bvecs,
2117 .copy_fn = copy_bio_bvecs,
2118 };
2635 2119
2636 /* 2120 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2637 * There is no defined parent data beyond the parent 2121 &fctx);
2638 * overlap, so limit what we read at that boundary if 2122}
2639 * necessary.
2640 */
2641 if (img_offset + length > rbd_dev->parent_overlap) {
2642 rbd_assert(img_offset < rbd_dev->parent_overlap);
2643 length = rbd_dev->parent_overlap - img_offset;
2644 }
2645 2123
2646 /* 2124static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2647 * Allocate a page array big enough to receive the data read 2125 u64 off, u64 len, struct bio *bio)
2648 * from the parent. 2126{
2649 */ 2127 struct ceph_file_extent ex = { off, len };
2650 page_count = (u32)calc_pages_for(0, length); 2128 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2651 pages = ceph_alloc_page_vector(page_count, GFP_NOIO);
2652 if (IS_ERR(pages)) {
2653 result = PTR_ERR(pages);
2654 pages = NULL;
2655 goto out_err;
2656 }
2657 2129
2658 result = -ENOMEM; 2130 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2659 parent_request = rbd_parent_request_create(obj_request, 2131}
2660 img_offset, length);
2661 if (!parent_request)
2662 goto out_err;
2663 2132
2664 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 2133static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2665 if (result) 2134{
2666 goto out_err; 2135 struct rbd_obj_request *obj_req =
2136 container_of(ex, struct rbd_obj_request, ex);
2137 struct ceph_bvec_iter *it = arg;
2667 2138
2668 parent_request->copyup_pages = pages; 2139 obj_req->bvec_pos = *it;
2669 parent_request->copyup_page_count = page_count; 2140 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2670 parent_request->callback = rbd_img_obj_parent_read_full_callback; 2141 ceph_bvec_iter_advance(it, bytes);
2142}
2671 2143
2672 result = rbd_img_request_submit(parent_request); 2144static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2673 if (!result) 2145{
2674 return 0; 2146 struct rbd_obj_request *obj_req =
2147 container_of(ex, struct rbd_obj_request, ex);
2148 struct ceph_bvec_iter *it = arg;
2675 2149
2676 parent_request->copyup_pages = NULL; 2150 ceph_bvec_iter_advance_step(it, bytes, ({
2677 parent_request->copyup_page_count = 0; 2151 obj_req->bvec_count++;
2678out_err: 2152 }));
2679 if (pages)
2680 ceph_release_page_vector(pages, page_count);
2681 if (parent_request)
2682 rbd_img_request_put(parent_request);
2683 return result;
2684} 2153}
2685 2154
2686static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2155static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2687{ 2156{
2688 struct rbd_obj_request *orig_request; 2157 struct rbd_obj_request *obj_req =
2689 struct rbd_device *rbd_dev; 2158 container_of(ex, struct rbd_obj_request, ex);
2690 int result; 2159 struct ceph_bvec_iter *it = arg;
2691 2160
2692 rbd_assert(!obj_request_img_data_test(obj_request)); 2161 ceph_bvec_iter_advance_step(it, bytes, ({
2162 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2163 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2164 }));
2165}
2693 2166
2694 /* 2167static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2695 * All we need from the object request is the original 2168 struct ceph_file_extent *img_extents,
2696 * request and the result of the STAT op. Grab those, then 2169 u32 num_img_extents,
2697 * we're done with the request. 2170 struct ceph_bvec_iter *bvec_pos)
2698 */ 2171{
2699 orig_request = obj_request->obj_request; 2172 struct rbd_img_fill_ctx fctx = {
2700 obj_request->obj_request = NULL; 2173 .pos_type = OBJ_REQUEST_BVECS,
2701 rbd_obj_request_put(orig_request); 2174 .pos = (union rbd_img_fill_iter *)bvec_pos,
2702 rbd_assert(orig_request); 2175 .set_pos_fn = set_bvec_pos,
2703 rbd_assert(orig_request->img_request); 2176 .count_fn = count_bvecs,
2704 2177 .copy_fn = copy_bvecs,
2705 result = obj_request->result; 2178 };
2706 obj_request->result = 0;
2707
2708 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2709 obj_request, orig_request, result,
2710 obj_request->xferred, obj_request->length);
2711 rbd_obj_request_put(obj_request);
2712 2179
2713 /* 2180 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2714 * If the overlap has become 0 (most likely because the 2181 &fctx);
2715 * image has been flattened) we need to re-submit the 2182}
2716 * original request.
2717 */
2718 rbd_dev = orig_request->img_request->rbd_dev;
2719 if (!rbd_dev->parent_overlap) {
2720 rbd_obj_request_submit(orig_request);
2721 return;
2722 }
2723 2183
2724 /* 2184static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2725 * Our only purpose here is to determine whether the object 2185 struct ceph_file_extent *img_extents,
2726 * exists, and we don't want to treat the non-existence as 2186 u32 num_img_extents,
2727 * an error. If something else comes back, transfer the 2187 struct bio_vec *bvecs)
2728 * error to the original request and complete it now. 2188{
2729 */ 2189 struct ceph_bvec_iter it = {
2730 if (!result) { 2190 .bvecs = bvecs,
2731 obj_request_existence_set(orig_request, true); 2191 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2732 } else if (result == -ENOENT) { 2192 num_img_extents) },
2733 obj_request_existence_set(orig_request, false); 2193 };
2734 } else {
2735 goto fail_orig_request;
2736 }
2737 2194
2738 /* 2195 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2739 * Resubmit the original request now that we have recorded 2196 &it);
2740 * whether the target object exists. 2197}
2741 */
2742 result = rbd_img_obj_request_submit(orig_request);
2743 if (result)
2744 goto fail_orig_request;
2745 2198
2746 return; 2199static void rbd_img_request_submit(struct rbd_img_request *img_request)
2200{
2201 struct rbd_obj_request *obj_request;
2202
2203 dout("%s: img %p\n", __func__, img_request);
2204
2205 rbd_img_request_get(img_request);
2206 for_each_obj_request(img_request, obj_request)
2207 rbd_obj_request_submit(obj_request);
2747 2208
2748fail_orig_request: 2209 rbd_img_request_put(img_request);
2749 rbd_obj_request_error(orig_request, result);
2750} 2210}
2751 2211
2752static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2212static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2753{ 2213{
2754 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 2214 struct rbd_img_request *img_req = obj_req->img_request;
2755 struct rbd_obj_request *stat_request; 2215 struct rbd_img_request *child_img_req;
2756 struct page **pages;
2757 u32 page_count;
2758 size_t size;
2759 int ret; 2216 int ret;
2760 2217
2761 stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); 2218 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2762 if (!stat_request) 2219 OBJ_OP_READ, NULL);
2220 if (!child_img_req)
2763 return -ENOMEM; 2221 return -ENOMEM;
2764 2222
2765 stat_request->object_no = obj_request->object_no; 2223 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2224 child_img_req->obj_request = obj_req;
2766 2225
2767 stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2226 if (!rbd_img_is_write(img_req)) {
2768 stat_request); 2227 switch (img_req->data_type) {
2769 if (!stat_request->osd_req) { 2228 case OBJ_REQUEST_BIO:
2770 ret = -ENOMEM; 2229 ret = __rbd_img_fill_from_bio(child_img_req,
2771 goto fail_stat_request; 2230 obj_req->img_extents,
2231 obj_req->num_img_extents,
2232 &obj_req->bio_pos);
2233 break;
2234 case OBJ_REQUEST_BVECS:
2235 case OBJ_REQUEST_OWN_BVECS:
2236 ret = __rbd_img_fill_from_bvecs(child_img_req,
2237 obj_req->img_extents,
2238 obj_req->num_img_extents,
2239 &obj_req->bvec_pos);
2240 break;
2241 default:
2242 rbd_assert(0);
2243 }
2244 } else {
2245 ret = rbd_img_fill_from_bvecs(child_img_req,
2246 obj_req->img_extents,
2247 obj_req->num_img_extents,
2248 obj_req->copyup_bvecs);
2249 }
2250 if (ret) {
2251 rbd_img_request_put(child_img_req);
2252 return ret;
2253 }
2254
2255 rbd_img_request_submit(child_img_req);
2256 return 0;
2257}
2258
2259static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2260{
2261 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2262 int ret;
2263
2264 if (obj_req->result == -ENOENT &&
2265 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2266 /* reverse map this object extent onto the parent */
2267 ret = rbd_obj_calc_img_extents(obj_req, false);
2268 if (ret) {
2269 obj_req->result = ret;
2270 return true;
2271 }
2272
2273 if (obj_req->num_img_extents) {
2274 obj_req->tried_parent = true;
2275 ret = rbd_obj_read_from_parent(obj_req);
2276 if (ret) {
2277 obj_req->result = ret;
2278 return true;
2279 }
2280 return false;
2281 }
2772 } 2282 }
2773 2283
2774 /* 2284 /*
2775 * The response data for a STAT call consists of: 2285 * -ENOENT means a hole in the image -- zero-fill the entire
2776 * le64 length; 2286 * length of the request. A short read also implies zero-fill
2777 * struct { 2287 * to the end of the request. In both cases we update xferred
2778 * le32 tv_sec; 2288 * count to indicate the whole request was satisfied.
2779 * le32 tv_nsec;
2780 * } mtime;
2781 */ 2289 */
2782 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2290 if (obj_req->result == -ENOENT ||
2783 page_count = (u32)calc_pages_for(0, size); 2291 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
2784 pages = ceph_alloc_page_vector(page_count, GFP_NOIO); 2292 rbd_assert(!obj_req->xferred || !obj_req->result);
2785 if (IS_ERR(pages)) { 2293 rbd_obj_zero_range(obj_req, obj_req->xferred,
2786 ret = PTR_ERR(pages); 2294 obj_req->ex.oe_len - obj_req->xferred);
2787 goto fail_stat_request; 2295 obj_req->result = 0;
2296 obj_req->xferred = obj_req->ex.oe_len;
2788 } 2297 }
2789 2298
2790 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2299 return true;
2791 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2300}
2792 false, false);
2793
2794 rbd_obj_request_get(obj_request);
2795 stat_request->obj_request = obj_request;
2796 stat_request->pages = pages;
2797 stat_request->page_count = page_count;
2798 stat_request->callback = rbd_img_obj_exists_callback;
2799 2301
2800 rbd_obj_request_submit(stat_request); 2302/*
2801 return 0; 2303 * copyup_bvecs pages are never highmem pages
2304 */
2305static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2306{
2307 struct ceph_bvec_iter it = {
2308 .bvecs = bvecs,
2309 .iter = { .bi_size = bytes },
2310 };
2802 2311
2803fail_stat_request: 2312 ceph_bvec_iter_advance_step(&it, bytes, ({
2804 rbd_obj_request_put(stat_request); 2313 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2805 return ret; 2314 bv.bv_len))
2315 return false;
2316 }));
2317 return true;
2806} 2318}
2807 2319
2808static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2320static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2809{ 2321{
2810 struct rbd_img_request *img_request = obj_request->img_request; 2322 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2811 struct rbd_device *rbd_dev = img_request->rbd_dev;
2812 2323
2813 /* Reads */ 2324 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2814 if (!img_request_write_test(img_request) && 2325 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2815 !img_request_discard_test(img_request)) 2326 rbd_osd_req_destroy(obj_req->osd_req);
2816 return true;
2817
2818 /* Non-layered writes */
2819 if (!img_request_layered_test(img_request))
2820 return true;
2821 2327
2822 /* 2328 /*
2823 * Layered writes outside of the parent overlap range don't 2329 * Create a copyup request with the same number of OSD ops as
2824 * share any data with the parent. 2330 * the original request. The original request was stat + op(s),
2331 * the new copyup request will be copyup + the same op(s).
2825 */ 2332 */
2826 if (!obj_request_overlaps_parent(obj_request)) 2333 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
2827 return true; 2334 if (!obj_req->osd_req)
2335 return -ENOMEM;
2828 2336
2829 /* 2337 /*
2830 * Entire-object layered writes - we will overwrite whatever 2338 * Only send non-zero copyup data to save some I/O and network
2831 * parent data there is anyway. 2339 * bandwidth -- zero copyup data is equivalent to the object not
2340 * existing.
2832 */ 2341 */
2833 if (!obj_request->offset && 2342 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2834 obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2343 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2835 return true; 2344 bytes = 0;
2345 }
2836 2346
2837 /* 2347 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2838 * If the object is known to already exist, its parent data has 2348 "copyup");
2839 * already been copied. 2349 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2840 */ 2350 obj_req->copyup_bvecs, bytes);
2841 if (obj_request_known_test(obj_request) && 2351
2842 obj_request_exists_test(obj_request)) 2352 switch (obj_req->img_request->op_type) {
2843 return true; 2353 case OBJ_OP_WRITE:
2354 __rbd_obj_setup_write(obj_req, 1);
2355 break;
2356 case OBJ_OP_DISCARD:
2357 rbd_assert(!rbd_obj_is_entire(obj_req));
2358 __rbd_obj_setup_discard(obj_req, 1);
2359 break;
2360 default:
2361 rbd_assert(0);
2362 }
2844 2363
2845 return false; 2364 rbd_obj_request_submit(obj_req);
2365 return 0;
2846} 2366}
2847 2367
2848static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2368static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2849{ 2369{
2850 rbd_assert(obj_request_img_data_test(obj_request)); 2370 u32 i;
2851 rbd_assert(obj_request_type_valid(obj_request->type));
2852 rbd_assert(obj_request->img_request);
2853 2371
2854 if (img_obj_request_simple(obj_request)) { 2372 rbd_assert(!obj_req->copyup_bvecs);
2855 rbd_obj_request_submit(obj_request); 2373 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2856 return 0; 2374 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2857 } 2375 sizeof(*obj_req->copyup_bvecs),
2376 GFP_NOIO);
2377 if (!obj_req->copyup_bvecs)
2378 return -ENOMEM;
2858 2379
2859 /* 2380 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2860 * It's a layered write. The target object might exist but 2381 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2861 * we may not know that yet. If we know it doesn't exist,
2862 * start by reading the data for the full target object from
2863 * the parent so we can use it for a copyup to the target.
2864 */
2865 if (obj_request_known_test(obj_request))
2866 return rbd_img_obj_parent_read_full(obj_request);
2867 2382
2868 /* We don't know whether the target exists. Go find out. */ 2383 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2384 if (!obj_req->copyup_bvecs[i].bv_page)
2385 return -ENOMEM;
2386
2387 obj_req->copyup_bvecs[i].bv_offset = 0;
2388 obj_req->copyup_bvecs[i].bv_len = len;
2389 obj_overlap -= len;
2390 }
2869 2391
2870 return rbd_img_obj_exists_submit(obj_request); 2392 rbd_assert(!obj_overlap);
2393 return 0;
2871} 2394}
2872 2395
2873static int rbd_img_request_submit(struct rbd_img_request *img_request) 2396static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2874{ 2397{
2875 struct rbd_obj_request *obj_request; 2398 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2876 struct rbd_obj_request *next_obj_request; 2399 int ret;
2877 int ret = 0;
2878
2879 dout("%s: img %p\n", __func__, img_request);
2880 2400
2881 rbd_img_request_get(img_request); 2401 rbd_assert(obj_req->num_img_extents);
2882 for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2402 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2883 ret = rbd_img_obj_request_submit(obj_request); 2403 rbd_dev->parent_overlap);
2884 if (ret) 2404 if (!obj_req->num_img_extents) {
2885 goto out_put_ireq; 2405 /*
2406 * The overlap has become 0 (most likely because the
2407 * image has been flattened). Use rbd_obj_issue_copyup()
2408 * to re-submit the original write request -- the copyup
2409 * operation itself will be a no-op, since someone must
2410 * have populated the child object while we weren't
2411 * looking. Move to WRITE_FLAT state as we'll be done
2412 * with the operation once the null copyup completes.
2413 */
2414 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2415 return rbd_obj_issue_copyup(obj_req, 0);
2886 } 2416 }
2887 2417
2888out_put_ireq: 2418 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
2889 rbd_img_request_put(img_request); 2419 if (ret)
2890 return ret; 2420 return ret;
2421
2422 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
2423 return rbd_obj_read_from_parent(obj_req);
2891} 2424}
2892 2425
2893static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 2426static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2894{ 2427{
2895 struct rbd_obj_request *obj_request; 2428 int ret;
2896 struct rbd_device *rbd_dev;
2897 u64 obj_end;
2898 u64 img_xferred;
2899 int img_result;
2900 2429
2901 rbd_assert(img_request_child_test(img_request)); 2430again:
2431 switch (obj_req->write_state) {
2432 case RBD_OBJ_WRITE_GUARD:
2433 rbd_assert(!obj_req->xferred);
2434 if (obj_req->result == -ENOENT) {
2435 /*
2436 * The target object doesn't exist. Read the data for
2437 * the entire target object up to the overlap point (if
2438 * any) from the parent, so we can use it for a copyup.
2439 */
2440 ret = rbd_obj_handle_write_guard(obj_req);
2441 if (ret) {
2442 obj_req->result = ret;
2443 return true;
2444 }
2445 return false;
2446 }
2447 /* fall through */
2448 case RBD_OBJ_WRITE_FLAT:
2449 if (!obj_req->result)
2450 /*
2451 * There is no such thing as a successful short
2452 * write -- indicate the whole request was satisfied.
2453 */
2454 obj_req->xferred = obj_req->ex.oe_len;
2455 return true;
2456 case RBD_OBJ_WRITE_COPYUP:
2457 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2458 if (obj_req->result)
2459 goto again;
2902 2460
2903 /* First get what we need from the image request and release it */ 2461 rbd_assert(obj_req->xferred);
2462 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2463 if (ret) {
2464 obj_req->result = ret;
2465 return true;
2466 }
2467 return false;
2468 default:
2469 rbd_assert(0);
2470 }
2471}
2904 2472
2905 obj_request = img_request->obj_request; 2473/*
2906 img_xferred = img_request->xferred; 2474 * Returns true if @obj_req is completed, or false otherwise.
2907 img_result = img_request->result; 2475 */
2908 rbd_img_request_put(img_request); 2476static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2477{
2478 switch (obj_req->img_request->op_type) {
2479 case OBJ_OP_READ:
2480 return rbd_obj_handle_read(obj_req);
2481 case OBJ_OP_WRITE:
2482 return rbd_obj_handle_write(obj_req);
2483 case OBJ_OP_DISCARD:
2484 if (rbd_obj_handle_write(obj_req)) {
2485 /*
2486 * Hide -ENOENT from delete/truncate/zero -- discarding
2487 * a non-existent object is not a problem.
2488 */
2489 if (obj_req->result == -ENOENT) {
2490 obj_req->result = 0;
2491 obj_req->xferred = obj_req->ex.oe_len;
2492 }
2493 return true;
2494 }
2495 return false;
2496 default:
2497 rbd_assert(0);
2498 }
2499}
2909 2500
2910 /* 2501static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2911 * If the overlap has become 0 (most likely because the 2502{
2912 * image has been flattened) we need to re-submit the 2503 struct rbd_img_request *img_req = obj_req->img_request;
2913 * original request. 2504
2914 */ 2505 rbd_assert((!obj_req->result &&
2915 rbd_assert(obj_request); 2506 obj_req->xferred == obj_req->ex.oe_len) ||
2916 rbd_assert(obj_request->img_request); 2507 (obj_req->result < 0 && !obj_req->xferred));
2917 rbd_dev = obj_request->img_request->rbd_dev; 2508 if (!obj_req->result) {
2918 if (!rbd_dev->parent_overlap) { 2509 img_req->xferred += obj_req->xferred;
2919 rbd_obj_request_submit(obj_request);
2920 return; 2510 return;
2921 } 2511 }
2922 2512
2923 obj_request->result = img_result; 2513 rbd_warn(img_req->rbd_dev,
2924 if (obj_request->result) 2514 "%s at objno %llu %llu~%llu result %d xferred %llu",
2925 goto out; 2515 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2516 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
2517 obj_req->xferred);
2518 if (!img_req->result) {
2519 img_req->result = obj_req->result;
2520 img_req->xferred = 0;
2521 }
2522}
2926 2523
2927 /* 2524static void rbd_img_end_child_request(struct rbd_img_request *img_req)
2928 * We need to zero anything beyond the parent overlap 2525{
2929 * boundary. Since rbd_img_obj_request_read_callback() 2526 struct rbd_obj_request *obj_req = img_req->obj_request;
2930 * will zero anything beyond the end of a short read, an
2931 * easy way to do this is to pretend the data from the
2932 * parent came up short--ending at the overlap boundary.
2933 */
2934 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2935 obj_end = obj_request->img_offset + obj_request->length;
2936 if (obj_end > rbd_dev->parent_overlap) {
2937 u64 xferred = 0;
2938 2527
2939 if (obj_request->img_offset < rbd_dev->parent_overlap) 2528 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
2940 xferred = rbd_dev->parent_overlap - 2529 rbd_assert((!img_req->result &&
2941 obj_request->img_offset; 2530 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2531 (img_req->result < 0 && !img_req->xferred));
2942 2532
2943 obj_request->xferred = min(img_xferred, xferred); 2533 obj_req->result = img_req->result;
2944 } else { 2534 obj_req->xferred = img_req->xferred;
2945 obj_request->xferred = img_xferred; 2535 rbd_img_request_put(img_req);
2946 }
2947out:
2948 rbd_img_obj_request_read_callback(obj_request);
2949 rbd_obj_request_complete(obj_request);
2950} 2536}
2951 2537
2952static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 2538static void rbd_img_end_request(struct rbd_img_request *img_req)
2953{ 2539{
2954 struct rbd_img_request *img_request; 2540 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2955 int result; 2541 rbd_assert((!img_req->result &&
2542 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2543 (img_req->result < 0 && !img_req->xferred));
2956 2544
2957 rbd_assert(obj_request_img_data_test(obj_request)); 2545 blk_mq_end_request(img_req->rq,
2958 rbd_assert(obj_request->img_request != NULL); 2546 errno_to_blk_status(img_req->result));
2959 rbd_assert(obj_request->result == (s32) -ENOENT); 2547 rbd_img_request_put(img_req);
2960 rbd_assert(obj_request_type_valid(obj_request->type)); 2548}
2961 2549
2962 /* rbd_read_finish(obj_request, obj_request->length); */ 2550static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2963 img_request = rbd_parent_request_create(obj_request, 2551{
2964 obj_request->img_offset, 2552 struct rbd_img_request *img_req;
2965 obj_request->length);
2966 result = -ENOMEM;
2967 if (!img_request)
2968 goto out_err;
2969 2553
2970 if (obj_request->type == OBJ_REQUEST_BIO) 2554again:
2971 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2555 if (!__rbd_obj_handle_request(obj_req))
2972 obj_request->bio_list); 2556 return;
2973 else
2974 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2975 obj_request->pages);
2976 if (result)
2977 goto out_err;
2978 2557
2979 img_request->callback = rbd_img_parent_read_callback; 2558 img_req = obj_req->img_request;
2980 result = rbd_img_request_submit(img_request); 2559 spin_lock(&img_req->completion_lock);
2981 if (result) 2560 rbd_obj_end_request(obj_req);
2982 goto out_err; 2561 rbd_assert(img_req->pending_count);
2562 if (--img_req->pending_count) {
2563 spin_unlock(&img_req->completion_lock);
2564 return;
2565 }
2983 2566
2984 return; 2567 spin_unlock(&img_req->completion_lock);
2985out_err: 2568 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2986 if (img_request) 2569 obj_req = img_req->obj_request;
2987 rbd_img_request_put(img_request); 2570 rbd_img_end_child_request(img_req);
2988 obj_request->result = result; 2571 goto again;
2989 obj_request->xferred = 0; 2572 }
2990 obj_request_done_set(obj_request); 2573 rbd_img_end_request(img_req);
2991} 2574}
2992 2575
2993static const struct rbd_client_id rbd_empty_cid; 2576static const struct rbd_client_id rbd_empty_cid;
@@ -3091,8 +2674,8 @@ static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3091{ 2674{
3092 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2675 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3093 struct rbd_client_id cid = rbd_get_cid(rbd_dev); 2676 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3094 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; 2677 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3095 char buf[buf_size]; 2678 int buf_size = sizeof(buf);
3096 void *p = buf; 2679 void *p = buf;
3097 2680
3098 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 2681 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
@@ -3610,8 +3193,8 @@ static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3610 u64 notify_id, u64 cookie, s32 *result) 3193 u64 notify_id, u64 cookie, s32 *result)
3611{ 3194{
3612 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3195 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3613 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; 3196 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3614 char buf[buf_size]; 3197 int buf_size = sizeof(buf);
3615 int ret; 3198 int ret;
3616 3199
3617 if (result) { 3200 if (result) {
@@ -3887,7 +3470,7 @@ static void rbd_reregister_watch(struct work_struct *work)
3887 3470
3888 ret = rbd_dev_refresh(rbd_dev); 3471 ret = rbd_dev_refresh(rbd_dev);
3889 if (ret) 3472 if (ret)
3890 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 3473 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
3891} 3474}
3892 3475
3893/* 3476/*
@@ -4070,8 +3653,7 @@ static void rbd_queue_workfn(struct work_struct *work)
4070 } 3653 }
4071 } 3654 }
4072 3655
4073 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 3656 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
4074 snapc);
4075 if (!img_request) { 3657 if (!img_request) {
4076 result = -ENOMEM; 3658 result = -ENOMEM;
4077 goto err_unlock; 3659 goto err_unlock;
@@ -4080,18 +3662,14 @@ static void rbd_queue_workfn(struct work_struct *work)
4080 snapc = NULL; /* img_request consumes a ref */ 3662 snapc = NULL; /* img_request consumes a ref */
4081 3663
4082 if (op_type == OBJ_OP_DISCARD) 3664 if (op_type == OBJ_OP_DISCARD)
4083 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 3665 result = rbd_img_fill_nodata(img_request, offset, length);
4084 NULL);
4085 else 3666 else
4086 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3667 result = rbd_img_fill_from_bio(img_request, offset, length,
4087 rq->bio); 3668 rq->bio);
4088 if (result)
4089 goto err_img_request;
4090
4091 result = rbd_img_request_submit(img_request);
4092 if (result) 3669 if (result)
4093 goto err_img_request; 3670 goto err_img_request;
4094 3671
3672 rbd_img_request_submit(img_request);
4095 if (must_be_locked) 3673 if (must_be_locked)
4096 up_read(&rbd_dev->lock_rwsem); 3674 up_read(&rbd_dev->lock_rwsem);
4097 return; 3675 return;
@@ -4369,7 +3947,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
4369 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3947 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
4370 q->limits.max_sectors = queue_max_hw_sectors(q); 3948 q->limits.max_sectors = queue_max_hw_sectors(q);
4371 blk_queue_max_segments(q, USHRT_MAX); 3949 blk_queue_max_segments(q, USHRT_MAX);
4372 blk_queue_max_segment_size(q, segment_size); 3950 blk_queue_max_segment_size(q, UINT_MAX);
4373 blk_queue_io_min(q, segment_size); 3951 blk_queue_io_min(q, segment_size);
4374 blk_queue_io_opt(q, segment_size); 3952 blk_queue_io_opt(q, segment_size);
4375 3953
@@ -5057,9 +4635,6 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5057 } __attribute__ ((packed)) striping_info_buf = { 0 }; 4635 } __attribute__ ((packed)) striping_info_buf = { 0 };
5058 size_t size = sizeof (striping_info_buf); 4636 size_t size = sizeof (striping_info_buf);
5059 void *p; 4637 void *p;
5060 u64 obj_size;
5061 u64 stripe_unit;
5062 u64 stripe_count;
5063 int ret; 4638 int ret;
5064 4639
5065 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4640 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
@@ -5071,31 +4646,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5071 if (ret < size) 4646 if (ret < size)
5072 return -ERANGE; 4647 return -ERANGE;
5073 4648
5074 /*
5075 * We don't actually support the "fancy striping" feature
5076 * (STRIPINGV2) yet, but if the striping sizes are the
5077 * defaults the behavior is the same as before. So find
5078 * out, and only fail if the image has non-default values.
5079 */
5080 ret = -EINVAL;
5081 obj_size = rbd_obj_bytes(&rbd_dev->header);
5082 p = &striping_info_buf; 4649 p = &striping_info_buf;
5083 stripe_unit = ceph_decode_64(&p); 4650 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5084 if (stripe_unit != obj_size) { 4651 rbd_dev->header.stripe_count = ceph_decode_64(&p);
5085 rbd_warn(rbd_dev, "unsupported stripe unit "
5086 "(got %llu want %llu)",
5087 stripe_unit, obj_size);
5088 return -EINVAL;
5089 }
5090 stripe_count = ceph_decode_64(&p);
5091 if (stripe_count != 1) {
5092 rbd_warn(rbd_dev, "unsupported stripe count "
5093 "(got %llu want 1)", stripe_count);
5094 return -EINVAL;
5095 }
5096 rbd_dev->header.stripe_unit = stripe_unit;
5097 rbd_dev->header.stripe_count = stripe_count;
5098
5099 return 0; 4652 return 0;
5100} 4653}
5101 4654
@@ -5653,39 +5206,6 @@ out_err:
5653 return ret; 5206 return ret;
5654} 5207}
5655 5208
5656/*
5657 * Return pool id (>= 0) or a negative error code.
5658 */
5659static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5660{
5661 struct ceph_options *opts = rbdc->client->options;
5662 u64 newest_epoch;
5663 int tries = 0;
5664 int ret;
5665
5666again:
5667 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5668 if (ret == -ENOENT && tries++ < 1) {
5669 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5670 &newest_epoch);
5671 if (ret < 0)
5672 return ret;
5673
5674 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
5675 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
5676 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
5677 newest_epoch,
5678 opts->mount_timeout);
5679 goto again;
5680 } else {
5681 /* the osdmap we have is new enough */
5682 return -ENOENT;
5683 }
5684 }
5685
5686 return ret;
5687}
5688
5689static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) 5209static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5690{ 5210{
5691 down_write(&rbd_dev->lock_rwsem); 5211 down_write(&rbd_dev->lock_rwsem);
@@ -6114,7 +5634,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
6114 } 5634 }
6115 5635
6116 /* pick the pool */ 5636 /* pick the pool */
6117 rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 5637 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
6118 if (rc < 0) { 5638 if (rc < 0) {
6119 if (rc == -ENOENT) 5639 if (rc == -ENOENT)
6120 pr_info("pool %s does not exist\n", spec->pool_name); 5640 pr_info("pool %s does not exist\n", spec->pool_name);
@@ -6366,16 +5886,8 @@ static int rbd_slab_init(void)
6366 if (!rbd_obj_request_cache) 5886 if (!rbd_obj_request_cache)
6367 goto out_err; 5887 goto out_err;
6368 5888
6369 rbd_assert(!rbd_bio_clone);
6370 rbd_bio_clone = bioset_create(BIO_POOL_SIZE, 0, 0);
6371 if (!rbd_bio_clone)
6372 goto out_err_clone;
6373
6374 return 0; 5889 return 0;
6375 5890
6376out_err_clone:
6377 kmem_cache_destroy(rbd_obj_request_cache);
6378 rbd_obj_request_cache = NULL;
6379out_err: 5891out_err:
6380 kmem_cache_destroy(rbd_img_request_cache); 5892 kmem_cache_destroy(rbd_img_request_cache);
6381 rbd_img_request_cache = NULL; 5893 rbd_img_request_cache = NULL;
@@ -6391,10 +5903,6 @@ static void rbd_slab_exit(void)
6391 rbd_assert(rbd_img_request_cache); 5903 rbd_assert(rbd_img_request_cache);
6392 kmem_cache_destroy(rbd_img_request_cache); 5904 kmem_cache_destroy(rbd_img_request_cache);
6393 rbd_img_request_cache = NULL; 5905 rbd_img_request_cache = NULL;
6394
6395 rbd_assert(rbd_bio_clone);
6396 bioset_free(rbd_bio_clone);
6397 rbd_bio_clone = NULL;
6398} 5906}
6399 5907
6400static int __init rbd_init(void) 5908static int __init rbd_init(void)
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 174f5709e508..a699e320393f 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@
6obj-$(CONFIG_CEPH_FS) += ceph.o 6obj-$(CONFIG_CEPH_FS) += ceph.o
7 7
8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
9 export.o caps.o snap.o xattr.o \ 9 export.o caps.o snap.o xattr.o quota.o \
10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 10 mds_client.o mdsmap.o strings.o ceph_frag.o \
11 debugfs.o 11 debugfs.o
12 12
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b4336b42ce3b..5f7ad3d0df2e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -15,6 +15,7 @@
15#include "mds_client.h" 15#include "mds_client.h"
16#include "cache.h" 16#include "cache.h"
17#include <linux/ceph/osd_client.h> 17#include <linux/ceph/osd_client.h>
18#include <linux/ceph/striper.h>
18 19
19/* 20/*
20 * Ceph address space ops. 21 * Ceph address space ops.
@@ -438,7 +439,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
438{ 439{
439 struct inode *inode = file_inode(file); 440 struct inode *inode = file_inode(file);
440 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 441 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
441 struct ceph_file_info *ci = file->private_data; 442 struct ceph_file_info *fi = file->private_data;
442 struct ceph_rw_context *rw_ctx; 443 struct ceph_rw_context *rw_ctx;
443 int rc = 0; 444 int rc = 0;
444 int max = 0; 445 int max = 0;
@@ -452,7 +453,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
452 if (rc == 0) 453 if (rc == 0)
453 goto out; 454 goto out;
454 455
455 rw_ctx = ceph_find_rw_context(ci); 456 rw_ctx = ceph_find_rw_context(fi);
456 max = fsc->mount_options->rsize >> PAGE_SHIFT; 457 max = fsc->mount_options->rsize >> PAGE_SHIFT;
457 dout("readpages %p file %p ctx %p nr_pages %d max %d\n", 458 dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
458 inode, file, rw_ctx, nr_pages, max); 459 inode, file, rw_ctx, nr_pages, max);
@@ -800,7 +801,7 @@ static int ceph_writepages_start(struct address_space *mapping,
800 struct ceph_osd_request *req = NULL; 801 struct ceph_osd_request *req = NULL;
801 struct ceph_writeback_ctl ceph_wbc; 802 struct ceph_writeback_ctl ceph_wbc;
802 bool should_loop, range_whole = false; 803 bool should_loop, range_whole = false;
803 bool stop, done = false; 804 bool done = false;
804 805
805 dout("writepages_start %p (mode=%s)\n", inode, 806 dout("writepages_start %p (mode=%s)\n", inode,
806 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 807 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
@@ -856,7 +857,7 @@ retry:
856 * in that range can be associated with newer snapc. 857 * in that range can be associated with newer snapc.
857 * They are not writeable until we write all dirty pages 858 * They are not writeable until we write all dirty pages
858 * associated with 'snapc' get written */ 859 * associated with 'snapc' get written */
859 if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) 860 if (index > 0)
860 should_loop = true; 861 should_loop = true;
861 dout(" non-head snapc, range whole\n"); 862 dout(" non-head snapc, range whole\n");
862 } 863 }
@@ -864,8 +865,7 @@ retry:
864 ceph_put_snap_context(last_snapc); 865 ceph_put_snap_context(last_snapc);
865 last_snapc = snapc; 866 last_snapc = snapc;
866 867
867 stop = false; 868 while (!done && index <= end) {
868 while (!stop && index <= end) {
869 int num_ops = 0, op_idx; 869 int num_ops = 0, op_idx;
870 unsigned i, pvec_pages, max_pages, locked_pages = 0; 870 unsigned i, pvec_pages, max_pages, locked_pages = 0;
871 struct page **pages = NULL, **data_pages; 871 struct page **pages = NULL, **data_pages;
@@ -898,16 +898,30 @@ get_more_pages:
898 unlock_page(page); 898 unlock_page(page);
899 continue; 899 continue;
900 } 900 }
901 if (strip_unit_end && (page->index > strip_unit_end)) { 901 /* only if matching snap context */
902 dout("end of strip unit %p\n", page); 902 pgsnapc = page_snap_context(page);
903 if (pgsnapc != snapc) {
904 dout("page snapc %p %lld != oldest %p %lld\n",
905 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
906 if (!should_loop &&
907 !ceph_wbc.head_snapc &&
908 wbc->sync_mode != WB_SYNC_NONE)
909 should_loop = true;
903 unlock_page(page); 910 unlock_page(page);
904 break; 911 continue;
905 } 912 }
906 if (page_offset(page) >= ceph_wbc.i_size) { 913 if (page_offset(page) >= ceph_wbc.i_size) {
907 dout("%p page eof %llu\n", 914 dout("%p page eof %llu\n",
908 page, ceph_wbc.i_size); 915 page, ceph_wbc.i_size);
909 /* not done if range_cyclic */ 916 if (ceph_wbc.size_stable ||
910 stop = true; 917 page_offset(page) >= i_size_read(inode))
918 mapping->a_ops->invalidatepage(page,
919 0, PAGE_SIZE);
920 unlock_page(page);
921 continue;
922 }
923 if (strip_unit_end && (page->index > strip_unit_end)) {
924 dout("end of strip unit %p\n", page);
911 unlock_page(page); 925 unlock_page(page);
912 break; 926 break;
913 } 927 }
@@ -921,15 +935,6 @@ get_more_pages:
921 wait_on_page_writeback(page); 935 wait_on_page_writeback(page);
922 } 936 }
923 937
924 /* only if matching snap context */
925 pgsnapc = page_snap_context(page);
926 if (pgsnapc != snapc) {
927 dout("page snapc %p %lld != oldest %p %lld\n",
928 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
929 unlock_page(page);
930 continue;
931 }
932
933 if (!clear_page_dirty_for_io(page)) { 938 if (!clear_page_dirty_for_io(page)) {
934 dout("%p !clear_page_dirty_for_io\n", page); 939 dout("%p !clear_page_dirty_for_io\n", page);
935 unlock_page(page); 940 unlock_page(page);
@@ -945,19 +950,15 @@ get_more_pages:
945 if (locked_pages == 0) { 950 if (locked_pages == 0) {
946 u64 objnum; 951 u64 objnum;
947 u64 objoff; 952 u64 objoff;
953 u32 xlen;
948 954
949 /* prepare async write request */ 955 /* prepare async write request */
950 offset = (u64)page_offset(page); 956 offset = (u64)page_offset(page);
951 len = wsize; 957 ceph_calc_file_object_mapping(&ci->i_layout,
952 958 offset, wsize,
953 rc = ceph_calc_file_object_mapping(&ci->i_layout, 959 &objnum, &objoff,
954 offset, len, 960 &xlen);
955 &objnum, &objoff, 961 len = xlen;
956 &len);
957 if (rc < 0) {
958 unlock_page(page);
959 break;
960 }
961 962
962 num_ops = 1; 963 num_ops = 1;
963 strip_unit_end = page->index + 964 strip_unit_end = page->index +
@@ -1146,7 +1147,7 @@ new_request:
1146 * we tagged for writeback prior to entering this loop. 1147 * we tagged for writeback prior to entering this loop.
1147 */ 1148 */
1148 if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) 1149 if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
1149 done = stop = true; 1150 done = true;
1150 1151
1151release_pvec_pages: 1152release_pvec_pages:
1152 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, 1153 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 33a211b364ed..bb524c880b1e 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -51,7 +51,7 @@ static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
51 .type = FSCACHE_COOKIE_TYPE_INDEX, 51 .type = FSCACHE_COOKIE_TYPE_INDEX,
52}; 52};
53 53
54int ceph_fscache_register(void) 54int __init ceph_fscache_register(void)
55{ 55{
56 return fscache_register_netfs(&ceph_cache_netfs); 56 return fscache_register_netfs(&ceph_cache_netfs);
57} 57}
@@ -135,7 +135,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
135 if (memcmp(data, &aux, sizeof(aux)) != 0) 135 if (memcmp(data, &aux, sizeof(aux)) != 0)
136 return FSCACHE_CHECKAUX_OBSOLETE; 136 return FSCACHE_CHECKAUX_OBSOLETE;
137 137
138 dout("ceph inode 0x%p cached okay", ci); 138 dout("ceph inode 0x%p cached okay\n", ci);
139 return FSCACHE_CHECKAUX_OKAY; 139 return FSCACHE_CHECKAUX_OKAY;
140} 140}
141 141
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0e5bd3e3344e..23dbfae16156 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -184,36 +184,54 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
184 mdsc->caps_avail_count); 184 mdsc->caps_avail_count);
185 spin_unlock(&mdsc->caps_list_lock); 185 spin_unlock(&mdsc->caps_list_lock);
186 186
187 for (i = have; i < need; i++) { 187 for (i = have; i < need; ) {
188retry:
189 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 188 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
190 if (!cap) { 189 if (cap) {
191 if (!trimmed) { 190 list_add(&cap->caps_item, &newcaps);
192 for (j = 0; j < mdsc->max_sessions; j++) { 191 alloc++;
193 s = __ceph_lookup_mds_session(mdsc, j); 192 i++;
194 if (!s) 193 continue;
195 continue; 194 }
196 mutex_unlock(&mdsc->mutex);
197 195
198 mutex_lock(&s->s_mutex); 196 if (!trimmed) {
199 max_caps = s->s_nr_caps - (need - i); 197 for (j = 0; j < mdsc->max_sessions; j++) {
200 ceph_trim_caps(mdsc, s, max_caps); 198 s = __ceph_lookup_mds_session(mdsc, j);
201 mutex_unlock(&s->s_mutex); 199 if (!s)
200 continue;
201 mutex_unlock(&mdsc->mutex);
202 202
203 ceph_put_mds_session(s); 203 mutex_lock(&s->s_mutex);
204 mutex_lock(&mdsc->mutex); 204 max_caps = s->s_nr_caps - (need - i);
205 } 205 ceph_trim_caps(mdsc, s, max_caps);
206 trimmed = true; 206 mutex_unlock(&s->s_mutex);
207 goto retry; 207
208 } else { 208 ceph_put_mds_session(s);
209 pr_warn("reserve caps ctx=%p ENOMEM " 209 mutex_lock(&mdsc->mutex);
210 "need=%d got=%d\n",
211 ctx, need, have + alloc);
212 goto out_nomem;
213 } 210 }
211 trimmed = true;
212
213 spin_lock(&mdsc->caps_list_lock);
214 if (mdsc->caps_avail_count) {
215 int more_have;
216 if (mdsc->caps_avail_count >= need - i)
217 more_have = need - i;
218 else
219 more_have = mdsc->caps_avail_count;
220
221 i += more_have;
222 have += more_have;
223 mdsc->caps_avail_count -= more_have;
224 mdsc->caps_reserve_count += more_have;
225
226 }
227 spin_unlock(&mdsc->caps_list_lock);
228
229 continue;
214 } 230 }
215 list_add(&cap->caps_item, &newcaps); 231
216 alloc++; 232 pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
233 ctx, need, have + alloc);
234 goto out_nomem;
217 } 235 }
218 BUG_ON(have + alloc != need); 236 BUG_ON(have + alloc != need);
219 237
@@ -234,16 +252,28 @@ retry:
234 return 0; 252 return 0;
235 253
236out_nomem: 254out_nomem:
255
256 spin_lock(&mdsc->caps_list_lock);
257 mdsc->caps_avail_count += have;
258 mdsc->caps_reserve_count -= have;
259
237 while (!list_empty(&newcaps)) { 260 while (!list_empty(&newcaps)) {
238 cap = list_first_entry(&newcaps, 261 cap = list_first_entry(&newcaps,
239 struct ceph_cap, caps_item); 262 struct ceph_cap, caps_item);
240 list_del(&cap->caps_item); 263 list_del(&cap->caps_item);
241 kmem_cache_free(ceph_cap_cachep, cap); 264
265 /* Keep some preallocated caps around (ceph_min_count), to
266 * avoid lots of free/alloc churn. */
267 if (mdsc->caps_avail_count >=
268 mdsc->caps_reserve_count + mdsc->caps_min_count) {
269 kmem_cache_free(ceph_cap_cachep, cap);
270 } else {
271 mdsc->caps_avail_count++;
272 mdsc->caps_total_count++;
273 list_add(&cap->caps_item, &mdsc->caps_list);
274 }
242 } 275 }
243 276
244 spin_lock(&mdsc->caps_list_lock);
245 mdsc->caps_avail_count += have;
246 mdsc->caps_reserve_count -= have;
247 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 277 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
248 mdsc->caps_reserve_count + 278 mdsc->caps_reserve_count +
249 mdsc->caps_avail_count); 279 mdsc->caps_avail_count);
@@ -254,12 +284,26 @@ out_nomem:
254int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 284int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
255 struct ceph_cap_reservation *ctx) 285 struct ceph_cap_reservation *ctx)
256{ 286{
287 int i;
288 struct ceph_cap *cap;
289
257 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 290 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
258 if (ctx->count) { 291 if (ctx->count) {
259 spin_lock(&mdsc->caps_list_lock); 292 spin_lock(&mdsc->caps_list_lock);
260 BUG_ON(mdsc->caps_reserve_count < ctx->count); 293 BUG_ON(mdsc->caps_reserve_count < ctx->count);
261 mdsc->caps_reserve_count -= ctx->count; 294 mdsc->caps_reserve_count -= ctx->count;
262 mdsc->caps_avail_count += ctx->count; 295 if (mdsc->caps_avail_count >=
296 mdsc->caps_reserve_count + mdsc->caps_min_count) {
297 mdsc->caps_total_count -= ctx->count;
298 for (i = 0; i < ctx->count; i++) {
299 cap = list_first_entry(&mdsc->caps_list,
300 struct ceph_cap, caps_item);
301 list_del(&cap->caps_item);
302 kmem_cache_free(ceph_cap_cachep, cap);
303 }
304 } else {
305 mdsc->caps_avail_count += ctx->count;
306 }
263 ctx->count = 0; 307 ctx->count = 0;
264 dout("unreserve caps %d = %d used + %d resv + %d avail\n", 308 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
265 mdsc->caps_total_count, mdsc->caps_use_count, 309 mdsc->caps_total_count, mdsc->caps_use_count,
@@ -285,7 +329,23 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
285 mdsc->caps_use_count++; 329 mdsc->caps_use_count++;
286 mdsc->caps_total_count++; 330 mdsc->caps_total_count++;
287 spin_unlock(&mdsc->caps_list_lock); 331 spin_unlock(&mdsc->caps_list_lock);
332 } else {
333 spin_lock(&mdsc->caps_list_lock);
334 if (mdsc->caps_avail_count) {
335 BUG_ON(list_empty(&mdsc->caps_list));
336
337 mdsc->caps_avail_count--;
338 mdsc->caps_use_count++;
339 cap = list_first_entry(&mdsc->caps_list,
340 struct ceph_cap, caps_item);
341 list_del(&cap->caps_item);
342
343 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
344 mdsc->caps_reserve_count + mdsc->caps_avail_count);
345 }
346 spin_unlock(&mdsc->caps_list_lock);
288 } 347 }
348
289 return cap; 349 return cap;
290 } 350 }
291 351
@@ -341,6 +401,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
341{ 401{
342 struct ceph_mds_client *mdsc = fsc->mdsc; 402 struct ceph_mds_client *mdsc = fsc->mdsc;
343 403
404 spin_lock(&mdsc->caps_list_lock);
405
344 if (total) 406 if (total)
345 *total = mdsc->caps_total_count; 407 *total = mdsc->caps_total_count;
346 if (avail) 408 if (avail)
@@ -351,6 +413,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
351 *reserved = mdsc->caps_reserve_count; 413 *reserved = mdsc->caps_reserve_count;
352 if (min) 414 if (min)
353 *min = mdsc->caps_min_count; 415 *min = mdsc->caps_min_count;
416
417 spin_unlock(&mdsc->caps_list_lock);
354} 418}
355 419
356/* 420/*
@@ -639,9 +703,11 @@ void ceph_add_cap(struct inode *inode,
639 } 703 }
640 704
641 spin_lock(&realm->inodes_with_caps_lock); 705 spin_lock(&realm->inodes_with_caps_lock);
642 ci->i_snap_realm = realm;
643 list_add(&ci->i_snap_realm_item, 706 list_add(&ci->i_snap_realm_item,
644 &realm->inodes_with_caps); 707 &realm->inodes_with_caps);
708 ci->i_snap_realm = realm;
709 if (realm->ino == ci->i_vino.ino)
710 realm->inode = inode;
645 spin_unlock(&realm->inodes_with_caps_lock); 711 spin_unlock(&realm->inodes_with_caps_lock);
646 712
647 if (oldrealm) 713 if (oldrealm)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 644def813754..abdf98deeec4 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -260,7 +260,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
260 goto out; 260 goto out;
261 261
262 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 262 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
263 0600, 263 0400,
264 fsc->client->debugfs_dir, 264 fsc->client->debugfs_dir,
265 fsc, 265 fsc,
266 &mdsmap_show_fops); 266 &mdsmap_show_fops);
@@ -268,7 +268,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
268 goto out; 268 goto out;
269 269
270 fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", 270 fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
271 0600, 271 0400,
272 fsc->client->debugfs_dir, 272 fsc->client->debugfs_dir,
273 fsc, 273 fsc,
274 &mds_sessions_show_fops); 274 &mds_sessions_show_fops);
@@ -276,7 +276,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
276 goto out; 276 goto out;
277 277
278 fsc->debugfs_mdsc = debugfs_create_file("mdsc", 278 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
279 0600, 279 0400,
280 fsc->client->debugfs_dir, 280 fsc->client->debugfs_dir,
281 fsc, 281 fsc,
282 &mdsc_show_fops); 282 &mdsc_show_fops);
@@ -292,7 +292,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
292 goto out; 292 goto out;
293 293
294 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 294 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
295 0600, 295 0400,
296 fsc->client->debugfs_dir, 296 fsc->client->debugfs_dir,
297 fsc, 297 fsc,
298 &dentry_lru_show_fops); 298 &dentry_lru_show_fops);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2bdd561c4c68..1a78dd6f8bf2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -101,18 +101,18 @@ static int fpos_cmp(loff_t l, loff_t r)
101 * regardless of what dir changes take place on the 101 * regardless of what dir changes take place on the
102 * server. 102 * server.
103 */ 103 */
104static int note_last_dentry(struct ceph_file_info *fi, const char *name, 104static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
105 int len, unsigned next_offset) 105 int len, unsigned next_offset)
106{ 106{
107 char *buf = kmalloc(len+1, GFP_KERNEL); 107 char *buf = kmalloc(len+1, GFP_KERNEL);
108 if (!buf) 108 if (!buf)
109 return -ENOMEM; 109 return -ENOMEM;
110 kfree(fi->last_name); 110 kfree(dfi->last_name);
111 fi->last_name = buf; 111 dfi->last_name = buf;
112 memcpy(fi->last_name, name, len); 112 memcpy(dfi->last_name, name, len);
113 fi->last_name[len] = 0; 113 dfi->last_name[len] = 0;
114 fi->next_offset = next_offset; 114 dfi->next_offset = next_offset;
115 dout("note_last_dentry '%s'\n", fi->last_name); 115 dout("note_last_dentry '%s'\n", dfi->last_name);
116 return 0; 116 return 0;
117} 117}
118 118
@@ -174,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
174static int __dcache_readdir(struct file *file, struct dir_context *ctx, 174static int __dcache_readdir(struct file *file, struct dir_context *ctx,
175 int shared_gen) 175 int shared_gen)
176{ 176{
177 struct ceph_file_info *fi = file->private_data; 177 struct ceph_dir_file_info *dfi = file->private_data;
178 struct dentry *parent = file->f_path.dentry; 178 struct dentry *parent = file->f_path.dentry;
179 struct inode *dir = d_inode(parent); 179 struct inode *dir = d_inode(parent);
180 struct dentry *dentry, *last = NULL; 180 struct dentry *dentry, *last = NULL;
@@ -221,7 +221,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
221 bool emit_dentry = false; 221 bool emit_dentry = false;
222 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); 222 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
223 if (!dentry) { 223 if (!dentry) {
224 fi->flags |= CEPH_F_ATEND; 224 dfi->file_info.flags |= CEPH_F_ATEND;
225 err = 0; 225 err = 0;
226 break; 226 break;
227 } 227 }
@@ -272,33 +272,33 @@ out:
272 if (last) { 272 if (last) {
273 int ret; 273 int ret;
274 di = ceph_dentry(last); 274 di = ceph_dentry(last);
275 ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, 275 ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
276 fpos_off(di->offset) + 1); 276 fpos_off(di->offset) + 1);
277 if (ret < 0) 277 if (ret < 0)
278 err = ret; 278 err = ret;
279 dput(last); 279 dput(last);
280 /* last_name no longer match cache index */ 280 /* last_name no longer match cache index */
281 if (fi->readdir_cache_idx >= 0) { 281 if (dfi->readdir_cache_idx >= 0) {
282 fi->readdir_cache_idx = -1; 282 dfi->readdir_cache_idx = -1;
283 fi->dir_release_count = 0; 283 dfi->dir_release_count = 0;
284 } 284 }
285 } 285 }
286 return err; 286 return err;
287} 287}
288 288
289static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) 289static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos)
290{ 290{
291 if (!fi->last_readdir) 291 if (!dfi->last_readdir)
292 return true; 292 return true;
293 if (is_hash_order(pos)) 293 if (is_hash_order(pos))
294 return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); 294 return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos));
295 else 295 else
296 return fi->frag != fpos_frag(pos); 296 return dfi->frag != fpos_frag(pos);
297} 297}
298 298
299static int ceph_readdir(struct file *file, struct dir_context *ctx) 299static int ceph_readdir(struct file *file, struct dir_context *ctx)
300{ 300{
301 struct ceph_file_info *fi = file->private_data; 301 struct ceph_dir_file_info *dfi = file->private_data;
302 struct inode *inode = file_inode(file); 302 struct inode *inode = file_inode(file);
303 struct ceph_inode_info *ci = ceph_inode(inode); 303 struct ceph_inode_info *ci = ceph_inode(inode);
304 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 304 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -309,7 +309,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
309 struct ceph_mds_reply_info_parsed *rinfo; 309 struct ceph_mds_reply_info_parsed *rinfo;
310 310
311 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); 311 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
312 if (fi->flags & CEPH_F_ATEND) 312 if (dfi->file_info.flags & CEPH_F_ATEND)
313 return 0; 313 return 0;
314 314
315 /* always start with . and .. */ 315 /* always start with . and .. */
@@ -350,15 +350,15 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
350 /* proceed with a normal readdir */ 350 /* proceed with a normal readdir */
351more: 351more:
352 /* do we have the correct frag content buffered? */ 352 /* do we have the correct frag content buffered? */
353 if (need_send_readdir(fi, ctx->pos)) { 353 if (need_send_readdir(dfi, ctx->pos)) {
354 struct ceph_mds_request *req; 354 struct ceph_mds_request *req;
355 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 355 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
356 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 356 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
357 357
358 /* discard old result, if any */ 358 /* discard old result, if any */
359 if (fi->last_readdir) { 359 if (dfi->last_readdir) {
360 ceph_mdsc_put_request(fi->last_readdir); 360 ceph_mdsc_put_request(dfi->last_readdir);
361 fi->last_readdir = NULL; 361 dfi->last_readdir = NULL;
362 } 362 }
363 363
364 if (is_hash_order(ctx->pos)) { 364 if (is_hash_order(ctx->pos)) {
@@ -372,7 +372,7 @@ more:
372 } 372 }
373 373
374 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 374 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
375 ceph_vinop(inode), frag, fi->last_name); 375 ceph_vinop(inode), frag, dfi->last_name);
376 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 376 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
377 if (IS_ERR(req)) 377 if (IS_ERR(req))
378 return PTR_ERR(req); 378 return PTR_ERR(req);
@@ -388,8 +388,8 @@ more:
388 __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); 388 __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
389 req->r_inode_drop = CEPH_CAP_FILE_EXCL; 389 req->r_inode_drop = CEPH_CAP_FILE_EXCL;
390 } 390 }
391 if (fi->last_name) { 391 if (dfi->last_name) {
392 req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); 392 req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL);
393 if (!req->r_path2) { 393 if (!req->r_path2) {
394 ceph_mdsc_put_request(req); 394 ceph_mdsc_put_request(req);
395 return -ENOMEM; 395 return -ENOMEM;
@@ -399,10 +399,10 @@ more:
399 cpu_to_le32(fpos_hash(ctx->pos)); 399 cpu_to_le32(fpos_hash(ctx->pos));
400 } 400 }
401 401
402 req->r_dir_release_cnt = fi->dir_release_count; 402 req->r_dir_release_cnt = dfi->dir_release_count;
403 req->r_dir_ordered_cnt = fi->dir_ordered_count; 403 req->r_dir_ordered_cnt = dfi->dir_ordered_count;
404 req->r_readdir_cache_idx = fi->readdir_cache_idx; 404 req->r_readdir_cache_idx = dfi->readdir_cache_idx;
405 req->r_readdir_offset = fi->next_offset; 405 req->r_readdir_offset = dfi->next_offset;
406 req->r_args.readdir.frag = cpu_to_le32(frag); 406 req->r_args.readdir.frag = cpu_to_le32(frag);
407 req->r_args.readdir.flags = 407 req->r_args.readdir.flags =
408 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); 408 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
@@ -426,35 +426,35 @@ more:
426 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 426 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
427 frag = le32_to_cpu(rinfo->dir_dir->frag); 427 frag = le32_to_cpu(rinfo->dir_dir->frag);
428 if (!rinfo->hash_order) { 428 if (!rinfo->hash_order) {
429 fi->next_offset = req->r_readdir_offset; 429 dfi->next_offset = req->r_readdir_offset;
430 /* adjust ctx->pos to beginning of frag */ 430 /* adjust ctx->pos to beginning of frag */
431 ctx->pos = ceph_make_fpos(frag, 431 ctx->pos = ceph_make_fpos(frag,
432 fi->next_offset, 432 dfi->next_offset,
433 false); 433 false);
434 } 434 }
435 } 435 }
436 436
437 fi->frag = frag; 437 dfi->frag = frag;
438 fi->last_readdir = req; 438 dfi->last_readdir = req;
439 439
440 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { 440 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
441 fi->readdir_cache_idx = req->r_readdir_cache_idx; 441 dfi->readdir_cache_idx = req->r_readdir_cache_idx;
442 if (fi->readdir_cache_idx < 0) { 442 if (dfi->readdir_cache_idx < 0) {
443 /* preclude from marking dir ordered */ 443 /* preclude from marking dir ordered */
444 fi->dir_ordered_count = 0; 444 dfi->dir_ordered_count = 0;
445 } else if (ceph_frag_is_leftmost(frag) && 445 } else if (ceph_frag_is_leftmost(frag) &&
446 fi->next_offset == 2) { 446 dfi->next_offset == 2) {
447 /* note dir version at start of readdir so 447 /* note dir version at start of readdir so
448 * we can tell if any dentries get dropped */ 448 * we can tell if any dentries get dropped */
449 fi->dir_release_count = req->r_dir_release_cnt; 449 dfi->dir_release_count = req->r_dir_release_cnt;
450 fi->dir_ordered_count = req->r_dir_ordered_cnt; 450 dfi->dir_ordered_count = req->r_dir_ordered_cnt;
451 } 451 }
452 } else { 452 } else {
453 dout("readdir !did_prepopulate"); 453 dout("readdir !did_prepopulate\n");
454 /* disable readdir cache */ 454 /* disable readdir cache */
455 fi->readdir_cache_idx = -1; 455 dfi->readdir_cache_idx = -1;
456 /* preclude from marking dir complete */ 456 /* preclude from marking dir complete */
457 fi->dir_release_count = 0; 457 dfi->dir_release_count = 0;
458 } 458 }
459 459
460 /* note next offset and last dentry name */ 460 /* note next offset and last dentry name */
@@ -463,19 +463,19 @@ more:
463 rinfo->dir_entries + (rinfo->dir_nr-1); 463 rinfo->dir_entries + (rinfo->dir_nr-1);
464 unsigned next_offset = req->r_reply_info.dir_end ? 464 unsigned next_offset = req->r_reply_info.dir_end ?
465 2 : (fpos_off(rde->offset) + 1); 465 2 : (fpos_off(rde->offset) + 1);
466 err = note_last_dentry(fi, rde->name, rde->name_len, 466 err = note_last_dentry(dfi, rde->name, rde->name_len,
467 next_offset); 467 next_offset);
468 if (err) 468 if (err)
469 return err; 469 return err;
470 } else if (req->r_reply_info.dir_end) { 470 } else if (req->r_reply_info.dir_end) {
471 fi->next_offset = 2; 471 dfi->next_offset = 2;
472 /* keep last name */ 472 /* keep last name */
473 } 473 }
474 } 474 }
475 475
476 rinfo = &fi->last_readdir->r_reply_info; 476 rinfo = &dfi->last_readdir->r_reply_info;
477 dout("readdir frag %x num %d pos %llx chunk first %llx\n", 477 dout("readdir frag %x num %d pos %llx chunk first %llx\n",
478 fi->frag, rinfo->dir_nr, ctx->pos, 478 dfi->frag, rinfo->dir_nr, ctx->pos,
479 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); 479 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
480 480
481 i = 0; 481 i = 0;
@@ -519,52 +519,55 @@ more:
519 ctx->pos++; 519 ctx->pos++;
520 } 520 }
521 521
522 ceph_mdsc_put_request(fi->last_readdir); 522 ceph_mdsc_put_request(dfi->last_readdir);
523 fi->last_readdir = NULL; 523 dfi->last_readdir = NULL;
524 524
525 if (fi->next_offset > 2) { 525 if (dfi->next_offset > 2) {
526 frag = fi->frag; 526 frag = dfi->frag;
527 goto more; 527 goto more;
528 } 528 }
529 529
530 /* more frags? */ 530 /* more frags? */
531 if (!ceph_frag_is_rightmost(fi->frag)) { 531 if (!ceph_frag_is_rightmost(dfi->frag)) {
532 frag = ceph_frag_next(fi->frag); 532 frag = ceph_frag_next(dfi->frag);
533 if (is_hash_order(ctx->pos)) { 533 if (is_hash_order(ctx->pos)) {
534 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), 534 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
535 fi->next_offset, true); 535 dfi->next_offset, true);
536 if (new_pos > ctx->pos) 536 if (new_pos > ctx->pos)
537 ctx->pos = new_pos; 537 ctx->pos = new_pos;
538 /* keep last_name */ 538 /* keep last_name */
539 } else { 539 } else {
540 ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); 540 ctx->pos = ceph_make_fpos(frag, dfi->next_offset,
541 kfree(fi->last_name); 541 false);
542 fi->last_name = NULL; 542 kfree(dfi->last_name);
543 dfi->last_name = NULL;
543 } 544 }
544 dout("readdir next frag is %x\n", frag); 545 dout("readdir next frag is %x\n", frag);
545 goto more; 546 goto more;
546 } 547 }
547 fi->flags |= CEPH_F_ATEND; 548 dfi->file_info.flags |= CEPH_F_ATEND;
548 549
549 /* 550 /*
550 * if dir_release_count still matches the dir, no dentries 551 * if dir_release_count still matches the dir, no dentries
551 * were released during the whole readdir, and we should have 552 * were released during the whole readdir, and we should have
552 * the complete dir contents in our cache. 553 * the complete dir contents in our cache.
553 */ 554 */
554 if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { 555 if (atomic64_read(&ci->i_release_count) ==
556 dfi->dir_release_count) {
555 spin_lock(&ci->i_ceph_lock); 557 spin_lock(&ci->i_ceph_lock);
556 if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { 558 if (dfi->dir_ordered_count ==
559 atomic64_read(&ci->i_ordered_count)) {
557 dout(" marking %p complete and ordered\n", inode); 560 dout(" marking %p complete and ordered\n", inode);
558 /* use i_size to track number of entries in 561 /* use i_size to track number of entries in
559 * readdir cache */ 562 * readdir cache */
560 BUG_ON(fi->readdir_cache_idx < 0); 563 BUG_ON(dfi->readdir_cache_idx < 0);
561 i_size_write(inode, fi->readdir_cache_idx * 564 i_size_write(inode, dfi->readdir_cache_idx *
562 sizeof(struct dentry*)); 565 sizeof(struct dentry*));
563 } else { 566 } else {
564 dout(" marking %p complete\n", inode); 567 dout(" marking %p complete\n", inode);
565 } 568 }
566 __ceph_dir_set_complete(ci, fi->dir_release_count, 569 __ceph_dir_set_complete(ci, dfi->dir_release_count,
567 fi->dir_ordered_count); 570 dfi->dir_ordered_count);
568 spin_unlock(&ci->i_ceph_lock); 571 spin_unlock(&ci->i_ceph_lock);
569 } 572 }
570 573
@@ -572,25 +575,25 @@ more:
572 return 0; 575 return 0;
573} 576}
574 577
575static void reset_readdir(struct ceph_file_info *fi) 578static void reset_readdir(struct ceph_dir_file_info *dfi)
576{ 579{
577 if (fi->last_readdir) { 580 if (dfi->last_readdir) {
578 ceph_mdsc_put_request(fi->last_readdir); 581 ceph_mdsc_put_request(dfi->last_readdir);
579 fi->last_readdir = NULL; 582 dfi->last_readdir = NULL;
580 } 583 }
581 kfree(fi->last_name); 584 kfree(dfi->last_name);
582 fi->last_name = NULL; 585 dfi->last_name = NULL;
583 fi->dir_release_count = 0; 586 dfi->dir_release_count = 0;
584 fi->readdir_cache_idx = -1; 587 dfi->readdir_cache_idx = -1;
585 fi->next_offset = 2; /* compensate for . and .. */ 588 dfi->next_offset = 2; /* compensate for . and .. */
586 fi->flags &= ~CEPH_F_ATEND; 589 dfi->file_info.flags &= ~CEPH_F_ATEND;
587} 590}
588 591
589/* 592/*
590 * discard buffered readdir content on seekdir(0), or seek to new frag, 593 * discard buffered readdir content on seekdir(0), or seek to new frag,
591 * or seek prior to current chunk 594 * or seek prior to current chunk
592 */ 595 */
593static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) 596static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos)
594{ 597{
595 struct ceph_mds_reply_info_parsed *rinfo; 598 struct ceph_mds_reply_info_parsed *rinfo;
596 loff_t chunk_offset; 599 loff_t chunk_offset;
@@ -599,10 +602,10 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
599 if (is_hash_order(new_pos)) { 602 if (is_hash_order(new_pos)) {
600 /* no need to reset last_name for a forward seek when 603 /* no need to reset last_name for a forward seek when
601 * dentries are sotred in hash order */ 604 * dentries are sotred in hash order */
602 } else if (fi->frag != fpos_frag(new_pos)) { 605 } else if (dfi->frag != fpos_frag(new_pos)) {
603 return true; 606 return true;
604 } 607 }
605 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; 608 rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL;
606 if (!rinfo || !rinfo->dir_nr) 609 if (!rinfo || !rinfo->dir_nr)
607 return true; 610 return true;
608 chunk_offset = rinfo->dir_entries[0].offset; 611 chunk_offset = rinfo->dir_entries[0].offset;
@@ -612,7 +615,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
612 615
613static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) 616static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
614{ 617{
615 struct ceph_file_info *fi = file->private_data; 618 struct ceph_dir_file_info *dfi = file->private_data;
616 struct inode *inode = file->f_mapping->host; 619 struct inode *inode = file->f_mapping->host;
617 loff_t retval; 620 loff_t retval;
618 621
@@ -630,20 +633,20 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
630 } 633 }
631 634
632 if (offset >= 0) { 635 if (offset >= 0) {
633 if (need_reset_readdir(fi, offset)) { 636 if (need_reset_readdir(dfi, offset)) {
634 dout("dir_llseek dropping %p content\n", file); 637 dout("dir_llseek dropping %p content\n", file);
635 reset_readdir(fi); 638 reset_readdir(dfi);
636 } else if (is_hash_order(offset) && offset > file->f_pos) { 639 } else if (is_hash_order(offset) && offset > file->f_pos) {
637 /* for hash offset, we don't know if a forward seek 640 /* for hash offset, we don't know if a forward seek
638 * is within same frag */ 641 * is within same frag */
639 fi->dir_release_count = 0; 642 dfi->dir_release_count = 0;
640 fi->readdir_cache_idx = -1; 643 dfi->readdir_cache_idx = -1;
641 } 644 }
642 645
643 if (offset != file->f_pos) { 646 if (offset != file->f_pos) {
644 file->f_pos = offset; 647 file->f_pos = offset;
645 file->f_version = 0; 648 file->f_version = 0;
646 fi->flags &= ~CEPH_F_ATEND; 649 dfi->file_info.flags &= ~CEPH_F_ATEND;
647 } 650 }
648 retval = offset; 651 retval = offset;
649 } 652 }
@@ -824,6 +827,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
824 if (ceph_snap(dir) != CEPH_NOSNAP) 827 if (ceph_snap(dir) != CEPH_NOSNAP)
825 return -EROFS; 828 return -EROFS;
826 829
830 if (ceph_quota_is_max_files_exceeded(dir))
831 return -EDQUOT;
832
827 err = ceph_pre_init_acls(dir, &mode, &acls); 833 err = ceph_pre_init_acls(dir, &mode, &acls);
828 if (err < 0) 834 if (err < 0)
829 return err; 835 return err;
@@ -877,6 +883,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
877 if (ceph_snap(dir) != CEPH_NOSNAP) 883 if (ceph_snap(dir) != CEPH_NOSNAP)
878 return -EROFS; 884 return -EROFS;
879 885
886 if (ceph_quota_is_max_files_exceeded(dir))
887 return -EDQUOT;
888
880 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 889 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
881 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 890 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
882 if (IS_ERR(req)) { 891 if (IS_ERR(req)) {
@@ -926,6 +935,12 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
926 goto out; 935 goto out;
927 } 936 }
928 937
938 if (op == CEPH_MDS_OP_MKDIR &&
939 ceph_quota_is_max_files_exceeded(dir)) {
940 err = -EDQUOT;
941 goto out;
942 }
943
929 mode |= S_IFDIR; 944 mode |= S_IFDIR;
930 err = ceph_pre_init_acls(dir, &mode, &acls); 945 err = ceph_pre_init_acls(dir, &mode, &acls);
931 if (err < 0) 946 if (err < 0)
@@ -1065,6 +1080,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
1065 else 1080 else
1066 return -EROFS; 1081 return -EROFS;
1067 } 1082 }
1083 /* don't allow cross-quota renames */
1084 if ((old_dir != new_dir) &&
1085 (!ceph_quota_is_same_realm(old_dir, new_dir)))
1086 return -EXDEV;
1087
1068 dout("rename dir %p dentry %p to dir %p dentry %p\n", 1088 dout("rename dir %p dentry %p to dir %p dentry %p\n",
1069 old_dir, old_dentry, new_dir, new_dentry); 1089 old_dir, old_dentry, new_dir, new_dentry);
1070 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 1090 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1351,7 +1371,7 @@ static void ceph_d_prune(struct dentry *dentry)
1351static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1371static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1352 loff_t *ppos) 1372 loff_t *ppos)
1353{ 1373{
1354 struct ceph_file_info *cf = file->private_data; 1374 struct ceph_dir_file_info *dfi = file->private_data;
1355 struct inode *inode = file_inode(file); 1375 struct inode *inode = file_inode(file);
1356 struct ceph_inode_info *ci = ceph_inode(inode); 1376 struct ceph_inode_info *ci = ceph_inode(inode);
1357 int left; 1377 int left;
@@ -1360,12 +1380,12 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1360 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1380 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1361 return -EISDIR; 1381 return -EISDIR;
1362 1382
1363 if (!cf->dir_info) { 1383 if (!dfi->dir_info) {
1364 cf->dir_info = kmalloc(bufsize, GFP_KERNEL); 1384 dfi->dir_info = kmalloc(bufsize, GFP_KERNEL);
1365 if (!cf->dir_info) 1385 if (!dfi->dir_info)
1366 return -ENOMEM; 1386 return -ENOMEM;
1367 cf->dir_info_len = 1387 dfi->dir_info_len =
1368 snprintf(cf->dir_info, bufsize, 1388 snprintf(dfi->dir_info, bufsize,
1369 "entries: %20lld\n" 1389 "entries: %20lld\n"
1370 " files: %20lld\n" 1390 " files: %20lld\n"
1371 " subdirs: %20lld\n" 1391 " subdirs: %20lld\n"
@@ -1385,10 +1405,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1385 (long)ci->i_rctime.tv_nsec); 1405 (long)ci->i_rctime.tv_nsec);
1386 } 1406 }
1387 1407
1388 if (*ppos >= cf->dir_info_len) 1408 if (*ppos >= dfi->dir_info_len)
1389 return 0; 1409 return 0;
1390 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1410 size = min_t(unsigned, size, dfi->dir_info_len-*ppos);
1391 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1411 left = copy_to_user(buf, dfi->dir_info + *ppos, size);
1392 if (left == size) 1412 if (left == size)
1393 return -EFAULT; 1413 return -EFAULT;
1394 *ppos += (size - left); 1414 *ppos += (size - left);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b67eec3532a1..f85040d73e3d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -30,6 +30,8 @@ static __le32 ceph_flags_sys2wire(u32 flags)
30 break; 30 break;
31 } 31 }
32 32
33 flags &= ~O_ACCMODE;
34
33#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } 35#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
34 36
35 ceph_sys2wire(O_CREAT); 37 ceph_sys2wire(O_CREAT);
@@ -41,7 +43,7 @@ static __le32 ceph_flags_sys2wire(u32 flags)
41#undef ceph_sys2wire 43#undef ceph_sys2wire
42 44
43 if (flags) 45 if (flags)
44 dout("unused open flags: %x", flags); 46 dout("unused open flags: %x\n", flags);
45 47
46 return cpu_to_le32(wire_flags); 48 return cpu_to_le32(wire_flags);
47} 49}
@@ -159,13 +161,50 @@ out:
159 return req; 161 return req;
160} 162}
161 163
164static int ceph_init_file_info(struct inode *inode, struct file *file,
165 int fmode, bool isdir)
166{
167 struct ceph_file_info *fi;
168
169 dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
170 inode->i_mode, isdir ? "dir" : "regular");
171 BUG_ON(inode->i_fop->release != ceph_release);
172
173 if (isdir) {
174 struct ceph_dir_file_info *dfi =
175 kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
176 if (!dfi) {
177 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
178 return -ENOMEM;
179 }
180
181 file->private_data = dfi;
182 fi = &dfi->file_info;
183 dfi->next_offset = 2;
184 dfi->readdir_cache_idx = -1;
185 } else {
186 fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
187 if (!fi) {
188 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
189 return -ENOMEM;
190 }
191
192 file->private_data = fi;
193 }
194
195 fi->fmode = fmode;
196 spin_lock_init(&fi->rw_contexts_lock);
197 INIT_LIST_HEAD(&fi->rw_contexts);
198
199 return 0;
200}
201
162/* 202/*
163 * initialize private struct file data. 203 * initialize private struct file data.
164 * if we fail, clean up by dropping fmode reference on the ceph_inode 204 * if we fail, clean up by dropping fmode reference on the ceph_inode
165 */ 205 */
166static int ceph_init_file(struct inode *inode, struct file *file, int fmode) 206static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
167{ 207{
168 struct ceph_file_info *cf;
169 int ret = 0; 208 int ret = 0;
170 209
171 switch (inode->i_mode & S_IFMT) { 210 switch (inode->i_mode & S_IFMT) {
@@ -173,22 +212,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
173 ceph_fscache_register_inode_cookie(inode); 212 ceph_fscache_register_inode_cookie(inode);
174 ceph_fscache_file_set_cookie(inode, file); 213 ceph_fscache_file_set_cookie(inode, file);
175 case S_IFDIR: 214 case S_IFDIR:
176 dout("init_file %p %p 0%o (regular)\n", inode, file, 215 ret = ceph_init_file_info(inode, file, fmode,
177 inode->i_mode); 216 S_ISDIR(inode->i_mode));
178 cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 217 if (ret)
179 if (!cf) { 218 return ret;
180 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
181 return -ENOMEM;
182 }
183 cf->fmode = fmode;
184
185 spin_lock_init(&cf->rw_contexts_lock);
186 INIT_LIST_HEAD(&cf->rw_contexts);
187
188 cf->next_offset = 2;
189 cf->readdir_cache_idx = -1;
190 file->private_data = cf;
191 BUG_ON(inode->i_fop->release != ceph_release);
192 break; 219 break;
193 220
194 case S_IFLNK: 221 case S_IFLNK:
@@ -278,11 +305,11 @@ int ceph_open(struct inode *inode, struct file *file)
278 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 305 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
279 struct ceph_mds_client *mdsc = fsc->mdsc; 306 struct ceph_mds_client *mdsc = fsc->mdsc;
280 struct ceph_mds_request *req; 307 struct ceph_mds_request *req;
281 struct ceph_file_info *cf = file->private_data; 308 struct ceph_file_info *fi = file->private_data;
282 int err; 309 int err;
283 int flags, fmode, wanted; 310 int flags, fmode, wanted;
284 311
285 if (cf) { 312 if (fi) {
286 dout("open file %p is already opened\n", file); 313 dout("open file %p is already opened\n", file);
287 return 0; 314 return 0;
288 } 315 }
@@ -375,7 +402,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
375 struct ceph_mds_request *req; 402 struct ceph_mds_request *req;
376 struct dentry *dn; 403 struct dentry *dn;
377 struct ceph_acls_info acls = {}; 404 struct ceph_acls_info acls = {};
378 int mask; 405 int mask;
379 int err; 406 int err;
380 407
381 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", 408 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -386,6 +413,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
386 return -ENAMETOOLONG; 413 return -ENAMETOOLONG;
387 414
388 if (flags & O_CREAT) { 415 if (flags & O_CREAT) {
416 if (ceph_quota_is_max_files_exceeded(dir))
417 return -EDQUOT;
389 err = ceph_pre_init_acls(dir, &mode, &acls); 418 err = ceph_pre_init_acls(dir, &mode, &acls);
390 if (err < 0) 419 if (err < 0)
391 return err; 420 return err;
@@ -460,16 +489,27 @@ out_acl:
460int ceph_release(struct inode *inode, struct file *file) 489int ceph_release(struct inode *inode, struct file *file)
461{ 490{
462 struct ceph_inode_info *ci = ceph_inode(inode); 491 struct ceph_inode_info *ci = ceph_inode(inode);
463 struct ceph_file_info *cf = file->private_data;
464 492
465 dout("release inode %p file %p\n", inode, file); 493 if (S_ISDIR(inode->i_mode)) {
466 ceph_put_fmode(ci, cf->fmode); 494 struct ceph_dir_file_info *dfi = file->private_data;
467 if (cf->last_readdir) 495 dout("release inode %p dir file %p\n", inode, file);
468 ceph_mdsc_put_request(cf->last_readdir); 496 WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
469 kfree(cf->last_name); 497
470 kfree(cf->dir_info); 498 ceph_put_fmode(ci, dfi->file_info.fmode);
471 WARN_ON(!list_empty(&cf->rw_contexts)); 499
472 kmem_cache_free(ceph_file_cachep, cf); 500 if (dfi->last_readdir)
501 ceph_mdsc_put_request(dfi->last_readdir);
502 kfree(dfi->last_name);
503 kfree(dfi->dir_info);
504 kmem_cache_free(ceph_dir_file_cachep, dfi);
505 } else {
506 struct ceph_file_info *fi = file->private_data;
507 dout("release inode %p regular file %p\n", inode, file);
508 WARN_ON(!list_empty(&fi->rw_contexts));
509
510 ceph_put_fmode(ci, fi->fmode);
511 kmem_cache_free(ceph_file_cachep, fi);
512 }
473 513
474 /* wake up anyone waiting for caps on this inode */ 514 /* wake up anyone waiting for caps on this inode */
475 wake_up_all(&ci->i_cap_wq); 515 wake_up_all(&ci->i_cap_wq);
@@ -1338,6 +1378,11 @@ retry_snap:
1338 1378
1339 pos = iocb->ki_pos; 1379 pos = iocb->ki_pos;
1340 count = iov_iter_count(from); 1380 count = iov_iter_count(from);
1381 if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
1382 err = -EDQUOT;
1383 goto out;
1384 }
1385
1341 err = file_remove_privs(file); 1386 err = file_remove_privs(file);
1342 if (err) 1387 if (err)
1343 goto out; 1388 goto out;
@@ -1419,6 +1464,7 @@ retry_snap:
1419 1464
1420 if (written >= 0) { 1465 if (written >= 0) {
1421 int dirty; 1466 int dirty;
1467
1422 spin_lock(&ci->i_ceph_lock); 1468 spin_lock(&ci->i_ceph_lock);
1423 ci->i_inline_version = CEPH_INLINE_NONE; 1469 ci->i_inline_version = CEPH_INLINE_NONE;
1424 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 1470 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
@@ -1426,6 +1472,8 @@ retry_snap:
1426 spin_unlock(&ci->i_ceph_lock); 1472 spin_unlock(&ci->i_ceph_lock);
1427 if (dirty) 1473 if (dirty)
1428 __mark_inode_dirty(inode, dirty); 1474 __mark_inode_dirty(inode, dirty);
1475 if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
1476 ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
1429 } 1477 }
1430 1478
1431 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 1479 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1668,6 +1716,12 @@ static long ceph_fallocate(struct file *file, int mode,
1668 goto unlock; 1716 goto unlock;
1669 } 1717 }
1670 1718
1719 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
1720 ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
1721 ret = -EDQUOT;
1722 goto unlock;
1723 }
1724
1671 if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && 1725 if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
1672 !(mode & FALLOC_FL_PUNCH_HOLE)) { 1726 !(mode & FALLOC_FL_PUNCH_HOLE)) {
1673 ret = -ENOSPC; 1727 ret = -ENOSPC;
@@ -1716,6 +1770,9 @@ static long ceph_fallocate(struct file *file, int mode,
1716 spin_unlock(&ci->i_ceph_lock); 1770 spin_unlock(&ci->i_ceph_lock);
1717 if (dirty) 1771 if (dirty)
1718 __mark_inode_dirty(inode, dirty); 1772 __mark_inode_dirty(inode, dirty);
1773 if ((endoff > size) &&
1774 ceph_quota_is_max_bytes_approaching(inode, endoff))
1775 ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
1719 } 1776 }
1720 1777
1721 ceph_put_cap_refs(ci, got); 1778 ceph_put_cap_refs(ci, got);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index c6ec5aa46100..8bf60250309e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -441,6 +441,9 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
441 atomic64_set(&ci->i_complete_seq[1], 0); 441 atomic64_set(&ci->i_complete_seq[1], 0);
442 ci->i_symlink = NULL; 442 ci->i_symlink = NULL;
443 443
444 ci->i_max_bytes = 0;
445 ci->i_max_files = 0;
446
444 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); 447 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
445 RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); 448 RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
446 449
@@ -536,6 +539,9 @@ void ceph_destroy_inode(struct inode *inode)
536 539
537 ceph_queue_caps_release(inode); 540 ceph_queue_caps_release(inode);
538 541
542 if (__ceph_has_any_quota(ci))
543 ceph_adjust_quota_realms_count(inode, false);
544
539 /* 545 /*
540 * we may still have a snap_realm reference if there are stray 546 * we may still have a snap_realm reference if there are stray
541 * caps in i_snap_caps. 547 * caps in i_snap_caps.
@@ -548,6 +554,9 @@ void ceph_destroy_inode(struct inode *inode)
548 dout(" dropping residual ref to snap realm %p\n", realm); 554 dout(" dropping residual ref to snap realm %p\n", realm);
549 spin_lock(&realm->inodes_with_caps_lock); 555 spin_lock(&realm->inodes_with_caps_lock);
550 list_del_init(&ci->i_snap_realm_item); 556 list_del_init(&ci->i_snap_realm_item);
557 ci->i_snap_realm = NULL;
558 if (realm->ino == ci->i_vino.ino)
559 realm->inode = NULL;
551 spin_unlock(&realm->inodes_with_caps_lock); 560 spin_unlock(&realm->inodes_with_caps_lock);
552 ceph_put_snap_realm(mdsc, realm); 561 ceph_put_snap_realm(mdsc, realm);
553 } 562 }
@@ -790,6 +799,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
790 inode->i_rdev = le32_to_cpu(info->rdev); 799 inode->i_rdev = le32_to_cpu(info->rdev);
791 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 800 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
792 801
802 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
803
793 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && 804 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
794 (issued & CEPH_CAP_AUTH_EXCL) == 0) { 805 (issued & CEPH_CAP_AUTH_EXCL) == 0) {
795 inode->i_mode = le32_to_cpu(info->mode); 806 inode->i_mode = le32_to_cpu(info->mode);
@@ -1867,20 +1878,9 @@ retry:
1867 * possibly truncate them.. so write AND block! 1878 * possibly truncate them.. so write AND block!
1868 */ 1879 */
1869 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { 1880 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1870 struct ceph_cap_snap *capsnap;
1871 to = ci->i_truncate_size;
1872 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1873 // MDS should have revoked Frw caps
1874 WARN_ON_ONCE(capsnap->writing);
1875 if (capsnap->dirty_pages && capsnap->size > to)
1876 to = capsnap->size;
1877 }
1878 spin_unlock(&ci->i_ceph_lock); 1881 spin_unlock(&ci->i_ceph_lock);
1879 dout("__do_pending_vmtruncate %p flushing snaps first\n", 1882 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1880 inode); 1883 inode);
1881
1882 truncate_pagecache(inode, to);
1883
1884 filemap_write_and_wait_range(&inode->i_data, 0, 1884 filemap_write_and_wait_range(&inode->i_data, 0,
1885 inode->i_sb->s_maxbytes); 1885 inode->i_sb->s_maxbytes);
1886 goto retry; 1886 goto retry;
@@ -2152,6 +2152,10 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
2152 if (err != 0) 2152 if (err != 0)
2153 return err; 2153 return err;
2154 2154
2155 if ((attr->ia_valid & ATTR_SIZE) &&
2156 ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
2157 return -EDQUOT;
2158
2155 err = __ceph_setattr(inode, attr); 2159 err = __ceph_setattr(inode, attr);
2156 2160
2157 if (err >= 0 && (attr->ia_valid & ATTR_MODE)) 2161 if (err >= 0 && (attr->ia_valid & ATTR_MODE))
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 851aa69ec8f0..c90f03beb15d 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -5,7 +5,7 @@
5#include "super.h" 5#include "super.h"
6#include "mds_client.h" 6#include "mds_client.h"
7#include "ioctl.h" 7#include "ioctl.h"
8 8#include <linux/ceph/striper.h>
9 9
10/* 10/*
11 * ioctls 11 * ioctls
@@ -185,7 +185,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
185 &ceph_sb_to_client(inode->i_sb)->client->osdc; 185 &ceph_sb_to_client(inode->i_sb)->client->osdc;
186 struct ceph_object_locator oloc; 186 struct ceph_object_locator oloc;
187 CEPH_DEFINE_OID_ONSTACK(oid); 187 CEPH_DEFINE_OID_ONSTACK(oid);
188 u64 len = 1, olen; 188 u32 xlen;
189 u64 tmp; 189 u64 tmp;
190 struct ceph_pg pgid; 190 struct ceph_pg pgid;
191 int r; 191 int r;
@@ -195,13 +195,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
195 return -EFAULT; 195 return -EFAULT;
196 196
197 down_read(&osdc->lock); 197 down_read(&osdc->lock);
198 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, 198 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1,
199 &dl.object_no, &dl.object_offset, 199 &dl.object_no, &dl.object_offset, &xlen);
200 &olen);
201 if (r < 0) {
202 up_read(&osdc->lock);
203 return -EIO;
204 }
205 dl.file_offset -= dl.object_offset; 200 dl.file_offset -= dl.object_offset;
206 dl.object_size = ci->i_layout.object_size; 201 dl.object_size = ci->i_layout.object_size;
207 dl.block_size = ci->i_layout.stripe_unit; 202 dl.block_size = ci->i_layout.stripe_unit;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 9e66f69ee8a5..9dae2ec7e1fa 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -95,7 +95,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
95 owner = secure_addr(fl->fl_owner); 95 owner = secure_addr(fl->fl_owner);
96 96
97 dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " 97 dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
98 "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, 98 "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
99 (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, 99 (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
100 wait, fl->fl_type); 100 wait, fl->fl_type);
101 101
@@ -132,7 +132,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
132 } 132 }
133 ceph_mdsc_put_request(req); 133 ceph_mdsc_put_request(req);
134 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 134 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
135 "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, 135 "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
136 (int)operation, (u64)fl->fl_pid, fl->fl_start, 136 (int)operation, (u64)fl->fl_pid, fl->fl_start,
137 length, wait, fl->fl_type, err); 137 length, wait, fl->fl_type, err);
138 return err; 138 return err;
@@ -226,7 +226,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
226 if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) 226 if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
227 return -ENOLCK; 227 return -ENOLCK;
228 228
229 dout("ceph_lock, fl_owner: %p", fl->fl_owner); 229 dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
230 230
231 /* set wait bit as appropriate, then make command as Ceph expects it*/ 231 /* set wait bit as appropriate, then make command as Ceph expects it*/
232 if (IS_GETLK(cmd)) 232 if (IS_GETLK(cmd))
@@ -264,7 +264,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
264 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); 264 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
265 if (!err) { 265 if (!err) {
266 if (op == CEPH_MDS_OP_SETFILELOCK) { 266 if (op == CEPH_MDS_OP_SETFILELOCK) {
267 dout("mds locked, locking locally"); 267 dout("mds locked, locking locally\n");
268 err = posix_lock_file(file, fl, NULL); 268 err = posix_lock_file(file, fl, NULL);
269 if (err) { 269 if (err) {
270 /* undo! This should only happen if 270 /* undo! This should only happen if
@@ -272,7 +272,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
272 * deadlock. */ 272 * deadlock. */
273 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, 273 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
274 CEPH_LOCK_UNLOCK, 0, fl); 274 CEPH_LOCK_UNLOCK, 0, fl);
275 dout("got %d on posix_lock_file, undid lock", 275 dout("got %d on posix_lock_file, undid lock\n",
276 err); 276 err);
277 } 277 }
278 } 278 }
@@ -294,7 +294,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
294 if (fl->fl_type & LOCK_MAND) 294 if (fl->fl_type & LOCK_MAND)
295 return -EOPNOTSUPP; 295 return -EOPNOTSUPP;
296 296
297 dout("ceph_flock, fl_file: %p", fl->fl_file); 297 dout("ceph_flock, fl_file: %p\n", fl->fl_file);
298 298
299 spin_lock(&ci->i_ceph_lock); 299 spin_lock(&ci->i_ceph_lock);
300 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 300 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -329,7 +329,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
329 ceph_lock_message(CEPH_LOCK_FLOCK, 329 ceph_lock_message(CEPH_LOCK_FLOCK,
330 CEPH_MDS_OP_SETFILELOCK, 330 CEPH_MDS_OP_SETFILELOCK,
331 inode, CEPH_LOCK_UNLOCK, 0, fl); 331 inode, CEPH_LOCK_UNLOCK, 0, fl);
332 dout("got %d on locks_lock_file_wait, undid lock", err); 332 dout("got %d on locks_lock_file_wait, undid lock\n", err);
333 } 333 }
334 } 334 }
335 return err; 335 return err;
@@ -356,7 +356,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
356 ++(*flock_count); 356 ++(*flock_count);
357 spin_unlock(&ctx->flc_lock); 357 spin_unlock(&ctx->flc_lock);
358 } 358 }
359 dout("counted %d flock locks and %d fcntl locks", 359 dout("counted %d flock locks and %d fcntl locks\n",
360 *flock_count, *fcntl_count); 360 *flock_count, *fcntl_count);
361} 361}
362 362
@@ -384,7 +384,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock,
384 cephlock->type = CEPH_LOCK_UNLOCK; 384 cephlock->type = CEPH_LOCK_UNLOCK;
385 break; 385 break;
386 default: 386 default:
387 dout("Have unknown lock type %d", lock->fl_type); 387 dout("Have unknown lock type %d\n", lock->fl_type);
388 err = -EINVAL; 388 err = -EINVAL;
389 } 389 }
390 390
@@ -407,7 +407,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
407 int seen_flock = 0; 407 int seen_flock = 0;
408 int l = 0; 408 int l = 0;
409 409
410 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 410 dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
411 num_fcntl_locks); 411 num_fcntl_locks);
412 412
413 if (!ctx) 413 if (!ctx)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2e8f90f96540..5ece2e6ad154 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,26 @@ static int parse_reply_info_in(void **p, void *end,
100 } else 100 } else
101 info->inline_version = CEPH_INLINE_NONE; 101 info->inline_version = CEPH_INLINE_NONE;
102 102
103 if (features & CEPH_FEATURE_MDS_QUOTA) {
104 u8 struct_v, struct_compat;
105 u32 struct_len;
106
107 /*
108 * both struct_v and struct_compat are expected to be >= 1
109 */
110 ceph_decode_8_safe(p, end, struct_v, bad);
111 ceph_decode_8_safe(p, end, struct_compat, bad);
112 if (!struct_v || !struct_compat)
113 goto bad;
114 ceph_decode_32_safe(p, end, struct_len, bad);
115 ceph_decode_need(p, end, struct_len, bad);
116 ceph_decode_64_safe(p, end, info->max_bytes, bad);
117 ceph_decode_64_safe(p, end, info->max_files, bad);
118 } else {
119 info->max_bytes = 0;
120 info->max_files = 0;
121 }
122
103 info->pool_ns_len = 0; 123 info->pool_ns_len = 0;
104 info->pool_ns_data = NULL; 124 info->pool_ns_data = NULL;
105 if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { 125 if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
@@ -384,7 +404,7 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
384 refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); 404 refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
385 return s; 405 return s;
386 } else { 406 } else {
387 dout("mdsc get_session %p 0 -- FAIL", s); 407 dout("mdsc get_session %p 0 -- FAIL\n", s);
388 return NULL; 408 return NULL;
389 } 409 }
390} 410}
@@ -419,9 +439,10 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
419 439
420static bool __have_session(struct ceph_mds_client *mdsc, int mds) 440static bool __have_session(struct ceph_mds_client *mdsc, int mds)
421{ 441{
422 if (mds >= mdsc->max_sessions) 442 if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
423 return false; 443 return false;
424 return mdsc->sessions[mds]; 444 else
445 return true;
425} 446}
426 447
427static int __verify_registered_session(struct ceph_mds_client *mdsc, 448static int __verify_registered_session(struct ceph_mds_client *mdsc,
@@ -448,6 +469,25 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
448 s = kzalloc(sizeof(*s), GFP_NOFS); 469 s = kzalloc(sizeof(*s), GFP_NOFS);
449 if (!s) 470 if (!s)
450 return ERR_PTR(-ENOMEM); 471 return ERR_PTR(-ENOMEM);
472
473 if (mds >= mdsc->max_sessions) {
474 int newmax = 1 << get_count_order(mds + 1);
475 struct ceph_mds_session **sa;
476
477 dout("%s: realloc to %d\n", __func__, newmax);
478 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
479 if (!sa)
480 goto fail_realloc;
481 if (mdsc->sessions) {
482 memcpy(sa, mdsc->sessions,
483 mdsc->max_sessions * sizeof(void *));
484 kfree(mdsc->sessions);
485 }
486 mdsc->sessions = sa;
487 mdsc->max_sessions = newmax;
488 }
489
490 dout("%s: mds%d\n", __func__, mds);
451 s->s_mdsc = mdsc; 491 s->s_mdsc = mdsc;
452 s->s_mds = mds; 492 s->s_mds = mds;
453 s->s_state = CEPH_MDS_SESSION_NEW; 493 s->s_state = CEPH_MDS_SESSION_NEW;
@@ -476,23 +516,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
476 INIT_LIST_HEAD(&s->s_cap_releases); 516 INIT_LIST_HEAD(&s->s_cap_releases);
477 INIT_LIST_HEAD(&s->s_cap_flushing); 517 INIT_LIST_HEAD(&s->s_cap_flushing);
478 518
479 dout("register_session mds%d\n", mds);
480 if (mds >= mdsc->max_sessions) {
481 int newmax = 1 << get_count_order(mds+1);
482 struct ceph_mds_session **sa;
483
484 dout("register_session realloc to %d\n", newmax);
485 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
486 if (!sa)
487 goto fail_realloc;
488 if (mdsc->sessions) {
489 memcpy(sa, mdsc->sessions,
490 mdsc->max_sessions * sizeof(void *));
491 kfree(mdsc->sessions);
492 }
493 mdsc->sessions = sa;
494 mdsc->max_sessions = newmax;
495 }
496 mdsc->sessions[mds] = s; 519 mdsc->sessions[mds] = s;
497 atomic_inc(&mdsc->num_sessions); 520 atomic_inc(&mdsc->num_sessions);
498 refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ 521 refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
@@ -2531,10 +2554,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2531 * Otherwise we just have to return an ESTALE 2554 * Otherwise we just have to return an ESTALE
2532 */ 2555 */
2533 if (result == -ESTALE) { 2556 if (result == -ESTALE) {
2534 dout("got ESTALE on request %llu", req->r_tid); 2557 dout("got ESTALE on request %llu\n", req->r_tid);
2535 req->r_resend_mds = -1; 2558 req->r_resend_mds = -1;
2536 if (req->r_direct_mode != USE_AUTH_MDS) { 2559 if (req->r_direct_mode != USE_AUTH_MDS) {
2537 dout("not using auth, setting for that now"); 2560 dout("not using auth, setting for that now\n");
2538 req->r_direct_mode = USE_AUTH_MDS; 2561 req->r_direct_mode = USE_AUTH_MDS;
2539 __do_request(mdsc, req); 2562 __do_request(mdsc, req);
2540 mutex_unlock(&mdsc->mutex); 2563 mutex_unlock(&mdsc->mutex);
@@ -2542,13 +2565,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2542 } else { 2565 } else {
2543 int mds = __choose_mds(mdsc, req); 2566 int mds = __choose_mds(mdsc, req);
2544 if (mds >= 0 && mds != req->r_session->s_mds) { 2567 if (mds >= 0 && mds != req->r_session->s_mds) {
2545 dout("but auth changed, so resending"); 2568 dout("but auth changed, so resending\n");
2546 __do_request(mdsc, req); 2569 __do_request(mdsc, req);
2547 mutex_unlock(&mdsc->mutex); 2570 mutex_unlock(&mdsc->mutex);
2548 goto out; 2571 goto out;
2549 } 2572 }
2550 } 2573 }
2551 dout("have to return ESTALE on request %llu", req->r_tid); 2574 dout("have to return ESTALE on request %llu\n", req->r_tid);
2552 } 2575 }
2553 2576
2554 2577
@@ -3470,13 +3493,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
3470} 3493}
3471 3494
3472/* 3495/*
3473 * drop all leases (and dentry refs) in preparation for umount 3496 * lock unlock sessions, to wait ongoing session activities
3474 */ 3497 */
3475static void drop_leases(struct ceph_mds_client *mdsc) 3498static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
3476{ 3499{
3477 int i; 3500 int i;
3478 3501
3479 dout("drop_leases\n");
3480 mutex_lock(&mdsc->mutex); 3502 mutex_lock(&mdsc->mutex);
3481 for (i = 0; i < mdsc->max_sessions; i++) { 3503 for (i = 0; i < mdsc->max_sessions; i++) {
3482 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); 3504 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
@@ -3572,7 +3594,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3572 if (!mdsc) 3594 if (!mdsc)
3573 return -ENOMEM; 3595 return -ENOMEM;
3574 mdsc->fsc = fsc; 3596 mdsc->fsc = fsc;
3575 fsc->mdsc = mdsc;
3576 mutex_init(&mdsc->mutex); 3597 mutex_init(&mdsc->mutex);
3577 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3598 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
3578 if (!mdsc->mdsmap) { 3599 if (!mdsc->mdsmap) {
@@ -3580,6 +3601,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3580 return -ENOMEM; 3601 return -ENOMEM;
3581 } 3602 }
3582 3603
3604 fsc->mdsc = mdsc;
3583 init_completion(&mdsc->safe_umount_waiters); 3605 init_completion(&mdsc->safe_umount_waiters);
3584 init_waitqueue_head(&mdsc->session_close_wq); 3606 init_waitqueue_head(&mdsc->session_close_wq);
3585 INIT_LIST_HEAD(&mdsc->waiting_for_map); 3607 INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -3587,6 +3609,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3587 atomic_set(&mdsc->num_sessions, 0); 3609 atomic_set(&mdsc->num_sessions, 0);
3588 mdsc->max_sessions = 0; 3610 mdsc->max_sessions = 0;
3589 mdsc->stopping = 0; 3611 mdsc->stopping = 0;
3612 atomic64_set(&mdsc->quotarealms_count, 0);
3590 mdsc->last_snap_seq = 0; 3613 mdsc->last_snap_seq = 0;
3591 init_rwsem(&mdsc->snap_rwsem); 3614 init_rwsem(&mdsc->snap_rwsem);
3592 mdsc->snap_realms = RB_ROOT; 3615 mdsc->snap_realms = RB_ROOT;
@@ -3660,7 +3683,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
3660 dout("pre_umount\n"); 3683 dout("pre_umount\n");
3661 mdsc->stopping = 1; 3684 mdsc->stopping = 1;
3662 3685
3663 drop_leases(mdsc); 3686 lock_unlock_sessions(mdsc);
3664 ceph_flush_dirty_caps(mdsc); 3687 ceph_flush_dirty_caps(mdsc);
3665 wait_requests(mdsc); 3688 wait_requests(mdsc);
3666 3689
@@ -3858,6 +3881,9 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3858 struct ceph_mds_client *mdsc = fsc->mdsc; 3881 struct ceph_mds_client *mdsc = fsc->mdsc;
3859 dout("mdsc_destroy %p\n", mdsc); 3882 dout("mdsc_destroy %p\n", mdsc);
3860 3883
3884 if (!mdsc)
3885 return;
3886
3861 /* flush out any connection work with references to us */ 3887 /* flush out any connection work with references to us */
3862 ceph_msgr_flush(); 3888 ceph_msgr_flush();
3863 3889
@@ -4077,6 +4103,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
4077 case CEPH_MSG_CLIENT_LEASE: 4103 case CEPH_MSG_CLIENT_LEASE:
4078 handle_lease(mdsc, s, msg); 4104 handle_lease(mdsc, s, msg);
4079 break; 4105 break;
4106 case CEPH_MSG_CLIENT_QUOTA:
4107 ceph_handle_quota(mdsc, s, msg);
4108 break;
4080 4109
4081 default: 4110 default:
4082 pr_err("received unknown message type %d %s\n", type, 4111 pr_err("received unknown message type %d %s\n", type,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 71e3b783ee6f..2ec3b5b35067 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -49,6 +49,8 @@ struct ceph_mds_reply_info_in {
49 char *inline_data; 49 char *inline_data;
50 u32 pool_ns_len; 50 u32 pool_ns_len;
51 char *pool_ns_data; 51 char *pool_ns_data;
52 u64 max_bytes;
53 u64 max_files;
52}; 54};
53 55
54struct ceph_mds_reply_dir_entry { 56struct ceph_mds_reply_dir_entry {
@@ -312,6 +314,8 @@ struct ceph_mds_client {
312 int max_sessions; /* len of s_mds_sessions */ 314 int max_sessions; /* len of s_mds_sessions */
313 int stopping; /* true if shutting down */ 315 int stopping; /* true if shutting down */
314 316
317 atomic64_t quotarealms_count; /* # realms with quota */
318
315 /* 319 /*
316 * snap_rwsem will cover cap linkage into snaprealms, and 320 * snap_rwsem will cover cap linkage into snaprealms, and
317 * realm snap contexts. (later, we can do per-realm snap 321 * realm snap contexts. (later, we can do per-realm snap
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
new file mode 100644
index 000000000000..242bfa5c0539
--- /dev/null
+++ b/fs/ceph/quota.c
@@ -0,0 +1,361 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * quota.c - CephFS quota
4 *
5 * Copyright (C) 2017-2018 SUSE
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#include <linux/statfs.h>
22
23#include "super.h"
24#include "mds_client.h"
25
26void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
27{
28 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
29 if (inc)
30 atomic64_inc(&mdsc->quotarealms_count);
31 else
32 atomic64_dec(&mdsc->quotarealms_count);
33}
34
35static inline bool ceph_has_realms_with_quotas(struct inode *inode)
36{
37 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
38 return atomic64_read(&mdsc->quotarealms_count) > 0;
39}
40
41void ceph_handle_quota(struct ceph_mds_client *mdsc,
42 struct ceph_mds_session *session,
43 struct ceph_msg *msg)
44{
45 struct super_block *sb = mdsc->fsc->sb;
46 struct ceph_mds_quota *h = msg->front.iov_base;
47 struct ceph_vino vino;
48 struct inode *inode;
49 struct ceph_inode_info *ci;
50
51 if (msg->front.iov_len != sizeof(*h)) {
52 pr_err("%s corrupt message mds%d len %d\n", __func__,
53 session->s_mds, (int)msg->front.iov_len);
54 ceph_msg_dump(msg);
55 return;
56 }
57
58 /* increment msg sequence number */
59 mutex_lock(&session->s_mutex);
60 session->s_seq++;
61 mutex_unlock(&session->s_mutex);
62
63 /* lookup inode */
64 vino.ino = le64_to_cpu(h->ino);
65 vino.snap = CEPH_NOSNAP;
66 inode = ceph_find_inode(sb, vino);
67 if (!inode) {
68 pr_warn("Failed to find inode %llu\n", vino.ino);
69 return;
70 }
71 ci = ceph_inode(inode);
72
73 spin_lock(&ci->i_ceph_lock);
74 ci->i_rbytes = le64_to_cpu(h->rbytes);
75 ci->i_rfiles = le64_to_cpu(h->rfiles);
76 ci->i_rsubdirs = le64_to_cpu(h->rsubdirs);
77 __ceph_update_quota(ci, le64_to_cpu(h->max_bytes),
78 le64_to_cpu(h->max_files));
79 spin_unlock(&ci->i_ceph_lock);
80
81 iput(inode);
82}
83
84/*
85 * This function walks through the snaprealm for an inode and returns the
86 * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
87 * or max_bytes). If the root is reached, return the root ceph_snap_realm
88 * instead.
89 *
90 * Note that the caller is responsible for calling ceph_put_snap_realm() on the
91 * returned realm.
92 */
93static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
94 struct inode *inode)
95{
96 struct ceph_inode_info *ci = NULL;
97 struct ceph_snap_realm *realm, *next;
98 struct inode *in;
99 bool has_quota;
100
101 if (ceph_snap(inode) != CEPH_NOSNAP)
102 return NULL;
103
104 realm = ceph_inode(inode)->i_snap_realm;
105 if (realm)
106 ceph_get_snap_realm(mdsc, realm);
107 else
108 pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
109 "null i_snap_realm\n", ceph_vinop(inode));
110 while (realm) {
111 spin_lock(&realm->inodes_with_caps_lock);
112 in = realm->inode ? igrab(realm->inode) : NULL;
113 spin_unlock(&realm->inodes_with_caps_lock);
114 if (!in)
115 break;
116
117 ci = ceph_inode(in);
118 has_quota = __ceph_has_any_quota(ci);
119 iput(in);
120
121 next = realm->parent;
122 if (has_quota || !next)
123 return realm;
124
125 ceph_get_snap_realm(mdsc, next);
126 ceph_put_snap_realm(mdsc, realm);
127 realm = next;
128 }
129 if (realm)
130 ceph_put_snap_realm(mdsc, realm);
131
132 return NULL;
133}
134
135bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
136{
137 struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
138 struct ceph_snap_realm *old_realm, *new_realm;
139 bool is_same;
140
141 down_read(&mdsc->snap_rwsem);
142 old_realm = get_quota_realm(mdsc, old);
143 new_realm = get_quota_realm(mdsc, new);
144 is_same = (old_realm == new_realm);
145 up_read(&mdsc->snap_rwsem);
146
147 if (old_realm)
148 ceph_put_snap_realm(mdsc, old_realm);
149 if (new_realm)
150 ceph_put_snap_realm(mdsc, new_realm);
151
152 return is_same;
153}
154
155enum quota_check_op {
156 QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */
157 QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */
158 QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files
159 limit is approaching */
160};
161
162/*
163 * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each
164 * realm, it will execute quota check operation defined by the 'op' parameter.
165 * The snaprealm walk is interrupted if the quota check detects that the quota
166 * is exceeded or if the root inode is reached.
167 */
168static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
169 loff_t delta)
170{
171 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
172 struct ceph_inode_info *ci;
173 struct ceph_snap_realm *realm, *next;
174 struct inode *in;
175 u64 max, rvalue;
176 bool exceeded = false;
177
178 if (ceph_snap(inode) != CEPH_NOSNAP)
179 return false;
180
181 down_read(&mdsc->snap_rwsem);
182 realm = ceph_inode(inode)->i_snap_realm;
183 if (realm)
184 ceph_get_snap_realm(mdsc, realm);
185 else
186 pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
187 "null i_snap_realm\n", ceph_vinop(inode));
188 while (realm) {
189 spin_lock(&realm->inodes_with_caps_lock);
190 in = realm->inode ? igrab(realm->inode) : NULL;
191 spin_unlock(&realm->inodes_with_caps_lock);
192 if (!in)
193 break;
194
195 ci = ceph_inode(in);
196 spin_lock(&ci->i_ceph_lock);
197 if (op == QUOTA_CHECK_MAX_FILES_OP) {
198 max = ci->i_max_files;
199 rvalue = ci->i_rfiles + ci->i_rsubdirs;
200 } else {
201 max = ci->i_max_bytes;
202 rvalue = ci->i_rbytes;
203 }
204 spin_unlock(&ci->i_ceph_lock);
205 switch (op) {
206 case QUOTA_CHECK_MAX_FILES_OP:
207 exceeded = (max && (rvalue >= max));
208 break;
209 case QUOTA_CHECK_MAX_BYTES_OP:
210 exceeded = (max && (rvalue + delta > max));
211 break;
212 case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP:
213 if (max) {
214 if (rvalue >= max)
215 exceeded = true;
216 else {
217 /*
218 * when we're writing more that 1/16th
219 * of the available space
220 */
221 exceeded =
222 (((max - rvalue) >> 4) < delta);
223 }
224 }
225 break;
226 default:
227 /* Shouldn't happen */
228 pr_warn("Invalid quota check op (%d)\n", op);
229 exceeded = true; /* Just break the loop */
230 }
231 iput(in);
232
233 next = realm->parent;
234 if (exceeded || !next)
235 break;
236 ceph_get_snap_realm(mdsc, next);
237 ceph_put_snap_realm(mdsc, realm);
238 realm = next;
239 }
240 ceph_put_snap_realm(mdsc, realm);
241 up_read(&mdsc->snap_rwsem);
242
243 return exceeded;
244}
245
246/*
247 * ceph_quota_is_max_files_exceeded - check if we can create a new file
248 * @inode: directory where a new file is being created
249 *
250 * This functions returns true is max_files quota allows a new file to be
251 * created. It is necessary to walk through the snaprealm hierarchy (until the
252 * FS root) to check all realms with quotas set.
253 */
254bool ceph_quota_is_max_files_exceeded(struct inode *inode)
255{
256 if (!ceph_has_realms_with_quotas(inode))
257 return false;
258
259 WARN_ON(!S_ISDIR(inode->i_mode));
260
261 return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0);
262}
263
264/*
265 * ceph_quota_is_max_bytes_exceeded - check if we can write to a file
266 * @inode: inode being written
267 * @newsize: new size if write succeeds
268 *
269 * This functions returns true is max_bytes quota allows a file size to reach
270 * @newsize; it returns false otherwise.
271 */
272bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize)
273{
274 loff_t size = i_size_read(inode);
275
276 if (!ceph_has_realms_with_quotas(inode))
277 return false;
278
279 /* return immediately if we're decreasing file size */
280 if (newsize <= size)
281 return false;
282
283 return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size));
284}
285
286/*
287 * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes
288 * @inode: inode being written
289 * @newsize: new size if write succeeds
290 *
291 * This function returns true if the new file size @newsize will be consuming
292 * more than 1/16th of the available quota space; it returns false otherwise.
293 */
294bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize)
295{
296 loff_t size = ceph_inode(inode)->i_reported_size;
297
298 if (!ceph_has_realms_with_quotas(inode))
299 return false;
300
301 /* return immediately if we're decreasing file size */
302 if (newsize <= size)
303 return false;
304
305 return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP,
306 (newsize - size));
307}
308
309/*
310 * ceph_quota_update_statfs - if root has quota update statfs with quota status
311 * @fsc: filesystem client instance
312 * @buf: statfs to update
313 *
314 * If the mounted filesystem root has max_bytes quota set, update the filesystem
315 * statistics with the quota status.
316 *
317 * This function returns true if the stats have been updated, false otherwise.
318 */
319bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
320{
321 struct ceph_mds_client *mdsc = fsc->mdsc;
322 struct ceph_inode_info *ci;
323 struct ceph_snap_realm *realm;
324 struct inode *in;
325 u64 total = 0, used, free;
326 bool is_updated = false;
327
328 down_read(&mdsc->snap_rwsem);
329 realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
330 up_read(&mdsc->snap_rwsem);
331 if (!realm)
332 return false;
333
334 spin_lock(&realm->inodes_with_caps_lock);
335 in = realm->inode ? igrab(realm->inode) : NULL;
336 spin_unlock(&realm->inodes_with_caps_lock);
337 if (in) {
338 ci = ceph_inode(in);
339 spin_lock(&ci->i_ceph_lock);
340 if (ci->i_max_bytes) {
341 total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT;
342 used = ci->i_rbytes >> CEPH_BLOCK_SHIFT;
343 /* It is possible for a quota to be exceeded.
344 * Report 'zero' in that case
345 */
346 free = total > used ? total - used : 0;
347 }
348 spin_unlock(&ci->i_ceph_lock);
349 if (total) {
350 buf->f_blocks = total;
351 buf->f_bfree = free;
352 buf->f_bavail = free;
353 is_updated = true;
354 }
355 iput(in);
356 }
357 ceph_put_snap_realm(mdsc, realm);
358
359 return is_updated;
360}
361
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 07cf95e6413d..041c27ea8de1 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -931,6 +931,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
931 list_add(&ci->i_snap_realm_item, 931 list_add(&ci->i_snap_realm_item,
932 &realm->inodes_with_caps); 932 &realm->inodes_with_caps);
933 ci->i_snap_realm = realm; 933 ci->i_snap_realm = realm;
934 if (realm->ino == ci->i_vino.ino)
935 realm->inode = inode;
934 spin_unlock(&realm->inodes_with_caps_lock); 936 spin_unlock(&realm->inodes_with_caps_lock);
935 937
936 spin_unlock(&ci->i_ceph_lock); 938 spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fb2bc9c15a23..b33082e6878f 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -76,9 +76,18 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
76 */ 76 */
77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; 77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
78 buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; 78 buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
79 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); 79
80 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 80 /*
81 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 81 * By default use root quota for stats; fallback to overall filesystem
82 * usage if using 'noquotadf' mount option or if the root dir doesn't
83 * have max_bytes quota set.
84 */
85 if (ceph_test_mount_opt(fsc, NOQUOTADF) ||
86 !ceph_quota_update_statfs(fsc, buf)) {
87 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
88 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
89 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
90 }
82 91
83 buf->f_files = le64_to_cpu(st.num_objects); 92 buf->f_files = le64_to_cpu(st.num_objects);
84 buf->f_ffree = -1; 93 buf->f_ffree = -1;
@@ -151,6 +160,8 @@ enum {
151 Opt_acl, 160 Opt_acl,
152#endif 161#endif
153 Opt_noacl, 162 Opt_noacl,
163 Opt_quotadf,
164 Opt_noquotadf,
154}; 165};
155 166
156static match_table_t fsopt_tokens = { 167static match_table_t fsopt_tokens = {
@@ -187,6 +198,8 @@ static match_table_t fsopt_tokens = {
187 {Opt_acl, "acl"}, 198 {Opt_acl, "acl"},
188#endif 199#endif
189 {Opt_noacl, "noacl"}, 200 {Opt_noacl, "noacl"},
201 {Opt_quotadf, "quotadf"},
202 {Opt_noquotadf, "noquotadf"},
190 {-1, NULL} 203 {-1, NULL}
191}; 204};
192 205
@@ -314,13 +327,16 @@ static int parse_fsopt_token(char *c, void *private)
314 break; 327 break;
315 case Opt_fscache: 328 case Opt_fscache:
316 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; 329 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
330 kfree(fsopt->fscache_uniq);
331 fsopt->fscache_uniq = NULL;
317 break; 332 break;
318 case Opt_nofscache: 333 case Opt_nofscache:
319 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; 334 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
335 kfree(fsopt->fscache_uniq);
336 fsopt->fscache_uniq = NULL;
320 break; 337 break;
321 case Opt_poolperm: 338 case Opt_poolperm:
322 fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; 339 fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
323 printk ("pool perm");
324 break; 340 break;
325 case Opt_nopoolperm: 341 case Opt_nopoolperm:
326 fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; 342 fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
@@ -331,6 +347,12 @@ static int parse_fsopt_token(char *c, void *private)
331 case Opt_norequire_active_mds: 347 case Opt_norequire_active_mds:
332 fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; 348 fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
333 break; 349 break;
350 case Opt_quotadf:
351 fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF;
352 break;
353 case Opt_noquotadf:
354 fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
355 break;
334#ifdef CONFIG_CEPH_FS_POSIX_ACL 356#ifdef CONFIG_CEPH_FS_POSIX_ACL
335 case Opt_acl: 357 case Opt_acl:
336 fsopt->sb_flags |= SB_POSIXACL; 358 fsopt->sb_flags |= SB_POSIXACL;
@@ -513,13 +535,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
513 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) 535 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
514 seq_puts(m, ",nodcache"); 536 seq_puts(m, ",nodcache");
515 if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { 537 if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
516 if (fsopt->fscache_uniq) 538 seq_show_option(m, "fsc", fsopt->fscache_uniq);
517 seq_printf(m, ",fsc=%s", fsopt->fscache_uniq);
518 else
519 seq_puts(m, ",fsc");
520 } 539 }
521 if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) 540 if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
522 seq_puts(m, ",nopoolperm"); 541 seq_puts(m, ",nopoolperm");
542 if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF)
543 seq_puts(m, ",noquotadf");
523 544
524#ifdef CONFIG_CEPH_FS_POSIX_ACL 545#ifdef CONFIG_CEPH_FS_POSIX_ACL
525 if (fsopt->sb_flags & SB_POSIXACL) 546 if (fsopt->sb_flags & SB_POSIXACL)
@@ -529,7 +550,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
529#endif 550#endif
530 551
531 if (fsopt->mds_namespace) 552 if (fsopt->mds_namespace)
532 seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); 553 seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
533 if (fsopt->wsize) 554 if (fsopt->wsize)
534 seq_printf(m, ",wsize=%d", fsopt->wsize); 555 seq_printf(m, ",wsize=%d", fsopt->wsize);
535 if (fsopt->rsize != CEPH_MAX_READ_SIZE) 556 if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -679,6 +700,7 @@ struct kmem_cache *ceph_cap_cachep;
679struct kmem_cache *ceph_cap_flush_cachep; 700struct kmem_cache *ceph_cap_flush_cachep;
680struct kmem_cache *ceph_dentry_cachep; 701struct kmem_cache *ceph_dentry_cachep;
681struct kmem_cache *ceph_file_cachep; 702struct kmem_cache *ceph_file_cachep;
703struct kmem_cache *ceph_dir_file_cachep;
682 704
683static void ceph_inode_init_once(void *foo) 705static void ceph_inode_init_once(void *foo)
684{ 706{
@@ -698,8 +720,7 @@ static int __init init_caches(void)
698 if (!ceph_inode_cachep) 720 if (!ceph_inode_cachep)
699 return -ENOMEM; 721 return -ENOMEM;
700 722
701 ceph_cap_cachep = KMEM_CACHE(ceph_cap, 723 ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
702 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
703 if (!ceph_cap_cachep) 724 if (!ceph_cap_cachep)
704 goto bad_cap; 725 goto bad_cap;
705 ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, 726 ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
@@ -716,6 +737,10 @@ static int __init init_caches(void)
716 if (!ceph_file_cachep) 737 if (!ceph_file_cachep)
717 goto bad_file; 738 goto bad_file;
718 739
740 ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD);
741 if (!ceph_dir_file_cachep)
742 goto bad_dir_file;
743
719 error = ceph_fscache_register(); 744 error = ceph_fscache_register();
720 if (error) 745 if (error)
721 goto bad_fscache; 746 goto bad_fscache;
@@ -723,6 +748,8 @@ static int __init init_caches(void)
723 return 0; 748 return 0;
724 749
725bad_fscache: 750bad_fscache:
751 kmem_cache_destroy(ceph_dir_file_cachep);
752bad_dir_file:
726 kmem_cache_destroy(ceph_file_cachep); 753 kmem_cache_destroy(ceph_file_cachep);
727bad_file: 754bad_file:
728 kmem_cache_destroy(ceph_dentry_cachep); 755 kmem_cache_destroy(ceph_dentry_cachep);
@@ -748,6 +775,7 @@ static void destroy_caches(void)
748 kmem_cache_destroy(ceph_cap_flush_cachep); 775 kmem_cache_destroy(ceph_cap_flush_cachep);
749 kmem_cache_destroy(ceph_dentry_cachep); 776 kmem_cache_destroy(ceph_dentry_cachep);
750 kmem_cache_destroy(ceph_file_cachep); 777 kmem_cache_destroy(ceph_file_cachep);
778 kmem_cache_destroy(ceph_dir_file_cachep);
751 779
752 ceph_fscache_unregister(); 780 ceph_fscache_unregister();
753} 781}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1c2086e0fec2..a7077a0c989f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -39,6 +39,7 @@
39#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ 39#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
40#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ 40#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
41#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ 41#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
42#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
42 43
43#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE 44#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
44 45
@@ -310,6 +311,9 @@ struct ceph_inode_info {
310 u64 i_rbytes, i_rfiles, i_rsubdirs; 311 u64 i_rbytes, i_rfiles, i_rsubdirs;
311 u64 i_files, i_subdirs; 312 u64 i_files, i_subdirs;
312 313
314 /* quotas */
315 u64 i_max_bytes, i_max_files;
316
313 struct rb_root i_fragtree; 317 struct rb_root i_fragtree;
314 int i_fragtree_nsplits; 318 int i_fragtree_nsplits;
315 struct mutex i_fragtree_mutex; 319 struct mutex i_fragtree_mutex;
@@ -671,6 +675,10 @@ struct ceph_file_info {
671 675
672 spinlock_t rw_contexts_lock; 676 spinlock_t rw_contexts_lock;
673 struct list_head rw_contexts; 677 struct list_head rw_contexts;
678};
679
680struct ceph_dir_file_info {
681 struct ceph_file_info file_info;
674 682
675 /* readdir: position within the dir */ 683 /* readdir: position within the dir */
676 u32 frag; 684 u32 frag;
@@ -748,6 +756,7 @@ struct ceph_readdir_cache_control {
748 */ 756 */
749struct ceph_snap_realm { 757struct ceph_snap_realm {
750 u64 ino; 758 u64 ino;
759 struct inode *inode;
751 atomic_t nref; 760 atomic_t nref;
752 struct rb_node node; 761 struct rb_node node;
753 762
@@ -1066,4 +1075,37 @@ extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
1066extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); 1075extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
1067extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); 1076extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
1068 1077
1078/* quota.c */
1079static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
1080{
1081 return ci->i_max_files || ci->i_max_bytes;
1082}
1083
1084extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
1085
1086static inline void __ceph_update_quota(struct ceph_inode_info *ci,
1087 u64 max_bytes, u64 max_files)
1088{
1089 bool had_quota, has_quota;
1090 had_quota = __ceph_has_any_quota(ci);
1091 ci->i_max_bytes = max_bytes;
1092 ci->i_max_files = max_files;
1093 has_quota = __ceph_has_any_quota(ci);
1094
1095 if (had_quota != has_quota)
1096 ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
1097}
1098
1099extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
1100 struct ceph_mds_session *session,
1101 struct ceph_msg *msg);
1102extern bool ceph_quota_is_max_files_exceeded(struct inode *inode);
1103extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new);
1104extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode,
1105 loff_t newlen);
1106extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
1107 loff_t newlen);
1108extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
1109 struct kstatfs *buf);
1110
1069#endif /* _FS_CEPH_SUPER_H */ 1111#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index e1c4e0b12b4c..7e72348639e4 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -224,6 +224,31 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
224 (long)ci->i_rctime.tv_nsec); 224 (long)ci->i_rctime.tv_nsec);
225} 225}
226 226
227/* quotas */
228
229static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
230{
231 return (ci->i_max_files || ci->i_max_bytes);
232}
233
234static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
235 size_t size)
236{
237 return snprintf(val, size, "max_bytes=%llu max_files=%llu",
238 ci->i_max_bytes, ci->i_max_files);
239}
240
241static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
242 char *val, size_t size)
243{
244 return snprintf(val, size, "%llu", ci->i_max_bytes);
245}
246
247static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
248 char *val, size_t size)
249{
250 return snprintf(val, size, "%llu", ci->i_max_files);
251}
227 252
228#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name 253#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
229#define CEPH_XATTR_NAME2(_type, _name, _name2) \ 254#define CEPH_XATTR_NAME2(_type, _name, _name2) \
@@ -247,6 +272,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
247 .hidden = true, \ 272 .hidden = true, \
248 .exists_cb = ceph_vxattrcb_layout_exists, \ 273 .exists_cb = ceph_vxattrcb_layout_exists, \
249 } 274 }
275#define XATTR_QUOTA_FIELD(_type, _name) \
276 { \
277 .name = CEPH_XATTR_NAME(_type, _name), \
278 .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \
279 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
280 .readonly = false, \
281 .hidden = true, \
282 .exists_cb = ceph_vxattrcb_quota_exists, \
283 }
250 284
251static struct ceph_vxattr ceph_dir_vxattrs[] = { 285static struct ceph_vxattr ceph_dir_vxattrs[] = {
252 { 286 {
@@ -270,6 +304,16 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
270 XATTR_NAME_CEPH(dir, rsubdirs), 304 XATTR_NAME_CEPH(dir, rsubdirs),
271 XATTR_NAME_CEPH(dir, rbytes), 305 XATTR_NAME_CEPH(dir, rbytes),
272 XATTR_NAME_CEPH(dir, rctime), 306 XATTR_NAME_CEPH(dir, rctime),
307 {
308 .name = "ceph.quota",
309 .name_size = sizeof("ceph.quota"),
310 .getxattr_cb = ceph_vxattrcb_quota,
311 .readonly = false,
312 .hidden = true,
313 .exists_cb = ceph_vxattrcb_quota_exists,
314 },
315 XATTR_QUOTA_FIELD(quota, max_bytes),
316 XATTR_QUOTA_FIELD(quota, max_files),
273 { .name = NULL, 0 } /* Required table terminator */ 317 { .name = NULL, 0 } /* Required table terminator */
274}; 318};
275static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ 319static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 59042d5ac520..3901927cf6a0 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -204,6 +204,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
204 CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ 204 CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
205 CEPH_FEATURE_MSGR_KEEPALIVE2 | \ 205 CEPH_FEATURE_MSGR_KEEPALIVE2 | \
206 CEPH_FEATURE_OSD_POOLRESEND | \ 206 CEPH_FEATURE_OSD_POOLRESEND | \
207 CEPH_FEATURE_MDS_QUOTA | \
207 CEPH_FEATURE_CRUSH_V4 | \ 208 CEPH_FEATURE_CRUSH_V4 | \
208 CEPH_FEATURE_NEW_OSDOP_ENCODING | \ 209 CEPH_FEATURE_NEW_OSDOP_ENCODING | \
209 CEPH_FEATURE_SERVER_JEWEL | \ 210 CEPH_FEATURE_SERVER_JEWEL | \
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 88dd51381aaf..7ecfc88314d8 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -134,6 +134,7 @@ struct ceph_dir_layout {
134#define CEPH_MSG_CLIENT_LEASE 0x311 134#define CEPH_MSG_CLIENT_LEASE 0x311
135#define CEPH_MSG_CLIENT_SNAP 0x312 135#define CEPH_MSG_CLIENT_SNAP 0x312
136#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 136#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
137#define CEPH_MSG_CLIENT_QUOTA 0x314
137 138
138/* pool ops */ 139/* pool ops */
139#define CEPH_MSG_POOLOP_REPLY 48 140#define CEPH_MSG_POOLOP_REPLY 48
@@ -807,4 +808,20 @@ struct ceph_mds_snap_realm {
807} __attribute__ ((packed)); 808} __attribute__ ((packed));
808/* followed by my snap list, then prior parent snap list */ 809/* followed by my snap list, then prior parent snap list */
809 810
811/*
812 * quotas
813 */
814struct ceph_mds_quota {
815 __le64 ino; /* ino */
816 struct ceph_timespec rctime;
817 __le64 rbytes; /* dir stats */
818 __le64 rfiles;
819 __le64 rsubdirs;
820 __u8 struct_v; /* compat */
821 __u8 struct_compat;
822 __le32 struct_len;
823 __le64 max_bytes; /* quota max. bytes */
824 __le64 max_files; /* quota max. files */
825} __attribute__ ((packed));
826
810#endif 827#endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index c2ec44cf5098..49c93b9308d7 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -262,6 +262,7 @@ extern struct kmem_cache *ceph_cap_cachep;
262extern struct kmem_cache *ceph_cap_flush_cachep; 262extern struct kmem_cache *ceph_cap_flush_cachep;
263extern struct kmem_cache *ceph_dentry_cachep; 263extern struct kmem_cache *ceph_dentry_cachep;
264extern struct kmem_cache *ceph_file_cachep; 264extern struct kmem_cache *ceph_file_cachep;
265extern struct kmem_cache *ceph_dir_file_cachep;
265 266
266/* ceph_common.c */ 267/* ceph_common.c */
267extern bool libceph_compatible(void *data); 268extern bool libceph_compatible(void *data);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index ead9d85f1c11..c7dfcb8a1fb2 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -76,6 +76,7 @@ enum ceph_msg_data_type {
76#ifdef CONFIG_BLOCK 76#ifdef CONFIG_BLOCK
77 CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ 77 CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */
78#endif /* CONFIG_BLOCK */ 78#endif /* CONFIG_BLOCK */
79 CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */
79}; 80};
80 81
81static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) 82static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
@@ -87,22 +88,106 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
87#ifdef CONFIG_BLOCK 88#ifdef CONFIG_BLOCK
88 case CEPH_MSG_DATA_BIO: 89 case CEPH_MSG_DATA_BIO:
89#endif /* CONFIG_BLOCK */ 90#endif /* CONFIG_BLOCK */
91 case CEPH_MSG_DATA_BVECS:
90 return true; 92 return true;
91 default: 93 default:
92 return false; 94 return false;
93 } 95 }
94} 96}
95 97
98#ifdef CONFIG_BLOCK
99
100struct ceph_bio_iter {
101 struct bio *bio;
102 struct bvec_iter iter;
103};
104
105#define __ceph_bio_iter_advance_step(it, n, STEP) do { \
106 unsigned int __n = (n), __cur_n; \
107 \
108 while (__n) { \
109 BUG_ON(!(it)->iter.bi_size); \
110 __cur_n = min((it)->iter.bi_size, __n); \
111 (void)(STEP); \
112 bio_advance_iter((it)->bio, &(it)->iter, __cur_n); \
113 if (!(it)->iter.bi_size && (it)->bio->bi_next) { \
114 dout("__ceph_bio_iter_advance_step next bio\n"); \
115 (it)->bio = (it)->bio->bi_next; \
116 (it)->iter = (it)->bio->bi_iter; \
117 } \
118 __n -= __cur_n; \
119 } \
120} while (0)
121
122/*
123 * Advance @it by @n bytes.
124 */
125#define ceph_bio_iter_advance(it, n) \
126 __ceph_bio_iter_advance_step(it, n, 0)
127
128/*
129 * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec.
130 */
131#define ceph_bio_iter_advance_step(it, n, BVEC_STEP) \
132 __ceph_bio_iter_advance_step(it, n, ({ \
133 struct bio_vec bv; \
134 struct bvec_iter __cur_iter; \
135 \
136 __cur_iter = (it)->iter; \
137 __cur_iter.bi_size = __cur_n; \
138 __bio_for_each_segment(bv, (it)->bio, __cur_iter, __cur_iter) \
139 (void)(BVEC_STEP); \
140 }))
141
142#endif /* CONFIG_BLOCK */
143
144struct ceph_bvec_iter {
145 struct bio_vec *bvecs;
146 struct bvec_iter iter;
147};
148
149#define __ceph_bvec_iter_advance_step(it, n, STEP) do { \
150 BUG_ON((n) > (it)->iter.bi_size); \
151 (void)(STEP); \
152 bvec_iter_advance((it)->bvecs, &(it)->iter, (n)); \
153} while (0)
154
155/*
156 * Advance @it by @n bytes.
157 */
158#define ceph_bvec_iter_advance(it, n) \
159 __ceph_bvec_iter_advance_step(it, n, 0)
160
161/*
162 * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec.
163 */
164#define ceph_bvec_iter_advance_step(it, n, BVEC_STEP) \
165 __ceph_bvec_iter_advance_step(it, n, ({ \
166 struct bio_vec bv; \
167 struct bvec_iter __cur_iter; \
168 \
169 __cur_iter = (it)->iter; \
170 __cur_iter.bi_size = (n); \
171 for_each_bvec(bv, (it)->bvecs, __cur_iter, __cur_iter) \
172 (void)(BVEC_STEP); \
173 }))
174
175#define ceph_bvec_iter_shorten(it, n) do { \
176 BUG_ON((n) > (it)->iter.bi_size); \
177 (it)->iter.bi_size = (n); \
178} while (0)
179
96struct ceph_msg_data { 180struct ceph_msg_data {
97 struct list_head links; /* ceph_msg->data */ 181 struct list_head links; /* ceph_msg->data */
98 enum ceph_msg_data_type type; 182 enum ceph_msg_data_type type;
99 union { 183 union {
100#ifdef CONFIG_BLOCK 184#ifdef CONFIG_BLOCK
101 struct { 185 struct {
102 struct bio *bio; 186 struct ceph_bio_iter bio_pos;
103 size_t bio_length; 187 u32 bio_length;
104 }; 188 };
105#endif /* CONFIG_BLOCK */ 189#endif /* CONFIG_BLOCK */
190 struct ceph_bvec_iter bvec_pos;
106 struct { 191 struct {
107 struct page **pages; /* NOT OWNER. */ 192 struct page **pages; /* NOT OWNER. */
108 size_t length; /* total # bytes */ 193 size_t length; /* total # bytes */
@@ -122,11 +207,9 @@ struct ceph_msg_data_cursor {
122 bool need_crc; /* crc update needed */ 207 bool need_crc; /* crc update needed */
123 union { 208 union {
124#ifdef CONFIG_BLOCK 209#ifdef CONFIG_BLOCK
125 struct { /* bio */ 210 struct ceph_bio_iter bio_iter;
126 struct bio *bio; /* bio from list */
127 struct bvec_iter bvec_iter;
128 };
129#endif /* CONFIG_BLOCK */ 211#endif /* CONFIG_BLOCK */
212 struct bvec_iter bvec_iter;
130 struct { /* pages */ 213 struct { /* pages */
131 unsigned int page_offset; /* offset in page */ 214 unsigned int page_offset; /* offset in page */
132 unsigned short page_index; /* index in array */ 215 unsigned short page_index; /* index in array */
@@ -290,9 +373,11 @@ extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
290extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, 373extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
291 struct ceph_pagelist *pagelist); 374 struct ceph_pagelist *pagelist);
292#ifdef CONFIG_BLOCK 375#ifdef CONFIG_BLOCK
293extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, 376void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
294 size_t length); 377 u32 length);
295#endif /* CONFIG_BLOCK */ 378#endif /* CONFIG_BLOCK */
379void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
380 struct ceph_bvec_iter *bvec_pos);
296 381
297extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 382extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
298 bool can_fail); 383 bool can_fail);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 52fb37d1c2a5..528ccc943cee 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -57,6 +57,7 @@ enum ceph_osd_data_type {
57#ifdef CONFIG_BLOCK 57#ifdef CONFIG_BLOCK
58 CEPH_OSD_DATA_TYPE_BIO, 58 CEPH_OSD_DATA_TYPE_BIO,
59#endif /* CONFIG_BLOCK */ 59#endif /* CONFIG_BLOCK */
60 CEPH_OSD_DATA_TYPE_BVECS,
60}; 61};
61 62
62struct ceph_osd_data { 63struct ceph_osd_data {
@@ -72,10 +73,11 @@ struct ceph_osd_data {
72 struct ceph_pagelist *pagelist; 73 struct ceph_pagelist *pagelist;
73#ifdef CONFIG_BLOCK 74#ifdef CONFIG_BLOCK
74 struct { 75 struct {
75 struct bio *bio; /* list of bios */ 76 struct ceph_bio_iter bio_pos;
76 size_t bio_length; /* total in list */ 77 u32 bio_length;
77 }; 78 };
78#endif /* CONFIG_BLOCK */ 79#endif /* CONFIG_BLOCK */
80 struct ceph_bvec_iter bvec_pos;
79 }; 81 };
80}; 82};
81 83
@@ -405,10 +407,14 @@ extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
405 unsigned int which, 407 unsigned int which,
406 struct ceph_pagelist *pagelist); 408 struct ceph_pagelist *pagelist);
407#ifdef CONFIG_BLOCK 409#ifdef CONFIG_BLOCK
408extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, 410void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
409 unsigned int which, 411 unsigned int which,
410 struct bio *bio, size_t bio_length); 412 struct ceph_bio_iter *bio_pos,
413 u32 bio_length);
411#endif /* CONFIG_BLOCK */ 414#endif /* CONFIG_BLOCK */
415void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
416 unsigned int which,
417 struct ceph_bvec_iter *bvec_pos);
412 418
413extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, 419extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
414 unsigned int which, 420 unsigned int which,
@@ -418,6 +424,9 @@ extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
418 struct page **pages, u64 length, 424 struct page **pages, u64 length,
419 u32 alignment, bool pages_from_pool, 425 u32 alignment, bool pages_from_pool,
420 bool own_pages); 426 bool own_pages);
427void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
428 unsigned int which,
429 struct bio_vec *bvecs, u32 bytes);
421extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, 430extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
422 unsigned int which, 431 unsigned int which,
423 struct page **pages, u64 length, 432 struct page **pages, u64 length,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index d41fad99c0fa..e71fb222c7c3 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -5,7 +5,6 @@
5#include <linux/rbtree.h> 5#include <linux/rbtree.h>
6#include <linux/ceph/types.h> 6#include <linux/ceph/types.h>
7#include <linux/ceph/decode.h> 7#include <linux/ceph/decode.h>
8#include <linux/ceph/ceph_fs.h>
9#include <linux/crush/crush.h> 8#include <linux/crush/crush.h>
10 9
11/* 10/*
@@ -280,11 +279,6 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting,
280 const struct ceph_osds *new_acting, 279 const struct ceph_osds *new_acting,
281 bool any_change); 280 bool any_change);
282 281
283/* calculate mapping of a file extent to an object */
284extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
285 u64 off, u64 len,
286 u64 *bno, u64 *oxoff, u64 *oxlen);
287
288int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 282int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
289 const struct ceph_object_id *oid, 283 const struct ceph_object_id *oid,
290 const struct ceph_object_locator *oloc, 284 const struct ceph_object_locator *oloc,
diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
new file mode 100644
index 000000000000..cbd0d24b7148
--- /dev/null
+++ b/include/linux/ceph/striper.h
@@ -0,0 +1,69 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CEPH_STRIPER_H
3#define _LINUX_CEPH_STRIPER_H
4
5#include <linux/list.h>
6#include <linux/types.h>
7
8struct ceph_file_layout;
9
10void ceph_calc_file_object_mapping(struct ceph_file_layout *l,
11 u64 off, u64 len,
12 u64 *objno, u64 *objoff, u32 *xlen);
13
14struct ceph_object_extent {
15 struct list_head oe_item;
16 u64 oe_objno;
17 u64 oe_off;
18 u64 oe_len;
19};
20
21static inline void ceph_object_extent_init(struct ceph_object_extent *ex)
22{
23 INIT_LIST_HEAD(&ex->oe_item);
24}
25
26/*
27 * Called for each mapped stripe unit.
28 *
29 * @bytes: number of bytes mapped, i.e. the minimum of the full length
30 * requested (file extent length) or the remainder of the stripe
31 * unit within an object
32 */
33typedef void (*ceph_object_extent_fn_t)(struct ceph_object_extent *ex,
34 u32 bytes, void *arg);
35
36int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len,
37 struct list_head *object_extents,
38 struct ceph_object_extent *alloc_fn(void *arg),
39 void *alloc_arg,
40 ceph_object_extent_fn_t action_fn,
41 void *action_arg);
42int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len,
43 struct list_head *object_extents,
44 ceph_object_extent_fn_t action_fn,
45 void *action_arg);
46
47struct ceph_file_extent {
48 u64 fe_off;
49 u64 fe_len;
50};
51
52static inline u64 ceph_file_extents_bytes(struct ceph_file_extent *file_extents,
53 u32 num_file_extents)
54{
55 u64 bytes = 0;
56 u32 i;
57
58 for (i = 0; i < num_file_extents; i++)
59 bytes += file_extents[i].fe_len;
60
61 return bytes;
62}
63
64int ceph_extent_to_file(struct ceph_file_layout *l,
65 u64 objno, u64 objoff, u64 objlen,
66 struct ceph_file_extent **file_extents,
67 u32 *num_file_extents);
68
69#endif
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index b4bded4b5396..12bf49772d24 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -8,6 +8,7 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
8 mon_client.o \ 8 mon_client.o \
9 cls_lock_client.o \ 9 cls_lock_client.o \
10 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ 10 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
11 striper.o \
11 debugfs.o \ 12 debugfs.o \
12 auth.o auth_none.o \ 13 auth.o auth_none.o \
13 crypto.o armor.o \ 14 crypto.o armor.o \
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4adf07826f4a..584fdbef2088 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -72,6 +72,7 @@ const char *ceph_msg_type_name(int type)
72 case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; 72 case CEPH_MSG_MON_GET_VERSION: return "mon_get_version";
73 case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply"; 73 case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply";
74 case CEPH_MSG_MDS_MAP: return "mds_map"; 74 case CEPH_MSG_MDS_MAP: return "mds_map";
75 case CEPH_MSG_FS_MAP_USER: return "fs_map_user";
75 case CEPH_MSG_CLIENT_SESSION: return "client_session"; 76 case CEPH_MSG_CLIENT_SESSION: return "client_session";
76 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; 77 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
77 case CEPH_MSG_CLIENT_REQUEST: return "client_request"; 78 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
@@ -79,8 +80,13 @@ const char *ceph_msg_type_name(int type)
79 case CEPH_MSG_CLIENT_REPLY: return "client_reply"; 80 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
80 case CEPH_MSG_CLIENT_CAPS: return "client_caps"; 81 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
81 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; 82 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
83 case CEPH_MSG_CLIENT_QUOTA: return "client_quota";
82 case CEPH_MSG_CLIENT_SNAP: return "client_snap"; 84 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
83 case CEPH_MSG_CLIENT_LEASE: return "client_lease"; 85 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
86 case CEPH_MSG_POOLOP_REPLY: return "poolop_reply";
87 case CEPH_MSG_POOLOP: return "poolop";
88 case CEPH_MSG_MON_COMMAND: return "mon_command";
89 case CEPH_MSG_MON_COMMAND_ACK: return "mon_command_ack";
84 case CEPH_MSG_OSD_MAP: return "osd_map"; 90 case CEPH_MSG_OSD_MAP: return "osd_map";
85 case CEPH_MSG_OSD_OP: return "osd_op"; 91 case CEPH_MSG_OSD_OP: return "osd_op";
86 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; 92 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
@@ -217,7 +223,7 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid)
217 223
218 if (i == 16) 224 if (i == 16)
219 err = 0; 225 err = 0;
220 dout("parse_fsid ret %d got fsid %pU", err, fsid); 226 dout("parse_fsid ret %d got fsid %pU\n", err, fsid);
221 return err; 227 return err;
222} 228}
223 229
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index bf9d079cbafd..02172c408ff2 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -347,10 +347,12 @@ struct key_type key_type_ceph = {
347 .destroy = ceph_key_destroy, 347 .destroy = ceph_key_destroy,
348}; 348};
349 349
350int ceph_crypto_init(void) { 350int __init ceph_crypto_init(void)
351{
351 return register_key_type(&key_type_ceph); 352 return register_key_type(&key_type_ceph);
352} 353}
353 354
354void ceph_crypto_shutdown(void) { 355void ceph_crypto_shutdown(void)
356{
355 unregister_key_type(&key_type_ceph); 357 unregister_key_type(&key_type_ceph);
356} 358}
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 1eef6806aa1a..02952605d121 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -389,7 +389,7 @@ CEPH_DEFINE_SHOW_FUNC(monc_show)
389CEPH_DEFINE_SHOW_FUNC(osdc_show) 389CEPH_DEFINE_SHOW_FUNC(osdc_show)
390CEPH_DEFINE_SHOW_FUNC(client_options_show) 390CEPH_DEFINE_SHOW_FUNC(client_options_show)
391 391
392int ceph_debugfs_init(void) 392int __init ceph_debugfs_init(void)
393{ 393{
394 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); 394 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
395 if (!ceph_debugfs_dir) 395 if (!ceph_debugfs_dir)
@@ -418,7 +418,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
418 goto out; 418 goto out;
419 419
420 client->monc.debugfs_file = debugfs_create_file("monc", 420 client->monc.debugfs_file = debugfs_create_file("monc",
421 0600, 421 0400,
422 client->debugfs_dir, 422 client->debugfs_dir,
423 client, 423 client,
424 &monc_show_fops); 424 &monc_show_fops);
@@ -426,7 +426,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
426 goto out; 426 goto out;
427 427
428 client->osdc.debugfs_file = debugfs_create_file("osdc", 428 client->osdc.debugfs_file = debugfs_create_file("osdc",
429 0600, 429 0400,
430 client->debugfs_dir, 430 client->debugfs_dir,
431 client, 431 client,
432 &osdc_show_fops); 432 &osdc_show_fops);
@@ -434,7 +434,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
434 goto out; 434 goto out;
435 435
436 client->debugfs_monmap = debugfs_create_file("monmap", 436 client->debugfs_monmap = debugfs_create_file("monmap",
437 0600, 437 0400,
438 client->debugfs_dir, 438 client->debugfs_dir,
439 client, 439 client,
440 &monmap_show_fops); 440 &monmap_show_fops);
@@ -442,7 +442,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
442 goto out; 442 goto out;
443 443
444 client->debugfs_osdmap = debugfs_create_file("osdmap", 444 client->debugfs_osdmap = debugfs_create_file("osdmap",
445 0600, 445 0400,
446 client->debugfs_dir, 446 client->debugfs_dir,
447 client, 447 client,
448 &osdmap_show_fops); 448 &osdmap_show_fops);
@@ -450,7 +450,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
450 goto out; 450 goto out;
451 451
452 client->debugfs_options = debugfs_create_file("client_options", 452 client->debugfs_options = debugfs_create_file("client_options",
453 0600, 453 0400,
454 client->debugfs_dir, 454 client->debugfs_dir,
455 client, 455 client,
456 &client_options_show_fops); 456 &client_options_show_fops);
@@ -477,7 +477,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
477 477
478#else /* CONFIG_DEBUG_FS */ 478#else /* CONFIG_DEBUG_FS */
479 479
480int ceph_debugfs_init(void) 480int __init ceph_debugfs_init(void)
481{ 481{
482 return 0; 482 return 0;
483} 483}
@@ -496,6 +496,3 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
496} 496}
497 497
498#endif /* CONFIG_DEBUG_FS */ 498#endif /* CONFIG_DEBUG_FS */
499
500EXPORT_SYMBOL(ceph_debugfs_init);
501EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8a4d3758030b..fcb40c12b1f8 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -277,7 +277,7 @@ static void _ceph_msgr_exit(void)
277 ceph_msgr_slab_exit(); 277 ceph_msgr_slab_exit();
278} 278}
279 279
280int ceph_msgr_init(void) 280int __init ceph_msgr_init(void)
281{ 281{
282 if (ceph_msgr_slab_init()) 282 if (ceph_msgr_slab_init())
283 return -ENOMEM; 283 return -ENOMEM;
@@ -299,7 +299,6 @@ int ceph_msgr_init(void)
299 299
300 return -ENOMEM; 300 return -ENOMEM;
301} 301}
302EXPORT_SYMBOL(ceph_msgr_init);
303 302
304void ceph_msgr_exit(void) 303void ceph_msgr_exit(void)
305{ 304{
@@ -307,7 +306,6 @@ void ceph_msgr_exit(void)
307 306
308 _ceph_msgr_exit(); 307 _ceph_msgr_exit();
309} 308}
310EXPORT_SYMBOL(ceph_msgr_exit);
311 309
312void ceph_msgr_flush(void) 310void ceph_msgr_flush(void)
313{ 311{
@@ -839,93 +837,112 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
839 size_t length) 837 size_t length)
840{ 838{
841 struct ceph_msg_data *data = cursor->data; 839 struct ceph_msg_data *data = cursor->data;
842 struct bio *bio; 840 struct ceph_bio_iter *it = &cursor->bio_iter;
843 841
844 BUG_ON(data->type != CEPH_MSG_DATA_BIO); 842 cursor->resid = min_t(size_t, length, data->bio_length);
843 *it = data->bio_pos;
844 if (cursor->resid < it->iter.bi_size)
845 it->iter.bi_size = cursor->resid;
845 846
846 bio = data->bio; 847 BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
847 BUG_ON(!bio); 848 cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
848
849 cursor->resid = min(length, data->bio_length);
850 cursor->bio = bio;
851 cursor->bvec_iter = bio->bi_iter;
852 cursor->last_piece =
853 cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
854} 849}
855 850
856static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, 851static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
857 size_t *page_offset, 852 size_t *page_offset,
858 size_t *length) 853 size_t *length)
859{ 854{
860 struct ceph_msg_data *data = cursor->data; 855 struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio,
861 struct bio *bio; 856 cursor->bio_iter.iter);
862 struct bio_vec bio_vec;
863
864 BUG_ON(data->type != CEPH_MSG_DATA_BIO);
865
866 bio = cursor->bio;
867 BUG_ON(!bio);
868
869 bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
870
871 *page_offset = (size_t) bio_vec.bv_offset;
872 BUG_ON(*page_offset >= PAGE_SIZE);
873 if (cursor->last_piece) /* pagelist offset is always 0 */
874 *length = cursor->resid;
875 else
876 *length = (size_t) bio_vec.bv_len;
877 BUG_ON(*length > cursor->resid);
878 BUG_ON(*page_offset + *length > PAGE_SIZE);
879 857
880 return bio_vec.bv_page; 858 *page_offset = bv.bv_offset;
859 *length = bv.bv_len;
860 return bv.bv_page;
881} 861}
882 862
883static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, 863static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
884 size_t bytes) 864 size_t bytes)
885{ 865{
886 struct bio *bio; 866 struct ceph_bio_iter *it = &cursor->bio_iter;
887 struct bio_vec bio_vec;
888 867
889 BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); 868 BUG_ON(bytes > cursor->resid);
869 BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
870 cursor->resid -= bytes;
871 bio_advance_iter(it->bio, &it->iter, bytes);
890 872
891 bio = cursor->bio; 873 if (!cursor->resid) {
892 BUG_ON(!bio); 874 BUG_ON(!cursor->last_piece);
875 return false; /* no more data */
876 }
893 877
894 bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); 878 if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done))
879 return false; /* more bytes to process in this segment */
895 880
896 /* Advance the cursor offset */ 881 if (!it->iter.bi_size) {
882 it->bio = it->bio->bi_next;
883 it->iter = it->bio->bi_iter;
884 if (cursor->resid < it->iter.bi_size)
885 it->iter.bi_size = cursor->resid;
886 }
897 887
898 BUG_ON(cursor->resid < bytes); 888 BUG_ON(cursor->last_piece);
899 cursor->resid -= bytes; 889 BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
890 cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
891 return true;
892}
893#endif /* CONFIG_BLOCK */
900 894
901 bio_advance_iter(bio, &cursor->bvec_iter, bytes); 895static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor,
896 size_t length)
897{
898 struct ceph_msg_data *data = cursor->data;
899 struct bio_vec *bvecs = data->bvec_pos.bvecs;
902 900
903 if (bytes < bio_vec.bv_len) 901 cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size);
904 return false; /* more bytes to process in this segment */ 902 cursor->bvec_iter = data->bvec_pos.iter;
903 cursor->bvec_iter.bi_size = cursor->resid;
905 904
906 /* Move on to the next segment, and possibly the next bio */ 905 BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
906 cursor->last_piece =
907 cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
908}
907 909
908 if (!cursor->bvec_iter.bi_size) { 910static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor,
909 bio = bio->bi_next; 911 size_t *page_offset,
910 cursor->bio = bio; 912 size_t *length)
911 if (bio) 913{
912 cursor->bvec_iter = bio->bi_iter; 914 struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs,
913 else 915 cursor->bvec_iter);
914 memset(&cursor->bvec_iter, 0, 916
915 sizeof(cursor->bvec_iter)); 917 *page_offset = bv.bv_offset;
916 } 918 *length = bv.bv_len;
919 return bv.bv_page;
920}
921
922static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor,
923 size_t bytes)
924{
925 struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs;
926
927 BUG_ON(bytes > cursor->resid);
928 BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter));
929 cursor->resid -= bytes;
930 bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes);
917 931
918 if (!cursor->last_piece) { 932 if (!cursor->resid) {
919 BUG_ON(!cursor->resid); 933 BUG_ON(!cursor->last_piece);
920 BUG_ON(!bio); 934 return false; /* no more data */
921 /* A short read is OK, so use <= rather than == */
922 if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
923 cursor->last_piece = true;
924 } 935 }
925 936
937 if (!bytes || cursor->bvec_iter.bi_bvec_done)
938 return false; /* more bytes to process in this segment */
939
940 BUG_ON(cursor->last_piece);
941 BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
942 cursor->last_piece =
943 cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
926 return true; 944 return true;
927} 945}
928#endif /* CONFIG_BLOCK */
929 946
930/* 947/*
931 * For a page array, a piece comes from the first page in the array 948 * For a page array, a piece comes from the first page in the array
@@ -1110,6 +1127,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
1110 ceph_msg_data_bio_cursor_init(cursor, length); 1127 ceph_msg_data_bio_cursor_init(cursor, length);
1111 break; 1128 break;
1112#endif /* CONFIG_BLOCK */ 1129#endif /* CONFIG_BLOCK */
1130 case CEPH_MSG_DATA_BVECS:
1131 ceph_msg_data_bvecs_cursor_init(cursor, length);
1132 break;
1113 case CEPH_MSG_DATA_NONE: 1133 case CEPH_MSG_DATA_NONE:
1114 default: 1134 default:
1115 /* BUG(); */ 1135 /* BUG(); */
@@ -1158,14 +1178,19 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
1158 page = ceph_msg_data_bio_next(cursor, page_offset, length); 1178 page = ceph_msg_data_bio_next(cursor, page_offset, length);
1159 break; 1179 break;
1160#endif /* CONFIG_BLOCK */ 1180#endif /* CONFIG_BLOCK */
1181 case CEPH_MSG_DATA_BVECS:
1182 page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
1183 break;
1161 case CEPH_MSG_DATA_NONE: 1184 case CEPH_MSG_DATA_NONE:
1162 default: 1185 default:
1163 page = NULL; 1186 page = NULL;
1164 break; 1187 break;
1165 } 1188 }
1189
1166 BUG_ON(!page); 1190 BUG_ON(!page);
1167 BUG_ON(*page_offset + *length > PAGE_SIZE); 1191 BUG_ON(*page_offset + *length > PAGE_SIZE);
1168 BUG_ON(!*length); 1192 BUG_ON(!*length);
1193 BUG_ON(*length > cursor->resid);
1169 if (last_piece) 1194 if (last_piece)
1170 *last_piece = cursor->last_piece; 1195 *last_piece = cursor->last_piece;
1171 1196
@@ -1194,6 +1219,9 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
1194 new_piece = ceph_msg_data_bio_advance(cursor, bytes); 1219 new_piece = ceph_msg_data_bio_advance(cursor, bytes);
1195 break; 1220 break;
1196#endif /* CONFIG_BLOCK */ 1221#endif /* CONFIG_BLOCK */
1222 case CEPH_MSG_DATA_BVECS:
1223 new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
1224 break;
1197 case CEPH_MSG_DATA_NONE: 1225 case CEPH_MSG_DATA_NONE:
1198 default: 1226 default:
1199 BUG(); 1227 BUG();
@@ -1575,13 +1603,18 @@ static int write_partial_message_data(struct ceph_connection *con)
1575 * been revoked, so use the zero page. 1603 * been revoked, so use the zero page.
1576 */ 1604 */
1577 crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; 1605 crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
1578 while (cursor->resid) { 1606 while (cursor->total_resid) {
1579 struct page *page; 1607 struct page *page;
1580 size_t page_offset; 1608 size_t page_offset;
1581 size_t length; 1609 size_t length;
1582 bool last_piece; 1610 bool last_piece;
1583 int ret; 1611 int ret;
1584 1612
1613 if (!cursor->resid) {
1614 ceph_msg_data_advance(cursor, 0);
1615 continue;
1616 }
1617
1585 page = ceph_msg_data_next(cursor, &page_offset, &length, 1618 page = ceph_msg_data_next(cursor, &page_offset, &length,
1586 &last_piece); 1619 &last_piece);
1587 ret = ceph_tcp_sendpage(con->sock, page, page_offset, 1620 ret = ceph_tcp_sendpage(con->sock, page, page_offset,
@@ -2297,7 +2330,12 @@ static int read_partial_msg_data(struct ceph_connection *con)
2297 2330
2298 if (do_datacrc) 2331 if (do_datacrc)
2299 crc = con->in_data_crc; 2332 crc = con->in_data_crc;
2300 while (cursor->resid) { 2333 while (cursor->total_resid) {
2334 if (!cursor->resid) {
2335 ceph_msg_data_advance(cursor, 0);
2336 continue;
2337 }
2338
2301 page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); 2339 page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
2302 ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); 2340 ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
2303 if (ret <= 0) { 2341 if (ret <= 0) {
@@ -3262,16 +3300,14 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
3262EXPORT_SYMBOL(ceph_msg_data_add_pagelist); 3300EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
3263 3301
3264#ifdef CONFIG_BLOCK 3302#ifdef CONFIG_BLOCK
3265void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, 3303void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
3266 size_t length) 3304 u32 length)
3267{ 3305{
3268 struct ceph_msg_data *data; 3306 struct ceph_msg_data *data;
3269 3307
3270 BUG_ON(!bio);
3271
3272 data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); 3308 data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
3273 BUG_ON(!data); 3309 BUG_ON(!data);
3274 data->bio = bio; 3310 data->bio_pos = *bio_pos;
3275 data->bio_length = length; 3311 data->bio_length = length;
3276 3312
3277 list_add_tail(&data->links, &msg->data); 3313 list_add_tail(&data->links, &msg->data);
@@ -3280,6 +3316,20 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
3280EXPORT_SYMBOL(ceph_msg_data_add_bio); 3316EXPORT_SYMBOL(ceph_msg_data_add_bio);
3281#endif /* CONFIG_BLOCK */ 3317#endif /* CONFIG_BLOCK */
3282 3318
3319void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
3320 struct ceph_bvec_iter *bvec_pos)
3321{
3322 struct ceph_msg_data *data;
3323
3324 data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS);
3325 BUG_ON(!data);
3326 data->bvec_pos = *bvec_pos;
3327
3328 list_add_tail(&data->links, &msg->data);
3329 msg->data_length += bvec_pos->iter.bi_size;
3330}
3331EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
3332
3283/* 3333/*
3284 * construct a new message with given type, size 3334 * construct a new message with given type, size
3285 * the new msg has a ref count of 1. 3335 * the new msg has a ref count of 1.
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 1547107f4854..b3dac24412d3 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -60,7 +60,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
60 num_mon = ceph_decode_32(&p); 60 num_mon = ceph_decode_32(&p);
61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); 61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
62 62
63 if (num_mon >= CEPH_MAX_MON) 63 if (num_mon > CEPH_MAX_MON)
64 goto bad; 64 goto bad;
65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); 65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
66 if (m == NULL) 66 if (m == NULL)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2814dba5902d..ea2a6c9fb7ce 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -20,6 +20,7 @@
20#include <linux/ceph/decode.h> 20#include <linux/ceph/decode.h>
21#include <linux/ceph/auth.h> 21#include <linux/ceph/auth.h>
22#include <linux/ceph/pagelist.h> 22#include <linux/ceph/pagelist.h>
23#include <linux/ceph/striper.h>
23 24
24#define OSD_OPREPLY_FRONT_LEN 512 25#define OSD_OPREPLY_FRONT_LEN 512
25 26
@@ -103,13 +104,12 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
103 u64 *objnum, u64 *objoff, u64 *objlen) 104 u64 *objnum, u64 *objoff, u64 *objlen)
104{ 105{
105 u64 orig_len = *plen; 106 u64 orig_len = *plen;
106 int r; 107 u32 xlen;
107 108
108 /* object extent? */ 109 /* object extent? */
109 r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, 110 ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
110 objoff, objlen); 111 objoff, &xlen);
111 if (r < 0) 112 *objlen = xlen;
112 return r;
113 if (*objlen < orig_len) { 113 if (*objlen < orig_len) {
114 *plen = *objlen; 114 *plen = *objlen;
115 dout(" skipping last %llu, final file extent %llu~%llu\n", 115 dout(" skipping last %llu, final file extent %llu~%llu\n",
@@ -117,7 +117,6 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
117 } 117 }
118 118
119 dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); 119 dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
120
121 return 0; 120 return 0;
122} 121}
123 122
@@ -148,14 +147,22 @@ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
148 147
149#ifdef CONFIG_BLOCK 148#ifdef CONFIG_BLOCK
150static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, 149static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
151 struct bio *bio, size_t bio_length) 150 struct ceph_bio_iter *bio_pos,
151 u32 bio_length)
152{ 152{
153 osd_data->type = CEPH_OSD_DATA_TYPE_BIO; 153 osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
154 osd_data->bio = bio; 154 osd_data->bio_pos = *bio_pos;
155 osd_data->bio_length = bio_length; 155 osd_data->bio_length = bio_length;
156} 156}
157#endif /* CONFIG_BLOCK */ 157#endif /* CONFIG_BLOCK */
158 158
159static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
160 struct ceph_bvec_iter *bvec_pos)
161{
162 osd_data->type = CEPH_OSD_DATA_TYPE_BVECS;
163 osd_data->bvec_pos = *bvec_pos;
164}
165
159#define osd_req_op_data(oreq, whch, typ, fld) \ 166#define osd_req_op_data(oreq, whch, typ, fld) \
160({ \ 167({ \
161 struct ceph_osd_request *__oreq = (oreq); \ 168 struct ceph_osd_request *__oreq = (oreq); \
@@ -218,16 +225,29 @@ EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
218 225
219#ifdef CONFIG_BLOCK 226#ifdef CONFIG_BLOCK
220void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, 227void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
221 unsigned int which, struct bio *bio, size_t bio_length) 228 unsigned int which,
229 struct ceph_bio_iter *bio_pos,
230 u32 bio_length)
222{ 231{
223 struct ceph_osd_data *osd_data; 232 struct ceph_osd_data *osd_data;
224 233
225 osd_data = osd_req_op_data(osd_req, which, extent, osd_data); 234 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
226 ceph_osd_data_bio_init(osd_data, bio, bio_length); 235 ceph_osd_data_bio_init(osd_data, bio_pos, bio_length);
227} 236}
228EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); 237EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
229#endif /* CONFIG_BLOCK */ 238#endif /* CONFIG_BLOCK */
230 239
240void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
241 unsigned int which,
242 struct ceph_bvec_iter *bvec_pos)
243{
244 struct ceph_osd_data *osd_data;
245
246 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
247 ceph_osd_data_bvecs_init(osd_data, bvec_pos);
248}
249EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
250
231static void osd_req_op_cls_request_info_pagelist( 251static void osd_req_op_cls_request_info_pagelist(
232 struct ceph_osd_request *osd_req, 252 struct ceph_osd_request *osd_req,
233 unsigned int which, struct ceph_pagelist *pagelist) 253 unsigned int which, struct ceph_pagelist *pagelist)
@@ -265,6 +285,23 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
265} 285}
266EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); 286EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
267 287
288void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
289 unsigned int which,
290 struct bio_vec *bvecs, u32 bytes)
291{
292 struct ceph_osd_data *osd_data;
293 struct ceph_bvec_iter it = {
294 .bvecs = bvecs,
295 .iter = { .bi_size = bytes },
296 };
297
298 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
299 ceph_osd_data_bvecs_init(osd_data, &it);
300 osd_req->r_ops[which].cls.indata_len += bytes;
301 osd_req->r_ops[which].indata_len += bytes;
302}
303EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs);
304
268void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, 305void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
269 unsigned int which, struct page **pages, u64 length, 306 unsigned int which, struct page **pages, u64 length,
270 u32 alignment, bool pages_from_pool, bool own_pages) 307 u32 alignment, bool pages_from_pool, bool own_pages)
@@ -290,6 +327,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
290 case CEPH_OSD_DATA_TYPE_BIO: 327 case CEPH_OSD_DATA_TYPE_BIO:
291 return (u64)osd_data->bio_length; 328 return (u64)osd_data->bio_length;
292#endif /* CONFIG_BLOCK */ 329#endif /* CONFIG_BLOCK */
330 case CEPH_OSD_DATA_TYPE_BVECS:
331 return osd_data->bvec_pos.iter.bi_size;
293 default: 332 default:
294 WARN(true, "unrecognized data type %d\n", (int)osd_data->type); 333 WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
295 return 0; 334 return 0;
@@ -828,8 +867,10 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
828 ceph_msg_data_add_pagelist(msg, osd_data->pagelist); 867 ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
829#ifdef CONFIG_BLOCK 868#ifdef CONFIG_BLOCK
830 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { 869 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
831 ceph_msg_data_add_bio(msg, osd_data->bio, length); 870 ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length);
832#endif 871#endif
872 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
873 ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
833 } else { 874 } else {
834 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); 875 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
835 } 876 }
@@ -5065,7 +5106,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
5065} 5106}
5066EXPORT_SYMBOL(ceph_osdc_writepages); 5107EXPORT_SYMBOL(ceph_osdc_writepages);
5067 5108
5068int ceph_osdc_setup(void) 5109int __init ceph_osdc_setup(void)
5069{ 5110{
5070 size_t size = sizeof(struct ceph_osd_request) + 5111 size_t size = sizeof(struct ceph_osd_request) +
5071 CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); 5112 CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
@@ -5076,7 +5117,6 @@ int ceph_osdc_setup(void)
5076 5117
5077 return ceph_osd_request_cache ? 0 : -ENOMEM; 5118 return ceph_osd_request_cache ? 0 : -ENOMEM;
5078} 5119}
5079EXPORT_SYMBOL(ceph_osdc_setup);
5080 5120
5081void ceph_osdc_cleanup(void) 5121void ceph_osdc_cleanup(void)
5082{ 5122{
@@ -5084,7 +5124,6 @@ void ceph_osdc_cleanup(void)
5084 kmem_cache_destroy(ceph_osd_request_cache); 5124 kmem_cache_destroy(ceph_osd_request_cache);
5085 ceph_osd_request_cache = NULL; 5125 ceph_osd_request_cache = NULL;
5086} 5126}
5087EXPORT_SYMBOL(ceph_osdc_cleanup);
5088 5127
5089/* 5128/*
5090 * handle incoming message 5129 * handle incoming message
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 0da27c66349a..9645ffd6acfb 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -4,7 +4,6 @@
4 4
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <asm/div64.h>
8 7
9#include <linux/ceph/libceph.h> 8#include <linux/ceph/libceph.h>
10#include <linux/ceph/osdmap.h> 9#include <linux/ceph/osdmap.h>
@@ -2141,76 +2140,6 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting,
2141} 2140}
2142 2141
2143/* 2142/*
2144 * calculate file layout from given offset, length.
2145 * fill in correct oid, logical length, and object extent
2146 * offset, length.
2147 *
2148 * for now, we write only a single su, until we can
2149 * pass a stride back to the caller.
2150 */
2151int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
2152 u64 off, u64 len,
2153 u64 *ono,
2154 u64 *oxoff, u64 *oxlen)
2155{
2156 u32 osize = layout->object_size;
2157 u32 su = layout->stripe_unit;
2158 u32 sc = layout->stripe_count;
2159 u32 bl, stripeno, stripepos, objsetno;
2160 u32 su_per_object;
2161 u64 t, su_offset;
2162
2163 dout("mapping %llu~%llu osize %u fl_su %u\n", off, len,
2164 osize, su);
2165 if (su == 0 || sc == 0)
2166 goto invalid;
2167 su_per_object = osize / su;
2168 if (su_per_object == 0)
2169 goto invalid;
2170 dout("osize %u / su %u = su_per_object %u\n", osize, su,
2171 su_per_object);
2172
2173 if ((su & ~PAGE_MASK) != 0)
2174 goto invalid;
2175
2176 /* bl = *off / su; */
2177 t = off;
2178 do_div(t, su);
2179 bl = t;
2180 dout("off %llu / su %u = bl %u\n", off, su, bl);
2181
2182 stripeno = bl / sc;
2183 stripepos = bl % sc;
2184 objsetno = stripeno / su_per_object;
2185
2186 *ono = objsetno * sc + stripepos;
2187 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono);
2188
2189 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
2190 t = off;
2191 su_offset = do_div(t, su);
2192 *oxoff = su_offset + (stripeno % su_per_object) * su;
2193
2194 /*
2195 * Calculate the length of the extent being written to the selected
2196 * object. This is the minimum of the full length requested (len) or
2197 * the remainder of the current stripe being written to.
2198 */
2199 *oxlen = min_t(u64, len, su - su_offset);
2200
2201 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
2202 return 0;
2203
2204invalid:
2205 dout(" invalid layout\n");
2206 *ono = 0;
2207 *oxoff = 0;
2208 *oxlen = 0;
2209 return -EINVAL;
2210}
2211EXPORT_SYMBOL(ceph_calc_file_object_mapping);
2212
2213/*
2214 * Map an object into a PG. 2143 * Map an object into a PG.
2215 * 2144 *
2216 * Should only be called with target_oid and target_oloc (as opposed to 2145 * Should only be called with target_oid and target_oloc (as opposed to
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
new file mode 100644
index 000000000000..c36462dc86b7
--- /dev/null
+++ b/net/ceph/striper.c
@@ -0,0 +1,261 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#include <linux/ceph/ceph_debug.h>
4
5#include <linux/math64.h>
6#include <linux/slab.h>
7
8#include <linux/ceph/striper.h>
9#include <linux/ceph/types.h>
10
11/*
12 * Map a file extent to a stripe unit within an object.
13 * Fill in objno, offset into object, and object extent length (i.e. the
14 * number of bytes mapped, less than or equal to @l->stripe_unit).
15 *
16 * Example for stripe_count = 3, stripes_per_object = 4:
17 *
18 * blockno | 0 3 6 9 | 1 4 7 10 | 2 5 8 11 | 12 15 18 21 | 13 16 19
19 * stripeno | 0 1 2 3 | 0 1 2 3 | 0 1 2 3 | 4 5 6 7 | 4 5 6
20 * stripepos | 0 | 1 | 2 | 0 | 1
21 * objno | 0 | 1 | 2 | 3 | 4
22 * objsetno | 0 | 1
23 */
24void ceph_calc_file_object_mapping(struct ceph_file_layout *l,
25 u64 off, u64 len,
26 u64 *objno, u64 *objoff, u32 *xlen)
27{
28 u32 stripes_per_object = l->object_size / l->stripe_unit;
29 u64 blockno; /* which su in the file (i.e. globally) */
30 u32 blockoff; /* offset into su */
31 u64 stripeno; /* which stripe */
32 u32 stripepos; /* which su in the stripe,
33 which object in the object set */
34 u64 objsetno; /* which object set */
35 u32 objsetpos; /* which stripe in the object set */
36
37 blockno = div_u64_rem(off, l->stripe_unit, &blockoff);
38 stripeno = div_u64_rem(blockno, l->stripe_count, &stripepos);
39 objsetno = div_u64_rem(stripeno, stripes_per_object, &objsetpos);
40
41 *objno = objsetno * l->stripe_count + stripepos;
42 *objoff = objsetpos * l->stripe_unit + blockoff;
43 *xlen = min_t(u64, len, l->stripe_unit - blockoff);
44}
45EXPORT_SYMBOL(ceph_calc_file_object_mapping);
46
47/*
48 * Return the last extent with given objno (@object_extents is sorted
49 * by objno). If not found, return NULL and set @add_pos so that the
50 * new extent can be added with list_add(add_pos, new_ex).
51 */
52static struct ceph_object_extent *
53lookup_last(struct list_head *object_extents, u64 objno,
54 struct list_head **add_pos)
55{
56 struct list_head *pos;
57
58 list_for_each_prev(pos, object_extents) {
59 struct ceph_object_extent *ex =
60 list_entry(pos, typeof(*ex), oe_item);
61
62 if (ex->oe_objno == objno)
63 return ex;
64
65 if (ex->oe_objno < objno)
66 break;
67 }
68
69 *add_pos = pos;
70 return NULL;
71}
72
73static struct ceph_object_extent *
74lookup_containing(struct list_head *object_extents, u64 objno,
75 u64 objoff, u32 xlen)
76{
77 struct ceph_object_extent *ex;
78
79 list_for_each_entry(ex, object_extents, oe_item) {
80 if (ex->oe_objno == objno &&
81 ex->oe_off <= objoff &&
82 ex->oe_off + ex->oe_len >= objoff + xlen) /* paranoia */
83 return ex;
84
85 if (ex->oe_objno > objno)
86 break;
87 }
88
89 return NULL;
90}
91
92/*
93 * Map a file extent to a sorted list of object extents.
94 *
95 * We want only one (or as few as possible) object extents per object.
96 * Adjacent object extents will be merged together, each returned object
97 * extent may reverse map to multiple different file extents.
98 *
99 * Call @alloc_fn for each new object extent and @action_fn for each
100 * mapped stripe unit, whether it was merged into an already allocated
101 * object extent or started a new object extent.
102 *
103 * Newly allocated object extents are added to @object_extents.
104 * To keep @object_extents sorted, successive calls to this function
105 * must map successive file extents (i.e. the list of file extents that
106 * are mapped using the same @object_extents must be sorted).
107 *
108 * The caller is responsible for @object_extents.
109 */
110int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len,
111 struct list_head *object_extents,
112 struct ceph_object_extent *alloc_fn(void *arg),
113 void *alloc_arg,
114 ceph_object_extent_fn_t action_fn,
115 void *action_arg)
116{
117 struct ceph_object_extent *last_ex, *ex;
118
119 while (len) {
120 struct list_head *add_pos = NULL;
121 u64 objno, objoff;
122 u32 xlen;
123
124 ceph_calc_file_object_mapping(l, off, len, &objno, &objoff,
125 &xlen);
126
127 last_ex = lookup_last(object_extents, objno, &add_pos);
128 if (!last_ex || last_ex->oe_off + last_ex->oe_len != objoff) {
129 ex = alloc_fn(alloc_arg);
130 if (!ex)
131 return -ENOMEM;
132
133 ex->oe_objno = objno;
134 ex->oe_off = objoff;
135 ex->oe_len = xlen;
136 if (action_fn)
137 action_fn(ex, xlen, action_arg);
138
139 if (!last_ex)
140 list_add(&ex->oe_item, add_pos);
141 else
142 list_add(&ex->oe_item, &last_ex->oe_item);
143 } else {
144 last_ex->oe_len += xlen;
145 if (action_fn)
146 action_fn(last_ex, xlen, action_arg);
147 }
148
149 off += xlen;
150 len -= xlen;
151 }
152
153 for (last_ex = list_first_entry(object_extents, typeof(*ex), oe_item),
154 ex = list_next_entry(last_ex, oe_item);
155 &ex->oe_item != object_extents;
156 last_ex = ex, ex = list_next_entry(ex, oe_item)) {
157 if (last_ex->oe_objno > ex->oe_objno ||
158 (last_ex->oe_objno == ex->oe_objno &&
159 last_ex->oe_off + last_ex->oe_len >= ex->oe_off)) {
160 WARN(1, "%s: object_extents list not sorted!\n",
161 __func__);
162 return -EINVAL;
163 }
164 }
165
166 return 0;
167}
168EXPORT_SYMBOL(ceph_file_to_extents);
169
170/*
171 * A stripped down, non-allocating version of ceph_file_to_extents(),
172 * for when @object_extents is already populated.
173 */
174int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len,
175 struct list_head *object_extents,
176 ceph_object_extent_fn_t action_fn,
177 void *action_arg)
178{
179 while (len) {
180 struct ceph_object_extent *ex;
181 u64 objno, objoff;
182 u32 xlen;
183
184 ceph_calc_file_object_mapping(l, off, len, &objno, &objoff,
185 &xlen);
186
187 ex = lookup_containing(object_extents, objno, objoff, xlen);
188 if (!ex) {
189 WARN(1, "%s: objno %llu %llu~%u not found!\n",
190 __func__, objno, objoff, xlen);
191 return -EINVAL;
192 }
193
194 action_fn(ex, xlen, action_arg);
195
196 off += xlen;
197 len -= xlen;
198 }
199
200 return 0;
201}
202EXPORT_SYMBOL(ceph_iterate_extents);
203
204/*
205 * Reverse map an object extent to a sorted list of file extents.
206 *
207 * On success, the caller is responsible for:
208 *
209 * kfree(file_extents)
210 */
211int ceph_extent_to_file(struct ceph_file_layout *l,
212 u64 objno, u64 objoff, u64 objlen,
213 struct ceph_file_extent **file_extents,
214 u32 *num_file_extents)
215{
216 u32 stripes_per_object = l->object_size / l->stripe_unit;
217 u64 blockno; /* which su */
218 u32 blockoff; /* offset into su */
219 u64 stripeno; /* which stripe */
220 u32 stripepos; /* which su in the stripe,
221 which object in the object set */
222 u64 objsetno; /* which object set */
223 u32 i = 0;
224
225 if (!objlen) {
226 *file_extents = NULL;
227 *num_file_extents = 0;
228 return 0;
229 }
230
231 *num_file_extents = DIV_ROUND_UP_ULL(objoff + objlen, l->stripe_unit) -
232 DIV_ROUND_DOWN_ULL(objoff, l->stripe_unit);
233 *file_extents = kmalloc_array(*num_file_extents, sizeof(**file_extents),
234 GFP_NOIO);
235 if (!*file_extents)
236 return -ENOMEM;
237
238 div_u64_rem(objoff, l->stripe_unit, &blockoff);
239 while (objlen) {
240 u64 off, len;
241
242 objsetno = div_u64_rem(objno, l->stripe_count, &stripepos);
243 stripeno = div_u64(objoff, l->stripe_unit) +
244 objsetno * stripes_per_object;
245 blockno = stripeno * l->stripe_count + stripepos;
246 off = blockno * l->stripe_unit + blockoff;
247 len = min_t(u64, objlen, l->stripe_unit - blockoff);
248
249 (*file_extents)[i].fe_off = off;
250 (*file_extents)[i].fe_len = len;
251
252 blockoff = 0;
253 objoff += len;
254 objlen -= len;
255 i++;
256 }
257
258 BUG_ON(i != *num_file_extents);
259 return 0;
260}
261EXPORT_SYMBOL(ceph_extent_to_file);