summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-18 14:05:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-18 14:05:25 -0400
commitd9b9c893048e9d308a833619f0866f1f52778cf5 (patch)
tree29090d6871a39fdf35b6e5b22fe49750e9cf7bb3
parent0fe49f70a08d7d25acee3b066a88c654fea26121 (diff)
parentd31d07b97a5e76f41e00eb81dcca740e84aa7782 (diff)
Merge tag 'ceph-for-5.3-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "Lots of exciting things this time! - support for rbd object-map and fast-diff features (myself). This will speed up reads, discards and things like snap diffs on sparse images. - ceph.snap.btime vxattr to expose snapshot creation time (David Disseldorp). This will be used to integrate with "Restore Previous Versions" feature added in Windows 7 for folks who reexport ceph through SMB. - security xattrs for ceph (Zheng Yan). Only selinux is supported for now due to the limitations of ->dentry_init_security(). - support for MSG_ADDR2, FS_BTIME and FS_CHANGE_ATTR features (Jeff Layton). This is actually a single feature bit which was missing because of the filesystem pieces. With this in, the kernel client will finally be reported as "luminous" by "ceph features" -- it is still being reported as "jewel" even though all required Luminous features were implemented in 4.13. - stop NULL-terminating ceph vxattrs (Jeff Layton). The convention with xattrs is to not terminate and this was causing inconsistencies with ceph-fuse. - change filesystem time granularity from 1 us to 1 ns, again fixing an inconsistency with ceph-fuse (Luis Henriques). On top of this there are some additional dentry name handling and cap flushing fixes from Zheng. Finally, Jeff is formally taking over for Zheng as the filesystem maintainer" * tag 'ceph-for-5.3-rc1' of git://github.com/ceph/ceph-client: (71 commits) ceph: fix end offset in truncate_inode_pages_range call ceph: use generic_delete_inode() for ->drop_inode ceph: use ceph_evict_inode to cleanup inode's resource ceph: initialize superblock s_time_gran to 1 MAINTAINERS: take over for Zheng as CephFS kernel client maintainer rbd: setallochint only if object doesn't exist rbd: support for object-map and fast-diff rbd: call rbd_dev_mapping_set() from rbd_dev_image_probe() libceph: export osd_req_op_data() macro libceph: change ceph_osdc_call() to take page vector for response libceph: bump CEPH_MSG_MAX_DATA_LEN (again) rbd: new exclusive lock wait/wake code rbd: quiescing lock should wait for image requests rbd: lock should be quiesced on reacquire rbd: introduce copyup state machine rbd: rename rbd_obj_setup_*() to rbd_obj_init_*() rbd: move OSD request allocation into object request state machines rbd: factor out __rbd_osd_setup_discard_ops() rbd: factor out rbd_osd_setup_copyup() rbd: introduce obj_req->osd_reqs list ...
-rw-r--r--MAINTAINERS4
-rw-r--r--drivers/block/rbd.c2188
-rw-r--r--drivers/block/rbd_types.h10
-rw-r--r--fs/ceph/Kconfig12
-rw-r--r--fs/ceph/acl.c22
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/caps.c120
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c73
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c34
-rw-r--r--fs/ceph/inode.c208
-rw-r--r--fs/ceph/mds_client.c120
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/quota.c15
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.c13
-rw-r--r--fs/ceph/super.h67
-rw-r--r--fs/ceph/xattr.c456
-rw-r--r--include/linux/ceph/ceph_features.h1
-rw-r--r--include/linux/ceph/ceph_fs.h2
-rw-r--r--include/linux/ceph/cls_lock_client.h3
-rw-r--r--include/linux/ceph/decode.h13
-rw-r--r--include/linux/ceph/libceph.h10
-rw-r--r--include/linux/ceph/mon_client.h1
-rw-r--r--include/linux/ceph/osd_client.h12
-rw-r--r--include/linux/ceph/striper.h2
-rw-r--r--include/linux/iversion.h24
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/cls_lock_client.c54
-rw-r--r--net/ceph/decode.c84
-rw-r--r--net/ceph/messenger.c14
-rw-r--r--net/ceph/mon_client.c21
-rw-r--r--net/ceph/osd_client.c42
-rw-r--r--net/ceph/osdmap.c31
-rw-r--r--net/ceph/pagevec.c33
-rw-r--r--net/ceph/striper.c17
38 files changed, 2606 insertions, 1127 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index d51808468713..f63e9d1468f6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3765,7 +3765,7 @@ F: arch/powerpc/platforms/cell/
3765 3765
3766CEPH COMMON CODE (LIBCEPH) 3766CEPH COMMON CODE (LIBCEPH)
3767M: Ilya Dryomov <idryomov@gmail.com> 3767M: Ilya Dryomov <idryomov@gmail.com>
3768M: "Yan, Zheng" <zyan@redhat.com> 3768M: Jeff Layton <jlayton@kernel.org>
3769M: Sage Weil <sage@redhat.com> 3769M: Sage Weil <sage@redhat.com>
3770L: ceph-devel@vger.kernel.org 3770L: ceph-devel@vger.kernel.org
3771W: http://ceph.com/ 3771W: http://ceph.com/
@@ -3777,7 +3777,7 @@ F: include/linux/ceph/
3777F: include/linux/crush/ 3777F: include/linux/crush/
3778 3778
3779CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) 3779CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
3780M: "Yan, Zheng" <zyan@redhat.com> 3780M: Jeff Layton <jlayton@kernel.org>
3781M: Sage Weil <sage@redhat.com> 3781M: Sage Weil <sage@redhat.com>
3782M: Ilya Dryomov <idryomov@gmail.com> 3782M: Ilya Dryomov <idryomov@gmail.com>
3783L: ceph-devel@vger.kernel.org 3783L: ceph-devel@vger.kernel.org
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e5009a34f9c2..3327192bb71f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v)
115#define RBD_FEATURE_LAYERING (1ULL<<0) 115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
118#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) 120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
119#define RBD_FEATURE_DATA_POOL (1ULL<<7) 121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
120#define RBD_FEATURE_OPERATIONS (1ULL<<8) 122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
@@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v)
122#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
123 RBD_FEATURE_STRIPINGV2 | \ 125 RBD_FEATURE_STRIPINGV2 | \
124 RBD_FEATURE_EXCLUSIVE_LOCK | \ 126 RBD_FEATURE_EXCLUSIVE_LOCK | \
127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
125 RBD_FEATURE_DEEP_FLATTEN | \ 129 RBD_FEATURE_DEEP_FLATTEN | \
126 RBD_FEATURE_DATA_POOL | \ 130 RBD_FEATURE_DATA_POOL | \
127 RBD_FEATURE_OPERATIONS) 131 RBD_FEATURE_OPERATIONS)
@@ -203,6 +207,11 @@ struct rbd_client {
203 struct list_head node; 207 struct list_head node;
204}; 208};
205 209
210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
206struct rbd_img_request; 215struct rbd_img_request;
207 216
208enum obj_request_type { 217enum obj_request_type {
@@ -219,6 +228,18 @@ enum obj_operation_type {
219 OBJ_OP_ZEROOUT, 228 OBJ_OP_ZEROOUT,
220}; 229};
221 230
231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
236
237enum rbd_obj_read_state {
238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
240 RBD_OBJ_READ_PARENT,
241};
242
222/* 243/*
223 * Writes go through the following state machine to deal with 244 * Writes go through the following state machine to deal with
224 * layering: 245 * layering:
@@ -245,17 +266,28 @@ enum obj_operation_type {
245 * even if there is a parent). 266 * even if there is a parent).
246 */ 267 */
247enum rbd_obj_write_state { 268enum rbd_obj_write_state {
248 RBD_OBJ_WRITE_FLAT = 1, 269 RBD_OBJ_WRITE_START = 1,
249 RBD_OBJ_WRITE_GUARD, 270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
250 RBD_OBJ_WRITE_READ_FROM_PARENT, 271 RBD_OBJ_WRITE_OBJECT,
251 RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC, 272 __RBD_OBJ_WRITE_COPYUP,
252 RBD_OBJ_WRITE_COPYUP_OPS, 273 RBD_OBJ_WRITE_COPYUP,
274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
253}; 284};
254 285
255struct rbd_obj_request { 286struct rbd_obj_request {
256 struct ceph_object_extent ex; 287 struct ceph_object_extent ex;
288 unsigned int flags; /* RBD_OBJ_FLAG_* */
257 union { 289 union {
258 bool tried_parent; /* for reads */ 290 enum rbd_obj_read_state read_state; /* for reads */
259 enum rbd_obj_write_state write_state; /* for writes */ 291 enum rbd_obj_write_state write_state; /* for writes */
260 }; 292 };
261 293
@@ -271,14 +303,15 @@ struct rbd_obj_request {
271 u32 bvec_idx; 303 u32 bvec_idx;
272 }; 304 };
273 }; 305 };
306
307 enum rbd_obj_copyup_state copyup_state;
274 struct bio_vec *copyup_bvecs; 308 struct bio_vec *copyup_bvecs;
275 u32 copyup_bvec_count; 309 u32 copyup_bvec_count;
276 310
277 struct ceph_osd_request *osd_req; 311 struct list_head osd_reqs; /* w/ r_private_item */
278
279 u64 xferred; /* bytes transferred */
280 int result;
281 312
313 struct mutex state_mutex;
314 struct pending_result pending;
282 struct kref kref; 315 struct kref kref;
283}; 316};
284 317
@@ -287,11 +320,19 @@ enum img_req_flags {
287 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
288}; 321};
289 322
323enum rbd_img_state {
324 RBD_IMG_START = 1,
325 RBD_IMG_EXCLUSIVE_LOCK,
326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
290struct rbd_img_request { 330struct rbd_img_request {
291 struct rbd_device *rbd_dev; 331 struct rbd_device *rbd_dev;
292 enum obj_operation_type op_type; 332 enum obj_operation_type op_type;
293 enum obj_request_type data_type; 333 enum obj_request_type data_type;
294 unsigned long flags; 334 unsigned long flags;
335 enum rbd_img_state state;
295 union { 336 union {
296 u64 snap_id; /* for reads */ 337 u64 snap_id; /* for reads */
297 struct ceph_snap_context *snapc; /* for writes */ 338 struct ceph_snap_context *snapc; /* for writes */
@@ -300,13 +341,14 @@ struct rbd_img_request {
300 struct request *rq; /* block request */ 341 struct request *rq; /* block request */
301 struct rbd_obj_request *obj_request; /* obj req initiator */ 342 struct rbd_obj_request *obj_request; /* obj req initiator */
302 }; 343 };
303 spinlock_t completion_lock;
304 u64 xferred;/* aggregate bytes transferred */
305 int result; /* first nonzero obj_request result */
306 344
345 struct list_head lock_item;
307 struct list_head object_extents; /* obj_req.ex structs */ 346 struct list_head object_extents; /* obj_req.ex structs */
308 u32 pending_count;
309 347
348 struct mutex state_mutex;
349 struct pending_result pending;
350 struct work_struct work;
351 int work_result;
310 struct kref kref; 352 struct kref kref;
311}; 353};
312 354
@@ -380,7 +422,17 @@ struct rbd_device {
380 struct work_struct released_lock_work; 422 struct work_struct released_lock_work;
381 struct delayed_work lock_dwork; 423 struct delayed_work lock_dwork;
382 struct work_struct unlock_work; 424 struct work_struct unlock_work;
383 wait_queue_head_t lock_waitq; 425 spinlock_t lock_lists_lock;
426 struct list_head acquiring_list;
427 struct list_head running_list;
428 struct completion acquire_wait;
429 int acquire_err;
430 struct completion releasing_wait;
431
432 spinlock_t object_map_lock;
433 u8 *object_map;
434 u64 object_map_size; /* in objects */
435 u64 object_map_flags;
384 436
385 struct workqueue_struct *task_wq; 437 struct workqueue_struct *task_wq;
386 438
@@ -408,12 +460,10 @@ struct rbd_device {
408 * Flag bits for rbd_dev->flags: 460 * Flag bits for rbd_dev->flags:
409 * - REMOVING (which is coupled with rbd_dev->open_count) is protected 461 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
410 * by rbd_dev->lock 462 * by rbd_dev->lock
411 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
412 */ 463 */
413enum rbd_dev_flags { 464enum rbd_dev_flags {
414 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 465 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
415 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 466 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
416 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
417}; 467};
418 468
419static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 469static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
@@ -466,6 +516,8 @@ static int minor_to_rbd_dev_id(int minor)
466 516
467static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 517static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
468{ 518{
519 lockdep_assert_held(&rbd_dev->lock_rwsem);
520
469 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 521 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
470 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 522 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
471} 523}
@@ -583,6 +635,26 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
583 u8 *order, u64 *snap_size); 635 u8 *order, u64 *snap_size);
584static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 636static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
585 u64 *snap_features); 637 u64 *snap_features);
638static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
639
640static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
641static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
642
643/*
644 * Return true if nothing else is pending.
645 */
646static bool pending_result_dec(struct pending_result *pending, int *result)
647{
648 rbd_assert(pending->num_pending > 0);
649
650 if (*result && !pending->result)
651 pending->result = *result;
652 if (--pending->num_pending)
653 return false;
654
655 *result = pending->result;
656 return true;
657}
586 658
587static int rbd_open(struct block_device *bdev, fmode_t mode) 659static int rbd_open(struct block_device *bdev, fmode_t mode)
588{ 660{
@@ -1317,6 +1389,8 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1317static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, 1389static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1318 u32 bytes) 1390 u32 bytes)
1319{ 1391{
1392 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1393
1320 switch (obj_req->img_request->data_type) { 1394 switch (obj_req->img_request->data_type) {
1321 case OBJ_REQUEST_BIO: 1395 case OBJ_REQUEST_BIO:
1322 zero_bios(&obj_req->bio_pos, off, bytes); 1396 zero_bios(&obj_req->bio_pos, off, bytes);
@@ -1339,13 +1413,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1339 kref_put(&obj_request->kref, rbd_obj_request_destroy); 1413 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1340} 1414}
1341 1415
1342static void rbd_img_request_get(struct rbd_img_request *img_request)
1343{
1344 dout("%s: img %p (was %d)\n", __func__, img_request,
1345 kref_read(&img_request->kref));
1346 kref_get(&img_request->kref);
1347}
1348
1349static void rbd_img_request_destroy(struct kref *kref); 1416static void rbd_img_request_destroy(struct kref *kref);
1350static void rbd_img_request_put(struct rbd_img_request *img_request) 1417static void rbd_img_request_put(struct rbd_img_request *img_request)
1351{ 1418{
@@ -1362,7 +1429,6 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1362 1429
1363 /* Image request now owns object's original reference */ 1430 /* Image request now owns object's original reference */
1364 obj_request->img_request = img_request; 1431 obj_request->img_request = img_request;
1365 img_request->pending_count++;
1366 dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1432 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1367} 1433}
1368 1434
@@ -1375,13 +1441,13 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1375 rbd_obj_request_put(obj_request); 1441 rbd_obj_request_put(obj_request);
1376} 1442}
1377 1443
1378static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1444static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1379{ 1445{
1380 struct ceph_osd_request *osd_req = obj_request->osd_req; 1446 struct rbd_obj_request *obj_req = osd_req->r_priv;
1381 1447
1382 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 1448 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1383 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, 1449 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1384 obj_request->ex.oe_len, osd_req); 1450 obj_req->ex.oe_off, obj_req->ex.oe_len);
1385 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1451 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1386} 1452}
1387 1453
@@ -1457,41 +1523,38 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req)
1457 } 1523 }
1458} 1524}
1459 1525
1460static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1461
1462static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1526static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1463{ 1527{
1464 struct rbd_obj_request *obj_req = osd_req->r_priv; 1528 struct rbd_obj_request *obj_req = osd_req->r_priv;
1529 int result;
1465 1530
1466 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 1531 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1467 osd_req->r_result, obj_req); 1532 osd_req->r_result, obj_req);
1468 rbd_assert(osd_req == obj_req->osd_req);
1469 1533
1470 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; 1534 /*
1471 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) 1535 * Writes aren't allowed to return a data payload. In some
1472 obj_req->xferred = osd_req->r_result; 1536 * guarded write cases (e.g. stat + zero on an empty object)
1537 * a stat response makes it through, but we don't care.
1538 */
1539 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1540 result = 0;
1473 else 1541 else
1474 /* 1542 result = osd_req->r_result;
1475 * Writes aren't allowed to return a data payload. In some
1476 * guarded write cases (e.g. stat + zero on an empty object)
1477 * a stat response makes it through, but we don't care.
1478 */
1479 obj_req->xferred = 0;
1480 1543
1481 rbd_obj_handle_request(obj_req); 1544 rbd_obj_handle_request(obj_req, result);
1482} 1545}
1483 1546
1484static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1547static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1485{ 1548{
1486 struct ceph_osd_request *osd_req = obj_request->osd_req; 1549 struct rbd_obj_request *obj_request = osd_req->r_priv;
1487 1550
1488 osd_req->r_flags = CEPH_OSD_FLAG_READ; 1551 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1489 osd_req->r_snapid = obj_request->img_request->snap_id; 1552 osd_req->r_snapid = obj_request->img_request->snap_id;
1490} 1553}
1491 1554
1492static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 1555static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
1493{ 1556{
1494 struct ceph_osd_request *osd_req = obj_request->osd_req; 1557 struct rbd_obj_request *obj_request = osd_req->r_priv;
1495 1558
1496 osd_req->r_flags = CEPH_OSD_FLAG_WRITE; 1559 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1497 ktime_get_real_ts64(&osd_req->r_mtime); 1560 ktime_get_real_ts64(&osd_req->r_mtime);
@@ -1499,19 +1562,21 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1499} 1562}
1500 1563
1501static struct ceph_osd_request * 1564static struct ceph_osd_request *
1502__rbd_osd_req_create(struct rbd_obj_request *obj_req, 1565__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1503 struct ceph_snap_context *snapc, unsigned int num_ops) 1566 struct ceph_snap_context *snapc, int num_ops)
1504{ 1567{
1505 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1568 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1506 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1569 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1507 struct ceph_osd_request *req; 1570 struct ceph_osd_request *req;
1508 const char *name_format = rbd_dev->image_format == 1 ? 1571 const char *name_format = rbd_dev->image_format == 1 ?
1509 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1572 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1573 int ret;
1510 1574
1511 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1575 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1512 if (!req) 1576 if (!req)
1513 return NULL; 1577 return ERR_PTR(-ENOMEM);
1514 1578
1579 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1515 req->r_callback = rbd_osd_req_callback; 1580 req->r_callback = rbd_osd_req_callback;
1516 req->r_priv = obj_req; 1581 req->r_priv = obj_req;
1517 1582
@@ -1522,27 +1587,20 @@ __rbd_osd_req_create(struct rbd_obj_request *obj_req,
1522 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 1587 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1523 req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1588 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1524 1589
1525 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 1590 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1526 rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) 1591 rbd_dev->header.object_prefix,
1527 goto err_req; 1592 obj_req->ex.oe_objno);
1593 if (ret)
1594 return ERR_PTR(ret);
1528 1595
1529 return req; 1596 return req;
1530
1531err_req:
1532 ceph_osdc_put_request(req);
1533 return NULL;
1534} 1597}
1535 1598
1536static struct ceph_osd_request * 1599static struct ceph_osd_request *
1537rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) 1600rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1538{ 1601{
1539 return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc, 1602 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1540 num_ops); 1603 num_ops);
1541}
1542
1543static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1544{
1545 ceph_osdc_put_request(osd_req);
1546} 1604}
1547 1605
1548static struct rbd_obj_request *rbd_obj_request_create(void) 1606static struct rbd_obj_request *rbd_obj_request_create(void)
@@ -1554,6 +1612,8 @@ static struct rbd_obj_request *rbd_obj_request_create(void)
1554 return NULL; 1612 return NULL;
1555 1613
1556 ceph_object_extent_init(&obj_request->ex); 1614 ceph_object_extent_init(&obj_request->ex);
1615 INIT_LIST_HEAD(&obj_request->osd_reqs);
1616 mutex_init(&obj_request->state_mutex);
1557 kref_init(&obj_request->kref); 1617 kref_init(&obj_request->kref);
1558 1618
1559 dout("%s %p\n", __func__, obj_request); 1619 dout("%s %p\n", __func__, obj_request);
@@ -1563,14 +1623,19 @@ static struct rbd_obj_request *rbd_obj_request_create(void)
1563static void rbd_obj_request_destroy(struct kref *kref) 1623static void rbd_obj_request_destroy(struct kref *kref)
1564{ 1624{
1565 struct rbd_obj_request *obj_request; 1625 struct rbd_obj_request *obj_request;
1626 struct ceph_osd_request *osd_req;
1566 u32 i; 1627 u32 i;
1567 1628
1568 obj_request = container_of(kref, struct rbd_obj_request, kref); 1629 obj_request = container_of(kref, struct rbd_obj_request, kref);
1569 1630
1570 dout("%s: obj %p\n", __func__, obj_request); 1631 dout("%s: obj %p\n", __func__, obj_request);
1571 1632
1572 if (obj_request->osd_req) 1633 while (!list_empty(&obj_request->osd_reqs)) {
1573 rbd_osd_req_destroy(obj_request->osd_req); 1634 osd_req = list_first_entry(&obj_request->osd_reqs,
1635 struct ceph_osd_request, r_private_item);
1636 list_del_init(&osd_req->r_private_item);
1637 ceph_osdc_put_request(osd_req);
1638 }
1574 1639
1575 switch (obj_request->img_request->data_type) { 1640 switch (obj_request->img_request->data_type) {
1576 case OBJ_REQUEST_NODATA: 1641 case OBJ_REQUEST_NODATA:
@@ -1684,8 +1749,9 @@ static struct rbd_img_request *rbd_img_request_create(
1684 if (rbd_dev_parent_get(rbd_dev)) 1749 if (rbd_dev_parent_get(rbd_dev))
1685 img_request_layered_set(img_request); 1750 img_request_layered_set(img_request);
1686 1751
1687 spin_lock_init(&img_request->completion_lock); 1752 INIT_LIST_HEAD(&img_request->lock_item);
1688 INIT_LIST_HEAD(&img_request->object_extents); 1753 INIT_LIST_HEAD(&img_request->object_extents);
1754 mutex_init(&img_request->state_mutex);
1689 kref_init(&img_request->kref); 1755 kref_init(&img_request->kref);
1690 1756
1691 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, 1757 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
@@ -1703,6 +1769,7 @@ static void rbd_img_request_destroy(struct kref *kref)
1703 1769
1704 dout("%s: img %p\n", __func__, img_request); 1770 dout("%s: img %p\n", __func__, img_request);
1705 1771
1772 WARN_ON(!list_empty(&img_request->lock_item));
1706 for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1773 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1707 rbd_img_obj_request_del(img_request, obj_request); 1774 rbd_img_obj_request_del(img_request, obj_request);
1708 1775
@@ -1717,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref)
1717 kmem_cache_free(rbd_img_request_cache, img_request); 1784 kmem_cache_free(rbd_img_request_cache, img_request);
1718} 1785}
1719 1786
1787#define BITS_PER_OBJ 2
1788#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1789#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
1790
1791static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1792 u64 *index, u8 *shift)
1793{
1794 u32 off;
1795
1796 rbd_assert(objno < rbd_dev->object_map_size);
1797 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1798 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1799}
1800
1801static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1802{
1803 u64 index;
1804 u8 shift;
1805
1806 lockdep_assert_held(&rbd_dev->object_map_lock);
1807 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1808 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1809}
1810
1811static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1812{
1813 u64 index;
1814 u8 shift;
1815 u8 *p;
1816
1817 lockdep_assert_held(&rbd_dev->object_map_lock);
1818 rbd_assert(!(val & ~OBJ_MASK));
1819
1820 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1821 p = &rbd_dev->object_map[index];
1822 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1823}
1824
1825static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1826{
1827 u8 state;
1828
1829 spin_lock(&rbd_dev->object_map_lock);
1830 state = __rbd_object_map_get(rbd_dev, objno);
1831 spin_unlock(&rbd_dev->object_map_lock);
1832 return state;
1833}
1834
1835static bool use_object_map(struct rbd_device *rbd_dev)
1836{
1837 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1838 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1839}
1840
1841static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1842{
1843 u8 state;
1844
1845 /* fall back to default logic if object map is disabled or invalid */
1846 if (!use_object_map(rbd_dev))
1847 return true;
1848
1849 state = rbd_object_map_get(rbd_dev, objno);
1850 return state != OBJECT_NONEXISTENT;
1851}
1852
1853static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1854 struct ceph_object_id *oid)
1855{
1856 if (snap_id == CEPH_NOSNAP)
1857 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1858 rbd_dev->spec->image_id);
1859 else
1860 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1861 rbd_dev->spec->image_id, snap_id);
1862}
1863
1864static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1865{
1866 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1867 CEPH_DEFINE_OID_ONSTACK(oid);
1868 u8 lock_type;
1869 char *lock_tag;
1870 struct ceph_locker *lockers;
1871 u32 num_lockers;
1872 bool broke_lock = false;
1873 int ret;
1874
1875 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1876
1877again:
1878 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1879 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1880 if (ret != -EBUSY || broke_lock) {
1881 if (ret == -EEXIST)
1882 ret = 0; /* already locked by myself */
1883 if (ret)
1884 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1885 return ret;
1886 }
1887
1888 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1889 RBD_LOCK_NAME, &lock_type, &lock_tag,
1890 &lockers, &num_lockers);
1891 if (ret) {
1892 if (ret == -ENOENT)
1893 goto again;
1894
1895 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1896 return ret;
1897 }
1898
1899 kfree(lock_tag);
1900 if (num_lockers == 0)
1901 goto again;
1902
1903 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1904 ENTITY_NAME(lockers[0].id.name));
1905
1906 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1907 RBD_LOCK_NAME, lockers[0].id.cookie,
1908 &lockers[0].id.name);
1909 ceph_free_lockers(lockers, num_lockers);
1910 if (ret) {
1911 if (ret == -ENOENT)
1912 goto again;
1913
1914 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1915 return ret;
1916 }
1917
1918 broke_lock = true;
1919 goto again;
1920}
1921
1922static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1923{
1924 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1925 CEPH_DEFINE_OID_ONSTACK(oid);
1926 int ret;
1927
1928 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1929
1930 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1931 "");
1932 if (ret && ret != -ENOENT)
1933 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1934}
1935
1936static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1937{
1938 u8 struct_v;
1939 u32 struct_len;
1940 u32 header_len;
1941 void *header_end;
1942 int ret;
1943
1944 ceph_decode_32_safe(p, end, header_len, e_inval);
1945 header_end = *p + header_len;
1946
1947 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1948 &struct_len);
1949 if (ret)
1950 return ret;
1951
1952 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1953
1954 *p = header_end;
1955 return 0;
1956
1957e_inval:
1958 return -EINVAL;
1959}
1960
1961static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1962{
1963 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1964 CEPH_DEFINE_OID_ONSTACK(oid);
1965 struct page **pages;
1966 void *p, *end;
1967 size_t reply_len;
1968 u64 num_objects;
1969 u64 object_map_bytes;
1970 u64 object_map_size;
1971 int num_pages;
1972 int ret;
1973
1974 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1975
1976 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1977 rbd_dev->mapping.size);
1978 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1979 BITS_PER_BYTE);
1980 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1981 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1982 if (IS_ERR(pages))
1983 return PTR_ERR(pages);
1984
1985 reply_len = num_pages * PAGE_SIZE;
1986 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1987 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1988 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1989 NULL, 0, pages, &reply_len);
1990 if (ret)
1991 goto out;
1992
1993 p = page_address(pages[0]);
1994 end = p + min(reply_len, (size_t)PAGE_SIZE);
1995 ret = decode_object_map_header(&p, end, &object_map_size);
1996 if (ret)
1997 goto out;
1998
1999 if (object_map_size != num_objects) {
2000 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
2001 object_map_size, num_objects);
2002 ret = -EINVAL;
2003 goto out;
2004 }
2005
2006 if (offset_in_page(p) + object_map_bytes > reply_len) {
2007 ret = -EINVAL;
2008 goto out;
2009 }
2010
2011 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
2012 if (!rbd_dev->object_map) {
2013 ret = -ENOMEM;
2014 goto out;
2015 }
2016
2017 rbd_dev->object_map_size = object_map_size;
2018 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
2019 offset_in_page(p), object_map_bytes);
2020
2021out:
2022 ceph_release_page_vector(pages, num_pages);
2023 return ret;
2024}
2025
2026static void rbd_object_map_free(struct rbd_device *rbd_dev)
2027{
2028 kvfree(rbd_dev->object_map);
2029 rbd_dev->object_map = NULL;
2030 rbd_dev->object_map_size = 0;
2031}
2032
2033static int rbd_object_map_load(struct rbd_device *rbd_dev)
2034{
2035 int ret;
2036
2037 ret = __rbd_object_map_load(rbd_dev);
2038 if (ret)
2039 return ret;
2040
2041 ret = rbd_dev_v2_get_flags(rbd_dev);
2042 if (ret) {
2043 rbd_object_map_free(rbd_dev);
2044 return ret;
2045 }
2046
2047 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
2048 rbd_warn(rbd_dev, "object map is invalid");
2049
2050 return 0;
2051}
2052
2053static int rbd_object_map_open(struct rbd_device *rbd_dev)
2054{
2055 int ret;
2056
2057 ret = rbd_object_map_lock(rbd_dev);
2058 if (ret)
2059 return ret;
2060
2061 ret = rbd_object_map_load(rbd_dev);
2062 if (ret) {
2063 rbd_object_map_unlock(rbd_dev);
2064 return ret;
2065 }
2066
2067 return 0;
2068}
2069
2070static void rbd_object_map_close(struct rbd_device *rbd_dev)
2071{
2072 rbd_object_map_free(rbd_dev);
2073 rbd_object_map_unlock(rbd_dev);
2074}
2075
2076/*
2077 * This function needs snap_id (or more precisely just something to
2078 * distinguish between HEAD and snapshot object maps), new_state and
2079 * current_state that were passed to rbd_object_map_update().
2080 *
2081 * To avoid allocating and stashing a context we piggyback on the OSD
2082 * request. A HEAD update has two ops (assert_locked). For new_state
2083 * and current_state we decode our own object_map_update op, encoded in
2084 * rbd_cls_object_map_update().
2085 */
2086static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
2087 struct ceph_osd_request *osd_req)
2088{
2089 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2090 struct ceph_osd_data *osd_data;
2091 u64 objno;
2092 u8 state, new_state, current_state;
2093 bool has_current_state;
2094 void *p;
2095
2096 if (osd_req->r_result)
2097 return osd_req->r_result;
2098
2099 /*
2100 * Nothing to do for a snapshot object map.
2101 */
2102 if (osd_req->r_num_ops == 1)
2103 return 0;
2104
2105 /*
2106 * Update in-memory HEAD object map.
2107 */
2108 rbd_assert(osd_req->r_num_ops == 2);
2109 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2110 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2111
2112 p = page_address(osd_data->pages[0]);
2113 objno = ceph_decode_64(&p);
2114 rbd_assert(objno == obj_req->ex.oe_objno);
2115 rbd_assert(ceph_decode_64(&p) == objno + 1);
2116 new_state = ceph_decode_8(&p);
2117 has_current_state = ceph_decode_8(&p);
2118 if (has_current_state)
2119 current_state = ceph_decode_8(&p);
2120
2121 spin_lock(&rbd_dev->object_map_lock);
2122 state = __rbd_object_map_get(rbd_dev, objno);
2123 if (!has_current_state || current_state == state ||
2124 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2125 __rbd_object_map_set(rbd_dev, objno, new_state);
2126 spin_unlock(&rbd_dev->object_map_lock);
2127
2128 return 0;
2129}
2130
2131static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2132{
2133 struct rbd_obj_request *obj_req = osd_req->r_priv;
2134 int result;
2135
2136 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2137 osd_req->r_result, obj_req);
2138
2139 result = rbd_object_map_update_finish(obj_req, osd_req);
2140 rbd_obj_handle_request(obj_req, result);
2141}
2142
2143static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2144{
2145 u8 state = rbd_object_map_get(rbd_dev, objno);
2146
2147 if (state == new_state ||
2148 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2149 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2150 return false;
2151
2152 return true;
2153}
2154
2155static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2156 int which, u64 objno, u8 new_state,
2157 const u8 *current_state)
2158{
2159 struct page **pages;
2160 void *p, *start;
2161 int ret;
2162
2163 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2164 if (ret)
2165 return ret;
2166
2167 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2168 if (IS_ERR(pages))
2169 return PTR_ERR(pages);
2170
2171 p = start = page_address(pages[0]);
2172 ceph_encode_64(&p, objno);
2173 ceph_encode_64(&p, objno + 1);
2174 ceph_encode_8(&p, new_state);
2175 if (current_state) {
2176 ceph_encode_8(&p, 1);
2177 ceph_encode_8(&p, *current_state);
2178 } else {
2179 ceph_encode_8(&p, 0);
2180 }
2181
2182 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2183 false, true);
2184 return 0;
2185}
2186
2187/*
2188 * Return:
2189 * 0 - object map update sent
2190 * 1 - object map update isn't needed
2191 * <0 - error
2192 */
2193static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2194 u8 new_state, const u8 *current_state)
2195{
2196 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2197 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2198 struct ceph_osd_request *req;
2199 int num_ops = 1;
2200 int which = 0;
2201 int ret;
2202
2203 if (snap_id == CEPH_NOSNAP) {
2204 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2205 return 1;
2206
2207 num_ops++; /* assert_locked */
2208 }
2209
2210 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2211 if (!req)
2212 return -ENOMEM;
2213
2214 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2215 req->r_callback = rbd_object_map_callback;
2216 req->r_priv = obj_req;
2217
2218 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2219 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2220 req->r_flags = CEPH_OSD_FLAG_WRITE;
2221 ktime_get_real_ts64(&req->r_mtime);
2222
2223 if (snap_id == CEPH_NOSNAP) {
2224 /*
2225 * Protect against possible race conditions during lock
2226 * ownership transitions.
2227 */
2228 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2229 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2230 if (ret)
2231 return ret;
2232 }
2233
2234 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2235 new_state, current_state);
2236 if (ret)
2237 return ret;
2238
2239 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2240 if (ret)
2241 return ret;
2242
2243 ceph_osdc_start_request(osdc, req, false);
2244 return 0;
2245}
2246
1720static void prune_extents(struct ceph_file_extent *img_extents, 2247static void prune_extents(struct ceph_file_extent *img_extents,
1721 u32 *num_img_extents, u64 overlap) 2248 u32 *num_img_extents, u64 overlap)
1722{ 2249{
@@ -1764,11 +2291,13 @@ static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1764 return 0; 2291 return 0;
1765} 2292}
1766 2293
1767static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) 2294static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
1768{ 2295{
2296 struct rbd_obj_request *obj_req = osd_req->r_priv;
2297
1769 switch (obj_req->img_request->data_type) { 2298 switch (obj_req->img_request->data_type) {
1770 case OBJ_REQUEST_BIO: 2299 case OBJ_REQUEST_BIO:
1771 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, 2300 osd_req_op_extent_osd_data_bio(osd_req, which,
1772 &obj_req->bio_pos, 2301 &obj_req->bio_pos,
1773 obj_req->ex.oe_len); 2302 obj_req->ex.oe_len);
1774 break; 2303 break;
@@ -1777,7 +2306,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1777 rbd_assert(obj_req->bvec_pos.iter.bi_size == 2306 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
1778 obj_req->ex.oe_len); 2307 obj_req->ex.oe_len);
1779 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); 2308 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
1780 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, 2309 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
1781 &obj_req->bvec_pos); 2310 &obj_req->bvec_pos);
1782 break; 2311 break;
1783 default: 2312 default:
@@ -1785,22 +2314,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1785 } 2314 }
1786} 2315}
1787 2316
1788static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) 2317static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
1789{
1790 obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
1791 if (!obj_req->osd_req)
1792 return -ENOMEM;
1793
1794 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
1795 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
1796 rbd_osd_req_setup_data(obj_req, 0);
1797
1798 rbd_osd_req_format_read(obj_req);
1799 return 0;
1800}
1801
1802static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1803 unsigned int which)
1804{ 2318{
1805 struct page **pages; 2319 struct page **pages;
1806 2320
@@ -1816,45 +2330,60 @@ static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1816 if (IS_ERR(pages)) 2330 if (IS_ERR(pages))
1817 return PTR_ERR(pages); 2331 return PTR_ERR(pages);
1818 2332
1819 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); 2333 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
1820 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, 2334 osd_req_op_raw_data_in_pages(osd_req, which, pages,
1821 8 + sizeof(struct ceph_timespec), 2335 8 + sizeof(struct ceph_timespec),
1822 0, false, true); 2336 0, false, true);
1823 return 0; 2337 return 0;
1824} 2338}
1825 2339
1826static int count_write_ops(struct rbd_obj_request *obj_req) 2340static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2341 u32 bytes)
2342{
2343 struct rbd_obj_request *obj_req = osd_req->r_priv;
2344 int ret;
2345
2346 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2347 if (ret)
2348 return ret;
2349
2350 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2351 obj_req->copyup_bvec_count, bytes);
2352 return 0;
2353}
2354
2355static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
1827{ 2356{
1828 return 2; /* setallochint + write/writefull */ 2357 obj_req->read_state = RBD_OBJ_READ_START;
2358 return 0;
1829} 2359}
1830 2360
1831static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, 2361static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
1832 unsigned int which) 2362 int which)
1833{ 2363{
2364 struct rbd_obj_request *obj_req = osd_req->r_priv;
1834 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 2365 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1835 u16 opcode; 2366 u16 opcode;
1836 2367
1837 osd_req_op_alloc_hint_init(obj_req->osd_req, which++, 2368 if (!use_object_map(rbd_dev) ||
1838 rbd_dev->layout.object_size, 2369 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
1839 rbd_dev->layout.object_size); 2370 osd_req_op_alloc_hint_init(osd_req, which++,
2371 rbd_dev->layout.object_size,
2372 rbd_dev->layout.object_size);
2373 }
1840 2374
1841 if (rbd_obj_is_entire(obj_req)) 2375 if (rbd_obj_is_entire(obj_req))
1842 opcode = CEPH_OSD_OP_WRITEFULL; 2376 opcode = CEPH_OSD_OP_WRITEFULL;
1843 else 2377 else
1844 opcode = CEPH_OSD_OP_WRITE; 2378 opcode = CEPH_OSD_OP_WRITE;
1845 2379
1846 osd_req_op_extent_init(obj_req->osd_req, which, opcode, 2380 osd_req_op_extent_init(osd_req, which, opcode,
1847 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 2381 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
1848 rbd_osd_req_setup_data(obj_req, which++); 2382 rbd_osd_setup_data(osd_req, which);
1849
1850 rbd_assert(which == obj_req->osd_req->r_num_ops);
1851 rbd_osd_req_format_write(obj_req);
1852} 2383}
1853 2384
1854static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) 2385static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
1855{ 2386{
1856 unsigned int num_osd_ops, which = 0;
1857 bool need_guard;
1858 int ret; 2387 int ret;
1859 2388
1860 /* reverse map the entire object onto the parent */ 2389 /* reverse map the entire object onto the parent */
@@ -1862,24 +2391,10 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
1862 if (ret) 2391 if (ret)
1863 return ret; 2392 return ret;
1864 2393
1865 need_guard = rbd_obj_copyup_enabled(obj_req); 2394 if (rbd_obj_copyup_enabled(obj_req))
1866 num_osd_ops = need_guard + count_write_ops(obj_req); 2395 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1867
1868 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
1869 if (!obj_req->osd_req)
1870 return -ENOMEM;
1871
1872 if (need_guard) {
1873 ret = __rbd_obj_setup_stat(obj_req, which++);
1874 if (ret)
1875 return ret;
1876 2396
1877 obj_req->write_state = RBD_OBJ_WRITE_GUARD; 2397 obj_req->write_state = RBD_OBJ_WRITE_START;
1878 } else {
1879 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1880 }
1881
1882 __rbd_obj_setup_write(obj_req, which);
1883 return 0; 2398 return 0;
1884} 2399}
1885 2400
@@ -1889,11 +2404,26 @@ static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1889 CEPH_OSD_OP_ZERO; 2404 CEPH_OSD_OP_ZERO;
1890} 2405}
1891 2406
1892static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) 2407static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2408 int which)
2409{
2410 struct rbd_obj_request *obj_req = osd_req->r_priv;
2411
2412 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2413 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2414 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2415 } else {
2416 osd_req_op_extent_init(osd_req, which,
2417 truncate_or_zero_opcode(obj_req),
2418 obj_req->ex.oe_off, obj_req->ex.oe_len,
2419 0, 0);
2420 }
2421}
2422
2423static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
1893{ 2424{
1894 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 2425 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1895 u64 off = obj_req->ex.oe_off; 2426 u64 off, next_off;
1896 u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
1897 int ret; 2427 int ret;
1898 2428
1899 /* 2429 /*
@@ -1906,10 +2436,17 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1906 */ 2436 */
1907 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || 2437 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1908 !rbd_obj_is_tail(obj_req)) { 2438 !rbd_obj_is_tail(obj_req)) {
1909 off = round_up(off, rbd_dev->opts->alloc_size); 2439 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
1910 next_off = round_down(next_off, rbd_dev->opts->alloc_size); 2440 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2441 rbd_dev->opts->alloc_size);
1911 if (off >= next_off) 2442 if (off >= next_off)
1912 return 1; 2443 return 1;
2444
2445 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2446 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2447 off, next_off - off);
2448 obj_req->ex.oe_off = off;
2449 obj_req->ex.oe_len = next_off - off;
1913 } 2450 }
1914 2451
1915 /* reverse map the entire object onto the parent */ 2452 /* reverse map the entire object onto the parent */
@@ -1917,52 +2454,29 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1917 if (ret) 2454 if (ret)
1918 return ret; 2455 return ret;
1919 2456
1920 obj_req->osd_req = rbd_osd_req_create(obj_req, 1); 2457 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
1921 if (!obj_req->osd_req) 2458 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
1922 return -ENOMEM; 2459 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
1923
1924 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
1925 osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
1926 } else {
1927 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1928 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1929 off, next_off - off);
1930 osd_req_op_extent_init(obj_req->osd_req, 0,
1931 truncate_or_zero_opcode(obj_req),
1932 off, next_off - off, 0, 0);
1933 }
1934 2460
1935 obj_req->write_state = RBD_OBJ_WRITE_FLAT; 2461 obj_req->write_state = RBD_OBJ_WRITE_START;
1936 rbd_osd_req_format_write(obj_req);
1937 return 0; 2462 return 0;
1938} 2463}
1939 2464
1940static int count_zeroout_ops(struct rbd_obj_request *obj_req) 2465static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
1941{ 2466 int which)
1942 int num_osd_ops;
1943
1944 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
1945 !rbd_obj_copyup_enabled(obj_req))
1946 num_osd_ops = 2; /* create + truncate */
1947 else
1948 num_osd_ops = 1; /* delete/truncate/zero */
1949
1950 return num_osd_ops;
1951}
1952
1953static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
1954 unsigned int which)
1955{ 2467{
2468 struct rbd_obj_request *obj_req = osd_req->r_priv;
1956 u16 opcode; 2469 u16 opcode;
1957 2470
1958 if (rbd_obj_is_entire(obj_req)) { 2471 if (rbd_obj_is_entire(obj_req)) {
1959 if (obj_req->num_img_extents) { 2472 if (obj_req->num_img_extents) {
1960 if (!rbd_obj_copyup_enabled(obj_req)) 2473 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
1961 osd_req_op_init(obj_req->osd_req, which++, 2474 osd_req_op_init(osd_req, which++,
1962 CEPH_OSD_OP_CREATE, 0); 2475 CEPH_OSD_OP_CREATE, 0);
1963 opcode = CEPH_OSD_OP_TRUNCATE; 2476 opcode = CEPH_OSD_OP_TRUNCATE;
1964 } else { 2477 } else {
1965 osd_req_op_init(obj_req->osd_req, which++, 2478 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2479 osd_req_op_init(osd_req, which++,
1966 CEPH_OSD_OP_DELETE, 0); 2480 CEPH_OSD_OP_DELETE, 0);
1967 opcode = 0; 2481 opcode = 0;
1968 } 2482 }
@@ -1971,18 +2485,13 @@ static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
1971 } 2485 }
1972 2486
1973 if (opcode) 2487 if (opcode)
1974 osd_req_op_extent_init(obj_req->osd_req, which++, opcode, 2488 osd_req_op_extent_init(osd_req, which, opcode,
1975 obj_req->ex.oe_off, obj_req->ex.oe_len, 2489 obj_req->ex.oe_off, obj_req->ex.oe_len,
1976 0, 0); 2490 0, 0);
1977
1978 rbd_assert(which == obj_req->osd_req->r_num_ops);
1979 rbd_osd_req_format_write(obj_req);
1980} 2491}
1981 2492
1982static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) 2493static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
1983{ 2494{
1984 unsigned int num_osd_ops, which = 0;
1985 bool need_guard;
1986 int ret; 2495 int ret;
1987 2496
1988 /* reverse map the entire object onto the parent */ 2497 /* reverse map the entire object onto the parent */
@@ -1990,31 +2499,66 @@ static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
1990 if (ret) 2499 if (ret)
1991 return ret; 2500 return ret;
1992 2501
1993 need_guard = rbd_obj_copyup_enabled(obj_req); 2502 if (rbd_obj_copyup_enabled(obj_req))
1994 num_osd_ops = need_guard + count_zeroout_ops(obj_req); 2503 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2504 if (!obj_req->num_img_extents) {
2505 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2506 if (rbd_obj_is_entire(obj_req))
2507 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2508 }
1995 2509
1996 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 2510 obj_req->write_state = RBD_OBJ_WRITE_START;
1997 if (!obj_req->osd_req) 2511 return 0;
1998 return -ENOMEM; 2512}
1999 2513
2000 if (need_guard) { 2514static int count_write_ops(struct rbd_obj_request *obj_req)
2001 ret = __rbd_obj_setup_stat(obj_req, which++); 2515{
2002 if (ret) 2516 struct rbd_img_request *img_req = obj_req->img_request;
2003 return ret;
2004 2517
2005 obj_req->write_state = RBD_OBJ_WRITE_GUARD; 2518 switch (img_req->op_type) {
2006 } else { 2519 case OBJ_OP_WRITE:
2007 obj_req->write_state = RBD_OBJ_WRITE_FLAT; 2520 if (!use_object_map(img_req->rbd_dev) ||
2521 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2522 return 2; /* setallochint + write/writefull */
2523
2524 return 1; /* write/writefull */
2525 case OBJ_OP_DISCARD:
2526 return 1; /* delete/truncate/zero */
2527 case OBJ_OP_ZEROOUT:
2528 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2529 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2530 return 2; /* create + truncate */
2531
2532 return 1; /* delete/truncate/zero */
2533 default:
2534 BUG();
2008 } 2535 }
2536}
2009 2537
2010 __rbd_obj_setup_zeroout(obj_req, which); 2538static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2011 return 0; 2539 int which)
2540{
2541 struct rbd_obj_request *obj_req = osd_req->r_priv;
2542
2543 switch (obj_req->img_request->op_type) {
2544 case OBJ_OP_WRITE:
2545 __rbd_osd_setup_write_ops(osd_req, which);
2546 break;
2547 case OBJ_OP_DISCARD:
2548 __rbd_osd_setup_discard_ops(osd_req, which);
2549 break;
2550 case OBJ_OP_ZEROOUT:
2551 __rbd_osd_setup_zeroout_ops(osd_req, which);
2552 break;
2553 default:
2554 BUG();
2555 }
2012} 2556}
2013 2557
2014/* 2558/*
2015 * For each object request in @img_req, allocate an OSD request, add 2559 * Prune the list of object requests (adjust offset and/or length, drop
2016 * individual OSD ops and prepare them for submission. The number of 2560 * redundant requests). Prepare object request state machines and image
2017 * OSD ops depends on op_type and the overlap point (if any). 2561 * request state machine for execution.
2018 */ 2562 */
2019static int __rbd_img_fill_request(struct rbd_img_request *img_req) 2563static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2020{ 2564{
@@ -2024,16 +2568,16 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2024 for_each_obj_request_safe(img_req, obj_req, next_obj_req) { 2568 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
2025 switch (img_req->op_type) { 2569 switch (img_req->op_type) {
2026 case OBJ_OP_READ: 2570 case OBJ_OP_READ:
2027 ret = rbd_obj_setup_read(obj_req); 2571 ret = rbd_obj_init_read(obj_req);
2028 break; 2572 break;
2029 case OBJ_OP_WRITE: 2573 case OBJ_OP_WRITE:
2030 ret = rbd_obj_setup_write(obj_req); 2574 ret = rbd_obj_init_write(obj_req);
2031 break; 2575 break;
2032 case OBJ_OP_DISCARD: 2576 case OBJ_OP_DISCARD:
2033 ret = rbd_obj_setup_discard(obj_req); 2577 ret = rbd_obj_init_discard(obj_req);
2034 break; 2578 break;
2035 case OBJ_OP_ZEROOUT: 2579 case OBJ_OP_ZEROOUT:
2036 ret = rbd_obj_setup_zeroout(obj_req); 2580 ret = rbd_obj_init_zeroout(obj_req);
2037 break; 2581 break;
2038 default: 2582 default:
2039 BUG(); 2583 BUG();
@@ -2041,17 +2585,12 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2041 if (ret < 0) 2585 if (ret < 0)
2042 return ret; 2586 return ret;
2043 if (ret > 0) { 2587 if (ret > 0) {
2044 img_req->xferred += obj_req->ex.oe_len;
2045 img_req->pending_count--;
2046 rbd_img_obj_request_del(img_req, obj_req); 2588 rbd_img_obj_request_del(img_req, obj_req);
2047 continue; 2589 continue;
2048 } 2590 }
2049
2050 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
2051 if (ret)
2052 return ret;
2053 } 2591 }
2054 2592
2593 img_req->state = RBD_IMG_START;
2055 return 0; 2594 return 0;
2056} 2595}
2057 2596
@@ -2340,17 +2879,55 @@ static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2340 &it); 2879 &it);
2341} 2880}
2342 2881
2343static void rbd_img_request_submit(struct rbd_img_request *img_request) 2882static void rbd_img_handle_request_work(struct work_struct *work)
2344{ 2883{
2345 struct rbd_obj_request *obj_request; 2884 struct rbd_img_request *img_req =
2885 container_of(work, struct rbd_img_request, work);
2346 2886
2347 dout("%s: img %p\n", __func__, img_request); 2887 rbd_img_handle_request(img_req, img_req->work_result);
2888}
2348 2889
2349 rbd_img_request_get(img_request); 2890static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2350 for_each_obj_request(img_request, obj_request) 2891{
2351 rbd_obj_request_submit(obj_request); 2892 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2893 img_req->work_result = result;
2894 queue_work(rbd_wq, &img_req->work);
2895}
2352 2896
2353 rbd_img_request_put(img_request); 2897static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2898{
2899 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2900
2901 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2902 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2903 return true;
2904 }
2905
2906 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2907 obj_req->ex.oe_objno);
2908 return false;
2909}
2910
2911static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2912{
2913 struct ceph_osd_request *osd_req;
2914 int ret;
2915
2916 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2917 if (IS_ERR(osd_req))
2918 return PTR_ERR(osd_req);
2919
2920 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2921 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2922 rbd_osd_setup_data(osd_req, 0);
2923 rbd_osd_format_read(osd_req);
2924
2925 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2926 if (ret)
2927 return ret;
2928
2929 rbd_osd_submit(osd_req);
2930 return 0;
2354} 2931}
2355 2932
2356static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) 2933static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
@@ -2396,51 +2973,144 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2396 return ret; 2973 return ret;
2397 } 2974 }
2398 2975
2399 rbd_img_request_submit(child_img_req); 2976 /* avoid parent chain recursion */
2977 rbd_img_schedule(child_img_req, 0);
2400 return 0; 2978 return 0;
2401} 2979}
2402 2980
2403static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) 2981static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
2404{ 2982{
2405 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 2983 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2406 int ret; 2984 int ret;
2407 2985
2408 if (obj_req->result == -ENOENT && 2986again:
2409 rbd_dev->parent_overlap && !obj_req->tried_parent) { 2987 switch (obj_req->read_state) {
2410 /* reverse map this object extent onto the parent */ 2988 case RBD_OBJ_READ_START:
2411 ret = rbd_obj_calc_img_extents(obj_req, false); 2989 rbd_assert(!*result);
2990
2991 if (!rbd_obj_may_exist(obj_req)) {
2992 *result = -ENOENT;
2993 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2994 goto again;
2995 }
2996
2997 ret = rbd_obj_read_object(obj_req);
2412 if (ret) { 2998 if (ret) {
2413 obj_req->result = ret; 2999 *result = ret;
2414 return true; 3000 return true;
2415 } 3001 }
2416 3002 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2417 if (obj_req->num_img_extents) { 3003 return false;
2418 obj_req->tried_parent = true; 3004 case RBD_OBJ_READ_OBJECT:
2419 ret = rbd_obj_read_from_parent(obj_req); 3005 if (*result == -ENOENT && rbd_dev->parent_overlap) {
3006 /* reverse map this object extent onto the parent */
3007 ret = rbd_obj_calc_img_extents(obj_req, false);
2420 if (ret) { 3008 if (ret) {
2421 obj_req->result = ret; 3009 *result = ret;
2422 return true; 3010 return true;
2423 } 3011 }
2424 return false; 3012 if (obj_req->num_img_extents) {
3013 ret = rbd_obj_read_from_parent(obj_req);
3014 if (ret) {
3015 *result = ret;
3016 return true;
3017 }
3018 obj_req->read_state = RBD_OBJ_READ_PARENT;
3019 return false;
3020 }
3021 }
3022
3023 /*
3024 * -ENOENT means a hole in the image -- zero-fill the entire
3025 * length of the request. A short read also implies zero-fill
3026 * to the end of the request.
3027 */
3028 if (*result == -ENOENT) {
3029 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
3030 *result = 0;
3031 } else if (*result >= 0) {
3032 if (*result < obj_req->ex.oe_len)
3033 rbd_obj_zero_range(obj_req, *result,
3034 obj_req->ex.oe_len - *result);
3035 else
3036 rbd_assert(*result == obj_req->ex.oe_len);
3037 *result = 0;
2425 } 3038 }
3039 return true;
3040 case RBD_OBJ_READ_PARENT:
3041 return true;
3042 default:
3043 BUG();
2426 } 3044 }
3045}
2427 3046
2428 /* 3047static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2429 * -ENOENT means a hole in the image -- zero-fill the entire 3048{
2430 * length of the request. A short read also implies zero-fill 3049 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2431 * to the end of the request. In both cases we update xferred 3050
2432 * count to indicate the whole request was satisfied. 3051 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2433 */ 3052 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2434 if (obj_req->result == -ENOENT || 3053
2435 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { 3054 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2436 rbd_assert(!obj_req->xferred || !obj_req->result); 3055 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2437 rbd_obj_zero_range(obj_req, obj_req->xferred, 3056 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2438 obj_req->ex.oe_len - obj_req->xferred); 3057 return true;
2439 obj_req->result = 0;
2440 obj_req->xferred = obj_req->ex.oe_len;
2441 } 3058 }
2442 3059
2443 return true; 3060 return false;
3061}
3062
3063/*
3064 * Return:
3065 * 0 - object map update sent
3066 * 1 - object map update isn't needed
3067 * <0 - error
3068 */
3069static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
3070{
3071 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3072 u8 new_state;
3073
3074 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3075 return 1;
3076
3077 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3078 new_state = OBJECT_PENDING;
3079 else
3080 new_state = OBJECT_EXISTS;
3081
3082 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3083}
3084
3085static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3086{
3087 struct ceph_osd_request *osd_req;
3088 int num_ops = count_write_ops(obj_req);
3089 int which = 0;
3090 int ret;
3091
3092 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3093 num_ops++; /* stat */
3094
3095 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3096 if (IS_ERR(osd_req))
3097 return PTR_ERR(osd_req);
3098
3099 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3100 ret = rbd_osd_setup_stat(osd_req, which++);
3101 if (ret)
3102 return ret;
3103 }
3104
3105 rbd_osd_setup_write_ops(osd_req, which);
3106 rbd_osd_format_write(osd_req);
3107
3108 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3109 if (ret)
3110 return ret;
3111
3112 rbd_osd_submit(osd_req);
3113 return 0;
2444} 3114}
2445 3115
2446/* 3116/*
@@ -2463,123 +3133,67 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2463 3133
2464#define MODS_ONLY U32_MAX 3134#define MODS_ONLY U32_MAX
2465 3135
2466static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req, 3136static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2467 u32 bytes) 3137 u32 bytes)
2468{ 3138{
3139 struct ceph_osd_request *osd_req;
2469 int ret; 3140 int ret;
2470 3141
2471 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 3142 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2472 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2473 rbd_assert(bytes > 0 && bytes != MODS_ONLY); 3143 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
2474 rbd_osd_req_destroy(obj_req->osd_req);
2475 3144
2476 obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1); 3145 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
2477 if (!obj_req->osd_req) 3146 if (IS_ERR(osd_req))
2478 return -ENOMEM; 3147 return PTR_ERR(osd_req);
2479 3148
2480 ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup"); 3149 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
2481 if (ret) 3150 if (ret)
2482 return ret; 3151 return ret;
2483 3152
2484 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, 3153 rbd_osd_format_write(osd_req);
2485 obj_req->copyup_bvecs,
2486 obj_req->copyup_bvec_count,
2487 bytes);
2488 rbd_osd_req_format_write(obj_req);
2489 3154
2490 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 3155 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2491 if (ret) 3156 if (ret)
2492 return ret; 3157 return ret;
2493 3158
2494 rbd_obj_request_submit(obj_req); 3159 rbd_osd_submit(osd_req);
2495 return 0; 3160 return 0;
2496} 3161}
2497 3162
2498static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) 3163static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3164 u32 bytes)
2499{ 3165{
2500 struct rbd_img_request *img_req = obj_req->img_request; 3166 struct ceph_osd_request *osd_req;
2501 unsigned int num_osd_ops = (bytes != MODS_ONLY); 3167 int num_ops = count_write_ops(obj_req);
2502 unsigned int which = 0; 3168 int which = 0;
2503 int ret; 3169 int ret;
2504 3170
2505 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 3171 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2506 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT ||
2507 obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL);
2508 rbd_osd_req_destroy(obj_req->osd_req);
2509 3172
2510 switch (img_req->op_type) { 3173 if (bytes != MODS_ONLY)
2511 case OBJ_OP_WRITE: 3174 num_ops++; /* copyup */
2512 num_osd_ops += count_write_ops(obj_req);
2513 break;
2514 case OBJ_OP_ZEROOUT:
2515 num_osd_ops += count_zeroout_ops(obj_req);
2516 break;
2517 default:
2518 BUG();
2519 }
2520 3175
2521 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 3176 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2522 if (!obj_req->osd_req) 3177 if (IS_ERR(osd_req))
2523 return -ENOMEM; 3178 return PTR_ERR(osd_req);
2524 3179
2525 if (bytes != MODS_ONLY) { 3180 if (bytes != MODS_ONLY) {
2526 ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd", 3181 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
2527 "copyup");
2528 if (ret) 3182 if (ret)
2529 return ret; 3183 return ret;
2530
2531 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++,
2532 obj_req->copyup_bvecs,
2533 obj_req->copyup_bvec_count,
2534 bytes);
2535 } 3184 }
2536 3185
2537 switch (img_req->op_type) { 3186 rbd_osd_setup_write_ops(osd_req, which);
2538 case OBJ_OP_WRITE: 3187 rbd_osd_format_write(osd_req);
2539 __rbd_obj_setup_write(obj_req, which);
2540 break;
2541 case OBJ_OP_ZEROOUT:
2542 __rbd_obj_setup_zeroout(obj_req, which);
2543 break;
2544 default:
2545 BUG();
2546 }
2547 3188
2548 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 3189 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2549 if (ret) 3190 if (ret)
2550 return ret; 3191 return ret;
2551 3192
2552 rbd_obj_request_submit(obj_req); 3193 rbd_osd_submit(osd_req);
2553 return 0; 3194 return 0;
2554} 3195}
2555 3196
2556static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2557{
2558 /*
2559 * Only send non-zero copyup data to save some I/O and network
2560 * bandwidth -- zero copyup data is equivalent to the object not
2561 * existing.
2562 */
2563 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2564 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2565 bytes = 0;
2566 }
2567
2568 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
2569 /*
2570 * Send a copyup request with an empty snapshot context to
2571 * deep-copyup the object through all existing snapshots.
2572 * A second request with the current snapshot context will be
2573 * sent for the actual modification.
2574 */
2575 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
2576 return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
2577 }
2578
2579 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2580 return rbd_obj_issue_copyup_ops(obj_req, bytes);
2581}
2582
2583static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) 3197static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2584{ 3198{
2585 u32 i; 3199 u32 i;
@@ -2608,7 +3222,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2608 return 0; 3222 return 0;
2609} 3223}
2610 3224
2611static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) 3225/*
3226 * The target object doesn't exist. Read the data for the entire
3227 * target object up to the overlap point (if any) from the parent,
3228 * so we can use it for a copyup.
3229 */
3230static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
2612{ 3231{
2613 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 3232 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2614 int ret; 3233 int ret;
@@ -2623,178 +3242,492 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2623 * request -- pass MODS_ONLY since the copyup isn't needed 3242 * request -- pass MODS_ONLY since the copyup isn't needed
2624 * anymore. 3243 * anymore.
2625 */ 3244 */
2626 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 3245 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
2627 return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
2628 } 3246 }
2629 3247
2630 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); 3248 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
2631 if (ret) 3249 if (ret)
2632 return ret; 3250 return ret;
2633 3251
2634 obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
2635 return rbd_obj_read_from_parent(obj_req); 3252 return rbd_obj_read_from_parent(obj_req);
2636} 3253}
2637 3254
2638static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) 3255static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
2639{ 3256{
3257 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3258 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3259 u8 new_state;
3260 u32 i;
2640 int ret; 3261 int ret;
2641 3262
2642 switch (obj_req->write_state) { 3263 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
2643 case RBD_OBJ_WRITE_GUARD: 3264
2644 rbd_assert(!obj_req->xferred); 3265 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2645 if (obj_req->result == -ENOENT) { 3266 return;
2646 /* 3267
2647 * The target object doesn't exist. Read the data for 3268 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
2648 * the entire target object up to the overlap point (if 3269 return;
2649 * any) from the parent, so we can use it for a copyup. 3270
2650 */ 3271 for (i = 0; i < snapc->num_snaps; i++) {
2651 ret = rbd_obj_handle_write_guard(obj_req); 3272 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
2652 if (ret) { 3273 i + 1 < snapc->num_snaps)
2653 obj_req->result = ret; 3274 new_state = OBJECT_EXISTS_CLEAN;
2654 return true; 3275 else
2655 } 3276 new_state = OBJECT_EXISTS;
2656 return false; 3277
3278 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3279 new_state, NULL);
3280 if (ret < 0) {
3281 obj_req->pending.result = ret;
3282 return;
2657 } 3283 }
2658 /* fall through */
2659 case RBD_OBJ_WRITE_FLAT:
2660 case RBD_OBJ_WRITE_COPYUP_OPS:
2661 if (!obj_req->result)
2662 /*
2663 * There is no such thing as a successful short
2664 * write -- indicate the whole request was satisfied.
2665 */
2666 obj_req->xferred = obj_req->ex.oe_len;
2667 return true;
2668 case RBD_OBJ_WRITE_READ_FROM_PARENT:
2669 if (obj_req->result)
2670 return true;
2671 3284
2672 rbd_assert(obj_req->xferred); 3285 rbd_assert(!ret);
2673 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); 3286 obj_req->pending.num_pending++;
3287 }
3288}
3289
3290static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3291{
3292 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3293 int ret;
3294
3295 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3296
3297 /*
3298 * Only send non-zero copyup data to save some I/O and network
3299 * bandwidth -- zero copyup data is equivalent to the object not
3300 * existing.
3301 */
3302 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3303 bytes = 0;
3304
3305 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3306 /*
3307 * Send a copyup request with an empty snapshot context to
3308 * deep-copyup the object through all existing snapshots.
3309 * A second request with the current snapshot context will be
3310 * sent for the actual modification.
3311 */
3312 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3313 if (ret) {
3314 obj_req->pending.result = ret;
3315 return;
3316 }
3317
3318 obj_req->pending.num_pending++;
3319 bytes = MODS_ONLY;
3320 }
3321
3322 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3323 if (ret) {
3324 obj_req->pending.result = ret;
3325 return;
3326 }
3327
3328 obj_req->pending.num_pending++;
3329}
3330
3331static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3332{
3333 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3334 int ret;
3335
3336again:
3337 switch (obj_req->copyup_state) {
3338 case RBD_OBJ_COPYUP_START:
3339 rbd_assert(!*result);
3340
3341 ret = rbd_obj_copyup_read_parent(obj_req);
2674 if (ret) { 3342 if (ret) {
2675 obj_req->result = ret; 3343 *result = ret;
2676 obj_req->xferred = 0;
2677 return true; 3344 return true;
2678 } 3345 }
3346 if (obj_req->num_img_extents)
3347 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3348 else
3349 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
2679 return false; 3350 return false;
2680 case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC: 3351 case RBD_OBJ_COPYUP_READ_PARENT:
2681 if (obj_req->result) 3352 if (*result)
2682 return true; 3353 return true;
2683 3354
2684 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 3355 if (is_zero_bvecs(obj_req->copyup_bvecs,
2685 ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); 3356 rbd_obj_img_extents_bytes(obj_req))) {
2686 if (ret) { 3357 dout("%s %p detected zeros\n", __func__, obj_req);
2687 obj_req->result = ret; 3358 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3359 }
3360
3361 rbd_obj_copyup_object_maps(obj_req);
3362 if (!obj_req->pending.num_pending) {
3363 *result = obj_req->pending.result;
3364 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3365 goto again;
3366 }
3367 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3368 return false;
3369 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3370 if (!pending_result_dec(&obj_req->pending, result))
3371 return false;
3372 /* fall through */
3373 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3374 if (*result) {
3375 rbd_warn(rbd_dev, "snap object map update failed: %d",
3376 *result);
2688 return true; 3377 return true;
2689 } 3378 }
3379
3380 rbd_obj_copyup_write_object(obj_req);
3381 if (!obj_req->pending.num_pending) {
3382 *result = obj_req->pending.result;
3383 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3384 goto again;
3385 }
3386 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
2690 return false; 3387 return false;
3388 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3389 if (!pending_result_dec(&obj_req->pending, result))
3390 return false;
3391 /* fall through */
3392 case RBD_OBJ_COPYUP_WRITE_OBJECT:
3393 return true;
2691 default: 3394 default:
2692 BUG(); 3395 BUG();
2693 } 3396 }
2694} 3397}
2695 3398
2696/* 3399/*
2697 * Returns true if @obj_req is completed, or false otherwise. 3400 * Return:
3401 * 0 - object map update sent
3402 * 1 - object map update isn't needed
3403 * <0 - error
2698 */ 3404 */
2699static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) 3405static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
2700{ 3406{
2701 switch (obj_req->img_request->op_type) { 3407 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2702 case OBJ_OP_READ: 3408 u8 current_state = OBJECT_PENDING;
2703 return rbd_obj_handle_read(obj_req); 3409
2704 case OBJ_OP_WRITE: 3410 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2705 return rbd_obj_handle_write(obj_req); 3411 return 1;
2706 case OBJ_OP_DISCARD: 3412
2707 case OBJ_OP_ZEROOUT: 3413 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
2708 if (rbd_obj_handle_write(obj_req)) { 3414 return 1;
3415
3416 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3417 &current_state);
3418}
3419
3420static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3421{
3422 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3423 int ret;
3424
3425again:
3426 switch (obj_req->write_state) {
3427 case RBD_OBJ_WRITE_START:
3428 rbd_assert(!*result);
3429
3430 if (rbd_obj_write_is_noop(obj_req))
3431 return true;
3432
3433 ret = rbd_obj_write_pre_object_map(obj_req);
3434 if (ret < 0) {
3435 *result = ret;
3436 return true;
3437 }
3438 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3439 if (ret > 0)
3440 goto again;
3441 return false;
3442 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3443 if (*result) {
3444 rbd_warn(rbd_dev, "pre object map update failed: %d",
3445 *result);
3446 return true;
3447 }
3448 ret = rbd_obj_write_object(obj_req);
3449 if (ret) {
3450 *result = ret;
3451 return true;
3452 }
3453 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3454 return false;
3455 case RBD_OBJ_WRITE_OBJECT:
3456 if (*result == -ENOENT) {
3457 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3458 *result = 0;
3459 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3460 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3461 goto again;
3462 }
2709 /* 3463 /*
2710 * Hide -ENOENT from delete/truncate/zero -- discarding 3464 * On a non-existent object:
2711 * a non-existent object is not a problem. 3465 * delete - -ENOENT, truncate/zero - 0
2712 */ 3466 */
2713 if (obj_req->result == -ENOENT) { 3467 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2714 obj_req->result = 0; 3468 *result = 0;
2715 obj_req->xferred = obj_req->ex.oe_len; 3469 }
2716 } 3470 if (*result)
3471 return true;
3472
3473 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3474 goto again;
3475 case __RBD_OBJ_WRITE_COPYUP:
3476 if (!rbd_obj_advance_copyup(obj_req, result))
3477 return false;
3478 /* fall through */
3479 case RBD_OBJ_WRITE_COPYUP:
3480 if (*result) {
3481 rbd_warn(rbd_dev, "copyup failed: %d", *result);
3482 return true;
3483 }
3484 ret = rbd_obj_write_post_object_map(obj_req);
3485 if (ret < 0) {
3486 *result = ret;
2717 return true; 3487 return true;
2718 } 3488 }
3489 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3490 if (ret > 0)
3491 goto again;
2719 return false; 3492 return false;
3493 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3494 if (*result)
3495 rbd_warn(rbd_dev, "post object map update failed: %d",
3496 *result);
3497 return true;
2720 default: 3498 default:
2721 BUG(); 3499 BUG();
2722 } 3500 }
2723} 3501}
2724 3502
2725static void rbd_obj_end_request(struct rbd_obj_request *obj_req) 3503/*
3504 * Return true if @obj_req is completed.
3505 */
3506static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3507 int *result)
2726{ 3508{
2727 struct rbd_img_request *img_req = obj_req->img_request; 3509 struct rbd_img_request *img_req = obj_req->img_request;
3510 struct rbd_device *rbd_dev = img_req->rbd_dev;
3511 bool done;
2728 3512
2729 rbd_assert((!obj_req->result && 3513 mutex_lock(&obj_req->state_mutex);
2730 obj_req->xferred == obj_req->ex.oe_len) || 3514 if (!rbd_img_is_write(img_req))
2731 (obj_req->result < 0 && !obj_req->xferred)); 3515 done = rbd_obj_advance_read(obj_req, result);
2732 if (!obj_req->result) { 3516 else
2733 img_req->xferred += obj_req->xferred; 3517 done = rbd_obj_advance_write(obj_req, result);
2734 return; 3518 mutex_unlock(&obj_req->state_mutex);
2735 }
2736 3519
2737 rbd_warn(img_req->rbd_dev, 3520 if (done && *result) {
2738 "%s at objno %llu %llu~%llu result %d xferred %llu", 3521 rbd_assert(*result < 0);
2739 obj_op_name(img_req->op_type), obj_req->ex.oe_objno, 3522 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
2740 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, 3523 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2741 obj_req->xferred); 3524 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
2742 if (!img_req->result) {
2743 img_req->result = obj_req->result;
2744 img_req->xferred = 0;
2745 } 3525 }
3526 return done;
2746} 3527}
2747 3528
2748static void rbd_img_end_child_request(struct rbd_img_request *img_req) 3529/*
3530 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3531 * recursion.
3532 */
3533static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
2749{ 3534{
2750 struct rbd_obj_request *obj_req = img_req->obj_request; 3535 if (__rbd_obj_handle_request(obj_req, &result))
3536 rbd_img_handle_request(obj_req->img_request, result);
3537}
2751 3538
2752 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); 3539static bool need_exclusive_lock(struct rbd_img_request *img_req)
2753 rbd_assert((!img_req->result && 3540{
2754 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || 3541 struct rbd_device *rbd_dev = img_req->rbd_dev;
2755 (img_req->result < 0 && !img_req->xferred));
2756 3542
2757 obj_req->result = img_req->result; 3543 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
2758 obj_req->xferred = img_req->xferred; 3544 return false;
2759 rbd_img_request_put(img_req); 3545
3546 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
3547 return false;
3548
3549 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
3550 if (rbd_dev->opts->lock_on_read ||
3551 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3552 return true;
3553
3554 return rbd_img_is_write(img_req);
2760} 3555}
2761 3556
2762static void rbd_img_end_request(struct rbd_img_request *img_req) 3557static bool rbd_lock_add_request(struct rbd_img_request *img_req)
2763{ 3558{
2764 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); 3559 struct rbd_device *rbd_dev = img_req->rbd_dev;
2765 rbd_assert((!img_req->result && 3560 bool locked;
2766 img_req->xferred == blk_rq_bytes(img_req->rq)) || 3561
2767 (img_req->result < 0 && !img_req->xferred)); 3562 lockdep_assert_held(&rbd_dev->lock_rwsem);
3563 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3564 spin_lock(&rbd_dev->lock_lists_lock);
3565 rbd_assert(list_empty(&img_req->lock_item));
3566 if (!locked)
3567 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3568 else
3569 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3570 spin_unlock(&rbd_dev->lock_lists_lock);
3571 return locked;
3572}
3573
3574static void rbd_lock_del_request(struct rbd_img_request *img_req)
3575{
3576 struct rbd_device *rbd_dev = img_req->rbd_dev;
3577 bool need_wakeup;
2768 3578
2769 blk_mq_end_request(img_req->rq, 3579 lockdep_assert_held(&rbd_dev->lock_rwsem);
2770 errno_to_blk_status(img_req->result)); 3580 spin_lock(&rbd_dev->lock_lists_lock);
2771 rbd_img_request_put(img_req); 3581 rbd_assert(!list_empty(&img_req->lock_item));
3582 list_del_init(&img_req->lock_item);
3583 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3584 list_empty(&rbd_dev->running_list));
3585 spin_unlock(&rbd_dev->lock_lists_lock);
3586 if (need_wakeup)
3587 complete(&rbd_dev->releasing_wait);
2772} 3588}
2773 3589
2774static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) 3590static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
2775{ 3591{
2776 struct rbd_img_request *img_req; 3592 struct rbd_device *rbd_dev = img_req->rbd_dev;
3593
3594 if (!need_exclusive_lock(img_req))
3595 return 1;
3596
3597 if (rbd_lock_add_request(img_req))
3598 return 1;
3599
3600 if (rbd_dev->opts->exclusive) {
3601 WARN_ON(1); /* lock got released? */
3602 return -EROFS;
3603 }
3604
3605 /*
3606 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3607 * and cancel_delayed_work() in wake_lock_waiters().
3608 */
3609 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3610 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3611 return 0;
3612}
3613
3614static void rbd_img_object_requests(struct rbd_img_request *img_req)
3615{
3616 struct rbd_obj_request *obj_req;
3617
3618 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3619
3620 for_each_obj_request(img_req, obj_req) {
3621 int result = 0;
3622
3623 if (__rbd_obj_handle_request(obj_req, &result)) {
3624 if (result) {
3625 img_req->pending.result = result;
3626 return;
3627 }
3628 } else {
3629 img_req->pending.num_pending++;
3630 }
3631 }
3632}
3633
3634static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3635{
3636 struct rbd_device *rbd_dev = img_req->rbd_dev;
3637 int ret;
2777 3638
2778again: 3639again:
2779 if (!__rbd_obj_handle_request(obj_req)) 3640 switch (img_req->state) {
2780 return; 3641 case RBD_IMG_START:
3642 rbd_assert(!*result);
2781 3643
2782 img_req = obj_req->img_request; 3644 ret = rbd_img_exclusive_lock(img_req);
2783 spin_lock(&img_req->completion_lock); 3645 if (ret < 0) {
2784 rbd_obj_end_request(obj_req); 3646 *result = ret;
2785 rbd_assert(img_req->pending_count); 3647 return true;
2786 if (--img_req->pending_count) { 3648 }
2787 spin_unlock(&img_req->completion_lock); 3649 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
2788 return; 3650 if (ret > 0)
3651 goto again;
3652 return false;
3653 case RBD_IMG_EXCLUSIVE_LOCK:
3654 if (*result)
3655 return true;
3656
3657 rbd_assert(!need_exclusive_lock(img_req) ||
3658 __rbd_is_lock_owner(rbd_dev));
3659
3660 rbd_img_object_requests(img_req);
3661 if (!img_req->pending.num_pending) {
3662 *result = img_req->pending.result;
3663 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3664 goto again;
3665 }
3666 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3667 return false;
3668 case __RBD_IMG_OBJECT_REQUESTS:
3669 if (!pending_result_dec(&img_req->pending, result))
3670 return false;
3671 /* fall through */
3672 case RBD_IMG_OBJECT_REQUESTS:
3673 return true;
3674 default:
3675 BUG();
3676 }
3677}
3678
3679/*
3680 * Return true if @img_req is completed.
3681 */
3682static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3683 int *result)
3684{
3685 struct rbd_device *rbd_dev = img_req->rbd_dev;
3686 bool done;
3687
3688 if (need_exclusive_lock(img_req)) {
3689 down_read(&rbd_dev->lock_rwsem);
3690 mutex_lock(&img_req->state_mutex);
3691 done = rbd_img_advance(img_req, result);
3692 if (done)
3693 rbd_lock_del_request(img_req);
3694 mutex_unlock(&img_req->state_mutex);
3695 up_read(&rbd_dev->lock_rwsem);
3696 } else {
3697 mutex_lock(&img_req->state_mutex);
3698 done = rbd_img_advance(img_req, result);
3699 mutex_unlock(&img_req->state_mutex);
3700 }
3701
3702 if (done && *result) {
3703 rbd_assert(*result < 0);
3704 rbd_warn(rbd_dev, "%s%s result %d",
3705 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3706 obj_op_name(img_req->op_type), *result);
2789 } 3707 }
3708 return done;
3709}
3710
3711static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3712{
3713again:
3714 if (!__rbd_img_handle_request(img_req, &result))
3715 return;
2790 3716
2791 spin_unlock(&img_req->completion_lock);
2792 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { 3717 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2793 obj_req = img_req->obj_request; 3718 struct rbd_obj_request *obj_req = img_req->obj_request;
2794 rbd_img_end_child_request(img_req); 3719
2795 goto again; 3720 rbd_img_request_put(img_req);
3721 if (__rbd_obj_handle_request(obj_req, &result)) {
3722 img_req = obj_req->img_request;
3723 goto again;
3724 }
3725 } else {
3726 struct request *rq = img_req->rq;
3727
3728 rbd_img_request_put(img_req);
3729 blk_mq_end_request(rq, errno_to_blk_status(result));
2796 } 3730 }
2797 rbd_img_end_request(img_req);
2798} 3731}
2799 3732
2800static const struct rbd_client_id rbd_empty_cid; 3733static const struct rbd_client_id rbd_empty_cid;
@@ -2839,6 +3772,7 @@ static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2839{ 3772{
2840 struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3773 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2841 3774
3775 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
2842 strcpy(rbd_dev->lock_cookie, cookie); 3776 strcpy(rbd_dev->lock_cookie, cookie);
2843 rbd_set_owner_cid(rbd_dev, &cid); 3777 rbd_set_owner_cid(rbd_dev, &cid);
2844 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3778 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
@@ -2863,7 +3797,6 @@ static int rbd_lock(struct rbd_device *rbd_dev)
2863 if (ret) 3797 if (ret)
2864 return ret; 3798 return ret;
2865 3799
2866 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
2867 __rbd_lock(rbd_dev, cookie); 3800 __rbd_lock(rbd_dev, cookie);
2868 return 0; 3801 return 0;
2869} 3802}
@@ -2882,7 +3815,7 @@ static void rbd_unlock(struct rbd_device *rbd_dev)
2882 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3815 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2883 RBD_LOCK_NAME, rbd_dev->lock_cookie); 3816 RBD_LOCK_NAME, rbd_dev->lock_cookie);
2884 if (ret && ret != -ENOENT) 3817 if (ret && ret != -ENOENT)
2885 rbd_warn(rbd_dev, "failed to unlock: %d", ret); 3818 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
2886 3819
2887 /* treat errors as the image is unlocked */ 3820 /* treat errors as the image is unlocked */
2888 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3821 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
@@ -3009,15 +3942,34 @@ e_inval:
3009 goto out; 3942 goto out;
3010} 3943}
3011 3944
3012static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3945/*
3946 * Either image request state machine(s) or rbd_add_acquire_lock()
3947 * (i.e. "rbd map").
3948 */
3949static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3013{ 3950{
3014 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3951 struct rbd_img_request *img_req;
3952
3953 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3954 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3015 3955
3016 cancel_delayed_work(&rbd_dev->lock_dwork); 3956 cancel_delayed_work(&rbd_dev->lock_dwork);
3017 if (wake_all) 3957 if (!completion_done(&rbd_dev->acquire_wait)) {
3018 wake_up_all(&rbd_dev->lock_waitq); 3958 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3019 else 3959 list_empty(&rbd_dev->running_list));
3020 wake_up(&rbd_dev->lock_waitq); 3960 rbd_dev->acquire_err = result;
3961 complete_all(&rbd_dev->acquire_wait);
3962 return;
3963 }
3964
3965 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3966 mutex_lock(&img_req->state_mutex);
3967 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3968 rbd_img_schedule(img_req, result);
3969 mutex_unlock(&img_req->state_mutex);
3970 }
3971
3972 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3021} 3973}
3022 3974
3023static int get_lock_owner_info(struct rbd_device *rbd_dev, 3975static int get_lock_owner_info(struct rbd_device *rbd_dev,
@@ -3132,13 +4084,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
3132 goto again; 4084 goto again;
3133 4085
3134 ret = find_watcher(rbd_dev, lockers); 4086 ret = find_watcher(rbd_dev, lockers);
3135 if (ret) { 4087 if (ret)
3136 if (ret > 0) 4088 goto out; /* request lock or error */
3137 ret = 0; /* have to request lock */
3138 goto out;
3139 }
3140 4089
3141 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 4090 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
3142 ENTITY_NAME(lockers[0].id.name)); 4091 ENTITY_NAME(lockers[0].id.name));
3143 4092
3144 ret = ceph_monc_blacklist_add(&client->monc, 4093 ret = ceph_monc_blacklist_add(&client->monc,
@@ -3165,53 +4114,90 @@ out:
3165 return ret; 4114 return ret;
3166} 4115}
3167 4116
4117static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4118{
4119 int ret;
4120
4121 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4122 ret = rbd_object_map_open(rbd_dev);
4123 if (ret)
4124 return ret;
4125 }
4126
4127 return 0;
4128}
4129
3168/* 4130/*
3169 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 4131 * Return:
4132 * 0 - lock acquired
4133 * 1 - caller should call rbd_request_lock()
4134 * <0 - error
3170 */ 4135 */
3171static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 4136static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
3172 int *pret)
3173{ 4137{
3174 enum rbd_lock_state lock_state; 4138 int ret;
3175 4139
3176 down_read(&rbd_dev->lock_rwsem); 4140 down_read(&rbd_dev->lock_rwsem);
3177 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 4141 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3178 rbd_dev->lock_state); 4142 rbd_dev->lock_state);
3179 if (__rbd_is_lock_owner(rbd_dev)) { 4143 if (__rbd_is_lock_owner(rbd_dev)) {
3180 lock_state = rbd_dev->lock_state;
3181 up_read(&rbd_dev->lock_rwsem); 4144 up_read(&rbd_dev->lock_rwsem);
3182 return lock_state; 4145 return 0;
3183 } 4146 }
3184 4147
3185 up_read(&rbd_dev->lock_rwsem); 4148 up_read(&rbd_dev->lock_rwsem);
3186 down_write(&rbd_dev->lock_rwsem); 4149 down_write(&rbd_dev->lock_rwsem);
3187 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 4150 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3188 rbd_dev->lock_state); 4151 rbd_dev->lock_state);
3189 if (!__rbd_is_lock_owner(rbd_dev)) { 4152 if (__rbd_is_lock_owner(rbd_dev)) {
3190 *pret = rbd_try_lock(rbd_dev); 4153 up_write(&rbd_dev->lock_rwsem);
3191 if (*pret) 4154 return 0;
3192 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 4155 }
4156
4157 ret = rbd_try_lock(rbd_dev);
4158 if (ret < 0) {
4159 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4160 if (ret == -EBLACKLISTED)
4161 goto out;
4162
4163 ret = 1; /* request lock anyway */
4164 }
4165 if (ret > 0) {
4166 up_write(&rbd_dev->lock_rwsem);
4167 return ret;
4168 }
4169
4170 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4171 rbd_assert(list_empty(&rbd_dev->running_list));
4172
4173 ret = rbd_post_acquire_action(rbd_dev);
4174 if (ret) {
4175 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4176 /*
4177 * Can't stay in RBD_LOCK_STATE_LOCKED because
4178 * rbd_lock_add_request() would let the request through,
4179 * assuming that e.g. object map is locked and loaded.
4180 */
4181 rbd_unlock(rbd_dev);
3193 } 4182 }
3194 4183
3195 lock_state = rbd_dev->lock_state; 4184out:
4185 wake_lock_waiters(rbd_dev, ret);
3196 up_write(&rbd_dev->lock_rwsem); 4186 up_write(&rbd_dev->lock_rwsem);
3197 return lock_state; 4187 return ret;
3198} 4188}
3199 4189
3200static void rbd_acquire_lock(struct work_struct *work) 4190static void rbd_acquire_lock(struct work_struct *work)
3201{ 4191{
3202 struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 4192 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3203 struct rbd_device, lock_dwork); 4193 struct rbd_device, lock_dwork);
3204 enum rbd_lock_state lock_state; 4194 int ret;
3205 int ret = 0;
3206 4195
3207 dout("%s rbd_dev %p\n", __func__, rbd_dev); 4196 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3208again: 4197again:
3209 lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 4198 ret = rbd_try_acquire_lock(rbd_dev);
3210 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 4199 if (ret <= 0) {
3211 if (lock_state == RBD_LOCK_STATE_LOCKED) 4200 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
3212 wake_requests(rbd_dev, true);
3213 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3214 rbd_dev, lock_state, ret);
3215 return; 4201 return;
3216 } 4202 }
3217 4203
@@ -3220,16 +4206,9 @@ again:
3220 goto again; /* treat this as a dead client */ 4206 goto again; /* treat this as a dead client */
3221 } else if (ret == -EROFS) { 4207 } else if (ret == -EROFS) {
3222 rbd_warn(rbd_dev, "peer will not release lock"); 4208 rbd_warn(rbd_dev, "peer will not release lock");
3223 /* 4209 down_write(&rbd_dev->lock_rwsem);
3224 * If this is rbd_add_acquire_lock(), we want to fail 4210 wake_lock_waiters(rbd_dev, ret);
3225 * immediately -- reuse BLACKLISTED flag. Otherwise we 4211 up_write(&rbd_dev->lock_rwsem);
3226 * want to block.
3227 */
3228 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3229 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3230 /* wake "rbd map --exclusive" process */
3231 wake_requests(rbd_dev, false);
3232 }
3233 } else if (ret < 0) { 4212 } else if (ret < 0) {
3234 rbd_warn(rbd_dev, "error requesting lock: %d", ret); 4213 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3235 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 4214 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
@@ -3246,43 +4225,67 @@ again:
3246 } 4225 }
3247} 4226}
3248 4227
3249/* 4228static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
3250 * lock_rwsem must be held for write
3251 */
3252static bool rbd_release_lock(struct rbd_device *rbd_dev)
3253{ 4229{
3254 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 4230 bool need_wait;
3255 rbd_dev->lock_state); 4231
4232 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4233 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4234
3256 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 4235 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3257 return false; 4236 return false;
3258 4237
3259 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3260 downgrade_write(&rbd_dev->lock_rwsem);
3261 /* 4238 /*
3262 * Ensure that all in-flight IO is flushed. 4239 * Ensure that all in-flight IO is flushed.
3263 *
3264 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3265 * may be shared with other devices.
3266 */ 4240 */
3267 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 4241 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4242 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4243 need_wait = !list_empty(&rbd_dev->running_list);
4244 downgrade_write(&rbd_dev->lock_rwsem);
4245 if (need_wait)
4246 wait_for_completion(&rbd_dev->releasing_wait);
3268 up_read(&rbd_dev->lock_rwsem); 4247 up_read(&rbd_dev->lock_rwsem);
3269 4248
3270 down_write(&rbd_dev->lock_rwsem); 4249 down_write(&rbd_dev->lock_rwsem);
3271 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3272 rbd_dev->lock_state);
3273 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 4250 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3274 return false; 4251 return false;
3275 4252
4253 rbd_assert(list_empty(&rbd_dev->running_list));
4254 return true;
4255}
4256
4257static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4258{
4259 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4260 rbd_object_map_close(rbd_dev);
4261}
4262
4263static void __rbd_release_lock(struct rbd_device *rbd_dev)
4264{
4265 rbd_assert(list_empty(&rbd_dev->running_list));
4266
4267 rbd_pre_release_action(rbd_dev);
3276 rbd_unlock(rbd_dev); 4268 rbd_unlock(rbd_dev);
4269}
4270
4271/*
4272 * lock_rwsem must be held for write
4273 */
4274static void rbd_release_lock(struct rbd_device *rbd_dev)
4275{
4276 if (!rbd_quiesce_lock(rbd_dev))
4277 return;
4278
4279 __rbd_release_lock(rbd_dev);
4280
3277 /* 4281 /*
3278 * Give others a chance to grab the lock - we would re-acquire 4282 * Give others a chance to grab the lock - we would re-acquire
3279 * almost immediately if we got new IO during ceph_osdc_sync() 4283 * almost immediately if we got new IO while draining the running
3280 * otherwise. We need to ack our own notifications, so this 4284 * list otherwise. We need to ack our own notifications, so this
3281 * lock_dwork will be requeued from rbd_wait_state_locked() 4285 * lock_dwork will be requeued from rbd_handle_released_lock() by
3282 * after wake_requests() in rbd_handle_released_lock(). 4286 * way of maybe_kick_acquire().
3283 */ 4287 */
3284 cancel_delayed_work(&rbd_dev->lock_dwork); 4288 cancel_delayed_work(&rbd_dev->lock_dwork);
3285 return true;
3286} 4289}
3287 4290
3288static void rbd_release_lock_work(struct work_struct *work) 4291static void rbd_release_lock_work(struct work_struct *work)
@@ -3295,6 +4298,23 @@ static void rbd_release_lock_work(struct work_struct *work)
3295 up_write(&rbd_dev->lock_rwsem); 4298 up_write(&rbd_dev->lock_rwsem);
3296} 4299}
3297 4300
4301static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4302{
4303 bool have_requests;
4304
4305 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4306 if (__rbd_is_lock_owner(rbd_dev))
4307 return;
4308
4309 spin_lock(&rbd_dev->lock_lists_lock);
4310 have_requests = !list_empty(&rbd_dev->acquiring_list);
4311 spin_unlock(&rbd_dev->lock_lists_lock);
4312 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4313 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4314 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4315 }
4316}
4317
3298static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 4318static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3299 void **p) 4319 void **p)
3300{ 4320{
@@ -3324,8 +4344,7 @@ static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3324 down_read(&rbd_dev->lock_rwsem); 4344 down_read(&rbd_dev->lock_rwsem);
3325 } 4345 }
3326 4346
3327 if (!__rbd_is_lock_owner(rbd_dev)) 4347 maybe_kick_acquire(rbd_dev);
3328 wake_requests(rbd_dev, false);
3329 up_read(&rbd_dev->lock_rwsem); 4348 up_read(&rbd_dev->lock_rwsem);
3330} 4349}
3331 4350
@@ -3357,8 +4376,7 @@ static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3357 down_read(&rbd_dev->lock_rwsem); 4376 down_read(&rbd_dev->lock_rwsem);
3358 } 4377 }
3359 4378
3360 if (!__rbd_is_lock_owner(rbd_dev)) 4379 maybe_kick_acquire(rbd_dev);
3361 wake_requests(rbd_dev, false);
3362 up_read(&rbd_dev->lock_rwsem); 4380 up_read(&rbd_dev->lock_rwsem);
3363} 4381}
3364 4382
@@ -3608,7 +4626,6 @@ static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3608 4626
3609static void rbd_unregister_watch(struct rbd_device *rbd_dev) 4627static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3610{ 4628{
3611 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
3612 cancel_tasks_sync(rbd_dev); 4629 cancel_tasks_sync(rbd_dev);
3613 4630
3614 mutex_lock(&rbd_dev->watch_mutex); 4631 mutex_lock(&rbd_dev->watch_mutex);
@@ -3630,7 +4647,8 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3630 char cookie[32]; 4647 char cookie[32];
3631 int ret; 4648 int ret;
3632 4649
3633 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 4650 if (!rbd_quiesce_lock(rbd_dev))
4651 return;
3634 4652
3635 format_lock_cookie(rbd_dev, cookie); 4653 format_lock_cookie(rbd_dev, cookie);
3636 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, 4654 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
@@ -3646,11 +4664,11 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3646 * Lock cookie cannot be updated on older OSDs, so do 4664 * Lock cookie cannot be updated on older OSDs, so do
3647 * a manual release and queue an acquire. 4665 * a manual release and queue an acquire.
3648 */ 4666 */
3649 if (rbd_release_lock(rbd_dev)) 4667 __rbd_release_lock(rbd_dev);
3650 queue_delayed_work(rbd_dev->task_wq, 4668 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3651 &rbd_dev->lock_dwork, 0);
3652 } else { 4669 } else {
3653 __rbd_lock(rbd_dev, cookie); 4670 __rbd_lock(rbd_dev, cookie);
4671 wake_lock_waiters(rbd_dev, 0);
3654 } 4672 }
3655} 4673}
3656 4674
@@ -3671,15 +4689,18 @@ static void rbd_reregister_watch(struct work_struct *work)
3671 ret = __rbd_register_watch(rbd_dev); 4689 ret = __rbd_register_watch(rbd_dev);
3672 if (ret) { 4690 if (ret) {
3673 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 4691 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
3674 if (ret == -EBLACKLISTED || ret == -ENOENT) { 4692 if (ret != -EBLACKLISTED && ret != -ENOENT) {
3675 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3676 wake_requests(rbd_dev, true);
3677 } else {
3678 queue_delayed_work(rbd_dev->task_wq, 4693 queue_delayed_work(rbd_dev->task_wq,
3679 &rbd_dev->watch_dwork, 4694 &rbd_dev->watch_dwork,
3680 RBD_RETRY_DELAY); 4695 RBD_RETRY_DELAY);
4696 mutex_unlock(&rbd_dev->watch_mutex);
4697 return;
3681 } 4698 }
4699
3682 mutex_unlock(&rbd_dev->watch_mutex); 4700 mutex_unlock(&rbd_dev->watch_mutex);
4701 down_write(&rbd_dev->lock_rwsem);
4702 wake_lock_waiters(rbd_dev, ret);
4703 up_write(&rbd_dev->lock_rwsem);
3683 return; 4704 return;
3684 } 4705 }
3685 4706
@@ -3742,7 +4763,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3742 4763
3743 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 4764 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3744 CEPH_OSD_FLAG_READ, req_page, outbound_size, 4765 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3745 reply_page, &inbound_size); 4766 &reply_page, &inbound_size);
3746 if (!ret) { 4767 if (!ret) {
3747 memcpy(inbound, page_address(reply_page), inbound_size); 4768 memcpy(inbound, page_address(reply_page), inbound_size);
3748 ret = inbound_size; 4769 ret = inbound_size;
@@ -3754,54 +4775,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3754 return ret; 4775 return ret;
3755} 4776}
3756 4777
3757/*
3758 * lock_rwsem must be held for read
3759 */
3760static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
3761{
3762 DEFINE_WAIT(wait);
3763 unsigned long timeout;
3764 int ret = 0;
3765
3766 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3767 return -EBLACKLISTED;
3768
3769 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3770 return 0;
3771
3772 if (!may_acquire) {
3773 rbd_warn(rbd_dev, "exclusive lock required");
3774 return -EROFS;
3775 }
3776
3777 do {
3778 /*
3779 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3780 * and cancel_delayed_work() in wake_requests().
3781 */
3782 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3783 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3784 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3785 TASK_UNINTERRUPTIBLE);
3786 up_read(&rbd_dev->lock_rwsem);
3787 timeout = schedule_timeout(ceph_timeout_jiffies(
3788 rbd_dev->opts->lock_timeout));
3789 down_read(&rbd_dev->lock_rwsem);
3790 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3791 ret = -EBLACKLISTED;
3792 break;
3793 }
3794 if (!timeout) {
3795 rbd_warn(rbd_dev, "timed out waiting for lock");
3796 ret = -ETIMEDOUT;
3797 break;
3798 }
3799 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3800
3801 finish_wait(&rbd_dev->lock_waitq, &wait);
3802 return ret;
3803}
3804
3805static void rbd_queue_workfn(struct work_struct *work) 4778static void rbd_queue_workfn(struct work_struct *work)
3806{ 4779{
3807 struct request *rq = blk_mq_rq_from_pdu(work); 4780 struct request *rq = blk_mq_rq_from_pdu(work);
@@ -3812,7 +4785,6 @@ static void rbd_queue_workfn(struct work_struct *work)
3812 u64 length = blk_rq_bytes(rq); 4785 u64 length = blk_rq_bytes(rq);
3813 enum obj_operation_type op_type; 4786 enum obj_operation_type op_type;
3814 u64 mapping_size; 4787 u64 mapping_size;
3815 bool must_be_locked;
3816 int result; 4788 int result;
3817 4789
3818 switch (req_op(rq)) { 4790 switch (req_op(rq)) {
@@ -3886,21 +4858,10 @@ static void rbd_queue_workfn(struct work_struct *work)
3886 goto err_rq; 4858 goto err_rq;
3887 } 4859 }
3888 4860
3889 must_be_locked =
3890 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3891 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
3892 if (must_be_locked) {
3893 down_read(&rbd_dev->lock_rwsem);
3894 result = rbd_wait_state_locked(rbd_dev,
3895 !rbd_dev->opts->exclusive);
3896 if (result)
3897 goto err_unlock;
3898 }
3899
3900 img_request = rbd_img_request_create(rbd_dev, op_type, snapc); 4861 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
3901 if (!img_request) { 4862 if (!img_request) {
3902 result = -ENOMEM; 4863 result = -ENOMEM;
3903 goto err_unlock; 4864 goto err_rq;
3904 } 4865 }
3905 img_request->rq = rq; 4866 img_request->rq = rq;
3906 snapc = NULL; /* img_request consumes a ref */ 4867 snapc = NULL; /* img_request consumes a ref */
@@ -3910,19 +4871,14 @@ static void rbd_queue_workfn(struct work_struct *work)
3910 else 4871 else
3911 result = rbd_img_fill_from_bio(img_request, offset, length, 4872 result = rbd_img_fill_from_bio(img_request, offset, length,
3912 rq->bio); 4873 rq->bio);
3913 if (result || !img_request->pending_count) 4874 if (result)
3914 goto err_img_request; 4875 goto err_img_request;
3915 4876
3916 rbd_img_request_submit(img_request); 4877 rbd_img_handle_request(img_request, 0);
3917 if (must_be_locked)
3918 up_read(&rbd_dev->lock_rwsem);
3919 return; 4878 return;
3920 4879
3921err_img_request: 4880err_img_request:
3922 rbd_img_request_put(img_request); 4881 rbd_img_request_put(img_request);
3923err_unlock:
3924 if (must_be_locked)
3925 up_read(&rbd_dev->lock_rwsem);
3926err_rq: 4882err_rq:
3927 if (result) 4883 if (result)
3928 rbd_warn(rbd_dev, "%s %llx at %llx result %d", 4884 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
@@ -4589,7 +5545,13 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4589 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 5545 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4590 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 5546 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4591 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 5547 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4592 init_waitqueue_head(&rbd_dev->lock_waitq); 5548 spin_lock_init(&rbd_dev->lock_lists_lock);
5549 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5550 INIT_LIST_HEAD(&rbd_dev->running_list);
5551 init_completion(&rbd_dev->acquire_wait);
5552 init_completion(&rbd_dev->releasing_wait);
5553
5554 spin_lock_init(&rbd_dev->object_map_lock);
4593 5555
4594 rbd_dev->dev.bus = &rbd_bus_type; 5556 rbd_dev->dev.bus = &rbd_bus_type;
4595 rbd_dev->dev.type = &rbd_device_type; 5557 rbd_dev->dev.type = &rbd_device_type;
@@ -4772,6 +5734,32 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4772 &rbd_dev->header.features); 5734 &rbd_dev->header.features);
4773} 5735}
4774 5736
5737/*
5738 * These are generic image flags, but since they are used only for
5739 * object map, store them in rbd_dev->object_map_flags.
5740 *
5741 * For the same reason, this function is called only on object map
5742 * (re)load and not on header refresh.
5743 */
5744static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5745{
5746 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5747 __le64 flags;
5748 int ret;
5749
5750 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5751 &rbd_dev->header_oloc, "get_flags",
5752 &snapid, sizeof(snapid),
5753 &flags, sizeof(flags));
5754 if (ret < 0)
5755 return ret;
5756 if (ret < sizeof(flags))
5757 return -EBADMSG;
5758
5759 rbd_dev->object_map_flags = le64_to_cpu(flags);
5760 return 0;
5761}
5762
4775struct parent_image_info { 5763struct parent_image_info {
4776 u64 pool_id; 5764 u64 pool_id;
4777 const char *pool_ns; 5765 const char *pool_ns;
@@ -4829,7 +5817,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
4829 5817
4830 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5818 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4831 "rbd", "parent_get", CEPH_OSD_FLAG_READ, 5819 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4832 req_page, sizeof(u64), reply_page, &reply_len); 5820 req_page, sizeof(u64), &reply_page, &reply_len);
4833 if (ret) 5821 if (ret)
4834 return ret == -EOPNOTSUPP ? 1 : ret; 5822 return ret == -EOPNOTSUPP ? 1 : ret;
4835 5823
@@ -4841,7 +5829,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
4841 5829
4842 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5830 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4843 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, 5831 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4844 req_page, sizeof(u64), reply_page, &reply_len); 5832 req_page, sizeof(u64), &reply_page, &reply_len);
4845 if (ret) 5833 if (ret)
4846 return ret; 5834 return ret;
4847 5835
@@ -4872,7 +5860,7 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4872 5860
4873 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5861 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4874 "rbd", "get_parent", CEPH_OSD_FLAG_READ, 5862 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4875 req_page, sizeof(u64), reply_page, &reply_len); 5863 req_page, sizeof(u64), &reply_page, &reply_len);
4876 if (ret) 5864 if (ret)
4877 return ret; 5865 return ret;
4878 5866
@@ -5605,28 +6593,49 @@ static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5605{ 6593{
5606 down_write(&rbd_dev->lock_rwsem); 6594 down_write(&rbd_dev->lock_rwsem);
5607 if (__rbd_is_lock_owner(rbd_dev)) 6595 if (__rbd_is_lock_owner(rbd_dev))
5608 rbd_unlock(rbd_dev); 6596 __rbd_release_lock(rbd_dev);
5609 up_write(&rbd_dev->lock_rwsem); 6597 up_write(&rbd_dev->lock_rwsem);
5610} 6598}
5611 6599
6600/*
6601 * If the wait is interrupted, an error is returned even if the lock
6602 * was successfully acquired. rbd_dev_image_unlock() will release it
6603 * if needed.
6604 */
5612static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) 6605static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5613{ 6606{
5614 int ret; 6607 long ret;
5615 6608
5616 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { 6609 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6610 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6611 return 0;
6612
5617 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); 6613 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5618 return -EINVAL; 6614 return -EINVAL;
5619 } 6615 }
5620 6616
5621 /* FIXME: "rbd map --exclusive" should be in interruptible */ 6617 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5622 down_read(&rbd_dev->lock_rwsem); 6618 return 0;
5623 ret = rbd_wait_state_locked(rbd_dev, true); 6619
5624 up_read(&rbd_dev->lock_rwsem); 6620 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6621 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6622 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6623 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
6624 if (ret > 0)
6625 ret = rbd_dev->acquire_err;
6626 else if (!ret)
6627 ret = -ETIMEDOUT;
6628
5625 if (ret) { 6629 if (ret) {
5626 rbd_warn(rbd_dev, "failed to acquire exclusive lock"); 6630 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
5627 return -EROFS; 6631 return ret;
5628 } 6632 }
5629 6633
6634 /*
6635 * The lock may have been released by now, unless automatic lock
6636 * transitions are disabled.
6637 */
6638 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
5630 return 0; 6639 return 0;
5631} 6640}
5632 6641
@@ -5724,6 +6733,8 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5724 struct rbd_image_header *header; 6733 struct rbd_image_header *header;
5725 6734
5726 rbd_dev_parent_put(rbd_dev); 6735 rbd_dev_parent_put(rbd_dev);
6736 rbd_object_map_free(rbd_dev);
6737 rbd_dev_mapping_clear(rbd_dev);
5727 6738
5728 /* Free dynamic fields from the header, then zero it out */ 6739 /* Free dynamic fields from the header, then zero it out */
5729 6740
@@ -5824,7 +6835,6 @@ out_err:
5824static void rbd_dev_device_release(struct rbd_device *rbd_dev) 6835static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5825{ 6836{
5826 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6837 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5827 rbd_dev_mapping_clear(rbd_dev);
5828 rbd_free_disk(rbd_dev); 6838 rbd_free_disk(rbd_dev);
5829 if (!single_major) 6839 if (!single_major)
5830 unregister_blkdev(rbd_dev->major, rbd_dev->name); 6840 unregister_blkdev(rbd_dev->major, rbd_dev->name);
@@ -5858,23 +6868,17 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5858 if (ret) 6868 if (ret)
5859 goto err_out_blkdev; 6869 goto err_out_blkdev;
5860 6870
5861 ret = rbd_dev_mapping_set(rbd_dev);
5862 if (ret)
5863 goto err_out_disk;
5864
5865 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 6871 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
5866 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); 6872 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
5867 6873
5868 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 6874 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5869 if (ret) 6875 if (ret)
5870 goto err_out_mapping; 6876 goto err_out_disk;
5871 6877
5872 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6878 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5873 up_write(&rbd_dev->header_rwsem); 6879 up_write(&rbd_dev->header_rwsem);
5874 return 0; 6880 return 0;
5875 6881
5876err_out_mapping:
5877 rbd_dev_mapping_clear(rbd_dev);
5878err_out_disk: 6882err_out_disk:
5879 rbd_free_disk(rbd_dev); 6883 rbd_free_disk(rbd_dev);
5880err_out_blkdev: 6884err_out_blkdev:
@@ -5975,6 +6979,17 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5975 goto err_out_probe; 6979 goto err_out_probe;
5976 } 6980 }
5977 6981
6982 ret = rbd_dev_mapping_set(rbd_dev);
6983 if (ret)
6984 goto err_out_probe;
6985
6986 if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
6987 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
6988 ret = rbd_object_map_load(rbd_dev);
6989 if (ret)
6990 goto err_out_probe;
6991 }
6992
5978 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 6993 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5979 ret = rbd_dev_v2_parent_info(rbd_dev); 6994 ret = rbd_dev_v2_parent_info(rbd_dev);
5980 if (ret) 6995 if (ret)
@@ -6071,11 +7086,9 @@ static ssize_t do_rbd_add(struct bus_type *bus,
6071 if (rc) 7086 if (rc)
6072 goto err_out_image_probe; 7087 goto err_out_image_probe;
6073 7088
6074 if (rbd_dev->opts->exclusive) { 7089 rc = rbd_add_acquire_lock(rbd_dev);
6075 rc = rbd_add_acquire_lock(rbd_dev); 7090 if (rc)
6076 if (rc) 7091 goto err_out_image_lock;
6077 goto err_out_device_setup;
6078 }
6079 7092
6080 /* Everything's ready. Announce the disk to the world. */ 7093 /* Everything's ready. Announce the disk to the world. */
6081 7094
@@ -6101,7 +7114,6 @@ out:
6101 7114
6102err_out_image_lock: 7115err_out_image_lock:
6103 rbd_dev_image_unlock(rbd_dev); 7116 rbd_dev_image_unlock(rbd_dev);
6104err_out_device_setup:
6105 rbd_dev_device_release(rbd_dev); 7117 rbd_dev_device_release(rbd_dev);
6106err_out_image_probe: 7118err_out_image_probe:
6107 rbd_dev_image_release(rbd_dev); 7119 rbd_dev_image_release(rbd_dev);
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 62ff50d3e7a6..ac98ab6ccd3b 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -18,6 +18,7 @@
18/* For format version 2, rbd image 'foo' consists of objects 18/* For format version 2, rbd image 'foo' consists of objects
19 * rbd_id.foo - id of image 19 * rbd_id.foo - id of image
20 * rbd_header.<id> - image metadata 20 * rbd_header.<id> - image metadata
21 * rbd_object_map.<id> - optional image object map
21 * rbd_data.<id>.0000000000000000 22 * rbd_data.<id>.0000000000000000
22 * rbd_data.<id>.0000000000000001 23 * rbd_data.<id>.0000000000000001
23 * ... - data 24 * ... - data
@@ -25,6 +26,7 @@
25 */ 26 */
26 27
27#define RBD_HEADER_PREFIX "rbd_header." 28#define RBD_HEADER_PREFIX "rbd_header."
29#define RBD_OBJECT_MAP_PREFIX "rbd_object_map."
28#define RBD_ID_PREFIX "rbd_id." 30#define RBD_ID_PREFIX "rbd_id."
29#define RBD_V2_DATA_FORMAT "%s.%016llx" 31#define RBD_V2_DATA_FORMAT "%s.%016llx"
30 32
@@ -39,6 +41,14 @@ enum rbd_notify_op {
39 RBD_NOTIFY_OP_HEADER_UPDATE = 3, 41 RBD_NOTIFY_OP_HEADER_UPDATE = 3,
40}; 42};
41 43
44#define OBJECT_NONEXISTENT 0
45#define OBJECT_EXISTS 1
46#define OBJECT_PENDING 2
47#define OBJECT_EXISTS_CLEAN 3
48
49#define RBD_FLAG_OBJECT_MAP_INVALID (1ULL << 0)
50#define RBD_FLAG_FAST_DIFF_INVALID (1ULL << 1)
51
42/* 52/*
43 * For format version 1, rbd image 'foo' consists of objects 53 * For format version 1, rbd image 'foo' consists of objects
44 * foo.rbd - image metadata 54 * foo.rbd - image metadata
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 7f7d92d6b024..cf235f6eacf9 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -36,3 +36,15 @@ config CEPH_FS_POSIX_ACL
36 groups beyond the owner/group/world scheme. 36 groups beyond the owner/group/world scheme.
37 37
38 If you don't know what Access Control Lists are, say N 38 If you don't know what Access Control Lists are, say N
39
40config CEPH_FS_SECURITY_LABEL
41 bool "CephFS Security Labels"
42 depends on CEPH_FS && SECURITY
43 help
44 Security labels support alternative access control models
45 implemented by security modules like SELinux. This option
46 enables an extended attribute handler for file security
47 labels in the Ceph filesystem.
48
49 If you are not using a security module that requires using
50 extended attributes for file security labels, say N.
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8a19c249036c..aa55f412a6e3 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -159,7 +159,7 @@ out:
159} 159}
160 160
161int ceph_pre_init_acls(struct inode *dir, umode_t *mode, 161int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
162 struct ceph_acls_info *info) 162 struct ceph_acl_sec_ctx *as_ctx)
163{ 163{
164 struct posix_acl *acl, *default_acl; 164 struct posix_acl *acl, *default_acl;
165 size_t val_size1 = 0, val_size2 = 0; 165 size_t val_size1 = 0, val_size2 = 0;
@@ -234,9 +234,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
234 234
235 kfree(tmp_buf); 235 kfree(tmp_buf);
236 236
237 info->acl = acl; 237 as_ctx->acl = acl;
238 info->default_acl = default_acl; 238 as_ctx->default_acl = default_acl;
239 info->pagelist = pagelist; 239 as_ctx->pagelist = pagelist;
240 return 0; 240 return 0;
241 241
242out_err: 242out_err:
@@ -248,18 +248,10 @@ out_err:
248 return err; 248 return err;
249} 249}
250 250
251void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info) 251void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx)
252{ 252{
253 if (!inode) 253 if (!inode)
254 return; 254 return;
255 ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl); 255 ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, as_ctx->acl);
256 ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl); 256 ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, as_ctx->default_acl);
257}
258
259void ceph_release_acls_info(struct ceph_acls_info *info)
260{
261 posix_acl_release(info->acl);
262 posix_acl_release(info->default_acl);
263 if (info->pagelist)
264 ceph_pagelist_release(info->pagelist);
265} 257}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a47c541f8006..e078cc55b989 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -10,6 +10,7 @@
10#include <linux/pagevec.h> 10#include <linux/pagevec.h>
11#include <linux/task_io_accounting_ops.h> 11#include <linux/task_io_accounting_ops.h>
12#include <linux/signal.h> 12#include <linux/signal.h>
13#include <linux/iversion.h>
13 14
14#include "super.h" 15#include "super.h"
15#include "mds_client.h" 16#include "mds_client.h"
@@ -1576,6 +1577,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1576 1577
1577 /* Update time before taking page lock */ 1578 /* Update time before taking page lock */
1578 file_update_time(vma->vm_file); 1579 file_update_time(vma->vm_file);
1580 inode_inc_iversion_raw(inode);
1579 1581
1580 do { 1582 do {
1581 lock_page(page); 1583 lock_page(page);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0176241eaea7..d98dcd976c80 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -8,6 +8,7 @@
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/wait.h> 9#include <linux/wait.h>
10#include <linux/writeback.h> 10#include <linux/writeback.h>
11#include <linux/iversion.h>
11 12
12#include "super.h" 13#include "super.h"
13#include "mds_client.h" 14#include "mds_client.h"
@@ -1138,8 +1139,9 @@ struct cap_msg_args {
1138 u64 ino, cid, follows; 1139 u64 ino, cid, follows;
1139 u64 flush_tid, oldest_flush_tid, size, max_size; 1140 u64 flush_tid, oldest_flush_tid, size, max_size;
1140 u64 xattr_version; 1141 u64 xattr_version;
1142 u64 change_attr;
1141 struct ceph_buffer *xattr_buf; 1143 struct ceph_buffer *xattr_buf;
1142 struct timespec64 atime, mtime, ctime; 1144 struct timespec64 atime, mtime, ctime, btime;
1143 int op, caps, wanted, dirty; 1145 int op, caps, wanted, dirty;
1144 u32 seq, issue_seq, mseq, time_warp_seq; 1146 u32 seq, issue_seq, mseq, time_warp_seq;
1145 u32 flags; 1147 u32 flags;
@@ -1160,7 +1162,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
1160 struct ceph_msg *msg; 1162 struct ceph_msg *msg;
1161 void *p; 1163 void *p;
1162 size_t extra_len; 1164 size_t extra_len;
1163 struct timespec64 zerotime = {0};
1164 struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; 1165 struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
1165 1166
1166 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" 1167 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
@@ -1245,15 +1246,10 @@ static int send_cap_msg(struct cap_msg_args *arg)
1245 /* pool namespace (version 8) (mds always ignores this) */ 1246 /* pool namespace (version 8) (mds always ignores this) */
1246 ceph_encode_32(&p, 0); 1247 ceph_encode_32(&p, 0);
1247 1248
1248 /* 1249 /* btime and change_attr (version 9) */
1249 * btime and change_attr (version 9) 1250 ceph_encode_timespec64(p, &arg->btime);
1250 *
1251 * We just zero these out for now, as the MDS ignores them unless
1252 * the requisite feature flags are set (which we don't do yet).
1253 */
1254 ceph_encode_timespec64(p, &zerotime);
1255 p += sizeof(struct ceph_timespec); 1251 p += sizeof(struct ceph_timespec);
1256 ceph_encode_64(&p, 0); 1252 ceph_encode_64(&p, arg->change_attr);
1257 1253
1258 /* Advisory flags (version 10) */ 1254 /* Advisory flags (version 10) */
1259 ceph_encode_32(&p, arg->flags); 1255 ceph_encode_32(&p, arg->flags);
@@ -1263,20 +1259,22 @@ static int send_cap_msg(struct cap_msg_args *arg)
1263} 1259}
1264 1260
1265/* 1261/*
1266 * Queue cap releases when an inode is dropped from our cache. Since 1262 * Queue cap releases when an inode is dropped from our cache.
1267 * inode is about to be destroyed, there is no need for i_ceph_lock.
1268 */ 1263 */
1269void __ceph_remove_caps(struct inode *inode) 1264void __ceph_remove_caps(struct ceph_inode_info *ci)
1270{ 1265{
1271 struct ceph_inode_info *ci = ceph_inode(inode);
1272 struct rb_node *p; 1266 struct rb_node *p;
1273 1267
1268 /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
1269 * may call __ceph_caps_issued_mask() on a freeing inode. */
1270 spin_lock(&ci->i_ceph_lock);
1274 p = rb_first(&ci->i_caps); 1271 p = rb_first(&ci->i_caps);
1275 while (p) { 1272 while (p) {
1276 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 1273 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
1277 p = rb_next(p); 1274 p = rb_next(p);
1278 __ceph_remove_cap(cap, true); 1275 __ceph_remove_cap(cap, true);
1279 } 1276 }
1277 spin_unlock(&ci->i_ceph_lock);
1280} 1278}
1281 1279
1282/* 1280/*
@@ -1297,7 +1295,7 @@ void __ceph_remove_caps(struct inode *inode)
1297 * caller should hold snap_rwsem (read), s_mutex. 1295 * caller should hold snap_rwsem (read), s_mutex.
1298 */ 1296 */
1299static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, 1297static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1300 int op, bool sync, int used, int want, int retain, 1298 int op, int flags, int used, int want, int retain,
1301 int flushing, u64 flush_tid, u64 oldest_flush_tid) 1299 int flushing, u64 flush_tid, u64 oldest_flush_tid)
1302 __releases(cap->ci->i_ceph_lock) 1300 __releases(cap->ci->i_ceph_lock)
1303{ 1301{
@@ -1377,6 +1375,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1377 arg.mtime = inode->i_mtime; 1375 arg.mtime = inode->i_mtime;
1378 arg.atime = inode->i_atime; 1376 arg.atime = inode->i_atime;
1379 arg.ctime = inode->i_ctime; 1377 arg.ctime = inode->i_ctime;
1378 arg.btime = ci->i_btime;
1379 arg.change_attr = inode_peek_iversion_raw(inode);
1380 1380
1381 arg.op = op; 1381 arg.op = op;
1382 arg.caps = cap->implemented; 1382 arg.caps = cap->implemented;
@@ -1393,12 +1393,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1393 arg.mode = inode->i_mode; 1393 arg.mode = inode->i_mode;
1394 1394
1395 arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 1395 arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1396 if (list_empty(&ci->i_cap_snaps)) 1396 if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1397 arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; 1397 !list_empty(&ci->i_cap_snaps)) {
1398 else 1398 struct ceph_cap_snap *capsnap;
1399 arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; 1399 list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1400 if (sync) 1400 if (capsnap->cap_flush.tid)
1401 arg.flags |= CEPH_CLIENT_CAPS_SYNC; 1401 break;
1402 if (capsnap->need_flush) {
1403 flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1404 break;
1405 }
1406 }
1407 }
1408 arg.flags = flags;
1402 1409
1403 spin_unlock(&ci->i_ceph_lock); 1410 spin_unlock(&ci->i_ceph_lock);
1404 1411
@@ -1436,6 +1443,8 @@ static inline int __send_flush_snap(struct inode *inode,
1436 arg.atime = capsnap->atime; 1443 arg.atime = capsnap->atime;
1437 arg.mtime = capsnap->mtime; 1444 arg.mtime = capsnap->mtime;
1438 arg.ctime = capsnap->ctime; 1445 arg.ctime = capsnap->ctime;
1446 arg.btime = capsnap->btime;
1447 arg.change_attr = capsnap->change_attr;
1439 1448
1440 arg.op = CEPH_CAP_OP_FLUSHSNAP; 1449 arg.op = CEPH_CAP_OP_FLUSHSNAP;
1441 arg.caps = capsnap->issued; 1450 arg.caps = capsnap->issued;
@@ -1603,10 +1612,8 @@ retry:
1603 } 1612 }
1604 1613
1605 // make sure flushsnap messages are sent in proper order. 1614 // make sure flushsnap messages are sent in proper order.
1606 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { 1615 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
1607 __kick_flushing_caps(mdsc, session, ci, 0); 1616 __kick_flushing_caps(mdsc, session, ci, 0);
1608 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
1609 }
1610 1617
1611 __ceph_flush_snaps(ci, session); 1618 __ceph_flush_snaps(ci, session);
1612out: 1619out:
@@ -2048,10 +2055,8 @@ ack:
2048 if (cap == ci->i_auth_cap && 2055 if (cap == ci->i_auth_cap &&
2049 (ci->i_ceph_flags & 2056 (ci->i_ceph_flags &
2050 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { 2057 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
2051 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { 2058 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2052 __kick_flushing_caps(mdsc, session, ci, 0); 2059 __kick_flushing_caps(mdsc, session, ci, 0);
2053 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2054 }
2055 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) 2060 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2056 __ceph_flush_snaps(ci, session); 2061 __ceph_flush_snaps(ci, session);
2057 2062
@@ -2087,7 +2092,7 @@ ack:
2087 sent++; 2092 sent++;
2088 2093
2089 /* __send_cap drops i_ceph_lock */ 2094 /* __send_cap drops i_ceph_lock */
2090 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false, 2095 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
2091 cap_used, want, retain, flushing, 2096 cap_used, want, retain, flushing,
2092 flush_tid, oldest_flush_tid); 2097 flush_tid, oldest_flush_tid);
2093 goto retry; /* retake i_ceph_lock and restart our cap scan. */ 2098 goto retry; /* retake i_ceph_lock and restart our cap scan. */
@@ -2121,6 +2126,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
2121 2126
2122retry: 2127retry:
2123 spin_lock(&ci->i_ceph_lock); 2128 spin_lock(&ci->i_ceph_lock);
2129retry_locked:
2124 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { 2130 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2125 spin_unlock(&ci->i_ceph_lock); 2131 spin_unlock(&ci->i_ceph_lock);
2126 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); 2132 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
@@ -2128,8 +2134,6 @@ retry:
2128 } 2134 }
2129 if (ci->i_dirty_caps && ci->i_auth_cap) { 2135 if (ci->i_dirty_caps && ci->i_auth_cap) {
2130 struct ceph_cap *cap = ci->i_auth_cap; 2136 struct ceph_cap *cap = ci->i_auth_cap;
2131 int used = __ceph_caps_used(ci);
2132 int want = __ceph_caps_wanted(ci);
2133 int delayed; 2137 int delayed;
2134 2138
2135 if (!session || session != cap->session) { 2139 if (!session || session != cap->session) {
@@ -2145,13 +2149,25 @@ retry:
2145 goto out; 2149 goto out;
2146 } 2150 }
2147 2151
2152 if (ci->i_ceph_flags &
2153 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
2154 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2155 __kick_flushing_caps(mdsc, session, ci, 0);
2156 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2157 __ceph_flush_snaps(ci, session);
2158 goto retry_locked;
2159 }
2160
2148 flushing = __mark_caps_flushing(inode, session, true, 2161 flushing = __mark_caps_flushing(inode, session, true,
2149 &flush_tid, &oldest_flush_tid); 2162 &flush_tid, &oldest_flush_tid);
2150 2163
2151 /* __send_cap drops i_ceph_lock */ 2164 /* __send_cap drops i_ceph_lock */
2152 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true, 2165 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
2153 used, want, (cap->issued | cap->implemented), 2166 CEPH_CLIENT_CAPS_SYNC,
2154 flushing, flush_tid, oldest_flush_tid); 2167 __ceph_caps_used(ci),
2168 __ceph_caps_wanted(ci),
2169 (cap->issued | cap->implemented),
2170 flushing, flush_tid, oldest_flush_tid);
2155 2171
2156 if (delayed) { 2172 if (delayed) {
2157 spin_lock(&ci->i_ceph_lock); 2173 spin_lock(&ci->i_ceph_lock);
@@ -2320,6 +2336,16 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2320 struct ceph_cap_flush *cf; 2336 struct ceph_cap_flush *cf;
2321 int ret; 2337 int ret;
2322 u64 first_tid = 0; 2338 u64 first_tid = 0;
2339 u64 last_snap_flush = 0;
2340
2341 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2342
2343 list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2344 if (!cf->caps) {
2345 last_snap_flush = cf->tid;
2346 break;
2347 }
2348 }
2323 2349
2324 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { 2350 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2325 if (cf->tid < first_tid) 2351 if (cf->tid < first_tid)
@@ -2338,10 +2364,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2338 dout("kick_flushing_caps %p cap %p tid %llu %s\n", 2364 dout("kick_flushing_caps %p cap %p tid %llu %s\n",
2339 inode, cap, cf->tid, ceph_cap_string(cf->caps)); 2365 inode, cap, cf->tid, ceph_cap_string(cf->caps));
2340 ci->i_ceph_flags |= CEPH_I_NODELAY; 2366 ci->i_ceph_flags |= CEPH_I_NODELAY;
2367
2341 ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 2368 ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
2342 false, __ceph_caps_used(ci), 2369 (cf->tid < last_snap_flush ?
2370 CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
2371 __ceph_caps_used(ci),
2343 __ceph_caps_wanted(ci), 2372 __ceph_caps_wanted(ci),
2344 cap->issued | cap->implemented, 2373 (cap->issued | cap->implemented),
2345 cf->caps, cf->tid, oldest_flush_tid); 2374 cf->caps, cf->tid, oldest_flush_tid);
2346 if (ret) { 2375 if (ret) {
2347 pr_err("kick_flushing_caps: error sending " 2376 pr_err("kick_flushing_caps: error sending "
@@ -2410,7 +2439,6 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2410 */ 2439 */
2411 if ((cap->issued & ci->i_flushing_caps) != 2440 if ((cap->issued & ci->i_flushing_caps) !=
2412 ci->i_flushing_caps) { 2441 ci->i_flushing_caps) {
2413 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2414 /* encode_caps_cb() also will reset these sequence 2442 /* encode_caps_cb() also will reset these sequence
2415 * numbers. make sure sequence numbers in cap flush 2443 * numbers. make sure sequence numbers in cap flush
2416 * message match later reconnect message */ 2444 * message match later reconnect message */
@@ -2450,7 +2478,6 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2450 continue; 2478 continue;
2451 } 2479 }
2452 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { 2480 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2453 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2454 __kick_flushing_caps(mdsc, session, ci, 2481 __kick_flushing_caps(mdsc, session, ci,
2455 oldest_flush_tid); 2482 oldest_flush_tid);
2456 } 2483 }
@@ -2478,7 +2505,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
2478 oldest_flush_tid = __get_oldest_flush_tid(mdsc); 2505 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2479 spin_unlock(&mdsc->cap_dirty_lock); 2506 spin_unlock(&mdsc->cap_dirty_lock);
2480 2507
2481 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2482 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); 2508 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2483 spin_unlock(&ci->i_ceph_lock); 2509 spin_unlock(&ci->i_ceph_lock);
2484 } else { 2510 } else {
@@ -3040,8 +3066,10 @@ struct cap_extra_info {
3040 bool dirstat_valid; 3066 bool dirstat_valid;
3041 u64 nfiles; 3067 u64 nfiles;
3042 u64 nsubdirs; 3068 u64 nsubdirs;
3069 u64 change_attr;
3043 /* currently issued */ 3070 /* currently issued */
3044 int issued; 3071 int issued;
3072 struct timespec64 btime;
3045}; 3073};
3046 3074
3047/* 3075/*
@@ -3123,11 +3151,14 @@ static void handle_cap_grant(struct inode *inode,
3123 3151
3124 __check_cap_issue(ci, cap, newcaps); 3152 __check_cap_issue(ci, cap, newcaps);
3125 3153
3154 inode_set_max_iversion_raw(inode, extra_info->change_attr);
3155
3126 if ((newcaps & CEPH_CAP_AUTH_SHARED) && 3156 if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
3127 (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { 3157 (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
3128 inode->i_mode = le32_to_cpu(grant->mode); 3158 inode->i_mode = le32_to_cpu(grant->mode);
3129 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); 3159 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
3130 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); 3160 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
3161 ci->i_btime = extra_info->btime;
3131 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, 3162 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
3132 from_kuid(&init_user_ns, inode->i_uid), 3163 from_kuid(&init_user_ns, inode->i_uid),
3133 from_kgid(&init_user_ns, inode->i_gid)); 3164 from_kgid(&init_user_ns, inode->i_gid));
@@ -3154,6 +3185,7 @@ static void handle_cap_grant(struct inode *inode,
3154 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); 3185 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
3155 ci->i_xattrs.version = version; 3186 ci->i_xattrs.version = version;
3156 ceph_forget_all_cached_acls(inode); 3187 ceph_forget_all_cached_acls(inode);
3188 ceph_security_invalidate_secctx(inode);
3157 } 3189 }
3158 } 3190 }
3159 3191
@@ -3848,17 +3880,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3848 } 3880 }
3849 } 3881 }
3850 3882
3851 if (msg_version >= 11) { 3883 if (msg_version >= 9) {
3852 struct ceph_timespec *btime; 3884 struct ceph_timespec *btime;
3853 u64 change_attr;
3854 u32 flags;
3855 3885
3856 /* version >= 9 */
3857 if (p + sizeof(*btime) > end) 3886 if (p + sizeof(*btime) > end)
3858 goto bad; 3887 goto bad;
3859 btime = p; 3888 btime = p;
3889 ceph_decode_timespec64(&extra_info.btime, btime);
3860 p += sizeof(*btime); 3890 p += sizeof(*btime);
3861 ceph_decode_64_safe(&p, end, change_attr, bad); 3891 ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
3892 }
3893
3894 if (msg_version >= 11) {
3895 u32 flags;
3862 /* version >= 10 */ 3896 /* version >= 10 */
3863 ceph_decode_32_safe(&p, end, flags, bad); 3897 ceph_decode_32_safe(&p, end, flags, bad);
3864 /* version >= 11 */ 3898 /* version >= 11 */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 83cd41fa2b01..2eb88ed22993 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -52,7 +52,7 @@ static int mdsc_show(struct seq_file *s, void *p)
52 struct ceph_mds_client *mdsc = fsc->mdsc; 52 struct ceph_mds_client *mdsc = fsc->mdsc;
53 struct ceph_mds_request *req; 53 struct ceph_mds_request *req;
54 struct rb_node *rp; 54 struct rb_node *rp;
55 int pathlen; 55 int pathlen = 0;
56 u64 pathbase; 56 u64 pathbase;
57 char *path; 57 char *path;
58 58
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0637149fb9f9..aab29f48c62d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -825,7 +825,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
825 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 825 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
826 struct ceph_mds_client *mdsc = fsc->mdsc; 826 struct ceph_mds_client *mdsc = fsc->mdsc;
827 struct ceph_mds_request *req; 827 struct ceph_mds_request *req;
828 struct ceph_acls_info acls = {}; 828 struct ceph_acl_sec_ctx as_ctx = {};
829 int err; 829 int err;
830 830
831 if (ceph_snap(dir) != CEPH_NOSNAP) 831 if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -836,7 +836,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
836 goto out; 836 goto out;
837 } 837 }
838 838
839 err = ceph_pre_init_acls(dir, &mode, &acls); 839 err = ceph_pre_init_acls(dir, &mode, &as_ctx);
840 if (err < 0)
841 goto out;
842 err = ceph_security_init_secctx(dentry, mode, &as_ctx);
840 if (err < 0) 843 if (err < 0)
841 goto out; 844 goto out;
842 845
@@ -855,9 +858,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
855 req->r_args.mknod.rdev = cpu_to_le32(rdev); 858 req->r_args.mknod.rdev = cpu_to_le32(rdev);
856 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 859 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
857 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 860 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
858 if (acls.pagelist) { 861 if (as_ctx.pagelist) {
859 req->r_pagelist = acls.pagelist; 862 req->r_pagelist = as_ctx.pagelist;
860 acls.pagelist = NULL; 863 as_ctx.pagelist = NULL;
861 } 864 }
862 err = ceph_mdsc_do_request(mdsc, dir, req); 865 err = ceph_mdsc_do_request(mdsc, dir, req);
863 if (!err && !req->r_reply_info.head->is_dentry) 866 if (!err && !req->r_reply_info.head->is_dentry)
@@ -865,10 +868,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
865 ceph_mdsc_put_request(req); 868 ceph_mdsc_put_request(req);
866out: 869out:
867 if (!err) 870 if (!err)
868 ceph_init_inode_acls(d_inode(dentry), &acls); 871 ceph_init_inode_acls(d_inode(dentry), &as_ctx);
869 else 872 else
870 d_drop(dentry); 873 d_drop(dentry);
871 ceph_release_acls_info(&acls); 874 ceph_release_acl_sec_ctx(&as_ctx);
872 return err; 875 return err;
873} 876}
874 877
@@ -884,6 +887,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
884 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 887 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
885 struct ceph_mds_client *mdsc = fsc->mdsc; 888 struct ceph_mds_client *mdsc = fsc->mdsc;
886 struct ceph_mds_request *req; 889 struct ceph_mds_request *req;
890 struct ceph_acl_sec_ctx as_ctx = {};
887 int err; 891 int err;
888 892
889 if (ceph_snap(dir) != CEPH_NOSNAP) 893 if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -894,6 +898,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
894 goto out; 898 goto out;
895 } 899 }
896 900
901 err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx);
902 if (err < 0)
903 goto out;
904
897 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 905 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
898 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 906 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
899 if (IS_ERR(req)) { 907 if (IS_ERR(req)) {
@@ -919,6 +927,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
919out: 927out:
920 if (err) 928 if (err)
921 d_drop(dentry); 929 d_drop(dentry);
930 ceph_release_acl_sec_ctx(&as_ctx);
922 return err; 931 return err;
923} 932}
924 933
@@ -927,7 +936,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
927 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 936 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
928 struct ceph_mds_client *mdsc = fsc->mdsc; 937 struct ceph_mds_client *mdsc = fsc->mdsc;
929 struct ceph_mds_request *req; 938 struct ceph_mds_request *req;
930 struct ceph_acls_info acls = {}; 939 struct ceph_acl_sec_ctx as_ctx = {};
931 int err = -EROFS; 940 int err = -EROFS;
932 int op; 941 int op;
933 942
@@ -950,7 +959,10 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
950 } 959 }
951 960
952 mode |= S_IFDIR; 961 mode |= S_IFDIR;
953 err = ceph_pre_init_acls(dir, &mode, &acls); 962 err = ceph_pre_init_acls(dir, &mode, &as_ctx);
963 if (err < 0)
964 goto out;
965 err = ceph_security_init_secctx(dentry, mode, &as_ctx);
954 if (err < 0) 966 if (err < 0)
955 goto out; 967 goto out;
956 968
@@ -967,9 +979,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
967 req->r_args.mkdir.mode = cpu_to_le32(mode); 979 req->r_args.mkdir.mode = cpu_to_le32(mode);
968 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 980 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
969 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 981 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
970 if (acls.pagelist) { 982 if (as_ctx.pagelist) {
971 req->r_pagelist = acls.pagelist; 983 req->r_pagelist = as_ctx.pagelist;
972 acls.pagelist = NULL; 984 as_ctx.pagelist = NULL;
973 } 985 }
974 err = ceph_mdsc_do_request(mdsc, dir, req); 986 err = ceph_mdsc_do_request(mdsc, dir, req);
975 if (!err && 987 if (!err &&
@@ -979,10 +991,10 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
979 ceph_mdsc_put_request(req); 991 ceph_mdsc_put_request(req);
980out: 992out:
981 if (!err) 993 if (!err)
982 ceph_init_inode_acls(d_inode(dentry), &acls); 994 ceph_init_inode_acls(d_inode(dentry), &as_ctx);
983 else 995 else
984 d_drop(dentry); 996 d_drop(dentry);
985 ceph_release_acls_info(&acls); 997 ceph_release_acl_sec_ctx(&as_ctx);
986 return err; 998 return err;
987} 999}
988 1000
@@ -1433,8 +1445,7 @@ static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
1433 return false; 1445 return false;
1434} 1446}
1435 1447
1436static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, 1448static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
1437 struct inode *dir)
1438{ 1449{
1439 struct ceph_dentry_info *di; 1450 struct ceph_dentry_info *di;
1440 struct ceph_mds_session *session = NULL; 1451 struct ceph_mds_session *session = NULL;
@@ -1466,7 +1477,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
1466 spin_unlock(&dentry->d_lock); 1477 spin_unlock(&dentry->d_lock);
1467 1478
1468 if (session) { 1479 if (session) {
1469 ceph_mdsc_lease_send_msg(session, dir, dentry, 1480 ceph_mdsc_lease_send_msg(session, dentry,
1470 CEPH_MDS_LEASE_RENEW, seq); 1481 CEPH_MDS_LEASE_RENEW, seq);
1471 ceph_put_mds_session(session); 1482 ceph_put_mds_session(session);
1472 } 1483 }
@@ -1512,18 +1523,26 @@ static int __dir_lease_try_check(const struct dentry *dentry)
1512static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 1523static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
1513{ 1524{
1514 struct ceph_inode_info *ci = ceph_inode(dir); 1525 struct ceph_inode_info *ci = ceph_inode(dir);
1515 struct ceph_dentry_info *di = ceph_dentry(dentry); 1526 int valid;
1516 int valid = 0; 1527 int shared_gen;
1517 1528
1518 spin_lock(&ci->i_ceph_lock); 1529 spin_lock(&ci->i_ceph_lock);
1519 if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen) 1530 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
1520 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 1531 shared_gen = atomic_read(&ci->i_shared_gen);
1521 spin_unlock(&ci->i_ceph_lock); 1532 spin_unlock(&ci->i_ceph_lock);
1522 if (valid) 1533 if (valid) {
1523 __ceph_dentry_dir_lease_touch(di); 1534 struct ceph_dentry_info *di;
1524 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 1535 spin_lock(&dentry->d_lock);
1525 dir, (unsigned)atomic_read(&ci->i_shared_gen), 1536 di = ceph_dentry(dentry);
1526 dentry, (unsigned)di->lease_shared_gen, valid); 1537 if (dir == d_inode(dentry->d_parent) &&
1538 di && di->lease_shared_gen == shared_gen)
1539 __ceph_dentry_dir_lease_touch(di);
1540 else
1541 valid = 0;
1542 spin_unlock(&dentry->d_lock);
1543 }
1544 dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
1545 dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
1527 return valid; 1546 return valid;
1528} 1547}
1529 1548
@@ -1558,7 +1577,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1558 ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { 1577 ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
1559 valid = 1; 1578 valid = 1;
1560 } else { 1579 } else {
1561 valid = dentry_lease_is_valid(dentry, flags, dir); 1580 valid = dentry_lease_is_valid(dentry, flags);
1562 if (valid == -ECHILD) 1581 if (valid == -ECHILD)
1563 return valid; 1582 return valid;
1564 if (valid || dir_lease_is_valid(dir, dentry)) { 1583 if (valid || dir_lease_is_valid(dir, dentry)) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index d3ef7ee429ec..15ff1b09cfa2 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -368,7 +368,7 @@ static struct dentry *ceph_get_parent(struct dentry *child)
368 } 368 }
369out: 369out:
370 dout("get_parent %p ino %llx.%llx err=%ld\n", 370 dout("get_parent %p ino %llx.%llx err=%ld\n",
371 child, ceph_vinop(inode), (IS_ERR(dn) ? PTR_ERR(dn) : 0)); 371 child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
372 return dn; 372 return dn;
373} 373}
374 374
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c5517ffeb11c..685a03cc4b77 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -10,6 +10,7 @@
10#include <linux/namei.h> 10#include <linux/namei.h>
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/falloc.h> 12#include <linux/falloc.h>
13#include <linux/iversion.h>
13 14
14#include "super.h" 15#include "super.h"
15#include "mds_client.h" 16#include "mds_client.h"
@@ -437,7 +438,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
437 struct ceph_mds_client *mdsc = fsc->mdsc; 438 struct ceph_mds_client *mdsc = fsc->mdsc;
438 struct ceph_mds_request *req; 439 struct ceph_mds_request *req;
439 struct dentry *dn; 440 struct dentry *dn;
440 struct ceph_acls_info acls = {}; 441 struct ceph_acl_sec_ctx as_ctx = {};
441 int mask; 442 int mask;
442 int err; 443 int err;
443 444
@@ -451,25 +452,28 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
451 if (flags & O_CREAT) { 452 if (flags & O_CREAT) {
452 if (ceph_quota_is_max_files_exceeded(dir)) 453 if (ceph_quota_is_max_files_exceeded(dir))
453 return -EDQUOT; 454 return -EDQUOT;
454 err = ceph_pre_init_acls(dir, &mode, &acls); 455 err = ceph_pre_init_acls(dir, &mode, &as_ctx);
455 if (err < 0) 456 if (err < 0)
456 return err; 457 return err;
458 err = ceph_security_init_secctx(dentry, mode, &as_ctx);
459 if (err < 0)
460 goto out_ctx;
457 } 461 }
458 462
459 /* do the open */ 463 /* do the open */
460 req = prepare_open_request(dir->i_sb, flags, mode); 464 req = prepare_open_request(dir->i_sb, flags, mode);
461 if (IS_ERR(req)) { 465 if (IS_ERR(req)) {
462 err = PTR_ERR(req); 466 err = PTR_ERR(req);
463 goto out_acl; 467 goto out_ctx;
464 } 468 }
465 req->r_dentry = dget(dentry); 469 req->r_dentry = dget(dentry);
466 req->r_num_caps = 2; 470 req->r_num_caps = 2;
467 if (flags & O_CREAT) { 471 if (flags & O_CREAT) {
468 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 472 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
469 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 473 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
470 if (acls.pagelist) { 474 if (as_ctx.pagelist) {
471 req->r_pagelist = acls.pagelist; 475 req->r_pagelist = as_ctx.pagelist;
472 acls.pagelist = NULL; 476 as_ctx.pagelist = NULL;
473 } 477 }
474 } 478 }
475 479
@@ -507,7 +511,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
507 } else { 511 } else {
508 dout("atomic_open finish_open on dn %p\n", dn); 512 dout("atomic_open finish_open on dn %p\n", dn);
509 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { 513 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
510 ceph_init_inode_acls(d_inode(dentry), &acls); 514 ceph_init_inode_acls(d_inode(dentry), &as_ctx);
511 file->f_mode |= FMODE_CREATED; 515 file->f_mode |= FMODE_CREATED;
512 } 516 }
513 err = finish_open(file, dentry, ceph_open); 517 err = finish_open(file, dentry, ceph_open);
@@ -516,8 +520,8 @@ out_req:
516 if (!req->r_err && req->r_target_inode) 520 if (!req->r_err && req->r_target_inode)
517 ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); 521 ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
518 ceph_mdsc_put_request(req); 522 ceph_mdsc_put_request(req);
519out_acl: 523out_ctx:
520 ceph_release_acls_info(&acls); 524 ceph_release_acl_sec_ctx(&as_ctx);
521 dout("atomic_open result=%d\n", err); 525 dout("atomic_open result=%d\n", err);
522 return err; 526 return err;
523} 527}
@@ -1007,7 +1011,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
1007 * may block. 1011 * may block.
1008 */ 1012 */
1009 truncate_inode_pages_range(inode->i_mapping, pos, 1013 truncate_inode_pages_range(inode->i_mapping, pos,
1010 (pos+len) | (PAGE_SIZE - 1)); 1014 PAGE_ALIGN(pos + len) - 1);
1011 1015
1012 req->r_mtime = mtime; 1016 req->r_mtime = mtime;
1013 } 1017 }
@@ -1022,7 +1026,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
1022 req->r_callback = ceph_aio_complete_req; 1026 req->r_callback = ceph_aio_complete_req;
1023 req->r_inode = inode; 1027 req->r_inode = inode;
1024 req->r_priv = aio_req; 1028 req->r_priv = aio_req;
1025 list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); 1029 list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
1026 1030
1027 pos += len; 1031 pos += len;
1028 continue; 1032 continue;
@@ -1082,8 +1086,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
1082 while (!list_empty(&osd_reqs)) { 1086 while (!list_empty(&osd_reqs)) {
1083 req = list_first_entry(&osd_reqs, 1087 req = list_first_entry(&osd_reqs,
1084 struct ceph_osd_request, 1088 struct ceph_osd_request,
1085 r_unsafe_item); 1089 r_private_item);
1086 list_del_init(&req->r_unsafe_item); 1090 list_del_init(&req->r_private_item);
1087 if (ret >= 0) 1091 if (ret >= 0)
1088 ret = ceph_osdc_start_request(req->r_osdc, 1092 ret = ceph_osdc_start_request(req->r_osdc,
1089 req, false); 1093 req, false);
@@ -1432,6 +1436,8 @@ retry_snap:
1432 if (err) 1436 if (err)
1433 goto out; 1437 goto out;
1434 1438
1439 inode_inc_iversion_raw(inode);
1440
1435 if (ci->i_inline_version != CEPH_INLINE_NONE) { 1441 if (ci->i_inline_version != CEPH_INLINE_NONE) {
1436 err = ceph_uninline_data(file, NULL); 1442 err = ceph_uninline_data(file, NULL);
1437 if (err < 0) 1443 if (err < 0)
@@ -2063,6 +2069,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
2063 do_final_copy = true; 2069 do_final_copy = true;
2064 2070
2065 file_update_time(dst_file); 2071 file_update_time(dst_file);
2072 inode_inc_iversion_raw(dst_inode);
2073
2066 if (endoff > size) { 2074 if (endoff > size) {
2067 int caps_flags = 0; 2075 int caps_flags = 0;
2068 2076
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 761451f36e2d..791f84a13bb8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -13,6 +13,7 @@
13#include <linux/posix_acl.h> 13#include <linux/posix_acl.h>
14#include <linux/random.h> 14#include <linux/random.h>
15#include <linux/sort.h> 15#include <linux/sort.h>
16#include <linux/iversion.h>
16 17
17#include "super.h" 18#include "super.h"
18#include "mds_client.h" 19#include "mds_client.h"
@@ -42,6 +43,7 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
42{ 43{
43 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; 44 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
44 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); 45 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
46 inode_set_iversion_raw(inode, 0);
45 return 0; 47 return 0;
46} 48}
47 49
@@ -509,6 +511,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
509 511
510 INIT_WORK(&ci->i_work, ceph_inode_work); 512 INIT_WORK(&ci->i_work, ceph_inode_work);
511 ci->i_work_mask = 0; 513 ci->i_work_mask = 0;
514 memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
512 515
513 ceph_fscache_inode_init(ci); 516 ceph_fscache_inode_init(ci);
514 517
@@ -523,17 +526,20 @@ void ceph_free_inode(struct inode *inode)
523 kmem_cache_free(ceph_inode_cachep, ci); 526 kmem_cache_free(ceph_inode_cachep, ci);
524} 527}
525 528
526void ceph_destroy_inode(struct inode *inode) 529void ceph_evict_inode(struct inode *inode)
527{ 530{
528 struct ceph_inode_info *ci = ceph_inode(inode); 531 struct ceph_inode_info *ci = ceph_inode(inode);
529 struct ceph_inode_frag *frag; 532 struct ceph_inode_frag *frag;
530 struct rb_node *n; 533 struct rb_node *n;
531 534
532 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); 535 dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
536
537 truncate_inode_pages_final(&inode->i_data);
538 clear_inode(inode);
533 539
534 ceph_fscache_unregister_inode_cookie(ci); 540 ceph_fscache_unregister_inode_cookie(ci);
535 541
536 __ceph_remove_caps(inode); 542 __ceph_remove_caps(ci);
537 543
538 if (__ceph_has_any_quota(ci)) 544 if (__ceph_has_any_quota(ci))
539 ceph_adjust_quota_realms_count(inode, false); 545 ceph_adjust_quota_realms_count(inode, false);
@@ -578,16 +584,6 @@ void ceph_destroy_inode(struct inode *inode)
578 ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); 584 ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
579} 585}
580 586
581int ceph_drop_inode(struct inode *inode)
582{
583 /*
584 * Positve dentry and corresponding inode are always accompanied
585 * in MDS reply. So no need to keep inode in the cache after
586 * dropping all its aliases.
587 */
588 return 1;
589}
590
591static inline blkcnt_t calc_inode_blocks(u64 size) 587static inline blkcnt_t calc_inode_blocks(u64 size)
592{ 588{
593 return (size + (1<<9) - 1) >> 9; 589 return (size + (1<<9) - 1) >> 9;
@@ -795,6 +791,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
795 le64_to_cpu(info->version) > (ci->i_version & ~1))) 791 le64_to_cpu(info->version) > (ci->i_version & ~1)))
796 new_version = true; 792 new_version = true;
797 793
794 /* Update change_attribute */
795 inode_set_max_iversion_raw(inode, iinfo->change_attr);
796
798 __ceph_caps_issued(ci, &issued); 797 __ceph_caps_issued(ci, &issued);
799 issued |= __ceph_caps_dirty(ci); 798 issued |= __ceph_caps_dirty(ci);
800 new_issued = ~issued & info_caps; 799 new_issued = ~issued & info_caps;
@@ -813,6 +812,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
813 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, 812 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
814 from_kuid(&init_user_ns, inode->i_uid), 813 from_kuid(&init_user_ns, inode->i_uid),
815 from_kgid(&init_user_ns, inode->i_gid)); 814 from_kgid(&init_user_ns, inode->i_gid));
815 ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
816 ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
816 } 817 }
817 818
818 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && 819 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
@@ -887,6 +888,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
887 iinfo->xattr_data, iinfo->xattr_len); 888 iinfo->xattr_data, iinfo->xattr_len);
888 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 889 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
889 ceph_forget_all_cached_acls(inode); 890 ceph_forget_all_cached_acls(inode);
891 ceph_security_invalidate_secctx(inode);
890 xattr_blob = NULL; 892 xattr_blob = NULL;
891 } 893 }
892 894
@@ -1027,59 +1029,38 @@ out:
1027} 1029}
1028 1030
1029/* 1031/*
1030 * caller should hold session s_mutex. 1032 * caller should hold session s_mutex and dentry->d_lock.
1031 */ 1033 */
1032static void update_dentry_lease(struct dentry *dentry, 1034static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
1033 struct ceph_mds_reply_lease *lease, 1035 struct ceph_mds_reply_lease *lease,
1034 struct ceph_mds_session *session, 1036 struct ceph_mds_session *session,
1035 unsigned long from_time, 1037 unsigned long from_time,
1036 struct ceph_vino *tgt_vino, 1038 struct ceph_mds_session **old_lease_session)
1037 struct ceph_vino *dir_vino)
1038{ 1039{
1039 struct ceph_dentry_info *di = ceph_dentry(dentry); 1040 struct ceph_dentry_info *di = ceph_dentry(dentry);
1040 long unsigned duration = le32_to_cpu(lease->duration_ms); 1041 long unsigned duration = le32_to_cpu(lease->duration_ms);
1041 long unsigned ttl = from_time + (duration * HZ) / 1000; 1042 long unsigned ttl = from_time + (duration * HZ) / 1000;
1042 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; 1043 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
1043 struct inode *dir;
1044 struct ceph_mds_session *old_lease_session = NULL;
1045 1044
1046 /*
1047 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
1048 * we expect a negative dentry.
1049 */
1050 if (!tgt_vino && d_really_is_positive(dentry))
1051 return;
1052
1053 if (tgt_vino && (d_really_is_negative(dentry) ||
1054 !ceph_ino_compare(d_inode(dentry), tgt_vino)))
1055 return;
1056
1057 spin_lock(&dentry->d_lock);
1058 dout("update_dentry_lease %p duration %lu ms ttl %lu\n", 1045 dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
1059 dentry, duration, ttl); 1046 dentry, duration, ttl);
1060 1047
1061 dir = d_inode(dentry->d_parent);
1062
1063 /* make sure parent matches dir_vino */
1064 if (!ceph_ino_compare(dir, dir_vino))
1065 goto out_unlock;
1066
1067 /* only track leases on regular dentries */ 1048 /* only track leases on regular dentries */
1068 if (ceph_snap(dir) != CEPH_NOSNAP) 1049 if (ceph_snap(dir) != CEPH_NOSNAP)
1069 goto out_unlock; 1050 return;
1070 1051
1071 di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); 1052 di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
1072 if (duration == 0) { 1053 if (duration == 0) {
1073 __ceph_dentry_dir_lease_touch(di); 1054 __ceph_dentry_dir_lease_touch(di);
1074 goto out_unlock; 1055 return;
1075 } 1056 }
1076 1057
1077 if (di->lease_gen == session->s_cap_gen && 1058 if (di->lease_gen == session->s_cap_gen &&
1078 time_before(ttl, di->time)) 1059 time_before(ttl, di->time))
1079 goto out_unlock; /* we already have a newer lease. */ 1060 return; /* we already have a newer lease. */
1080 1061
1081 if (di->lease_session && di->lease_session != session) { 1062 if (di->lease_session && di->lease_session != session) {
1082 old_lease_session = di->lease_session; 1063 *old_lease_session = di->lease_session;
1083 di->lease_session = NULL; 1064 di->lease_session = NULL;
1084 } 1065 }
1085 1066
@@ -1092,6 +1073,62 @@ static void update_dentry_lease(struct dentry *dentry,
1092 di->time = ttl; 1073 di->time = ttl;
1093 1074
1094 __ceph_dentry_lease_touch(di); 1075 __ceph_dentry_lease_touch(di);
1076}
1077
1078static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
1079 struct ceph_mds_reply_lease *lease,
1080 struct ceph_mds_session *session,
1081 unsigned long from_time)
1082{
1083 struct ceph_mds_session *old_lease_session = NULL;
1084 spin_lock(&dentry->d_lock);
1085 __update_dentry_lease(dir, dentry, lease, session, from_time,
1086 &old_lease_session);
1087 spin_unlock(&dentry->d_lock);
1088 if (old_lease_session)
1089 ceph_put_mds_session(old_lease_session);
1090}
1091
1092/*
1093 * update dentry lease without having parent inode locked
1094 */
1095static void update_dentry_lease_careful(struct dentry *dentry,
1096 struct ceph_mds_reply_lease *lease,
1097 struct ceph_mds_session *session,
1098 unsigned long from_time,
1099 char *dname, u32 dname_len,
1100 struct ceph_vino *pdvino,
1101 struct ceph_vino *ptvino)
1102
1103{
1104 struct inode *dir;
1105 struct ceph_mds_session *old_lease_session = NULL;
1106
1107 spin_lock(&dentry->d_lock);
1108 /* make sure dentry's name matches target */
1109 if (dentry->d_name.len != dname_len ||
1110 memcmp(dentry->d_name.name, dname, dname_len))
1111 goto out_unlock;
1112
1113 dir = d_inode(dentry->d_parent);
1114 /* make sure parent matches dvino */
1115 if (!ceph_ino_compare(dir, pdvino))
1116 goto out_unlock;
1117
1118 /* make sure dentry's inode matches target. NULL ptvino means that
1119 * we expect a negative dentry */
1120 if (ptvino) {
1121 if (d_really_is_negative(dentry))
1122 goto out_unlock;
1123 if (!ceph_ino_compare(d_inode(dentry), ptvino))
1124 goto out_unlock;
1125 } else {
1126 if (d_really_is_positive(dentry))
1127 goto out_unlock;
1128 }
1129
1130 __update_dentry_lease(dir, dentry, lease, session,
1131 from_time, &old_lease_session);
1095out_unlock: 1132out_unlock:
1096 spin_unlock(&dentry->d_lock); 1133 spin_unlock(&dentry->d_lock);
1097 if (old_lease_session) 1134 if (old_lease_session)
@@ -1156,19 +1193,6 @@ static int splice_dentry(struct dentry **pdn, struct inode *in)
1156 return 0; 1193 return 0;
1157} 1194}
1158 1195
1159static int d_name_cmp(struct dentry *dentry, const char *name, size_t len)
1160{
1161 int ret;
1162
1163 /* take d_lock to ensure dentry->d_name stability */
1164 spin_lock(&dentry->d_lock);
1165 ret = dentry->d_name.len - len;
1166 if (!ret)
1167 ret = memcmp(dentry->d_name.name, name, len);
1168 spin_unlock(&dentry->d_lock);
1169 return ret;
1170}
1171
1172/* 1196/*
1173 * Incorporate results into the local cache. This is either just 1197 * Incorporate results into the local cache. This is either just
1174 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 1198 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
@@ -1371,10 +1395,9 @@ retry_lookup:
1371 } else if (have_lease) { 1395 } else if (have_lease) {
1372 if (d_unhashed(dn)) 1396 if (d_unhashed(dn))
1373 d_add(dn, NULL); 1397 d_add(dn, NULL);
1374 update_dentry_lease(dn, rinfo->dlease, 1398 update_dentry_lease(dir, dn,
1375 session, 1399 rinfo->dlease, session,
1376 req->r_request_started, 1400 req->r_request_started);
1377 NULL, &dvino);
1378 } 1401 }
1379 goto done; 1402 goto done;
1380 } 1403 }
@@ -1396,11 +1419,9 @@ retry_lookup:
1396 } 1419 }
1397 1420
1398 if (have_lease) { 1421 if (have_lease) {
1399 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); 1422 update_dentry_lease(dir, dn,
1400 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); 1423 rinfo->dlease, session,
1401 update_dentry_lease(dn, rinfo->dlease, session, 1424 req->r_request_started);
1402 req->r_request_started,
1403 &tvino, &dvino);
1404 } 1425 }
1405 dout(" final dn %p\n", dn); 1426 dout(" final dn %p\n", dn);
1406 } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || 1427 } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
@@ -1418,27 +1439,20 @@ retry_lookup:
1418 err = splice_dentry(&req->r_dentry, in); 1439 err = splice_dentry(&req->r_dentry, in);
1419 if (err < 0) 1440 if (err < 0)
1420 goto done; 1441 goto done;
1421 } else if (rinfo->head->is_dentry && 1442 } else if (rinfo->head->is_dentry && req->r_dentry) {
1422 !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) { 1443 /* parent inode is not locked, be carefull */
1423 struct ceph_vino *ptvino = NULL; 1444 struct ceph_vino *ptvino = NULL;
1424 1445 dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1425 if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || 1446 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1426 le32_to_cpu(rinfo->dlease->duration_ms)) { 1447 if (rinfo->head->is_target) {
1427 dvino.ino = le64_to_cpu(rinfo->diri.in->ino); 1448 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1428 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); 1449 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1429 1450 ptvino = &tvino;
1430 if (rinfo->head->is_target) {
1431 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1432 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1433 ptvino = &tvino;
1434 }
1435
1436 update_dentry_lease(req->r_dentry, rinfo->dlease,
1437 session, req->r_request_started, ptvino,
1438 &dvino);
1439 } else {
1440 dout("%s: no dentry lease or dir cap\n", __func__);
1441 } 1451 }
1452 update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
1453 session, req->r_request_started,
1454 rinfo->dname, rinfo->dname_len,
1455 &dvino, ptvino);
1442 } 1456 }
1443done: 1457done:
1444 dout("fill_trace done err=%d\n", err); 1458 dout("fill_trace done err=%d\n", err);
@@ -1600,7 +1614,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1600 /* FIXME: release caps/leases if error occurs */ 1614 /* FIXME: release caps/leases if error occurs */
1601 for (i = 0; i < rinfo->dir_nr; i++) { 1615 for (i = 0; i < rinfo->dir_nr; i++) {
1602 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 1616 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1603 struct ceph_vino tvino, dvino; 1617 struct ceph_vino tvino;
1604 1618
1605 dname.name = rde->name; 1619 dname.name = rde->name;
1606 dname.len = rde->name_len; 1620 dname.len = rde->name_len;
@@ -1701,9 +1715,9 @@ retry_lookup:
1701 1715
1702 ceph_dentry(dn)->offset = rde->offset; 1716 ceph_dentry(dn)->offset = rde->offset;
1703 1717
1704 dvino = ceph_vino(d_inode(parent)); 1718 update_dentry_lease(d_inode(parent), dn,
1705 update_dentry_lease(dn, rde->lease, req->r_session, 1719 rde->lease, req->r_session,
1706 req->r_request_started, &tvino, &dvino); 1720 req->r_request_started);
1707 1721
1708 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 1722 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
1709 ret = fill_readdir_cache(d_inode(parent), dn, 1723 ret = fill_readdir_cache(d_inode(parent), dn,
@@ -2282,7 +2296,7 @@ static int statx_to_caps(u32 want)
2282{ 2296{
2283 int mask = 0; 2297 int mask = 0;
2284 2298
2285 if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME)) 2299 if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
2286 mask |= CEPH_CAP_AUTH_SHARED; 2300 mask |= CEPH_CAP_AUTH_SHARED;
2287 2301
2288 if (want & (STATX_NLINK|STATX_CTIME)) 2302 if (want & (STATX_NLINK|STATX_CTIME))
@@ -2307,6 +2321,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
2307{ 2321{
2308 struct inode *inode = d_inode(path->dentry); 2322 struct inode *inode = d_inode(path->dentry);
2309 struct ceph_inode_info *ci = ceph_inode(inode); 2323 struct ceph_inode_info *ci = ceph_inode(inode);
2324 u32 valid_mask = STATX_BASIC_STATS;
2310 int err = 0; 2325 int err = 0;
2311 2326
2312 /* Skip the getattr altogether if we're asked not to sync */ 2327 /* Skip the getattr altogether if we're asked not to sync */
@@ -2319,6 +2334,16 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
2319 2334
2320 generic_fillattr(inode, stat); 2335 generic_fillattr(inode, stat);
2321 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); 2336 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
2337
2338 /*
2339 * btime on newly-allocated inodes is 0, so if this is still set to
2340 * that, then assume that it's not valid.
2341 */
2342 if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
2343 stat->btime = ci->i_btime;
2344 valid_mask |= STATX_BTIME;
2345 }
2346
2322 if (ceph_snap(inode) == CEPH_NOSNAP) 2347 if (ceph_snap(inode) == CEPH_NOSNAP)
2323 stat->dev = inode->i_sb->s_dev; 2348 stat->dev = inode->i_sb->s_dev;
2324 else 2349 else
@@ -2342,7 +2367,6 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
2342 stat->nlink = 1 + 1 + ci->i_subdirs; 2367 stat->nlink = 1 + 1 + ci->i_subdirs;
2343 } 2368 }
2344 2369
2345 /* Mask off any higher bits (e.g. btime) until we have support */ 2370 stat->result_mask = request_mask & valid_mask;
2346 stat->result_mask = request_mask & STATX_BASIC_STATS;
2347 return err; 2371 return err;
2348} 2372}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c8a9b89b922d..920e9f048bd8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -150,14 +150,13 @@ static int parse_reply_info_in(void **p, void *end,
150 info->pool_ns_data = *p; 150 info->pool_ns_data = *p;
151 *p += info->pool_ns_len; 151 *p += info->pool_ns_len;
152 } 152 }
153 /* btime, change_attr */ 153
154 { 154 /* btime */
155 struct ceph_timespec btime; 155 ceph_decode_need(p, end, sizeof(info->btime), bad);
156 u64 change_attr; 156 ceph_decode_copy(p, &info->btime, sizeof(info->btime));
157 ceph_decode_need(p, end, sizeof(btime), bad); 157
158 ceph_decode_copy(p, &btime, sizeof(btime)); 158 /* change attribute */
159 ceph_decode_64_safe(p, end, change_attr, bad); 159 ceph_decode_64_safe(p, end, info->change_attr, bad);
160 }
161 160
162 /* dir pin */ 161 /* dir pin */
163 if (struct_v >= 2) { 162 if (struct_v >= 2) {
@@ -166,6 +165,15 @@ static int parse_reply_info_in(void **p, void *end,
166 info->dir_pin = -ENODATA; 165 info->dir_pin = -ENODATA;
167 } 166 }
168 167
168 /* snapshot birth time, remains zero for v<=2 */
169 if (struct_v >= 3) {
170 ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
171 ceph_decode_copy(p, &info->snap_btime,
172 sizeof(info->snap_btime));
173 } else {
174 memset(&info->snap_btime, 0, sizeof(info->snap_btime));
175 }
176
169 *p = end; 177 *p = end;
170 } else { 178 } else {
171 if (features & CEPH_FEATURE_MDS_INLINE_DATA) { 179 if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
@@ -197,7 +205,14 @@ static int parse_reply_info_in(void **p, void *end,
197 } 205 }
198 } 206 }
199 207
208 if (features & CEPH_FEATURE_FS_BTIME) {
209 ceph_decode_need(p, end, sizeof(info->btime), bad);
210 ceph_decode_copy(p, &info->btime, sizeof(info->btime));
211 ceph_decode_64_safe(p, end, info->change_attr, bad);
212 }
213
200 info->dir_pin = -ENODATA; 214 info->dir_pin = -ENODATA;
215 /* info->snap_btime remains zero */
201 } 216 }
202 return 0; 217 return 0;
203bad: 218bad:
@@ -717,6 +732,7 @@ void ceph_mdsc_release_request(struct kref *kref)
717 ceph_pagelist_release(req->r_pagelist); 732 ceph_pagelist_release(req->r_pagelist);
718 put_request_session(req); 733 put_request_session(req);
719 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); 734 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
735 WARN_ON_ONCE(!list_empty(&req->r_wait));
720 kfree(req); 736 kfree(req);
721} 737}
722 738
@@ -903,7 +919,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
903 struct inode *dir; 919 struct inode *dir;
904 920
905 rcu_read_lock(); 921 rcu_read_lock();
906 parent = req->r_dentry->d_parent; 922 parent = READ_ONCE(req->r_dentry->d_parent);
907 dir = req->r_parent ? : d_inode_rcu(parent); 923 dir = req->r_parent ? : d_inode_rcu(parent);
908 924
909 if (!dir || dir->i_sb != mdsc->fsc->sb) { 925 if (!dir || dir->i_sb != mdsc->fsc->sb) {
@@ -2135,7 +2151,7 @@ retry:
2135 memcpy(path + pos, temp->d_name.name, temp->d_name.len); 2151 memcpy(path + pos, temp->d_name.name, temp->d_name.len);
2136 } 2152 }
2137 spin_unlock(&temp->d_lock); 2153 spin_unlock(&temp->d_lock);
2138 temp = temp->d_parent; 2154 temp = READ_ONCE(temp->d_parent);
2139 2155
2140 /* Are we at the root? */ 2156 /* Are we at the root? */
2141 if (IS_ROOT(temp)) 2157 if (IS_ROOT(temp))
@@ -3727,42 +3743,35 @@ static void check_new_map(struct ceph_mds_client *mdsc,
3727 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", 3743 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
3728 ceph_session_state_name(s->s_state)); 3744 ceph_session_state_name(s->s_state));
3729 3745
3730 if (i >= newmap->m_num_mds || 3746 if (i >= newmap->m_num_mds) {
3731 memcmp(ceph_mdsmap_get_addr(oldmap, i), 3747 /* force close session for stopped mds */
3732 ceph_mdsmap_get_addr(newmap, i), 3748 get_session(s);
3733 sizeof(struct ceph_entity_addr))) { 3749 __unregister_session(mdsc, s);
3734 if (s->s_state == CEPH_MDS_SESSION_OPENING) { 3750 __wake_requests(mdsc, &s->s_waiting);
3735 /* the session never opened, just close it 3751 mutex_unlock(&mdsc->mutex);
3736 * out now */
3737 get_session(s);
3738 __unregister_session(mdsc, s);
3739 __wake_requests(mdsc, &s->s_waiting);
3740 ceph_put_mds_session(s);
3741 } else if (i >= newmap->m_num_mds) {
3742 /* force close session for stopped mds */
3743 get_session(s);
3744 __unregister_session(mdsc, s);
3745 __wake_requests(mdsc, &s->s_waiting);
3746 kick_requests(mdsc, i);
3747 mutex_unlock(&mdsc->mutex);
3748 3752
3749 mutex_lock(&s->s_mutex); 3753 mutex_lock(&s->s_mutex);
3750 cleanup_session_requests(mdsc, s); 3754 cleanup_session_requests(mdsc, s);
3751 remove_session_caps(s); 3755 remove_session_caps(s);
3752 mutex_unlock(&s->s_mutex); 3756 mutex_unlock(&s->s_mutex);
3753 3757
3754 ceph_put_mds_session(s); 3758 ceph_put_mds_session(s);
3755 3759
3756 mutex_lock(&mdsc->mutex); 3760 mutex_lock(&mdsc->mutex);
3757 } else { 3761 kick_requests(mdsc, i);
3758 /* just close it */ 3762 continue;
3759 mutex_unlock(&mdsc->mutex); 3763 }
3760 mutex_lock(&s->s_mutex); 3764
3761 mutex_lock(&mdsc->mutex); 3765 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
3762 ceph_con_close(&s->s_con); 3766 ceph_mdsmap_get_addr(newmap, i),
3763 mutex_unlock(&s->s_mutex); 3767 sizeof(struct ceph_entity_addr))) {
3764 s->s_state = CEPH_MDS_SESSION_RESTARTING; 3768 /* just close it */
3765 } 3769 mutex_unlock(&mdsc->mutex);
3770 mutex_lock(&s->s_mutex);
3771 mutex_lock(&mdsc->mutex);
3772 ceph_con_close(&s->s_con);
3773 mutex_unlock(&s->s_mutex);
3774 s->s_state = CEPH_MDS_SESSION_RESTARTING;
3766 } else if (oldstate == newstate) { 3775 } else if (oldstate == newstate) {
3767 continue; /* nothing new with this mds */ 3776 continue; /* nothing new with this mds */
3768 } 3777 }
@@ -3931,31 +3940,33 @@ bad:
3931} 3940}
3932 3941
3933void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, 3942void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
3934 struct inode *inode,
3935 struct dentry *dentry, char action, 3943 struct dentry *dentry, char action,
3936 u32 seq) 3944 u32 seq)
3937{ 3945{
3938 struct ceph_msg *msg; 3946 struct ceph_msg *msg;
3939 struct ceph_mds_lease *lease; 3947 struct ceph_mds_lease *lease;
3940 int len = sizeof(*lease) + sizeof(u32); 3948 struct inode *dir;
3941 int dnamelen = 0; 3949 int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
3942 3950
3943 dout("lease_send_msg inode %p dentry %p %s to mds%d\n", 3951 dout("lease_send_msg identry %p %s to mds%d\n",
3944 inode, dentry, ceph_lease_op_name(action), session->s_mds); 3952 dentry, ceph_lease_op_name(action), session->s_mds);
3945 dnamelen = dentry->d_name.len;
3946 len += dnamelen;
3947 3953
3948 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); 3954 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
3949 if (!msg) 3955 if (!msg)
3950 return; 3956 return;
3951 lease = msg->front.iov_base; 3957 lease = msg->front.iov_base;
3952 lease->action = action; 3958 lease->action = action;
3953 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
3954 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
3955 lease->seq = cpu_to_le32(seq); 3959 lease->seq = cpu_to_le32(seq);
3956 put_unaligned_le32(dnamelen, lease + 1);
3957 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
3958 3960
3961 spin_lock(&dentry->d_lock);
3962 dir = d_inode(dentry->d_parent);
3963 lease->ino = cpu_to_le64(ceph_ino(dir));
3964 lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
3965
3966 put_unaligned_le32(dentry->d_name.len, lease + 1);
3967 memcpy((void *)(lease + 1) + 4,
3968 dentry->d_name.name, dentry->d_name.len);
3969 spin_unlock(&dentry->d_lock);
3959 /* 3970 /*
3960 * if this is a preemptive lease RELEASE, no need to 3971 * if this is a preemptive lease RELEASE, no need to
3961 * flush request stream, since the actual request will 3972 * flush request stream, since the actual request will
@@ -4157,6 +4168,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
4157 while ((req = __get_oldest_req(mdsc))) { 4168 while ((req = __get_oldest_req(mdsc))) {
4158 dout("wait_requests timed out on tid %llu\n", 4169 dout("wait_requests timed out on tid %llu\n",
4159 req->r_tid); 4170 req->r_tid);
4171 list_del_init(&req->r_wait);
4160 __unregister_request(mdsc, req); 4172 __unregister_request(mdsc, req);
4161 } 4173 }
4162 } 4174 }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a83f28bc2387..f7c8603484fe 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -69,6 +69,9 @@ struct ceph_mds_reply_info_in {
69 u64 max_bytes; 69 u64 max_bytes;
70 u64 max_files; 70 u64 max_files;
71 s32 dir_pin; 71 s32 dir_pin;
72 struct ceph_timespec btime;
73 struct ceph_timespec snap_btime;
74 u64 change_attr;
72}; 75};
73 76
74struct ceph_mds_reply_dir_entry { 77struct ceph_mds_reply_dir_entry {
@@ -504,7 +507,6 @@ extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
504 507
505extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); 508extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
506extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, 509extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
507 struct inode *inode,
508 struct dentry *dentry, char action, 510 struct dentry *dentry, char action,
509 u32 seq); 511 u32 seq);
510 512
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 701b4fb0fb5a..ce2d00da5096 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -107,7 +107,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
107 struct ceph_mdsmap *m; 107 struct ceph_mdsmap *m;
108 const void *start = *p; 108 const void *start = *p;
109 int i, j, n; 109 int i, j, n;
110 int err = -EINVAL; 110 int err;
111 u8 mdsmap_v, mdsmap_cv; 111 u8 mdsmap_v, mdsmap_cv;
112 u16 mdsmap_ev; 112 u16 mdsmap_ev;
113 113
@@ -183,8 +183,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
183 inc = ceph_decode_32(p); 183 inc = ceph_decode_32(p);
184 state = ceph_decode_32(p); 184 state = ceph_decode_32(p);
185 state_seq = ceph_decode_64(p); 185 state_seq = ceph_decode_64(p);
186 ceph_decode_copy(p, &addr, sizeof(addr)); 186 err = ceph_decode_entity_addr(p, end, &addr);
187 ceph_decode_addr(&addr); 187 if (err)
188 goto corrupt;
188 ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); 189 ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
189 *p += sizeof(u32); 190 *p += sizeof(u32);
190 ceph_decode_32_safe(p, end, namelen, bad); 191 ceph_decode_32_safe(p, end, namelen, bad);
@@ -357,7 +358,7 @@ bad_ext:
357nomem: 358nomem:
358 err = -ENOMEM; 359 err = -ENOMEM;
359 goto out_err; 360 goto out_err;
360bad: 361corrupt:
361 pr_err("corrupt mdsmap\n"); 362 pr_err("corrupt mdsmap\n");
362 print_hex_dump(KERN_DEBUG, "mdsmap: ", 363 print_hex_dump(KERN_DEBUG, "mdsmap: ",
363 DUMP_PREFIX_OFFSET, 16, 1, 364 DUMP_PREFIX_OFFSET, 16, 1,
@@ -365,6 +366,9 @@ bad:
365out_err: 366out_err:
366 ceph_mdsmap_destroy(m); 367 ceph_mdsmap_destroy(m);
367 return ERR_PTR(err); 368 return ERR_PTR(err);
369bad:
370 err = -EINVAL;
371 goto corrupt;
368} 372}
369 373
370void ceph_mdsmap_destroy(struct ceph_mdsmap *m) 374void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index d629fc857450..de56dee60540 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -135,7 +135,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
135 return NULL; 135 return NULL;
136 136
137 mutex_lock(&qri->mutex); 137 mutex_lock(&qri->mutex);
138 if (qri->inode) { 138 if (qri->inode && ceph_is_any_caps(qri->inode)) {
139 /* A request has already returned the inode */ 139 /* A request has already returned the inode */
140 mutex_unlock(&qri->mutex); 140 mutex_unlock(&qri->mutex);
141 return qri->inode; 141 return qri->inode;
@@ -146,7 +146,18 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
146 mutex_unlock(&qri->mutex); 146 mutex_unlock(&qri->mutex);
147 return NULL; 147 return NULL;
148 } 148 }
149 in = ceph_lookup_inode(sb, realm->ino); 149 if (qri->inode) {
150 /* get caps */
151 int ret = __ceph_do_getattr(qri->inode, NULL,
152 CEPH_STAT_CAP_INODE, true);
153 if (ret >= 0)
154 in = qri->inode;
155 else
156 in = ERR_PTR(ret);
157 } else {
158 in = ceph_lookup_inode(sb, realm->ino);
159 }
160
150 if (IS_ERR(in)) { 161 if (IS_ERR(in)) {
151 pr_warn("Can't lookup inode %llx (err: %ld)\n", 162 pr_warn("Can't lookup inode %llx (err: %ld)\n",
152 realm->ino, PTR_ERR(in)); 163 realm->ino, PTR_ERR(in));
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 72c6c022f02b..4c6494eb02b5 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -3,6 +3,7 @@
3 3
4#include <linux/sort.h> 4#include <linux/sort.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/iversion.h>
6#include "super.h" 7#include "super.h"
7#include "mds_client.h" 8#include "mds_client.h"
8#include <linux/ceph/decode.h> 9#include <linux/ceph/decode.h>
@@ -606,6 +607,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
606 capsnap->mtime = inode->i_mtime; 607 capsnap->mtime = inode->i_mtime;
607 capsnap->atime = inode->i_atime; 608 capsnap->atime = inode->i_atime;
608 capsnap->ctime = inode->i_ctime; 609 capsnap->ctime = inode->i_ctime;
610 capsnap->btime = ci->i_btime;
611 capsnap->change_attr = inode_peek_iversion_raw(inode);
609 capsnap->time_warp_seq = ci->i_time_warp_seq; 612 capsnap->time_warp_seq = ci->i_time_warp_seq;
610 capsnap->truncate_size = ci->i_truncate_size; 613 capsnap->truncate_size = ci->i_truncate_size;
611 capsnap->truncate_seq = ci->i_truncate_seq; 614 capsnap->truncate_seq = ci->i_truncate_seq;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ed1b65a6c2c3..ab4868c7308e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -840,10 +840,10 @@ static int ceph_remount(struct super_block *sb, int *flags, char *data)
840 840
841static const struct super_operations ceph_super_ops = { 841static const struct super_operations ceph_super_ops = {
842 .alloc_inode = ceph_alloc_inode, 842 .alloc_inode = ceph_alloc_inode,
843 .destroy_inode = ceph_destroy_inode,
844 .free_inode = ceph_free_inode, 843 .free_inode = ceph_free_inode,
845 .write_inode = ceph_write_inode, 844 .write_inode = ceph_write_inode,
846 .drop_inode = ceph_drop_inode, 845 .drop_inode = generic_delete_inode,
846 .evict_inode = ceph_evict_inode,
847 .sync_fs = ceph_sync_fs, 847 .sync_fs = ceph_sync_fs,
848 .put_super = ceph_put_super, 848 .put_super = ceph_put_super,
849 .remount_fs = ceph_remount, 849 .remount_fs = ceph_remount,
@@ -978,7 +978,7 @@ static int ceph_set_super(struct super_block *s, void *data)
978 s->s_d_op = &ceph_dentry_ops; 978 s->s_d_op = &ceph_dentry_ops;
979 s->s_export_op = &ceph_export_ops; 979 s->s_export_op = &ceph_export_ops;
980 980
981 s->s_time_gran = 1000; /* 1000 ns == 1 us */ 981 s->s_time_gran = 1;
982 982
983 ret = set_anon_super(s, NULL); /* what is that second arg for? */ 983 ret = set_anon_super(s, NULL); /* what is that second arg for? */
984 if (ret != 0) 984 if (ret != 0)
@@ -1159,17 +1159,15 @@ static int __init init_ceph(void)
1159 goto out; 1159 goto out;
1160 1160
1161 ceph_flock_init(); 1161 ceph_flock_init();
1162 ceph_xattr_init();
1163 ret = register_filesystem(&ceph_fs_type); 1162 ret = register_filesystem(&ceph_fs_type);
1164 if (ret) 1163 if (ret)
1165 goto out_xattr; 1164 goto out_caches;
1166 1165
1167 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); 1166 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1168 1167
1169 return 0; 1168 return 0;
1170 1169
1171out_xattr: 1170out_caches:
1172 ceph_xattr_exit();
1173 destroy_caches(); 1171 destroy_caches();
1174out: 1172out:
1175 return ret; 1173 return ret;
@@ -1179,7 +1177,6 @@ static void __exit exit_ceph(void)
1179{ 1177{
1180 dout("exit_ceph\n"); 1178 dout("exit_ceph\n");
1181 unregister_filesystem(&ceph_fs_type); 1179 unregister_filesystem(&ceph_fs_type);
1182 ceph_xattr_exit();
1183 destroy_caches(); 1180 destroy_caches();
1184} 1181}
1185 1182
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fbe6869a3f95..d2352fd95dbc 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -197,7 +197,8 @@ struct ceph_cap_snap {
197 u64 xattr_version; 197 u64 xattr_version;
198 198
199 u64 size; 199 u64 size;
200 struct timespec64 mtime, atime, ctime; 200 u64 change_attr;
201 struct timespec64 mtime, atime, ctime, btime;
201 u64 time_warp_seq; 202 u64 time_warp_seq;
202 u64 truncate_size; 203 u64 truncate_size;
203 u32 truncate_seq; 204 u32 truncate_seq;
@@ -384,6 +385,8 @@ struct ceph_inode_info {
384 int i_snap_realm_counter; /* snap realm (if caps) */ 385 int i_snap_realm_counter; /* snap realm (if caps) */
385 struct list_head i_snap_realm_item; 386 struct list_head i_snap_realm_item;
386 struct list_head i_snap_flush_item; 387 struct list_head i_snap_flush_item;
388 struct timespec64 i_btime;
389 struct timespec64 i_snap_btime;
387 390
388 struct work_struct i_work; 391 struct work_struct i_work;
389 unsigned long i_work_mask; 392 unsigned long i_work_mask;
@@ -544,7 +547,12 @@ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
544 long long release_count, 547 long long release_count,
545 long long ordered_count) 548 long long ordered_count)
546{ 549{
547 smp_mb__before_atomic(); 550 /*
551 * Makes sure operations that setup readdir cache (update page
552 * cache and i_size) are strongly ordered w.r.t. the following
553 * atomic64_set() operations.
554 */
555 smp_mb();
548 atomic64_set(&ci->i_complete_seq[0], release_count); 556 atomic64_set(&ci->i_complete_seq[0], release_count);
549 atomic64_set(&ci->i_complete_seq[1], ordered_count); 557 atomic64_set(&ci->i_complete_seq[1], ordered_count);
550} 558}
@@ -876,9 +884,8 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
876extern const struct inode_operations ceph_file_iops; 884extern const struct inode_operations ceph_file_iops;
877 885
878extern struct inode *ceph_alloc_inode(struct super_block *sb); 886extern struct inode *ceph_alloc_inode(struct super_block *sb);
879extern void ceph_destroy_inode(struct inode *inode); 887extern void ceph_evict_inode(struct inode *inode);
880extern void ceph_free_inode(struct inode *inode); 888extern void ceph_free_inode(struct inode *inode);
881extern int ceph_drop_inode(struct inode *inode);
882 889
883extern struct inode *ceph_get_inode(struct super_block *sb, 890extern struct inode *ceph_get_inode(struct super_block *sb,
884 struct ceph_vino vino); 891 struct ceph_vino vino);
@@ -921,10 +928,20 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
921extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); 928extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
922extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); 929extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
923extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); 930extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
924extern void __init ceph_xattr_init(void);
925extern void ceph_xattr_exit(void);
926extern const struct xattr_handler *ceph_xattr_handlers[]; 931extern const struct xattr_handler *ceph_xattr_handlers[];
927 932
933struct ceph_acl_sec_ctx {
934#ifdef CONFIG_CEPH_FS_POSIX_ACL
935 void *default_acl;
936 void *acl;
937#endif
938#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
939 void *sec_ctx;
940 u32 sec_ctxlen;
941#endif
942 struct ceph_pagelist *pagelist;
943};
944
928#ifdef CONFIG_SECURITY 945#ifdef CONFIG_SECURITY
929extern bool ceph_security_xattr_deadlock(struct inode *in); 946extern bool ceph_security_xattr_deadlock(struct inode *in);
930extern bool ceph_security_xattr_wanted(struct inode *in); 947extern bool ceph_security_xattr_wanted(struct inode *in);
@@ -939,21 +956,32 @@ static inline bool ceph_security_xattr_wanted(struct inode *in)
939} 956}
940#endif 957#endif
941 958
942/* acl.c */ 959#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
943struct ceph_acls_info { 960extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
944 void *default_acl; 961 struct ceph_acl_sec_ctx *ctx);
945 void *acl; 962extern void ceph_security_invalidate_secctx(struct inode *inode);
946 struct ceph_pagelist *pagelist; 963#else
947}; 964static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
965 struct ceph_acl_sec_ctx *ctx)
966{
967 return 0;
968}
969static inline void ceph_security_invalidate_secctx(struct inode *inode)
970{
971}
972#endif
973
974void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
948 975
976/* acl.c */
949#ifdef CONFIG_CEPH_FS_POSIX_ACL 977#ifdef CONFIG_CEPH_FS_POSIX_ACL
950 978
951struct posix_acl *ceph_get_acl(struct inode *, int); 979struct posix_acl *ceph_get_acl(struct inode *, int);
952int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); 980int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
953int ceph_pre_init_acls(struct inode *dir, umode_t *mode, 981int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
954 struct ceph_acls_info *info); 982 struct ceph_acl_sec_ctx *as_ctx);
955void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info); 983void ceph_init_inode_acls(struct inode *inode,
956void ceph_release_acls_info(struct ceph_acls_info *info); 984 struct ceph_acl_sec_ctx *as_ctx);
957 985
958static inline void ceph_forget_all_cached_acls(struct inode *inode) 986static inline void ceph_forget_all_cached_acls(struct inode *inode)
959{ 987{
@@ -966,15 +994,12 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
966#define ceph_set_acl NULL 994#define ceph_set_acl NULL
967 995
968static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, 996static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
969 struct ceph_acls_info *info) 997 struct ceph_acl_sec_ctx *as_ctx)
970{ 998{
971 return 0; 999 return 0;
972} 1000}
973static inline void ceph_init_inode_acls(struct inode *inode, 1001static inline void ceph_init_inode_acls(struct inode *inode,
974 struct ceph_acls_info *info) 1002 struct ceph_acl_sec_ctx *as_ctx)
975{
976}
977static inline void ceph_release_acls_info(struct ceph_acls_info *info)
978{ 1003{
979} 1004}
980static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) 1005static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
@@ -1000,7 +1025,7 @@ extern void ceph_add_cap(struct inode *inode,
1000 unsigned cap, unsigned seq, u64 realmino, int flags, 1025 unsigned cap, unsigned seq, u64 realmino, int flags,
1001 struct ceph_cap **new_cap); 1026 struct ceph_cap **new_cap);
1002extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); 1027extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
1003extern void __ceph_remove_caps(struct inode* inode); 1028extern void __ceph_remove_caps(struct ceph_inode_info *ci);
1004extern void ceph_put_cap(struct ceph_mds_client *mdsc, 1029extern void ceph_put_cap(struct ceph_mds_client *mdsc,
1005 struct ceph_cap *cap); 1030 struct ceph_cap *cap);
1006extern int ceph_is_any_caps(struct inode *inode); 1031extern int ceph_is_any_caps(struct inode *inode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0cc42c8879e9..37b458a9af3a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -8,6 +8,7 @@
8#include <linux/ceph/decode.h> 8#include <linux/ceph/decode.h>
9 9
10#include <linux/xattr.h> 10#include <linux/xattr.h>
11#include <linux/security.h>
11#include <linux/posix_acl_xattr.h> 12#include <linux/posix_acl_xattr.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13 14
@@ -17,26 +18,9 @@
17static int __remove_xattr(struct ceph_inode_info *ci, 18static int __remove_xattr(struct ceph_inode_info *ci,
18 struct ceph_inode_xattr *xattr); 19 struct ceph_inode_xattr *xattr);
19 20
20static const struct xattr_handler ceph_other_xattr_handler;
21
22/*
23 * List of handlers for synthetic system.* attributes. Other
24 * attributes are handled directly.
25 */
26const struct xattr_handler *ceph_xattr_handlers[] = {
27#ifdef CONFIG_CEPH_FS_POSIX_ACL
28 &posix_acl_access_xattr_handler,
29 &posix_acl_default_xattr_handler,
30#endif
31 &ceph_other_xattr_handler,
32 NULL,
33};
34
35static bool ceph_is_valid_xattr(const char *name) 21static bool ceph_is_valid_xattr(const char *name)
36{ 22{
37 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 23 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
38 !strncmp(name, XATTR_SECURITY_PREFIX,
39 XATTR_SECURITY_PREFIX_LEN) ||
40 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 24 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
41 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 25 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
42} 26}
@@ -48,8 +32,8 @@ static bool ceph_is_valid_xattr(const char *name)
48struct ceph_vxattr { 32struct ceph_vxattr {
49 char *name; 33 char *name;
50 size_t name_size; /* strlen(name) + 1 (for '\0') */ 34 size_t name_size; /* strlen(name) + 1 (for '\0') */
51 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, 35 ssize_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
52 size_t size); 36 size_t size);
53 bool (*exists_cb)(struct ceph_inode_info *ci); 37 bool (*exists_cb)(struct ceph_inode_info *ci);
54 unsigned int flags; 38 unsigned int flags;
55}; 39};
@@ -68,8 +52,8 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
68 rcu_dereference_raw(fl->pool_ns) != NULL); 52 rcu_dereference_raw(fl->pool_ns) != NULL);
69} 53}
70 54
71static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, 55static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
72 size_t size) 56 size_t size)
73{ 57{
74 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); 58 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
75 struct ceph_osd_client *osdc = &fsc->client->osdc; 59 struct ceph_osd_client *osdc = &fsc->client->osdc;
@@ -79,7 +63,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
79 const char *ns_field = " pool_namespace="; 63 const char *ns_field = " pool_namespace=";
80 char buf[128]; 64 char buf[128];
81 size_t len, total_len = 0; 65 size_t len, total_len = 0;
82 int ret; 66 ssize_t ret;
83 67
84 pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); 68 pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
85 69
@@ -96,18 +80,15 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
96 len = snprintf(buf, sizeof(buf), 80 len = snprintf(buf, sizeof(buf),
97 "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld", 81 "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
98 ci->i_layout.stripe_unit, ci->i_layout.stripe_count, 82 ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
99 ci->i_layout.object_size, (unsigned long long)pool); 83 ci->i_layout.object_size, pool);
100 total_len = len; 84 total_len = len;
101 } 85 }
102 86
103 if (pool_ns) 87 if (pool_ns)
104 total_len += strlen(ns_field) + pool_ns->len; 88 total_len += strlen(ns_field) + pool_ns->len;
105 89
106 if (!size) { 90 ret = total_len;
107 ret = total_len; 91 if (size >= total_len) {
108 } else if (total_len > size) {
109 ret = -ERANGE;
110 } else {
111 memcpy(val, buf, len); 92 memcpy(val, buf, len);
112 ret = len; 93 ret = len;
113 if (pool_name) { 94 if (pool_name) {
@@ -128,28 +109,55 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
128 return ret; 109 return ret;
129} 110}
130 111
131static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, 112/*
132 char *val, size_t size) 113 * The convention with strings in xattrs is that they should not be NULL
114 * terminated, since we're returning the length with them. snprintf always
115 * NULL terminates however, so call it on a temporary buffer and then memcpy
116 * the result into place.
117 */
118static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
133{ 119{
134 return snprintf(val, size, "%u", ci->i_layout.stripe_unit); 120 int ret;
121 va_list args;
122 char buf[96]; /* NB: reevaluate size if new vxattrs are added */
123
124 va_start(args, fmt);
125 ret = vsnprintf(buf, size ? sizeof(buf) : 0, fmt, args);
126 va_end(args);
127
128 /* Sanity check */
129 if (size && ret + 1 > sizeof(buf)) {
130 WARN_ONCE(true, "Returned length too big (%d)", ret);
131 return -E2BIG;
132 }
133
134 if (ret <= size)
135 memcpy(val, buf, ret);
136 return ret;
135} 137}
136 138
137static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, 139static ssize_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
138 char *val, size_t size) 140 char *val, size_t size)
139{ 141{
140 return snprintf(val, size, "%u", ci->i_layout.stripe_count); 142 return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_unit);
143}
144
145static ssize_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
146 char *val, size_t size)
147{
148 return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_count);
141} 149}
142 150
143static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, 151static ssize_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
144 char *val, size_t size) 152 char *val, size_t size)
145{ 153{
146 return snprintf(val, size, "%u", ci->i_layout.object_size); 154 return ceph_fmt_xattr(val, size, "%u", ci->i_layout.object_size);
147} 155}
148 156
149static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, 157static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
150 char *val, size_t size) 158 char *val, size_t size)
151{ 159{
152 int ret; 160 ssize_t ret;
153 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); 161 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
154 struct ceph_osd_client *osdc = &fsc->client->osdc; 162 struct ceph_osd_client *osdc = &fsc->client->osdc;
155 s64 pool = ci->i_layout.pool_id; 163 s64 pool = ci->i_layout.pool_id;
@@ -157,21 +165,27 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
157 165
158 down_read(&osdc->lock); 166 down_read(&osdc->lock);
159 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 167 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
160 if (pool_name) 168 if (pool_name) {
161 ret = snprintf(val, size, "%s", pool_name); 169 ret = strlen(pool_name);
162 else 170 if (ret <= size)
163 ret = snprintf(val, size, "%lld", (unsigned long long)pool); 171 memcpy(val, pool_name, ret);
172 } else {
173 ret = ceph_fmt_xattr(val, size, "%lld", pool);
174 }
164 up_read(&osdc->lock); 175 up_read(&osdc->lock);
165 return ret; 176 return ret;
166} 177}
167 178
168static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, 179static ssize_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
169 char *val, size_t size) 180 char *val, size_t size)
170{ 181{
171 int ret = 0; 182 ssize_t ret = 0;
172 struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns); 183 struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
184
173 if (ns) { 185 if (ns) {
174 ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str); 186 ret = ns->len;
187 if (ret <= size)
188 memcpy(val, ns->str, ret);
175 ceph_put_string(ns); 189 ceph_put_string(ns);
176 } 190 }
177 return ret; 191 return ret;
@@ -179,53 +193,54 @@ static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
179 193
180/* directories */ 194/* directories */
181 195
182static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, 196static ssize_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
183 size_t size) 197 size_t size)
184{ 198{
185 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); 199 return ceph_fmt_xattr(val, size, "%lld", ci->i_files + ci->i_subdirs);
186} 200}
187 201
188static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, 202static ssize_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
189 size_t size) 203 size_t size)
190{ 204{
191 return snprintf(val, size, "%lld", ci->i_files); 205 return ceph_fmt_xattr(val, size, "%lld", ci->i_files);
192} 206}
193 207
194static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, 208static ssize_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
195 size_t size) 209 size_t size)
196{ 210{
197 return snprintf(val, size, "%lld", ci->i_subdirs); 211 return ceph_fmt_xattr(val, size, "%lld", ci->i_subdirs);
198} 212}
199 213
200static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, 214static ssize_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
201 size_t size) 215 size_t size)
202{ 216{
203 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); 217 return ceph_fmt_xattr(val, size, "%lld",
218 ci->i_rfiles + ci->i_rsubdirs);
204} 219}
205 220
206static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, 221static ssize_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
207 size_t size) 222 size_t size)
208{ 223{
209 return snprintf(val, size, "%lld", ci->i_rfiles); 224 return ceph_fmt_xattr(val, size, "%lld", ci->i_rfiles);
210} 225}
211 226
212static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, 227static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
213 size_t size) 228 size_t size)
214{ 229{
215 return snprintf(val, size, "%lld", ci->i_rsubdirs); 230 return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
216} 231}
217 232
218static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, 233static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
219 size_t size) 234 size_t size)
220{ 235{
221 return snprintf(val, size, "%lld", ci->i_rbytes); 236 return ceph_fmt_xattr(val, size, "%lld", ci->i_rbytes);
222} 237}
223 238
224static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, 239static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
225 size_t size) 240 size_t size)
226{ 241{
227 return snprintf(val, size, "%lld.09%ld", ci->i_rctime.tv_sec, 242 return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_rctime.tv_sec,
228 ci->i_rctime.tv_nsec); 243 ci->i_rctime.tv_nsec);
229} 244}
230 245
231/* dir pin */ 246/* dir pin */
@@ -234,10 +249,10 @@ static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
234 return ci->i_dir_pin != -ENODATA; 249 return ci->i_dir_pin != -ENODATA;
235} 250}
236 251
237static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, 252static ssize_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
238 size_t size) 253 size_t size)
239{ 254{
240 return snprintf(val, size, "%d", (int)ci->i_dir_pin); 255 return ceph_fmt_xattr(val, size, "%d", (int)ci->i_dir_pin);
241} 256}
242 257
243/* quotas */ 258/* quotas */
@@ -254,23 +269,36 @@ static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
254 return ret; 269 return ret;
255} 270}
256 271
257static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, 272static ssize_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
258 size_t size) 273 size_t size)
274{
275 return ceph_fmt_xattr(val, size, "max_bytes=%llu max_files=%llu",
276 ci->i_max_bytes, ci->i_max_files);
277}
278
279static ssize_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
280 char *val, size_t size)
259{ 281{
260 return snprintf(val, size, "max_bytes=%llu max_files=%llu", 282 return ceph_fmt_xattr(val, size, "%llu", ci->i_max_bytes);
261 ci->i_max_bytes, ci->i_max_files);
262} 283}
263 284
264static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, 285static ssize_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
265 char *val, size_t size) 286 char *val, size_t size)
266{ 287{
267 return snprintf(val, size, "%llu", ci->i_max_bytes); 288 return ceph_fmt_xattr(val, size, "%llu", ci->i_max_files);
268} 289}
269 290
270static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, 291/* snapshots */
271 char *val, size_t size) 292static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci)
272{ 293{
273 return snprintf(val, size, "%llu", ci->i_max_files); 294 return (ci->i_snap_btime.tv_sec != 0 || ci->i_snap_btime.tv_nsec != 0);
295}
296
297static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
298 size_t size)
299{
300 return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_snap_btime.tv_sec,
301 ci->i_snap_btime.tv_nsec);
274} 302}
275 303
276#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name 304#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
@@ -327,7 +355,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
327 XATTR_RSTAT_FIELD(dir, rctime), 355 XATTR_RSTAT_FIELD(dir, rctime),
328 { 356 {
329 .name = "ceph.dir.pin", 357 .name = "ceph.dir.pin",
330 .name_size = sizeof("ceph.dir_pin"), 358 .name_size = sizeof("ceph.dir.pin"),
331 .getxattr_cb = ceph_vxattrcb_dir_pin, 359 .getxattr_cb = ceph_vxattrcb_dir_pin,
332 .exists_cb = ceph_vxattrcb_dir_pin_exists, 360 .exists_cb = ceph_vxattrcb_dir_pin_exists,
333 .flags = VXATTR_FLAG_HIDDEN, 361 .flags = VXATTR_FLAG_HIDDEN,
@@ -341,9 +369,15 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
341 }, 369 },
342 XATTR_QUOTA_FIELD(quota, max_bytes), 370 XATTR_QUOTA_FIELD(quota, max_bytes),
343 XATTR_QUOTA_FIELD(quota, max_files), 371 XATTR_QUOTA_FIELD(quota, max_files),
372 {
373 .name = "ceph.snap.btime",
374 .name_size = sizeof("ceph.snap.btime"),
375 .getxattr_cb = ceph_vxattrcb_snap_btime,
376 .exists_cb = ceph_vxattrcb_snap_btime_exists,
377 .flags = VXATTR_FLAG_READONLY,
378 },
344 { .name = NULL, 0 } /* Required table terminator */ 379 { .name = NULL, 0 } /* Required table terminator */
345}; 380};
346static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
347 381
348/* files */ 382/* files */
349 383
@@ -360,9 +394,15 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
360 XATTR_LAYOUT_FIELD(file, layout, object_size), 394 XATTR_LAYOUT_FIELD(file, layout, object_size),
361 XATTR_LAYOUT_FIELD(file, layout, pool), 395 XATTR_LAYOUT_FIELD(file, layout, pool),
362 XATTR_LAYOUT_FIELD(file, layout, pool_namespace), 396 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
397 {
398 .name = "ceph.snap.btime",
399 .name_size = sizeof("ceph.snap.btime"),
400 .getxattr_cb = ceph_vxattrcb_snap_btime,
401 .exists_cb = ceph_vxattrcb_snap_btime_exists,
402 .flags = VXATTR_FLAG_READONLY,
403 },
363 { .name = NULL, 0 } /* Required table terminator */ 404 { .name = NULL, 0 } /* Required table terminator */
364}; 405};
365static size_t ceph_file_vxattrs_name_size; /* total size of all names */
366 406
367static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) 407static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
368{ 408{
@@ -373,47 +413,6 @@ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
373 return NULL; 413 return NULL;
374} 414}
375 415
376static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
377{
378 if (vxattrs == ceph_dir_vxattrs)
379 return ceph_dir_vxattrs_name_size;
380 if (vxattrs == ceph_file_vxattrs)
381 return ceph_file_vxattrs_name_size;
382 BUG_ON(vxattrs);
383 return 0;
384}
385
386/*
387 * Compute the aggregate size (including terminating '\0') of all
388 * virtual extended attribute names in the given vxattr table.
389 */
390static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
391{
392 struct ceph_vxattr *vxattr;
393 size_t size = 0;
394
395 for (vxattr = vxattrs; vxattr->name; vxattr++) {
396 if (!(vxattr->flags & VXATTR_FLAG_HIDDEN))
397 size += vxattr->name_size;
398 }
399
400 return size;
401}
402
403/* Routines called at initialization and exit time */
404
405void __init ceph_xattr_init(void)
406{
407 ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
408 ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
409}
410
411void ceph_xattr_exit(void)
412{
413 ceph_dir_vxattrs_name_size = 0;
414 ceph_file_vxattrs_name_size = 0;
415}
416
417static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, 416static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
418 const char *name) 417 const char *name)
419{ 418{
@@ -523,8 +522,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
523 dout("__set_xattr_val p=%p\n", p); 522 dout("__set_xattr_val p=%p\n", p);
524 } 523 }
525 524
526 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", 525 dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
527 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); 526 ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
528 527
529 return 0; 528 return 0;
530} 529}
@@ -823,7 +822,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
823 struct ceph_inode_xattr *xattr; 822 struct ceph_inode_xattr *xattr;
824 struct ceph_vxattr *vxattr = NULL; 823 struct ceph_vxattr *vxattr = NULL;
825 int req_mask; 824 int req_mask;
826 int err; 825 ssize_t err;
827 826
828 /* let's see if a virtual xattr was requested */ 827 /* let's see if a virtual xattr was requested */
829 vxattr = ceph_match_vxattr(inode, name); 828 vxattr = ceph_match_vxattr(inode, name);
@@ -835,8 +834,11 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
835 if (err) 834 if (err)
836 return err; 835 return err;
837 err = -ENODATA; 836 err = -ENODATA;
838 if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) 837 if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
839 err = vxattr->getxattr_cb(ci, value, size); 838 err = vxattr->getxattr_cb(ci, value, size);
839 if (size && size < err)
840 err = -ERANGE;
841 }
840 return err; 842 return err;
841 } 843 }
842 844
@@ -897,10 +899,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
897 struct inode *inode = d_inode(dentry); 899 struct inode *inode = d_inode(dentry);
898 struct ceph_inode_info *ci = ceph_inode(inode); 900 struct ceph_inode_info *ci = ceph_inode(inode);
899 struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); 901 struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
900 u32 vir_namelen = 0; 902 bool len_only = (size == 0);
901 u32 namelen; 903 u32 namelen;
902 int err; 904 int err;
903 u32 len;
904 int i; 905 int i;
905 906
906 spin_lock(&ci->i_ceph_lock); 907 spin_lock(&ci->i_ceph_lock);
@@ -919,38 +920,45 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
919 err = __build_xattrs(inode); 920 err = __build_xattrs(inode);
920 if (err < 0) 921 if (err < 0)
921 goto out; 922 goto out;
922 /*
923 * Start with virtual dir xattr names (if any) (including
924 * terminating '\0' characters for each).
925 */
926 vir_namelen = ceph_vxattrs_name_size(vxattrs);
927 923
928 /* adding 1 byte per each variable due to the null termination */ 924 /* add 1 byte for each xattr due to the null termination */
929 namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; 925 namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
930 err = -ERANGE; 926 if (!len_only) {
931 if (size && vir_namelen + namelen > size) 927 if (namelen > size) {
932 goto out; 928 err = -ERANGE;
933 929 goto out;
934 err = namelen + vir_namelen; 930 }
935 if (size == 0) 931 names = __copy_xattr_names(ci, names);
936 goto out; 932 size -= namelen;
933 }
937 934
938 names = __copy_xattr_names(ci, names);
939 935
940 /* virtual xattr names, too */ 936 /* virtual xattr names, too */
941 err = namelen;
942 if (vxattrs) { 937 if (vxattrs) {
943 for (i = 0; vxattrs[i].name; i++) { 938 for (i = 0; vxattrs[i].name; i++) {
944 if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) && 939 size_t this_len;
945 !(vxattrs[i].exists_cb && 940
946 !vxattrs[i].exists_cb(ci))) { 941 if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN)
947 len = sprintf(names, "%s", vxattrs[i].name); 942 continue;
948 names += len + 1; 943 if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci))
949 err += len + 1; 944 continue;
945
946 this_len = strlen(vxattrs[i].name) + 1;
947 namelen += this_len;
948 if (len_only)
949 continue;
950
951 if (this_len > size) {
952 err = -ERANGE;
953 goto out;
950 } 954 }
955
956 memcpy(names, vxattrs[i].name, this_len);
957 names += this_len;
958 size -= this_len;
951 } 959 }
952 } 960 }
953 961 err = namelen;
954out: 962out:
955 spin_unlock(&ci->i_ceph_lock); 963 spin_unlock(&ci->i_ceph_lock);
956 return err; 964 return err;
@@ -1206,4 +1214,138 @@ bool ceph_security_xattr_deadlock(struct inode *in)
1206 spin_unlock(&ci->i_ceph_lock); 1214 spin_unlock(&ci->i_ceph_lock);
1207 return ret; 1215 return ret;
1208} 1216}
1217
1218#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
1219int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
1220 struct ceph_acl_sec_ctx *as_ctx)
1221{
1222 struct ceph_pagelist *pagelist = as_ctx->pagelist;
1223 const char *name;
1224 size_t name_len;
1225 int err;
1226
1227 err = security_dentry_init_security(dentry, mode, &dentry->d_name,
1228 &as_ctx->sec_ctx,
1229 &as_ctx->sec_ctxlen);
1230 if (err < 0) {
1231 WARN_ON_ONCE(err != -EOPNOTSUPP);
1232 err = 0; /* do nothing */
1233 goto out;
1234 }
1235
1236 err = -ENOMEM;
1237 if (!pagelist) {
1238 pagelist = ceph_pagelist_alloc(GFP_KERNEL);
1239 if (!pagelist)
1240 goto out;
1241 err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
1242 if (err)
1243 goto out;
1244 ceph_pagelist_encode_32(pagelist, 1);
1245 }
1246
1247 /*
1248 * FIXME: Make security_dentry_init_security() generic. Currently
1249 * It only supports single security module and only selinux has
1250 * dentry_init_security hook.
1251 */
1252 name = XATTR_NAME_SELINUX;
1253 name_len = strlen(name);
1254 err = ceph_pagelist_reserve(pagelist,
1255 4 * 2 + name_len + as_ctx->sec_ctxlen);
1256 if (err)
1257 goto out;
1258
1259 if (as_ctx->pagelist) {
1260 /* update count of KV pairs */
1261 BUG_ON(pagelist->length <= sizeof(__le32));
1262 if (list_is_singular(&pagelist->head)) {
1263 le32_add_cpu((__le32*)pagelist->mapped_tail, 1);
1264 } else {
1265 struct page *page = list_first_entry(&pagelist->head,
1266 struct page, lru);
1267 void *addr = kmap_atomic(page);
1268 le32_add_cpu((__le32*)addr, 1);
1269 kunmap_atomic(addr);
1270 }
1271 } else {
1272 as_ctx->pagelist = pagelist;
1273 }
1274
1275 ceph_pagelist_encode_32(pagelist, name_len);
1276 ceph_pagelist_append(pagelist, name, name_len);
1277
1278 ceph_pagelist_encode_32(pagelist, as_ctx->sec_ctxlen);
1279 ceph_pagelist_append(pagelist, as_ctx->sec_ctx, as_ctx->sec_ctxlen);
1280
1281 err = 0;
1282out:
1283 if (pagelist && !as_ctx->pagelist)
1284 ceph_pagelist_release(pagelist);
1285 return err;
1286}
1287
1288void ceph_security_invalidate_secctx(struct inode *inode)
1289{
1290 security_inode_invalidate_secctx(inode);
1291}
1292
1293static int ceph_xattr_set_security_label(const struct xattr_handler *handler,
1294 struct dentry *unused, struct inode *inode,
1295 const char *key, const void *buf,
1296 size_t buflen, int flags)
1297{
1298 if (security_ismaclabel(key)) {
1299 const char *name = xattr_full_name(handler, key);
1300 return __ceph_setxattr(inode, name, buf, buflen, flags);
1301 }
1302 return -EOPNOTSUPP;
1303}
1304
1305static int ceph_xattr_get_security_label(const struct xattr_handler *handler,
1306 struct dentry *unused, struct inode *inode,
1307 const char *key, void *buf, size_t buflen)
1308{
1309 if (security_ismaclabel(key)) {
1310 const char *name = xattr_full_name(handler, key);
1311 return __ceph_getxattr(inode, name, buf, buflen);
1312 }
1313 return -EOPNOTSUPP;
1314}
1315
1316static const struct xattr_handler ceph_security_label_handler = {
1317 .prefix = XATTR_SECURITY_PREFIX,
1318 .get = ceph_xattr_get_security_label,
1319 .set = ceph_xattr_set_security_label,
1320};
1321#endif
1209#endif 1322#endif
1323
1324void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
1325{
1326#ifdef CONFIG_CEPH_FS_POSIX_ACL
1327 posix_acl_release(as_ctx->acl);
1328 posix_acl_release(as_ctx->default_acl);
1329#endif
1330#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
1331 security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen);
1332#endif
1333 if (as_ctx->pagelist)
1334 ceph_pagelist_release(as_ctx->pagelist);
1335}
1336
1337/*
1338 * List of handlers for synthetic system.* attributes. Other
1339 * attributes are handled directly.
1340 */
1341const struct xattr_handler *ceph_xattr_handlers[] = {
1342#ifdef CONFIG_CEPH_FS_POSIX_ACL
1343 &posix_acl_access_xattr_handler,
1344 &posix_acl_default_xattr_handler,
1345#endif
1346#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
1347 &ceph_security_label_handler,
1348#endif
1349 &ceph_other_xattr_handler,
1350 NULL,
1351};
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 65a38c4a02a1..39e6f4c57580 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -211,6 +211,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
211 CEPH_FEATURE_MON_STATEFUL_SUB | \ 211 CEPH_FEATURE_MON_STATEFUL_SUB | \
212 CEPH_FEATURE_CRUSH_TUNABLES5 | \ 212 CEPH_FEATURE_CRUSH_TUNABLES5 | \
213 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ 213 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \
214 CEPH_FEATURE_MSG_ADDR2 | \
214 CEPH_FEATURE_CEPHX_V2) 215 CEPH_FEATURE_CEPHX_V2)
215 216
216#define CEPH_FEATURES_REQUIRED_DEFAULT 0 217#define CEPH_FEATURES_REQUIRED_DEFAULT 0
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 3ac0feaf2b5e..cb21c5cf12c3 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -682,7 +682,7 @@ extern const char *ceph_cap_op_name(int op);
682/* flags field in client cap messages (version >= 10) */ 682/* flags field in client cap messages (version >= 10) */
683#define CEPH_CLIENT_CAPS_SYNC (1<<0) 683#define CEPH_CLIENT_CAPS_SYNC (1<<0)
684#define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1) 684#define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1)
685#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2); 685#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2)
686 686
687/* 687/*
688 * caps message, used for capability callbacks, acks, requests, etc. 688 * caps message, used for capability callbacks, acks, requests, etc.
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
index bea6c77d2093..17bc7584d1fe 100644
--- a/include/linux/ceph/cls_lock_client.h
+++ b/include/linux/ceph/cls_lock_client.h
@@ -52,4 +52,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
52 char *lock_name, u8 *type, char **tag, 52 char *lock_name, u8 *type, char **tag,
53 struct ceph_locker **lockers, u32 *num_lockers); 53 struct ceph_locker **lockers, u32 *num_lockers);
54 54
55int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
56 char *lock_name, u8 type, char *cookie, char *tag);
57
55#endif 58#endif
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6c2a48d42e0..450384fe487c 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -218,18 +218,27 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
218/* 218/*
219 * sockaddr_storage <-> ceph_sockaddr 219 * sockaddr_storage <-> ceph_sockaddr
220 */ 220 */
221static inline void ceph_encode_addr(struct ceph_entity_addr *a) 221#define CEPH_ENTITY_ADDR_TYPE_NONE 0
222#define CEPH_ENTITY_ADDR_TYPE_LEGACY __cpu_to_le32(1)
223
224static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
222{ 225{
223 __be16 ss_family = htons(a->in_addr.ss_family); 226 __be16 ss_family = htons(a->in_addr.ss_family);
224 a->in_addr.ss_family = *(__u16 *)&ss_family; 227 a->in_addr.ss_family = *(__u16 *)&ss_family;
228
229 /* Banner addresses require TYPE_NONE */
230 a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
225} 231}
226static inline void ceph_decode_addr(struct ceph_entity_addr *a) 232static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
227{ 233{
228 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; 234 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
229 a->in_addr.ss_family = ntohs(ss_family); 235 a->in_addr.ss_family = ntohs(ss_family);
230 WARN_ON(a->in_addr.ss_family == 512); 236 WARN_ON(a->in_addr.ss_family == 512);
237 a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
231} 238}
232 239
240extern int ceph_decode_entity_addr(void **p, void *end,
241 struct ceph_entity_addr *addr);
233/* 242/*
234 * encoders 243 * encoders
235 */ 244 */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 337d5049ff93..82156da3c650 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -84,11 +84,13 @@ struct ceph_options {
84#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) 84#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
85 85
86/* 86/*
87 * Handle the largest possible rbd object in one message. 87 * The largest possible rbd data object is 32M.
88 * The largest possible rbd object map object is 64M.
89 *
88 * There is no limit on the size of cephfs objects, but it has to obey 90 * There is no limit on the size of cephfs objects, but it has to obey
89 * rsize and wsize mount options anyway. 91 * rsize and wsize mount options anyway.
90 */ 92 */
91#define CEPH_MSG_MAX_DATA_LEN (32*1024*1024) 93#define CEPH_MSG_MAX_DATA_LEN (64*1024*1024)
92 94
93#define CEPH_AUTH_NAME_DEFAULT "guest" 95#define CEPH_AUTH_NAME_DEFAULT "guest"
94 96
@@ -299,10 +301,6 @@ int ceph_wait_for_latest_osdmap(struct ceph_client *client,
299 301
300/* pagevec.c */ 302/* pagevec.c */
301extern void ceph_release_page_vector(struct page **pages, int num_pages); 303extern void ceph_release_page_vector(struct page **pages, int num_pages);
302
303extern struct page **ceph_get_direct_page_vector(const void __user *data,
304 int num_pages,
305 bool write_page);
306extern void ceph_put_page_vector(struct page **pages, int num_pages, 304extern void ceph_put_page_vector(struct page **pages, int num_pages,
307 bool dirty); 305 bool dirty);
308extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); 306extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 3a4688af7455..b4d134d3312a 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -104,7 +104,6 @@ struct ceph_mon_client {
104#endif 104#endif
105}; 105};
106 106
107extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
108extern int ceph_monmap_contains(struct ceph_monmap *m, 107extern int ceph_monmap_contains(struct ceph_monmap *m,
109 struct ceph_entity_addr *addr); 108 struct ceph_entity_addr *addr);
110 109
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2294f963dab7..ad7fe5d10dcd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -198,9 +198,9 @@ struct ceph_osd_request {
198 bool r_mempool; 198 bool r_mempool;
199 struct completion r_completion; /* private to osd_client.c */ 199 struct completion r_completion; /* private to osd_client.c */
200 ceph_osdc_callback_t r_callback; 200 ceph_osdc_callback_t r_callback;
201 struct list_head r_unsafe_item;
202 201
203 struct inode *r_inode; /* for use by callbacks */ 202 struct inode *r_inode; /* for use by callbacks */
203 struct list_head r_private_item; /* ditto */
204 void *r_priv; /* ditto */ 204 void *r_priv; /* ditto */
205 205
206 /* set by submitter */ 206 /* set by submitter */
@@ -389,6 +389,14 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
389void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); 389void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
390void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); 390void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
391 391
392#define osd_req_op_data(oreq, whch, typ, fld) \
393({ \
394 struct ceph_osd_request *__oreq = (oreq); \
395 unsigned int __whch = (whch); \
396 BUG_ON(__whch >= __oreq->r_num_ops); \
397 &__oreq->r_ops[__whch].typ.fld; \
398})
399
392extern void osd_req_op_init(struct ceph_osd_request *osd_req, 400extern void osd_req_op_init(struct ceph_osd_request *osd_req,
393 unsigned int which, u16 opcode, u32 flags); 401 unsigned int which, u16 opcode, u32 flags);
394 402
@@ -497,7 +505,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
497 const char *class, const char *method, 505 const char *class, const char *method,
498 unsigned int flags, 506 unsigned int flags,
499 struct page *req_page, size_t req_len, 507 struct page *req_page, size_t req_len,
500 struct page *resp_page, size_t *resp_len); 508 struct page **resp_pages, size_t *resp_len);
501 509
502extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, 510extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
503 struct ceph_vino vino, 511 struct ceph_vino vino,
diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
index cbd0d24b7148..3486636c0e6e 100644
--- a/include/linux/ceph/striper.h
+++ b/include/linux/ceph/striper.h
@@ -66,4 +66,6 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
66 struct ceph_file_extent **file_extents, 66 struct ceph_file_extent **file_extents,
67 u32 *num_file_extents); 67 u32 *num_file_extents);
68 68
69u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size);
70
69#endif 71#endif
diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index be50ef7cedab..2917ef990d43 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -113,6 +113,30 @@ inode_peek_iversion_raw(const struct inode *inode)
113} 113}
114 114
115/** 115/**
116 * inode_set_max_iversion_raw - update i_version new value is larger
117 * @inode: inode to set
118 * @val: new i_version to set
119 *
120 * Some self-managed filesystems (e.g Ceph) will only update the i_version
121 * value if the new value is larger than the one we already have.
122 */
123static inline void
124inode_set_max_iversion_raw(struct inode *inode, u64 val)
125{
126 u64 cur, old;
127
128 cur = inode_peek_iversion_raw(inode);
129 for (;;) {
130 if (cur > val)
131 break;
132 old = atomic64_cmpxchg(&inode->i_version, cur, val);
133 if (likely(old == cur))
134 break;
135 cur = old;
136 }
137}
138
139/**
116 * inode_set_iversion - set i_version to a particular value 140 * inode_set_iversion - set i_version to a particular value
117 * @inode: inode to set 141 * @inode: inode to set
118 * @val: new i_version value to set 142 * @val: new i_version value to set
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index db09defe27d0..59d0ba2072de 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,7 +5,7 @@
5obj-$(CONFIG_CEPH_LIB) += libceph.o 5obj-$(CONFIG_CEPH_LIB) += libceph.o
6 6
7libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ 7libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
8 mon_client.o \ 8 mon_client.o decode.o \
9 cls_lock_client.o \ 9 cls_lock_client.o \
10 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ 10 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
11 striper.o \ 11 striper.o \
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 4cc28541281b..17447c19d937 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -6,6 +6,7 @@
6 6
7#include <linux/ceph/cls_lock_client.h> 7#include <linux/ceph/cls_lock_client.h>
8#include <linux/ceph/decode.h> 8#include <linux/ceph/decode.h>
9#include <linux/ceph/libceph.h>
9 10
10/** 11/**
11 * ceph_cls_lock - grab rados lock for object 12 * ceph_cls_lock - grab rados lock for object
@@ -264,8 +265,11 @@ static int decode_locker(void **p, void *end, struct ceph_locker *locker)
264 return ret; 265 return ret;
265 266
266 *p += sizeof(struct ceph_timespec); /* skip expiration */ 267 *p += sizeof(struct ceph_timespec); /* skip expiration */
267 ceph_decode_copy(p, &locker->info.addr, sizeof(locker->info.addr)); 268
268 ceph_decode_addr(&locker->info.addr); 269 ret = ceph_decode_entity_addr(p, end, &locker->info.addr);
270 if (ret)
271 return ret;
272
269 len = ceph_decode_32(p); 273 len = ceph_decode_32(p);
270 *p += len; /* skip description */ 274 *p += len; /* skip description */
271 275
@@ -360,7 +364,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
360 dout("%s lock_name %s\n", __func__, lock_name); 364 dout("%s lock_name %s\n", __func__, lock_name);
361 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info", 365 ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
362 CEPH_OSD_FLAG_READ, get_info_op_page, 366 CEPH_OSD_FLAG_READ, get_info_op_page,
363 get_info_op_buf_size, reply_page, &reply_len); 367 get_info_op_buf_size, &reply_page, &reply_len);
364 368
365 dout("%s: status %d\n", __func__, ret); 369 dout("%s: status %d\n", __func__, ret);
366 if (ret >= 0) { 370 if (ret >= 0) {
@@ -375,3 +379,47 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
375 return ret; 379 return ret;
376} 380}
377EXPORT_SYMBOL(ceph_cls_lock_info); 381EXPORT_SYMBOL(ceph_cls_lock_info);
382
383int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
384 char *lock_name, u8 type, char *cookie, char *tag)
385{
386 int assert_op_buf_size;
387 int name_len = strlen(lock_name);
388 int cookie_len = strlen(cookie);
389 int tag_len = strlen(tag);
390 struct page **pages;
391 void *p, *end;
392 int ret;
393
394 assert_op_buf_size = name_len + sizeof(__le32) +
395 cookie_len + sizeof(__le32) +
396 tag_len + sizeof(__le32) +
397 sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
398 if (assert_op_buf_size > PAGE_SIZE)
399 return -E2BIG;
400
401 ret = osd_req_op_cls_init(req, which, "lock", "assert_locked");
402 if (ret)
403 return ret;
404
405 pages = ceph_alloc_page_vector(1, GFP_NOIO);
406 if (IS_ERR(pages))
407 return PTR_ERR(pages);
408
409 p = page_address(pages[0]);
410 end = p + assert_op_buf_size;
411
412 /* encode cls_lock_assert_op struct */
413 ceph_start_encoding(&p, 1, 1,
414 assert_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
415 ceph_encode_string(&p, end, lock_name, name_len);
416 ceph_encode_8(&p, type);
417 ceph_encode_string(&p, end, cookie, cookie_len);
418 ceph_encode_string(&p, end, tag, tag_len);
419 WARN_ON(p != end);
420
421 osd_req_op_cls_request_data_pages(req, which, pages, assert_op_buf_size,
422 0, false, true);
423 return 0;
424}
425EXPORT_SYMBOL(ceph_cls_assert_locked);
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
new file mode 100644
index 000000000000..eea529595a7a
--- /dev/null
+++ b/net/ceph/decode.c
@@ -0,0 +1,84 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/ceph/decode.h>
4
5static int
6ceph_decode_entity_addr_versioned(void **p, void *end,
7 struct ceph_entity_addr *addr)
8{
9 int ret;
10 u8 struct_v;
11 u32 struct_len, addr_len;
12 void *struct_end;
13
14 ret = ceph_start_decoding(p, end, 1, "entity_addr_t", &struct_v,
15 &struct_len);
16 if (ret)
17 goto bad;
18
19 ret = -EINVAL;
20 struct_end = *p + struct_len;
21
22 ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
23
24 ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
25
26 ceph_decode_32_safe(p, end, addr_len, bad);
27 if (addr_len > sizeof(addr->in_addr))
28 goto bad;
29
30 memset(&addr->in_addr, 0, sizeof(addr->in_addr));
31 if (addr_len) {
32 ceph_decode_copy_safe(p, end, &addr->in_addr, addr_len, bad);
33
34 addr->in_addr.ss_family =
35 le16_to_cpu((__force __le16)addr->in_addr.ss_family);
36 }
37
38 /* Advance past anything the client doesn't yet understand */
39 *p = struct_end;
40 ret = 0;
41bad:
42 return ret;
43}
44
45static int
46ceph_decode_entity_addr_legacy(void **p, void *end,
47 struct ceph_entity_addr *addr)
48{
49 int ret = -EINVAL;
50
51 /* Skip rest of type field */
52 ceph_decode_skip_n(p, end, 3, bad);
53
54 /*
55 * Clients that don't support ADDR2 always send TYPE_NONE, change it
56 * to TYPE_LEGACY for forward compatibility.
57 */
58 addr->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
59 ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
60 memset(&addr->in_addr, 0, sizeof(addr->in_addr));
61 ceph_decode_copy_safe(p, end, &addr->in_addr,
62 sizeof(addr->in_addr), bad);
63 addr->in_addr.ss_family =
64 be16_to_cpu((__force __be16)addr->in_addr.ss_family);
65 ret = 0;
66bad:
67 return ret;
68}
69
70int
71ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr)
72{
73 u8 marker;
74
75 ceph_decode_8_safe(p, end, marker, bad);
76 if (marker == 1)
77 return ceph_decode_entity_addr_versioned(p, end, addr);
78 else if (marker == 0)
79 return ceph_decode_entity_addr_legacy(p, end, addr);
80bad:
81 return -EINVAL;
82}
83EXPORT_SYMBOL(ceph_decode_entity_addr);
84
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a33402c99321..962f521c863e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -199,12 +199,14 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
199 199
200 switch (ss.ss_family) { 200 switch (ss.ss_family) {
201 case AF_INET: 201 case AF_INET:
202 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, 202 snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
203 le32_to_cpu(addr->type), &in4->sin_addr,
203 ntohs(in4->sin_port)); 204 ntohs(in4->sin_port));
204 break; 205 break;
205 206
206 case AF_INET6: 207 case AF_INET6:
207 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, 208 snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
209 le32_to_cpu(addr->type), &in6->sin6_addr,
208 ntohs(in6->sin6_port)); 210 ntohs(in6->sin6_port));
209 break; 211 break;
210 212
@@ -220,7 +222,7 @@ EXPORT_SYMBOL(ceph_pr_addr);
220static void encode_my_addr(struct ceph_messenger *msgr) 222static void encode_my_addr(struct ceph_messenger *msgr)
221{ 223{
222 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); 224 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
223 ceph_encode_addr(&msgr->my_enc_addr); 225 ceph_encode_banner_addr(&msgr->my_enc_addr);
224} 226}
225 227
226/* 228/*
@@ -1732,12 +1734,14 @@ static int read_partial_banner(struct ceph_connection *con)
1732 ret = read_partial(con, end, size, &con->actual_peer_addr); 1734 ret = read_partial(con, end, size, &con->actual_peer_addr);
1733 if (ret <= 0) 1735 if (ret <= 0)
1734 goto out; 1736 goto out;
1737 ceph_decode_banner_addr(&con->actual_peer_addr);
1735 1738
1736 size = sizeof (con->peer_addr_for_me); 1739 size = sizeof (con->peer_addr_for_me);
1737 end += size; 1740 end += size;
1738 ret = read_partial(con, end, size, &con->peer_addr_for_me); 1741 ret = read_partial(con, end, size, &con->peer_addr_for_me);
1739 if (ret <= 0) 1742 if (ret <= 0)
1740 goto out; 1743 goto out;
1744 ceph_decode_banner_addr(&con->peer_addr_for_me);
1741 1745
1742out: 1746out:
1743 return ret; 1747 return ret;
@@ -1981,6 +1985,7 @@ int ceph_parse_ips(const char *c, const char *end,
1981 } 1985 }
1982 1986
1983 addr_set_port(&addr[i], port); 1987 addr_set_port(&addr[i], port);
1988 addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
1984 1989
1985 dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); 1990 dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
1986 1991
@@ -2011,9 +2016,6 @@ static int process_banner(struct ceph_connection *con)
2011 if (verify_hello(con) < 0) 2016 if (verify_hello(con) < 0)
2012 return -1; 2017 return -1;
2013 2018
2014 ceph_decode_addr(&con->actual_peer_addr);
2015 ceph_decode_addr(&con->peer_addr_for_me);
2016
2017 /* 2019 /*
2018 * Make sure the other end is who we wanted. note that the other 2020 * Make sure the other end is who we wanted. note that the other
2019 * end may not yet know their ip address, so if it's 0.0.0.0, give 2021 * end may not yet know their ip address, so if it's 0.0.0.0, give
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 895679d3529b..0520bf9825aa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -39,7 +39,7 @@ static int __validate_auth(struct ceph_mon_client *monc);
39/* 39/*
40 * Decode a monmap blob (e.g., during mount). 40 * Decode a monmap blob (e.g., during mount).
41 */ 41 */
42struct ceph_monmap *ceph_monmap_decode(void *p, void *end) 42static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
43{ 43{
44 struct ceph_monmap *m = NULL; 44 struct ceph_monmap *m = NULL;
45 int i, err = -EINVAL; 45 int i, err = -EINVAL;
@@ -50,7 +50,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
50 ceph_decode_32_safe(&p, end, len, bad); 50 ceph_decode_32_safe(&p, end, len, bad);
51 ceph_decode_need(&p, end, len, bad); 51 ceph_decode_need(&p, end, len, bad);
52 52
53 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); 53 dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
54 p += sizeof(u16); /* skip version */ 54 p += sizeof(u16); /* skip version */
55 55
56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); 56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
@@ -58,7 +58,6 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
58 epoch = ceph_decode_32(&p); 58 epoch = ceph_decode_32(&p);
59 59
60 num_mon = ceph_decode_32(&p); 60 num_mon = ceph_decode_32(&p);
61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
62 61
63 if (num_mon > CEPH_MAX_MON) 62 if (num_mon > CEPH_MAX_MON)
64 goto bad; 63 goto bad;
@@ -68,17 +67,22 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
68 m->fsid = fsid; 67 m->fsid = fsid;
69 m->epoch = epoch; 68 m->epoch = epoch;
70 m->num_mon = num_mon; 69 m->num_mon = num_mon;
71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); 70 for (i = 0; i < num_mon; ++i) {
72 for (i = 0; i < num_mon; i++) 71 struct ceph_entity_inst *inst = &m->mon_inst[i];
73 ceph_decode_addr(&m->mon_inst[i].addr); 72
74 73 /* copy name portion */
74 ceph_decode_copy_safe(&p, end, &inst->name,
75 sizeof(inst->name), bad);
76 err = ceph_decode_entity_addr(&p, end, &inst->addr);
77 if (err)
78 goto bad;
79 }
75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, 80 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
76 m->num_mon); 81 m->num_mon);
77 for (i = 0; i < m->num_mon; i++) 82 for (i = 0; i < m->num_mon; i++)
78 dout("monmap_decode mon%d is %s\n", i, 83 dout("monmap_decode mon%d is %s\n", i,
79 ceph_pr_addr(&m->mon_inst[i].addr)); 84 ceph_pr_addr(&m->mon_inst[i].addr));
80 return m; 85 return m;
81
82bad: 86bad:
83 dout("monmap_decode failed with %d\n", err); 87 dout("monmap_decode failed with %d\n", err);
84 kfree(m); 88 kfree(m);
@@ -469,6 +473,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
469 if (IS_ERR(monmap)) { 473 if (IS_ERR(monmap)) {
470 pr_err("problem decoding monmap, %d\n", 474 pr_err("problem decoding monmap, %d\n",
471 (int)PTR_ERR(monmap)); 475 (int)PTR_ERR(monmap));
476 ceph_msg_dump(msg);
472 goto out; 477 goto out;
473 } 478 }
474 479
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9a8eca5eda65..0b2df09b2554 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,14 +171,6 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
171 osd_data->num_bvecs = num_bvecs; 171 osd_data->num_bvecs = num_bvecs;
172} 172}
173 173
174#define osd_req_op_data(oreq, whch, typ, fld) \
175({ \
176 struct ceph_osd_request *__oreq = (oreq); \
177 unsigned int __whch = (whch); \
178 BUG_ON(__whch >= __oreq->r_num_ops); \
179 &__oreq->r_ops[__whch].typ.fld; \
180})
181
182static struct ceph_osd_data * 174static struct ceph_osd_data *
183osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) 175osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
184{ 176{
@@ -478,7 +470,7 @@ static void request_release_checks(struct ceph_osd_request *req)
478{ 470{
479 WARN_ON(!RB_EMPTY_NODE(&req->r_node)); 471 WARN_ON(!RB_EMPTY_NODE(&req->r_node));
480 WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node)); 472 WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
481 WARN_ON(!list_empty(&req->r_unsafe_item)); 473 WARN_ON(!list_empty(&req->r_private_item));
482 WARN_ON(req->r_osd); 474 WARN_ON(req->r_osd);
483} 475}
484 476
@@ -538,7 +530,7 @@ static void request_init(struct ceph_osd_request *req)
538 init_completion(&req->r_completion); 530 init_completion(&req->r_completion);
539 RB_CLEAR_NODE(&req->r_node); 531 RB_CLEAR_NODE(&req->r_node);
540 RB_CLEAR_NODE(&req->r_mc_node); 532 RB_CLEAR_NODE(&req->r_mc_node);
541 INIT_LIST_HEAD(&req->r_unsafe_item); 533 INIT_LIST_HEAD(&req->r_private_item);
542 534
543 target_init(&req->r_t); 535 target_init(&req->r_t);
544} 536}
@@ -4914,20 +4906,26 @@ static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
4914 ret = ceph_start_decoding(p, end, 2, "watch_item_t", 4906 ret = ceph_start_decoding(p, end, 2, "watch_item_t",
4915 &struct_v, &struct_len); 4907 &struct_v, &struct_len);
4916 if (ret) 4908 if (ret)
4917 return ret; 4909 goto bad;
4910
4911 ret = -EINVAL;
4912 ceph_decode_copy_safe(p, end, &item->name, sizeof(item->name), bad);
4913 ceph_decode_64_safe(p, end, item->cookie, bad);
4914 ceph_decode_skip_32(p, end, bad); /* skip timeout seconds */
4918 4915
4919 ceph_decode_copy(p, &item->name, sizeof(item->name));
4920 item->cookie = ceph_decode_64(p);
4921 *p += 4; /* skip timeout_seconds */
4922 if (struct_v >= 2) { 4916 if (struct_v >= 2) {
4923 ceph_decode_copy(p, &item->addr, sizeof(item->addr)); 4917 ret = ceph_decode_entity_addr(p, end, &item->addr);
4924 ceph_decode_addr(&item->addr); 4918 if (ret)
4919 goto bad;
4920 } else {
4921 ret = 0;
4925 } 4922 }
4926 4923
4927 dout("%s %s%llu cookie %llu addr %s\n", __func__, 4924 dout("%s %s%llu cookie %llu addr %s\n", __func__,
4928 ENTITY_NAME(item->name), item->cookie, 4925 ENTITY_NAME(item->name), item->cookie,
4929 ceph_pr_addr(&item->addr)); 4926 ceph_pr_addr(&item->addr));
4930 return 0; 4927bad:
4928 return ret;
4931} 4929}
4932 4930
4933static int decode_watchers(void **p, void *end, 4931static int decode_watchers(void **p, void *end,
@@ -5044,12 +5042,12 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
5044 const char *class, const char *method, 5042 const char *class, const char *method,
5045 unsigned int flags, 5043 unsigned int flags,
5046 struct page *req_page, size_t req_len, 5044 struct page *req_page, size_t req_len,
5047 struct page *resp_page, size_t *resp_len) 5045 struct page **resp_pages, size_t *resp_len)
5048{ 5046{
5049 struct ceph_osd_request *req; 5047 struct ceph_osd_request *req;
5050 int ret; 5048 int ret;
5051 5049
5052 if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE)) 5050 if (req_len > PAGE_SIZE)
5053 return -E2BIG; 5051 return -E2BIG;
5054 5052
5055 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); 5053 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
@@ -5067,8 +5065,8 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
5067 if (req_page) 5065 if (req_page)
5068 osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len, 5066 osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
5069 0, false, false); 5067 0, false, false);
5070 if (resp_page) 5068 if (resp_pages)
5071 osd_req_op_cls_response_data_pages(req, 0, &resp_page, 5069 osd_req_op_cls_response_data_pages(req, 0, resp_pages,
5072 *resp_len, 0, false, false); 5070 *resp_len, 0, false, false);
5073 5071
5074 ret = ceph_osdc_alloc_messages(req, GFP_NOIO); 5072 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
@@ -5079,7 +5077,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
5079 ret = ceph_osdc_wait_request(osdc, req); 5077 ret = ceph_osdc_wait_request(osdc, req);
5080 if (ret >= 0) { 5078 if (ret >= 0) {
5081 ret = req->r_ops[0].rval; 5079 ret = req->r_ops[0].rval;
5082 if (resp_page) 5080 if (resp_pages)
5083 *resp_len = req->r_ops[0].outdata_len; 5081 *resp_len = req->r_ops[0].outdata_len;
5084 } 5082 }
5085 5083
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 48a31dc9161c..90437906b7bc 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1489,11 +1489,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
1489 1489
1490 /* osd_state, osd_weight, osd_addrs->client_addr */ 1490 /* osd_state, osd_weight, osd_addrs->client_addr */
1491 ceph_decode_need(p, end, 3*sizeof(u32) + 1491 ceph_decode_need(p, end, 3*sizeof(u32) +
1492 map->max_osd*((struct_v >= 5 ? sizeof(u32) : 1492 map->max_osd*(struct_v >= 5 ? sizeof(u32) :
1493 sizeof(u8)) + 1493 sizeof(u8)) +
1494 sizeof(*map->osd_weight) + 1494 sizeof(*map->osd_weight), e_inval);
1495 sizeof(*map->osd_addr)), e_inval);
1496
1497 if (ceph_decode_32(p) != map->max_osd) 1495 if (ceph_decode_32(p) != map->max_osd)
1498 goto e_inval; 1496 goto e_inval;
1499 1497
@@ -1514,9 +1512,11 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
1514 if (ceph_decode_32(p) != map->max_osd) 1512 if (ceph_decode_32(p) != map->max_osd)
1515 goto e_inval; 1513 goto e_inval;
1516 1514
1517 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); 1515 for (i = 0; i < map->max_osd; i++) {
1518 for (i = 0; i < map->max_osd; i++) 1516 err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
1519 ceph_decode_addr(&map->osd_addr[i]); 1517 if (err)
1518 goto bad;
1519 }
1520 1520
1521 /* pg_temp */ 1521 /* pg_temp */
1522 err = decode_pg_temp(p, end, map); 1522 err = decode_pg_temp(p, end, map);
@@ -1618,12 +1618,17 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
1618 void *new_state; 1618 void *new_state;
1619 void *new_weight_end; 1619 void *new_weight_end;
1620 u32 len; 1620 u32 len;
1621 int i;
1621 1622
1622 new_up_client = *p; 1623 new_up_client = *p;
1623 ceph_decode_32_safe(p, end, len, e_inval); 1624 ceph_decode_32_safe(p, end, len, e_inval);
1624 len *= sizeof(u32) + sizeof(struct ceph_entity_addr); 1625 for (i = 0; i < len; ++i) {
1625 ceph_decode_need(p, end, len, e_inval); 1626 struct ceph_entity_addr addr;
1626 *p += len; 1627
1628 ceph_decode_skip_32(p, end, e_inval);
1629 if (ceph_decode_entity_addr(p, end, &addr))
1630 goto e_inval;
1631 }
1627 1632
1628 new_state = *p; 1633 new_state = *p;
1629 ceph_decode_32_safe(p, end, len, e_inval); 1634 ceph_decode_32_safe(p, end, len, e_inval);
@@ -1699,9 +1704,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
1699 struct ceph_entity_addr addr; 1704 struct ceph_entity_addr addr;
1700 1705
1701 osd = ceph_decode_32(p); 1706 osd = ceph_decode_32(p);
1702 ceph_decode_copy(p, &addr, sizeof(addr));
1703 ceph_decode_addr(&addr);
1704 BUG_ON(osd >= map->max_osd); 1707 BUG_ON(osd >= map->max_osd);
1708 if (ceph_decode_entity_addr(p, end, &addr))
1709 goto e_inval;
1705 pr_info("osd%d up\n", osd); 1710 pr_info("osd%d up\n", osd);
1706 map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; 1711 map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1707 map->osd_addr[osd] = addr; 1712 map->osd_addr[osd] = addr;
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 74cafc0142ea..64305e7056a1 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -10,39 +10,6 @@
10 10
11#include <linux/ceph/libceph.h> 11#include <linux/ceph/libceph.h>
12 12
13/*
14 * build a vector of user pages
15 */
16struct page **ceph_get_direct_page_vector(const void __user *data,
17 int num_pages, bool write_page)
18{
19 struct page **pages;
20 int got = 0;
21 int rc = 0;
22
23 pages = kmalloc_array(num_pages, sizeof(*pages), GFP_NOFS);
24 if (!pages)
25 return ERR_PTR(-ENOMEM);
26
27 while (got < num_pages) {
28 rc = get_user_pages_fast(
29 (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
30 num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
31 if (rc < 0)
32 break;
33 BUG_ON(rc == 0);
34 got += rc;
35 }
36 if (rc < 0)
37 goto fail;
38 return pages;
39
40fail:
41 ceph_put_page_vector(pages, got, false);
42 return ERR_PTR(rc);
43}
44EXPORT_SYMBOL(ceph_get_direct_page_vector);
45
46void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) 13void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
47{ 14{
48 int i; 15 int i;
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
index c36462dc86b7..3b3fa75d1189 100644
--- a/net/ceph/striper.c
+++ b/net/ceph/striper.c
@@ -259,3 +259,20 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
259 return 0; 259 return 0;
260} 260}
261EXPORT_SYMBOL(ceph_extent_to_file); 261EXPORT_SYMBOL(ceph_extent_to_file);
262
263u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size)
264{
265 u64 period = (u64)l->stripe_count * l->object_size;
266 u64 num_periods = DIV64_U64_ROUND_UP(size, period);
267 u64 remainder_bytes;
268 u64 remainder_objs = 0;
269
270 div64_u64_rem(size, period, &remainder_bytes);
271 if (remainder_bytes > 0 &&
272 remainder_bytes < (u64)l->stripe_count * l->stripe_unit)
273 remainder_objs = l->stripe_count -
274 DIV_ROUND_UP_ULL(remainder_bytes, l->stripe_unit);
275
276 return num_periods * l->stripe_count - remainder_objs;
277}
278EXPORT_SYMBOL(ceph_get_num_objects);