aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Dryomov <ilya.dryomov@inktank.com>2014-02-25 09:22:27 -0500
committerYan, Zheng <zheng.z.yan@intel.com>2014-04-02 22:33:51 -0400
commitc647b8a8c6366f849c2a237bfe525cb1d316d5f4 (patch)
tree8f3385992b44ba6bb5bcb1d745bb6df4dd6b5bbf
parent7b25bf5f02c5c80adf96120e031dc3a1756ce54d (diff)
libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op
This is primarily for rbd's benefit and is supposed to combat fragmentation: "... knowing that rbd images have a 4m size, librbd can pass a hint that will let the osd do the xfs allocation size ioctl on new files so that they are allocated in 1m or 4m chunks. We've seen cases where users with rbd workloads have very high levels of fragmentation in xfs and this would mitigate that and probably have a pretty nice performance benefit." SETALLOCHINT is considered advisory, so our backwards compatibility mechanism here is to set FAILOK flag for all SETALLOCHINT ops. Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Sage Weil <sage@inktank.com> Reviewed-by: Alex Elder <elder@linaro.org>
-rw-r--r--include/linux/ceph/osd_client.h8
-rw-r--r--include/linux/ceph/rados.h7
-rw-r--r--net/ceph/osd_client.c27
3 files changed, 42 insertions, 0 deletions
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index e94f5da251d6..c42d1ada685f 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -103,6 +103,10 @@ struct ceph_osd_req_op {
103 u32 timeout; 103 u32 timeout;
104 __u8 flag; 104 __u8 flag;
105 } watch; 105 } watch;
106 struct {
107 u64 expected_object_size;
108 u64 expected_write_size;
109 } alloc_hint;
106 }; 110 };
107}; 111};
108 112
@@ -294,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
294extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 298extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
295 unsigned int which, u16 opcode, 299 unsigned int which, u16 opcode,
296 u64 cookie, u64 version, int flag); 300 u64 cookie, u64 version, int flag);
301extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
302 unsigned int which,
303 u64 expected_object_size,
304 u64 expected_write_size);
297 305
298extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 306extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
299 struct ceph_snap_context *snapc, 307 struct ceph_snap_context *snapc,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 8f9bf4570215..2caabef8d369 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -227,6 +227,9 @@ enum {
227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, 227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, 228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
229 229
230 /* hints */
231 CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
232
230 /** multi **/ 233 /** multi **/
231 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, 234 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
232 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, 235 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
@@ -416,6 +419,10 @@ struct ceph_osd_op {
416 __le64 offset, length; 419 __le64 offset, length;
417 __le64 src_offset; 420 __le64 src_offset;
418 } __attribute__ ((packed)) clonerange; 421 } __attribute__ ((packed)) clonerange;
422 struct {
423 __le64 expected_object_size;
424 __le64 expected_write_size;
425 } __attribute__ ((packed)) alloc_hint;
419 }; 426 };
420 __le32 payload_len; 427 __le32 payload_len;
421} __attribute__ ((packed)); 428} __attribute__ ((packed));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5d7fd0b8c1c8..71830d79b0f4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
436 case CEPH_OSD_OP_OMAPCLEAR: 436 case CEPH_OSD_OP_OMAPCLEAR:
437 case CEPH_OSD_OP_OMAPRMKEYS: 437 case CEPH_OSD_OP_OMAPRMKEYS:
438 case CEPH_OSD_OP_OMAP_CMP: 438 case CEPH_OSD_OP_OMAP_CMP:
439 case CEPH_OSD_OP_SETALLOCHINT:
439 case CEPH_OSD_OP_CLONERANGE: 440 case CEPH_OSD_OP_CLONERANGE:
440 case CEPH_OSD_OP_ASSERT_SRC_VERSION: 441 case CEPH_OSD_OP_ASSERT_SRC_VERSION:
441 case CEPH_OSD_OP_SRC_CMPXATTR: 442 case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
591} 592}
592EXPORT_SYMBOL(osd_req_op_watch_init); 593EXPORT_SYMBOL(osd_req_op_watch_init);
593 594
595void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
596 unsigned int which,
597 u64 expected_object_size,
598 u64 expected_write_size)
599{
600 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
601 CEPH_OSD_OP_SETALLOCHINT);
602
603 op->alloc_hint.expected_object_size = expected_object_size;
604 op->alloc_hint.expected_write_size = expected_write_size;
605
606 /*
607 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
608 * not worth a feature bit. Set FAILOK per-op flag to make
609 * sure older osds don't trip over an unsupported opcode.
610 */
611 op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
612}
613EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
614
594static void ceph_osdc_msg_data_add(struct ceph_msg *msg, 615static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
595 struct ceph_osd_data *osd_data) 616 struct ceph_osd_data *osd_data)
596{ 617{
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
681 dst->watch.ver = cpu_to_le64(src->watch.ver); 702 dst->watch.ver = cpu_to_le64(src->watch.ver);
682 dst->watch.flag = src->watch.flag; 703 dst->watch.flag = src->watch.flag;
683 break; 704 break;
705 case CEPH_OSD_OP_SETALLOCHINT:
706 dst->alloc_hint.expected_object_size =
707 cpu_to_le64(src->alloc_hint.expected_object_size);
708 dst->alloc_hint.expected_write_size =
709 cpu_to_le64(src->alloc_hint.expected_write_size);
710 break;
684 default: 711 default:
685 pr_err("unsupported osd opcode %s\n", 712 pr_err("unsupported osd opcode %s\n",
686 ceph_osd_op_name(src->op)); 713 ceph_osd_op_name(src->op));