libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op

This is primarily for rbd's benefit and is supposed to combat fragmentation: "... knowing that rbd images have a 4m size, librbd can pass a hint that will let the osd do the xfs allocation size ioctl on new files so that they are allocated in 1m or 4m chunks. We've seen cases where users with rbd workloads have very high levels of fragmentation in xfs and this would mitigate that and probably have a pretty nice performance benefit." SETALLOCHINT is considered advisory, so our backwards compatibility mechanism here is to set FAILOK flag for all SETALLOCHINT ops. Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Sage Weil <sage@inktank.com> Reviewed-by: Alex Elder <elder@linaro.org>
author: Ilya Dryomov <ilya.dryomov@inktank.com> 2014-02-25 09:22:27 -0500
committer: Yan, Zheng <zheng.z.yan@intel.com> 2014-04-02 22:33:51 -0400
commit: c647b8a8c6366f849c2a237bfe525cb1d316d5f4 (patch)
tree: 8f3385992b44ba6bb5bcb1d745bb6df4dd6b5bbf /net/ceph/osd_client.c
parent: 7b25bf5f02c5c80adf96120e031dc3a1756ce54d (diff)
1 files changed, 27 insertions, 0 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5d7fd0b8c1c8..71830d79b0f4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
        case CEPH_OSD_OP_OMAPCLEAR:
        case CEPH_OSD_OP_OMAPRMKEYS:
        case CEPH_OSD_OP_OMAP_CMP:
+        case CEPH_OSD_OP_SETALLOCHINT:
        case CEPH_OSD_OP_CLONERANGE:
        case CEPH_OSD_OP_ASSERT_SRC_VERSION:
        case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_watch_init);
+void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+                                unsigned int which,
+                                u64 expected_object_size,
+                                u64 expected_write_size)
+{
+        struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+                                                      CEPH_OSD_OP_SETALLOCHINT);
+        op->alloc_hint.expected_object_size = expected_object_size;
+        op->alloc_hint.expected_write_size = expected_write_size;
+        /*
+         * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+         * not worth a feature bit.  Set FAILOK per-op flag to make
+         * sure older osds don't trip over an unsupported opcode.
+         */
+        op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
 static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
                                struct ceph_osd_data *osd_data)
 {
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
                dst->watch.ver = cpu_to_le64(src->watch.ver);
                dst->watch.flag = src->watch.flag;
                break;
+        case CEPH_OSD_OP_SETALLOCHINT:
+                dst->alloc_hint.expected_object_size =
+                    cpu_to_le64(src->alloc_hint.expected_object_size);
+                dst->alloc_hint.expected_write_size =
+                    cpu_to_le64(src->alloc_hint.expected_write_size);
+                break;
        default:
                pr_err("unsupported osd opcode %s\n",
                        ceph_osd_op_name(src->op));
author	Ilya Dryomov <ilya.dryomov@inktank.com>	2014-02-25 09:22:27 -0500
committer	Yan, Zheng <zheng.z.yan@intel.com>	2014-04-02 22:33:51 -0400
commit	c647b8a8c6366f849c2a237bfe525cb1d316d5f4 (patch)
tree	8f3385992b44ba6bb5bcb1d745bb6df4dd6b5bbf /net/ceph/osd_client.c
parent	7b25bf5f02c5c80adf96120e031dc3a1756ce54d (diff)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5d7fd0b8c1c8..71830d79b0f4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
436	case CEPH_OSD_OP_OMAPCLEAR:	436	case CEPH_OSD_OP_OMAPCLEAR:
437	case CEPH_OSD_OP_OMAPRMKEYS:	437	case CEPH_OSD_OP_OMAPRMKEYS:
438	case CEPH_OSD_OP_OMAP_CMP:	438	case CEPH_OSD_OP_OMAP_CMP:
		439	case CEPH_OSD_OP_SETALLOCHINT:
439	case CEPH_OSD_OP_CLONERANGE:	440	case CEPH_OSD_OP_CLONERANGE:
440	case CEPH_OSD_OP_ASSERT_SRC_VERSION:	441	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
441	case CEPH_OSD_OP_SRC_CMPXATTR:	442	case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
591	}	592	}
592	EXPORT_SYMBOL(osd_req_op_watch_init);	593	EXPORT_SYMBOL(osd_req_op_watch_init);
593		594
		595	void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
		596	unsigned int which,
		597	u64 expected_object_size,
		598	u64 expected_write_size)
		599	{
		600	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
		601	CEPH_OSD_OP_SETALLOCHINT);
		602
		603	op->alloc_hint.expected_object_size = expected_object_size;
		604	op->alloc_hint.expected_write_size = expected_write_size;
		605
		606	/*
		607	* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
		608	* not worth a feature bit. Set FAILOK per-op flag to make
		609	* sure older osds don't trip over an unsupported opcode.
		610	*/
		611	op->flags \|= CEPH_OSD_OP_FLAG_FAILOK;
		612	}
		613	EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
		614
594	static void ceph_osdc_msg_data_add(struct ceph_msg *msg,	615	static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
595	struct ceph_osd_data *osd_data)	616	struct ceph_osd_data *osd_data)
596	{	617	{
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
681	dst->watch.ver = cpu_to_le64(src->watch.ver);	702	dst->watch.ver = cpu_to_le64(src->watch.ver);
682	dst->watch.flag = src->watch.flag;	703	dst->watch.flag = src->watch.flag;
683	break;	704	break;
		705	case CEPH_OSD_OP_SETALLOCHINT:
		706	dst->alloc_hint.expected_object_size =
		707	cpu_to_le64(src->alloc_hint.expected_object_size);
		708	dst->alloc_hint.expected_write_size =
		709	cpu_to_le64(src->alloc_hint.expected_write_size);
		710	break;
684	default:	711	default:
685	pr_err("unsupported osd opcode %s\n",	712	pr_err("unsupported osd opcode %s\n",
686	ceph_osd_op_name(src->op));	713	ceph_osd_op_name(src->op));