Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil: "The biggest chunk is a series of patches from Ilya that add support for new Ceph osd and crush map features, including some new tunables, primary affinity, and the new encoding that is needed for erasure coding support. This brings things into parity with the server side and the looming firefly release. There is also support for allocation hints in RBD that help limit fragmentation on the server side. There is also a series of patches from Zheng fixing NFS reexport, directory fragmentation support, flock vs fnctl behavior, and some issues with clustered MDS. Finally, there are some miscellaneous fixes from Yunchuan Wen for fscache, Fabian Frederick for ACLs, and from me for fsync(dirfd) behavior" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (79 commits) ceph: skip invalid dentry during dcache readdir libceph: dump pool {read,write}_tier to debugfs libceph: output primary affinity values on osdmap updates ceph: flush cap release queue when trimming session caps ceph: don't grabs open file reference for aborted request ceph: drop extra open file reference in ceph_atomic_open() ceph: preallocate buffer for readdir reply libceph: enable PRIMARY_AFFINITY feature bit libceph: redo ceph_calc_pg_primary() in terms of ceph_calc_pg_acting() libceph: add support for osd primary affinity libceph: add support for primary_temp mappings libceph: return primary from ceph_calc_pg_acting() libceph: switch ceph_calc_pg_acting() to new helpers libceph: introduce apply_temps() helper libceph: introduce pg_to_raw_osds() and raw_to_up_osds() helpers libceph: ceph_can_shift_osds(pool) and pool type defines libceph: ceph_osd_{exists,is_up,is_down}(osd) definitions libceph: enable OSDMAP_ENC feature bit libceph: primary_affinity decode bits libceph: primary_affinity infrastructure ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-07 14:09:13 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-07 14:09:13 -0400
commit: 240cd6a817bd855e3f1e615ed9ae16407f8cfce6 (patch)
tree: da7d6267d549cd0fbdff3f30032720b416d1ff3d /include/linux
parent: 3021112598d2b722eee54d8a662fea2089abbdbc (diff)
parent: a30be7cb2ccb995ad5e67fd4b548f11fe37fc8b1 (diff)
6 files changed, 91 insertions, 12 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 138448f766b4..d12659ce550d 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -43,6 +43,13 @@
 #define CEPH_FEATURE_CRUSH_V2      (1ULL<<36)  /* new indep; SET_* steps */
 #define CEPH_FEATURE_EXPORT_PEER   (1ULL<<37)
 #define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
+#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38)   /* overlap with EC */
+/* The process supports new-style OSDMap encoding. Monitors also use
+   this bit to determine if peers support NAK messages. */
+#define CEPH_FEATURE_OSDMAP_ENC    (1ULL<<39)
+#define CEPH_FEATURE_MDS_INLINE_DATA     (1ULL<<40)
+#define CEPH_FEATURE_CRUSH_TUNABLES3     (1ULL<<41)
+#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41)  /* overlap w/ tunables3 */
 /*
 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -82,7 +89,10 @@ static inline u64 ceph_sanitize_features(u64 features)
         CEPH_FEATURE_OSDHASHPSPOOL |           \
         CEPH_FEATURE_OSD_CACHEPOOL |           \
         CEPH_FEATURE_CRUSH_V2 |                \
-         CEPH_FEATURE_EXPORT_PEER)
+         CEPH_FEATURE_EXPORT_PEER |             \
+         CEPH_FEATURE_OSDMAP_ENC |              \
+         CEPH_FEATURE_CRUSH_TUNABLES3 |         \
+         CEPH_FEATURE_OSD_PRIMARY_AFFINITY)
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
        (CEPH_FEATURE_NOSRCADDR |        \
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 25bfb0eff772..5f6db18d72e8 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -332,6 +332,7 @@ enum {
        CEPH_MDS_OP_LOOKUPHASH = 0x00102,
        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
        CEPH_MDS_OP_LOOKUPINO  = 0x00104,
+        CEPH_MDS_OP_LOOKUPNAME = 0x00105,
        CEPH_MDS_OP_SETXATTR   = 0x01105,
        CEPH_MDS_OP_RMXATTR    = 0x01106,
@@ -420,8 +421,8 @@ union ceph_mds_request_args {
        struct {
                __u8 rule; /* currently fcntl or flock */
                __u8 type; /* shared, exclusive, remove*/
+                __le64 owner; /* owner of the lock */
                __le64 pid; /* process id requesting the lock */
-                __le64 pid_namespace;
                __le64 start; /* initial location to lock */
                __le64 length; /* num bytes to lock from start */
                __u8 wait; /* will caller wait for lock to become available? */
@@ -532,8 +533,8 @@ struct ceph_filelock {
        __le64 start;/* file offset to start lock at */
        __le64 length; /* num bytes to lock; 0 for all following start */
        __le64 client; /* which client holds the lock */
+        __le64 owner; /* owner the lock */
        __le64 pid; /* process id holding the lock on the client */
-        __le64 pid_namespace;
        __u8 type; /* shared lock, exclusive lock, or unlock */
 } __attribute__ ((packed));
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fd47e872ebcc..94ec69672164 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -43,7 +43,7 @@ struct ceph_osd {
 };
-#define CEPH_OSD_MAX_OP 2
+#define CEPH_OSD_MAX_OP 3
 enum ceph_osd_data_type {
        CEPH_OSD_DATA_TYPE_NONE = 0,
@@ -76,6 +76,7 @@ struct ceph_osd_data {
 struct ceph_osd_req_op {
        u16 op;           /* CEPH_OSD_OP_* */
+        u32 flags;        /* CEPH_OSD_OP_FLAG_* */
        u32 payload_len;
        union {
                struct ceph_osd_data raw_data_in;
@@ -102,6 +103,10 @@ struct ceph_osd_req_op {
                        u32 timeout;
                        __u8 flag;
                } watch;
+                struct {
+                        u64 expected_object_size;
+                        u64 expected_write_size;
+                } alloc_hint;
        };
 };
@@ -293,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
                                        unsigned int which, u16 opcode,
                                        u64 cookie, u64 version, int flag);
+extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+                                       unsigned int which,
+                                       u64 expected_object_size,
+                                       u64 expected_write_size);
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               struct ceph_snap_context *snapc,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 49ff69f0746b..561ea896c657 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -41,6 +41,18 @@ struct ceph_pg_pool_info {
        char *name;
 };
+static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
+{
+        switch (pool->type) {
+        case CEPH_POOL_TYPE_REP:
+                return true;
+        case CEPH_POOL_TYPE_EC:
+                return false;
+        default:
+                BUG_ON(1);
+        }
+}
 struct ceph_object_locator {
        s64 pool;
 };
@@ -60,8 +72,16 @@ struct ceph_object_id {
 struct ceph_pg_mapping {
        struct rb_node node;
        struct ceph_pg pgid;
-        int len;
-        int osds[];
+        union {
+                struct {
+                        int len;
+                        int osds[];
+                } pg_temp;
+                struct {
+                        int osd;
+                } primary_temp;
+        };
 };
 struct ceph_osdmap {
@@ -78,12 +98,19 @@ struct ceph_osdmap {
        struct ceph_entity_addr *osd_addr;
        struct rb_root pg_temp;
+        struct rb_root primary_temp;
+        u32 *osd_primary_affinity;
        struct rb_root pg_pools;
        u32 pool_max;
        /* the CRUSH map specifies the mapping of placement groups to
         * the list of osds that store+replicate them. */
        struct crush_map *crush;
+        struct mutex crush_scratch_mutex;
+        int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 static inline void ceph_oid_set_name(struct ceph_object_id *oid,
@@ -110,9 +137,21 @@ static inline void ceph_oid_copy(struct ceph_object_id *dest,
        dest->name_len = src->name_len;
 }
+static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+{
+        return osd >= 0 && osd < map->max_osd &&
+               (map->osd_state[osd] & CEPH_OSD_EXISTS);
+}
 static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
-        return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+        return ceph_osd_exists(map, osd) &&
+               (map->osd_state[osd] & CEPH_OSD_UP);
+}
+static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+{
+        return !ceph_osd_is_up(map, osd);
 }
 static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
@@ -121,6 +160,7 @@ static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
 }
 extern char *ceph_osdmap_state_str(char *str, int len, int state);
+extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
 static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
                                                     int osd)
@@ -153,7 +193,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
        return 0;
 }
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
 extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                                            struct ceph_osdmap *map,
                                            struct ceph_messenger *msgr);
@@ -172,7 +212,7 @@ extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
                               struct ceph_pg pgid,
-                               int *acting);
+                               int *osds, int *primary);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
                                struct ceph_pg pgid);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 96292df4041b..f20e0d8a2155 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -81,8 +81,9 @@ struct ceph_pg_v1 {
 */
 #define CEPH_NOPOOL  ((__u64) (-1))  /* pool id not defined */
-#define CEPH_PG_TYPE_REP     1
+#define CEPH_POOL_TYPE_REP     1
-#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_POOL_TYPE_RAID4   2 /* never implemented */
+#define CEPH_POOL_TYPE_EC      3
 /*
 * stable_mod func is used to control number of placement groups.
@@ -133,6 +134,10 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSD_IN  0x10000
 #define CEPH_OSD_OUT 0
+/* osd primary-affinity.  fixed point value: 0x10000 == baseline */
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
 /*
 * osd map flag bits
@@ -227,6 +232,9 @@ enum {
        CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
        CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
+        /* hints */
+        CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
        /** multi **/
        CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
        CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
@@ -382,7 +390,7 @@ enum {
 */
 struct ceph_osd_op {
        __le16 op;           /* CEPH_OSD_OP_* */
-        __le32 flags;        /* CEPH_OSD_FLAG_* */
+        __le32 flags;        /* CEPH_OSD_OP_FLAG_* */
        union {
                struct {
                        __le64 offset, length;
@@ -416,6 +424,10 @@ struct ceph_osd_op {
                        __le64 offset, length;
                        __le64 src_offset;
                } __attribute__ ((packed)) clonerange;
+                struct {
+                        __le64 expected_object_size;
+                        __le64 expected_write_size;
+                } __attribute__ ((packed)) alloc_hint;
        };
        __le32 payload_len;
 } __attribute__ ((packed));
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index acaa5615d634..4fad5f8ee01d 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -51,6 +51,7 @@ enum {
        CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
        CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
        CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
+        CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
 };
 /*
@@ -173,6 +174,12 @@ struct crush_map {
         * apply to a collision: in that case we will retry as we used
         * to. */
        __u32 chooseleaf_descend_once;
+        /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
+         * bits.  a value of 1 is best for new clusters.  for legacy clusters
+         * that want to limit reshuffling, a value of 3 or 4 will make the
+         * mappings line up a bit better with previous mappings. */
+        __u8 chooseleaf_vary_r;
 };
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-07 14:09:13 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-07 14:09:13 -0400
commit	240cd6a817bd855e3f1e615ed9ae16407f8cfce6 (patch)
tree	da7d6267d549cd0fbdff3f30032720b416d1ff3d /include/linux
parent	3021112598d2b722eee54d8a662fea2089abbdbc (diff)
parent	a30be7cb2ccb995ad5e67fd4b548f11fe37fc8b1 (diff)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 138448f766b4..d12659ce550d 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h
@@ -43,6 +43,13 @@
43	#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */	43	#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
44	#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)	44	#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
45	#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)	45	#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
		46	#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
		47	/* The process supports new-style OSDMap encoding. Monitors also use
		48	this bit to determine if peers support NAK messages. */
		49	#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39)
		50	#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40)
		51	#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41)
		52	#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */
46		53
47	/*	54	/*
48	* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature	55	* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -82,7 +89,10 @@ static inline u64 ceph_sanitize_features(u64 features)
82	CEPH_FEATURE_OSDHASHPSPOOL \| \	89	CEPH_FEATURE_OSDHASHPSPOOL \| \
83	CEPH_FEATURE_OSD_CACHEPOOL \| \	90	CEPH_FEATURE_OSD_CACHEPOOL \| \
84	CEPH_FEATURE_CRUSH_V2 \| \	91	CEPH_FEATURE_CRUSH_V2 \| \
85	CEPH_FEATURE_EXPORT_PEER)	92	CEPH_FEATURE_EXPORT_PEER \| \
		93	CEPH_FEATURE_OSDMAP_ENC \| \
		94	CEPH_FEATURE_CRUSH_TUNABLES3 \| \
		95	CEPH_FEATURE_OSD_PRIMARY_AFFINITY)
86		96
87	#define CEPH_FEATURES_REQUIRED_DEFAULT \	97	#define CEPH_FEATURES_REQUIRED_DEFAULT \
88	(CEPH_FEATURE_NOSRCADDR \| \	98	(CEPH_FEATURE_NOSRCADDR \| \


diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 25bfb0eff772..5f6db18d72e8 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h
@@ -332,6 +332,7 @@ enum {
332	CEPH_MDS_OP_LOOKUPHASH = 0x00102,	332	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
333	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,	333	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
334	CEPH_MDS_OP_LOOKUPINO = 0x00104,	334	CEPH_MDS_OP_LOOKUPINO = 0x00104,
		335	CEPH_MDS_OP_LOOKUPNAME = 0x00105,
335		336
336	CEPH_MDS_OP_SETXATTR = 0x01105,	337	CEPH_MDS_OP_SETXATTR = 0x01105,
337	CEPH_MDS_OP_RMXATTR = 0x01106,	338	CEPH_MDS_OP_RMXATTR = 0x01106,
@@ -420,8 +421,8 @@ union ceph_mds_request_args {
420	struct {	421	struct {
421	__u8 rule; /* currently fcntl or flock */	422	__u8 rule; /* currently fcntl or flock */
422	__u8 type; /* shared, exclusive, remove*/	423	__u8 type; /* shared, exclusive, remove*/
		424	__le64 owner; /* owner of the lock */
423	__le64 pid; /* process id requesting the lock */	425	__le64 pid; /* process id requesting the lock */
424	__le64 pid_namespace;
425	__le64 start; /* initial location to lock */	426	__le64 start; /* initial location to lock */
426	__le64 length; /* num bytes to lock from start */	427	__le64 length; /* num bytes to lock from start */
427	__u8 wait; /* will caller wait for lock to become available? */	428	__u8 wait; /* will caller wait for lock to become available? */
@@ -532,8 +533,8 @@ struct ceph_filelock {
532	__le64 start;/* file offset to start lock at */	533	__le64 start;/* file offset to start lock at */
533	__le64 length; /* num bytes to lock; 0 for all following start */	534	__le64 length; /* num bytes to lock; 0 for all following start */
534	__le64 client; /* which client holds the lock */	535	__le64 client; /* which client holds the lock */
		536	__le64 owner; /* owner the lock */
535	__le64 pid; /* process id holding the lock on the client */	537	__le64 pid; /* process id holding the lock on the client */
536	__le64 pid_namespace;
537	__u8 type; /* shared lock, exclusive lock, or unlock */	538	__u8 type; /* shared lock, exclusive lock, or unlock */
538	} __attribute__ ((packed));	539	} __attribute__ ((packed));
539		540


diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fd47e872ebcc..94ec69672164 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h
@@ -43,7 +43,7 @@ struct ceph_osd {
43	};	43	};
44		44
45		45
46	#define CEPH_OSD_MAX_OP 2	46	#define CEPH_OSD_MAX_OP 3
47		47
48	enum ceph_osd_data_type {	48	enum ceph_osd_data_type {
49	CEPH_OSD_DATA_TYPE_NONE = 0,	49	CEPH_OSD_DATA_TYPE_NONE = 0,
@@ -76,6 +76,7 @@ struct ceph_osd_data {
76		76
77	struct ceph_osd_req_op {	77	struct ceph_osd_req_op {
78	u16 op; /* CEPH_OSD_OP_* */	78	u16 op; /* CEPH_OSD_OP_* */
		79	u32 flags; /* CEPH_OSD_OP_FLAG_* */
79	u32 payload_len;	80	u32 payload_len;
80	union {	81	union {
81	struct ceph_osd_data raw_data_in;	82	struct ceph_osd_data raw_data_in;
@@ -102,6 +103,10 @@ struct ceph_osd_req_op {
102	u32 timeout;	103	u32 timeout;
103	__u8 flag;	104	__u8 flag;
104	} watch;	105	} watch;
		106	struct {
		107	u64 expected_object_size;
		108	u64 expected_write_size;
		109	} alloc_hint;
105	};	110	};
106	};	111	};
107		112
@@ -293,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
293	extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,	298	extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
294	unsigned int which, u16 opcode,	299	unsigned int which, u16 opcode,
295	u64 cookie, u64 version, int flag);	300	u64 cookie, u64 version, int flag);
		301	extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
		302	unsigned int which,
		303	u64 expected_object_size,
		304	u64 expected_write_size);
296		305
297	extern struct ceph_osd_request ceph_osdc_alloc_request(struct ceph_osd_client osdc,	306	extern struct ceph_osd_request ceph_osdc_alloc_request(struct ceph_osd_client osdc,
298	struct ceph_snap_context *snapc,	307	struct ceph_snap_context *snapc,


diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 49ff69f0746b..561ea896c657 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h
@@ -41,6 +41,18 @@ struct ceph_pg_pool_info {
41	char *name;	41	char *name;
42	};	42	};
43		43
		44	static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
		45	{
		46	switch (pool->type) {
		47	case CEPH_POOL_TYPE_REP:
		48	return true;
		49	case CEPH_POOL_TYPE_EC:
		50	return false;
		51	default:
		52	BUG_ON(1);
		53	}
		54	}
		55
44	struct ceph_object_locator {	56	struct ceph_object_locator {
45	s64 pool;	57	s64 pool;
46	};	58	};
@@ -60,8 +72,16 @@ struct ceph_object_id {
60	struct ceph_pg_mapping {	72	struct ceph_pg_mapping {
61	struct rb_node node;	73	struct rb_node node;
62	struct ceph_pg pgid;	74	struct ceph_pg pgid;
63	int len;	75
64	int osds[];	76	union {
		77	struct {
		78	int len;
		79	int osds[];
		80	} pg_temp;
		81	struct {
		82	int osd;
		83	} primary_temp;
		84	};
65	};	85	};
66		86
67	struct ceph_osdmap {	87	struct ceph_osdmap {
@@ -78,12 +98,19 @@ struct ceph_osdmap {
78	struct ceph_entity_addr *osd_addr;	98	struct ceph_entity_addr *osd_addr;
79		99
80	struct rb_root pg_temp;	100	struct rb_root pg_temp;
		101	struct rb_root primary_temp;
		102
		103	u32 *osd_primary_affinity;
		104
81	struct rb_root pg_pools;	105	struct rb_root pg_pools;
82	u32 pool_max;	106	u32 pool_max;
83		107
84	/* the CRUSH map specifies the mapping of placement groups to	108	/* the CRUSH map specifies the mapping of placement groups to
85	* the list of osds that store+replicate them. */	109	* the list of osds that store+replicate them. */
86	struct crush_map *crush;	110	struct crush_map *crush;
		111
		112	struct mutex crush_scratch_mutex;
		113	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
87	};	114	};
88		115
89	static inline void ceph_oid_set_name(struct ceph_object_id *oid,	116	static inline void ceph_oid_set_name(struct ceph_object_id *oid,
@@ -110,9 +137,21 @@ static inline void ceph_oid_copy(struct ceph_object_id *dest,
110	dest->name_len = src->name_len;	137	dest->name_len = src->name_len;
111	}	138	}
112		139
		140	static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
		141	{
		142	return osd >= 0 && osd < map->max_osd &&
		143	(map->osd_state[osd] & CEPH_OSD_EXISTS);
		144	}
		145
113	static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)	146	static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
114	{	147	{
115	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);	148	return ceph_osd_exists(map, osd) &&
		149	(map->osd_state[osd] & CEPH_OSD_UP);
		150	}
		151
		152	static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
		153	{
		154	return !ceph_osd_is_up(map, osd);
116	}	155	}
117		156
118	static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)	157	static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
@@ -121,6 +160,7 @@ static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
121	}	160	}
122		161
123	extern char ceph_osdmap_state_str(char str, int len, int state);	162	extern char ceph_osdmap_state_str(char str, int len, int state);
		163	extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
124		164
125	static inline struct ceph_entity_addr ceph_osd_addr(struct ceph_osdmap map,	165	static inline struct ceph_entity_addr ceph_osd_addr(struct ceph_osdmap map,
126	int osd)	166	int osd)
@@ -153,7 +193,7 @@ static inline int ceph_decode_pgid(void *p, void end, struct ceph_pg *pgid)
153	return 0;	193	return 0;
154	}	194	}
155		195
156	extern struct ceph_osdmap osdmap_decode(void p, void end);	196	extern struct ceph_osdmap ceph_osdmap_decode(void p, void end);
157	extern struct ceph_osdmap osdmap_apply_incremental(void p, void end,	197	extern struct ceph_osdmap osdmap_apply_incremental(void p, void end,
158	struct ceph_osdmap *map,	198	struct ceph_osdmap *map,
159	struct ceph_messenger *msgr);	199	struct ceph_messenger *msgr);
@@ -172,7 +212,7 @@ extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
172		212
173	extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,	213	extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
174	struct ceph_pg pgid,	214	struct ceph_pg pgid,
175	int *acting);	215	int osds, int primary);
176	extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,	216	extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
177	struct ceph_pg pgid);	217	struct ceph_pg pgid);
178		218


diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 96292df4041b..f20e0d8a2155 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h
@@ -81,8 +81,9 @@ struct ceph_pg_v1 {
81	*/	81	*/
82	#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */	82	#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */
83		83
84	#define CEPH_PG_TYPE_REP 1	84	#define CEPH_POOL_TYPE_REP 1
85	#define CEPH_PG_TYPE_RAID4 2	85	#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */
		86	#define CEPH_POOL_TYPE_EC 3
86		87
87	/*	88	/*
88	* stable_mod func is used to control number of placement groups.	89	* stable_mod func is used to control number of placement groups.
@@ -133,6 +134,10 @@ extern const char *ceph_osd_state_name(int s);
133	#define CEPH_OSD_IN 0x10000	134	#define CEPH_OSD_IN 0x10000
134	#define CEPH_OSD_OUT 0	135	#define CEPH_OSD_OUT 0
135		136
		137	/* osd primary-affinity. fixed point value: 0x10000 == baseline */
		138	#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
		139	#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
		140
136		141
137	/*	142	/*
138	* osd map flag bits	143	* osd map flag bits
@@ -227,6 +232,9 @@ enum {
227	CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR \| CEPH_OSD_OP_TYPE_DATA \| 24,	232	CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR \| CEPH_OSD_OP_TYPE_DATA \| 24,
228	CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD \| CEPH_OSD_OP_TYPE_DATA \| 25,	233	CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD \| CEPH_OSD_OP_TYPE_DATA \| 25,
229		234
		235	/* hints */
		236	CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR \| CEPH_OSD_OP_TYPE_DATA \| 35,
		237
230	/ multi /	238	/ multi /
231	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR \| CEPH_OSD_OP_TYPE_MULTI \| 1,	239	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR \| CEPH_OSD_OP_TYPE_MULTI \| 1,
232	CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD \| CEPH_OSD_OP_TYPE_MULTI \| 2,	240	CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD \| CEPH_OSD_OP_TYPE_MULTI \| 2,
@@ -382,7 +390,7 @@ enum {
382	*/	390	*/
383	struct ceph_osd_op {	391	struct ceph_osd_op {
384	__le16 op; /* CEPH_OSD_OP_* */	392	__le16 op; /* CEPH_OSD_OP_* */
385	__le32 flags; /* CEPH_OSD_FLAG_* */	393	__le32 flags; /* CEPH_OSD_OP_FLAG_* */
386	union {	394	union {
387	struct {	395	struct {
388	__le64 offset, length;	396	__le64 offset, length;
@@ -416,6 +424,10 @@ struct ceph_osd_op {
416	__le64 offset, length;	424	__le64 offset, length;
417	__le64 src_offset;	425	__le64 src_offset;
418	} __attribute__ ((packed)) clonerange;	426	} __attribute__ ((packed)) clonerange;
		427	struct {
		428	__le64 expected_object_size;
		429	__le64 expected_write_size;
		430	} __attribute__ ((packed)) alloc_hint;
419	};	431	};
420	__le32 payload_len;	432	__le32 payload_len;
421	} __attribute__ ((packed));	433	} __attribute__ ((packed));


diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index acaa5615d634..4fad5f8ee01d 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h
@@ -51,6 +51,7 @@ enum {
51	CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */	51	CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
52	CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,	52	CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
53	CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,	53	CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
		54	CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
54	};	55	};
55		56
56	/*	57	/*
@@ -173,6 +174,12 @@ struct crush_map {
173	* apply to a collision: in that case we will retry as we used	174	* apply to a collision: in that case we will retry as we used
174	* to. */	175	* to. */
175	__u32 chooseleaf_descend_once;	176	__u32 chooseleaf_descend_once;
		177
		178	/* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
		179	* bits. a value of 1 is best for new clusters. for legacy clusters
		180	* that want to limit reshuffling, a value of 3 or 4 will make the
		181	* mappings line up a bit better with previous mappings. */
		182	__u8 chooseleaf_vary_r;
176	};	183	};
177		184
178		185