aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-07 14:09:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-07 14:09:13 -0400
commit240cd6a817bd855e3f1e615ed9ae16407f8cfce6 (patch)
treeda7d6267d549cd0fbdff3f30032720b416d1ff3d /include/linux
parent3021112598d2b722eee54d8a662fea2089abbdbc (diff)
parenta30be7cb2ccb995ad5e67fd4b548f11fe37fc8b1 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "The biggest chunk is a series of patches from Ilya that add support for new Ceph osd and crush map features, including some new tunables, primary affinity, and the new encoding that is needed for erasure coding support. This brings things into parity with the server side and the looming firefly release. There is also support for allocation hints in RBD that help limit fragmentation on the server side. There is also a series of patches from Zheng fixing NFS reexport, directory fragmentation support, flock vs fnctl behavior, and some issues with clustered MDS. Finally, there are some miscellaneous fixes from Yunchuan Wen for fscache, Fabian Frederick for ACLs, and from me for fsync(dirfd) behavior" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (79 commits) ceph: skip invalid dentry during dcache readdir libceph: dump pool {read,write}_tier to debugfs libceph: output primary affinity values on osdmap updates ceph: flush cap release queue when trimming session caps ceph: don't grabs open file reference for aborted request ceph: drop extra open file reference in ceph_atomic_open() ceph: preallocate buffer for readdir reply libceph: enable PRIMARY_AFFINITY feature bit libceph: redo ceph_calc_pg_primary() in terms of ceph_calc_pg_acting() libceph: add support for osd primary affinity libceph: add support for primary_temp mappings libceph: return primary from ceph_calc_pg_acting() libceph: switch ceph_calc_pg_acting() to new helpers libceph: introduce apply_temps() helper libceph: introduce pg_to_raw_osds() and raw_to_up_osds() helpers libceph: ceph_can_shift_osds(pool) and pool type defines libceph: ceph_osd_{exists,is_up,is_down}(osd) definitions libceph: enable OSDMAP_ENC feature bit libceph: primary_affinity decode bits libceph: primary_affinity infrastructure ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/ceph/ceph_features.h12
-rw-r--r--include/linux/ceph/ceph_fs.h5
-rw-r--r--include/linux/ceph/osd_client.h11
-rw-r--r--include/linux/ceph/osdmap.h50
-rw-r--r--include/linux/ceph/rados.h18
-rw-r--r--include/linux/crush/crush.h7
6 files changed, 91 insertions, 12 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 138448f766b4..d12659ce550d 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -43,6 +43,13 @@
43#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ 43#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
44#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) 44#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
45#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) 45#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
46#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
47/* The process supports new-style OSDMap encoding. Monitors also use
48 this bit to determine if peers support NAK messages. */
49#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39)
50#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40)
51#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41)
52#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */
46 53
47/* 54/*
48 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature 55 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -82,7 +89,10 @@ static inline u64 ceph_sanitize_features(u64 features)
82 CEPH_FEATURE_OSDHASHPSPOOL | \ 89 CEPH_FEATURE_OSDHASHPSPOOL | \
83 CEPH_FEATURE_OSD_CACHEPOOL | \ 90 CEPH_FEATURE_OSD_CACHEPOOL | \
84 CEPH_FEATURE_CRUSH_V2 | \ 91 CEPH_FEATURE_CRUSH_V2 | \
85 CEPH_FEATURE_EXPORT_PEER) 92 CEPH_FEATURE_EXPORT_PEER | \
93 CEPH_FEATURE_OSDMAP_ENC | \
94 CEPH_FEATURE_CRUSH_TUNABLES3 | \
95 CEPH_FEATURE_OSD_PRIMARY_AFFINITY)
86 96
87#define CEPH_FEATURES_REQUIRED_DEFAULT \ 97#define CEPH_FEATURES_REQUIRED_DEFAULT \
88 (CEPH_FEATURE_NOSRCADDR | \ 98 (CEPH_FEATURE_NOSRCADDR | \
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 25bfb0eff772..5f6db18d72e8 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -332,6 +332,7 @@ enum {
332 CEPH_MDS_OP_LOOKUPHASH = 0x00102, 332 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
333 CEPH_MDS_OP_LOOKUPPARENT = 0x00103, 333 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
334 CEPH_MDS_OP_LOOKUPINO = 0x00104, 334 CEPH_MDS_OP_LOOKUPINO = 0x00104,
335 CEPH_MDS_OP_LOOKUPNAME = 0x00105,
335 336
336 CEPH_MDS_OP_SETXATTR = 0x01105, 337 CEPH_MDS_OP_SETXATTR = 0x01105,
337 CEPH_MDS_OP_RMXATTR = 0x01106, 338 CEPH_MDS_OP_RMXATTR = 0x01106,
@@ -420,8 +421,8 @@ union ceph_mds_request_args {
420 struct { 421 struct {
421 __u8 rule; /* currently fcntl or flock */ 422 __u8 rule; /* currently fcntl or flock */
422 __u8 type; /* shared, exclusive, remove*/ 423 __u8 type; /* shared, exclusive, remove*/
424 __le64 owner; /* owner of the lock */
423 __le64 pid; /* process id requesting the lock */ 425 __le64 pid; /* process id requesting the lock */
424 __le64 pid_namespace;
425 __le64 start; /* initial location to lock */ 426 __le64 start; /* initial location to lock */
426 __le64 length; /* num bytes to lock from start */ 427 __le64 length; /* num bytes to lock from start */
427 __u8 wait; /* will caller wait for lock to become available? */ 428 __u8 wait; /* will caller wait for lock to become available? */
@@ -532,8 +533,8 @@ struct ceph_filelock {
532 __le64 start;/* file offset to start lock at */ 533 __le64 start;/* file offset to start lock at */
533 __le64 length; /* num bytes to lock; 0 for all following start */ 534 __le64 length; /* num bytes to lock; 0 for all following start */
534 __le64 client; /* which client holds the lock */ 535 __le64 client; /* which client holds the lock */
536 __le64 owner; /* owner the lock */
535 __le64 pid; /* process id holding the lock on the client */ 537 __le64 pid; /* process id holding the lock on the client */
536 __le64 pid_namespace;
537 __u8 type; /* shared lock, exclusive lock, or unlock */ 538 __u8 type; /* shared lock, exclusive lock, or unlock */
538} __attribute__ ((packed)); 539} __attribute__ ((packed));
539 540
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fd47e872ebcc..94ec69672164 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -43,7 +43,7 @@ struct ceph_osd {
43}; 43};
44 44
45 45
46#define CEPH_OSD_MAX_OP 2 46#define CEPH_OSD_MAX_OP 3
47 47
48enum ceph_osd_data_type { 48enum ceph_osd_data_type {
49 CEPH_OSD_DATA_TYPE_NONE = 0, 49 CEPH_OSD_DATA_TYPE_NONE = 0,
@@ -76,6 +76,7 @@ struct ceph_osd_data {
76 76
77struct ceph_osd_req_op { 77struct ceph_osd_req_op {
78 u16 op; /* CEPH_OSD_OP_* */ 78 u16 op; /* CEPH_OSD_OP_* */
79 u32 flags; /* CEPH_OSD_OP_FLAG_* */
79 u32 payload_len; 80 u32 payload_len;
80 union { 81 union {
81 struct ceph_osd_data raw_data_in; 82 struct ceph_osd_data raw_data_in;
@@ -102,6 +103,10 @@ struct ceph_osd_req_op {
102 u32 timeout; 103 u32 timeout;
103 __u8 flag; 104 __u8 flag;
104 } watch; 105 } watch;
106 struct {
107 u64 expected_object_size;
108 u64 expected_write_size;
109 } alloc_hint;
105 }; 110 };
106}; 111};
107 112
@@ -293,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
293extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 298extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
294 unsigned int which, u16 opcode, 299 unsigned int which, u16 opcode,
295 u64 cookie, u64 version, int flag); 300 u64 cookie, u64 version, int flag);
301extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
302 unsigned int which,
303 u64 expected_object_size,
304 u64 expected_write_size);
296 305
297extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 306extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
298 struct ceph_snap_context *snapc, 307 struct ceph_snap_context *snapc,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 49ff69f0746b..561ea896c657 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -41,6 +41,18 @@ struct ceph_pg_pool_info {
41 char *name; 41 char *name;
42}; 42};
43 43
44static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
45{
46 switch (pool->type) {
47 case CEPH_POOL_TYPE_REP:
48 return true;
49 case CEPH_POOL_TYPE_EC:
50 return false;
51 default:
52 BUG_ON(1);
53 }
54}
55
44struct ceph_object_locator { 56struct ceph_object_locator {
45 s64 pool; 57 s64 pool;
46}; 58};
@@ -60,8 +72,16 @@ struct ceph_object_id {
60struct ceph_pg_mapping { 72struct ceph_pg_mapping {
61 struct rb_node node; 73 struct rb_node node;
62 struct ceph_pg pgid; 74 struct ceph_pg pgid;
63 int len; 75
64 int osds[]; 76 union {
77 struct {
78 int len;
79 int osds[];
80 } pg_temp;
81 struct {
82 int osd;
83 } primary_temp;
84 };
65}; 85};
66 86
67struct ceph_osdmap { 87struct ceph_osdmap {
@@ -78,12 +98,19 @@ struct ceph_osdmap {
78 struct ceph_entity_addr *osd_addr; 98 struct ceph_entity_addr *osd_addr;
79 99
80 struct rb_root pg_temp; 100 struct rb_root pg_temp;
101 struct rb_root primary_temp;
102
103 u32 *osd_primary_affinity;
104
81 struct rb_root pg_pools; 105 struct rb_root pg_pools;
82 u32 pool_max; 106 u32 pool_max;
83 107
84 /* the CRUSH map specifies the mapping of placement groups to 108 /* the CRUSH map specifies the mapping of placement groups to
85 * the list of osds that store+replicate them. */ 109 * the list of osds that store+replicate them. */
86 struct crush_map *crush; 110 struct crush_map *crush;
111
112 struct mutex crush_scratch_mutex;
113 int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
87}; 114};
88 115
89static inline void ceph_oid_set_name(struct ceph_object_id *oid, 116static inline void ceph_oid_set_name(struct ceph_object_id *oid,
@@ -110,9 +137,21 @@ static inline void ceph_oid_copy(struct ceph_object_id *dest,
110 dest->name_len = src->name_len; 137 dest->name_len = src->name_len;
111} 138}
112 139
140static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
141{
142 return osd >= 0 && osd < map->max_osd &&
143 (map->osd_state[osd] & CEPH_OSD_EXISTS);
144}
145
113static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) 146static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
114{ 147{
115 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); 148 return ceph_osd_exists(map, osd) &&
149 (map->osd_state[osd] & CEPH_OSD_UP);
150}
151
152static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
153{
154 return !ceph_osd_is_up(map, osd);
116} 155}
117 156
118static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) 157static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
@@ -121,6 +160,7 @@ static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
121} 160}
122 161
123extern char *ceph_osdmap_state_str(char *str, int len, int state); 162extern char *ceph_osdmap_state_str(char *str, int len, int state);
163extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
124 164
125static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, 165static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
126 int osd) 166 int osd)
@@ -153,7 +193,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
153 return 0; 193 return 0;
154} 194}
155 195
156extern struct ceph_osdmap *osdmap_decode(void **p, void *end); 196extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
157extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 197extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
158 struct ceph_osdmap *map, 198 struct ceph_osdmap *map,
159 struct ceph_messenger *msgr); 199 struct ceph_messenger *msgr);
@@ -172,7 +212,7 @@ extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
172 212
173extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 213extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
174 struct ceph_pg pgid, 214 struct ceph_pg pgid,
175 int *acting); 215 int *osds, int *primary);
176extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 216extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
177 struct ceph_pg pgid); 217 struct ceph_pg pgid);
178 218
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 96292df4041b..f20e0d8a2155 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -81,8 +81,9 @@ struct ceph_pg_v1 {
81 */ 81 */
82#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */ 82#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */
83 83
84#define CEPH_PG_TYPE_REP 1 84#define CEPH_POOL_TYPE_REP 1
85#define CEPH_PG_TYPE_RAID4 2 85#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */
86#define CEPH_POOL_TYPE_EC 3
86 87
87/* 88/*
88 * stable_mod func is used to control number of placement groups. 89 * stable_mod func is used to control number of placement groups.
@@ -133,6 +134,10 @@ extern const char *ceph_osd_state_name(int s);
133#define CEPH_OSD_IN 0x10000 134#define CEPH_OSD_IN 0x10000
134#define CEPH_OSD_OUT 0 135#define CEPH_OSD_OUT 0
135 136
137/* osd primary-affinity. fixed point value: 0x10000 == baseline */
138#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
139#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
140
136 141
137/* 142/*
138 * osd map flag bits 143 * osd map flag bits
@@ -227,6 +232,9 @@ enum {
227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, 232 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, 233 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
229 234
235 /* hints */
236 CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
237
230 /** multi **/ 238 /** multi **/
231 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, 239 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
232 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, 240 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
@@ -382,7 +390,7 @@ enum {
382 */ 390 */
383struct ceph_osd_op { 391struct ceph_osd_op {
384 __le16 op; /* CEPH_OSD_OP_* */ 392 __le16 op; /* CEPH_OSD_OP_* */
385 __le32 flags; /* CEPH_OSD_FLAG_* */ 393 __le32 flags; /* CEPH_OSD_OP_FLAG_* */
386 union { 394 union {
387 struct { 395 struct {
388 __le64 offset, length; 396 __le64 offset, length;
@@ -416,6 +424,10 @@ struct ceph_osd_op {
416 __le64 offset, length; 424 __le64 offset, length;
417 __le64 src_offset; 425 __le64 src_offset;
418 } __attribute__ ((packed)) clonerange; 426 } __attribute__ ((packed)) clonerange;
427 struct {
428 __le64 expected_object_size;
429 __le64 expected_write_size;
430 } __attribute__ ((packed)) alloc_hint;
419 }; 431 };
420 __le32 payload_len; 432 __le32 payload_len;
421} __attribute__ ((packed)); 433} __attribute__ ((packed));
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index acaa5615d634..4fad5f8ee01d 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -51,6 +51,7 @@ enum {
51 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 51 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
52 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 52 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
53 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 53 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
54 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
54}; 55};
55 56
56/* 57/*
@@ -173,6 +174,12 @@ struct crush_map {
173 * apply to a collision: in that case we will retry as we used 174 * apply to a collision: in that case we will retry as we used
174 * to. */ 175 * to. */
175 __u32 chooseleaf_descend_once; 176 __u32 chooseleaf_descend_once;
177
178 /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
179 * bits. a value of 1 is best for new clusters. for legacy clusters
180 * that want to limit reshuffling, a value of 3 or 4 will make the
181 * mappings line up a bit better with previous mappings. */
182 __u8 chooseleaf_vary_r;
176}; 183};
177 184
178 185