aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/ceph/ceph_features.h12
-rw-r--r--include/linux/ceph/ceph_fs.h5
-rw-r--r--include/linux/ceph/osd_client.h11
-rw-r--r--include/linux/ceph/osdmap.h50
-rw-r--r--include/linux/ceph/rados.h18
-rw-r--r--include/linux/crush/crush.h7
6 files changed, 91 insertions, 12 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 138448f766b4..d12659ce550d 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -43,6 +43,13 @@
43#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ 43#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
44#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) 44#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
45#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) 45#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
46#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
47/* The process supports new-style OSDMap encoding. Monitors also use
48 this bit to determine if peers support NAK messages. */
49#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39)
50#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40)
51#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41)
52#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */
46 53
47/* 54/*
48 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature 55 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -82,7 +89,10 @@ static inline u64 ceph_sanitize_features(u64 features)
82 CEPH_FEATURE_OSDHASHPSPOOL | \ 89 CEPH_FEATURE_OSDHASHPSPOOL | \
83 CEPH_FEATURE_OSD_CACHEPOOL | \ 90 CEPH_FEATURE_OSD_CACHEPOOL | \
84 CEPH_FEATURE_CRUSH_V2 | \ 91 CEPH_FEATURE_CRUSH_V2 | \
85 CEPH_FEATURE_EXPORT_PEER) 92 CEPH_FEATURE_EXPORT_PEER | \
93 CEPH_FEATURE_OSDMAP_ENC | \
94 CEPH_FEATURE_CRUSH_TUNABLES3 | \
95 CEPH_FEATURE_OSD_PRIMARY_AFFINITY)
86 96
87#define CEPH_FEATURES_REQUIRED_DEFAULT \ 97#define CEPH_FEATURES_REQUIRED_DEFAULT \
88 (CEPH_FEATURE_NOSRCADDR | \ 98 (CEPH_FEATURE_NOSRCADDR | \
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 25bfb0eff772..5f6db18d72e8 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -332,6 +332,7 @@ enum {
332 CEPH_MDS_OP_LOOKUPHASH = 0x00102, 332 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
333 CEPH_MDS_OP_LOOKUPPARENT = 0x00103, 333 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
334 CEPH_MDS_OP_LOOKUPINO = 0x00104, 334 CEPH_MDS_OP_LOOKUPINO = 0x00104,
335 CEPH_MDS_OP_LOOKUPNAME = 0x00105,
335 336
336 CEPH_MDS_OP_SETXATTR = 0x01105, 337 CEPH_MDS_OP_SETXATTR = 0x01105,
337 CEPH_MDS_OP_RMXATTR = 0x01106, 338 CEPH_MDS_OP_RMXATTR = 0x01106,
@@ -420,8 +421,8 @@ union ceph_mds_request_args {
420 struct { 421 struct {
421 __u8 rule; /* currently fcntl or flock */ 422 __u8 rule; /* currently fcntl or flock */
422 __u8 type; /* shared, exclusive, remove*/ 423 __u8 type; /* shared, exclusive, remove*/
424 __le64 owner; /* owner of the lock */
423 __le64 pid; /* process id requesting the lock */ 425 __le64 pid; /* process id requesting the lock */
424 __le64 pid_namespace;
425 __le64 start; /* initial location to lock */ 426 __le64 start; /* initial location to lock */
426 __le64 length; /* num bytes to lock from start */ 427 __le64 length; /* num bytes to lock from start */
427 __u8 wait; /* will caller wait for lock to become available? */ 428 __u8 wait; /* will caller wait for lock to become available? */
@@ -532,8 +533,8 @@ struct ceph_filelock {
532 __le64 start;/* file offset to start lock at */ 533 __le64 start;/* file offset to start lock at */
533 __le64 length; /* num bytes to lock; 0 for all following start */ 534 __le64 length; /* num bytes to lock; 0 for all following start */
534 __le64 client; /* which client holds the lock */ 535 __le64 client; /* which client holds the lock */
536 __le64 owner; /* owner the lock */
535 __le64 pid; /* process id holding the lock on the client */ 537 __le64 pid; /* process id holding the lock on the client */
536 __le64 pid_namespace;
537 __u8 type; /* shared lock, exclusive lock, or unlock */ 538 __u8 type; /* shared lock, exclusive lock, or unlock */
538} __attribute__ ((packed)); 539} __attribute__ ((packed));
539 540
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fd47e872ebcc..94ec69672164 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -43,7 +43,7 @@ struct ceph_osd {
43}; 43};
44 44
45 45
46#define CEPH_OSD_MAX_OP 2 46#define CEPH_OSD_MAX_OP 3
47 47
48enum ceph_osd_data_type { 48enum ceph_osd_data_type {
49 CEPH_OSD_DATA_TYPE_NONE = 0, 49 CEPH_OSD_DATA_TYPE_NONE = 0,
@@ -76,6 +76,7 @@ struct ceph_osd_data {
76 76
77struct ceph_osd_req_op { 77struct ceph_osd_req_op {
78 u16 op; /* CEPH_OSD_OP_* */ 78 u16 op; /* CEPH_OSD_OP_* */
79 u32 flags; /* CEPH_OSD_OP_FLAG_* */
79 u32 payload_len; 80 u32 payload_len;
80 union { 81 union {
81 struct ceph_osd_data raw_data_in; 82 struct ceph_osd_data raw_data_in;
@@ -102,6 +103,10 @@ struct ceph_osd_req_op {
102 u32 timeout; 103 u32 timeout;
103 __u8 flag; 104 __u8 flag;
104 } watch; 105 } watch;
106 struct {
107 u64 expected_object_size;
108 u64 expected_write_size;
109 } alloc_hint;
105 }; 110 };
106}; 111};
107 112
@@ -293,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
293extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 298extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
294 unsigned int which, u16 opcode, 299 unsigned int which, u16 opcode,
295 u64 cookie, u64 version, int flag); 300 u64 cookie, u64 version, int flag);
301extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
302 unsigned int which,
303 u64 expected_object_size,
304 u64 expected_write_size);
296 305
297extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 306extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
298 struct ceph_snap_context *snapc, 307 struct ceph_snap_context *snapc,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 49ff69f0746b..561ea896c657 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -41,6 +41,18 @@ struct ceph_pg_pool_info {
41 char *name; 41 char *name;
42}; 42};
43 43
44static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
45{
46 switch (pool->type) {
47 case CEPH_POOL_TYPE_REP:
48 return true;
49 case CEPH_POOL_TYPE_EC:
50 return false;
51 default:
52 BUG_ON(1);
53 }
54}
55
44struct ceph_object_locator { 56struct ceph_object_locator {
45 s64 pool; 57 s64 pool;
46}; 58};
@@ -60,8 +72,16 @@ struct ceph_object_id {
60struct ceph_pg_mapping { 72struct ceph_pg_mapping {
61 struct rb_node node; 73 struct rb_node node;
62 struct ceph_pg pgid; 74 struct ceph_pg pgid;
63 int len; 75
64 int osds[]; 76 union {
77 struct {
78 int len;
79 int osds[];
80 } pg_temp;
81 struct {
82 int osd;
83 } primary_temp;
84 };
65}; 85};
66 86
67struct ceph_osdmap { 87struct ceph_osdmap {
@@ -78,12 +98,19 @@ struct ceph_osdmap {
78 struct ceph_entity_addr *osd_addr; 98 struct ceph_entity_addr *osd_addr;
79 99
80 struct rb_root pg_temp; 100 struct rb_root pg_temp;
101 struct rb_root primary_temp;
102
103 u32 *osd_primary_affinity;
104
81 struct rb_root pg_pools; 105 struct rb_root pg_pools;
82 u32 pool_max; 106 u32 pool_max;
83 107
84 /* the CRUSH map specifies the mapping of placement groups to 108 /* the CRUSH map specifies the mapping of placement groups to
85 * the list of osds that store+replicate them. */ 109 * the list of osds that store+replicate them. */
86 struct crush_map *crush; 110 struct crush_map *crush;
111
112 struct mutex crush_scratch_mutex;
113 int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
87}; 114};
88 115
89static inline void ceph_oid_set_name(struct ceph_object_id *oid, 116static inline void ceph_oid_set_name(struct ceph_object_id *oid,
@@ -110,9 +137,21 @@ static inline void ceph_oid_copy(struct ceph_object_id *dest,
110 dest->name_len = src->name_len; 137 dest->name_len = src->name_len;
111} 138}
112 139
140static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
141{
142 return osd >= 0 && osd < map->max_osd &&
143 (map->osd_state[osd] & CEPH_OSD_EXISTS);
144}
145
113static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) 146static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
114{ 147{
115 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); 148 return ceph_osd_exists(map, osd) &&
149 (map->osd_state[osd] & CEPH_OSD_UP);
150}
151
152static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
153{
154 return !ceph_osd_is_up(map, osd);
116} 155}
117 156
118static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) 157static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
@@ -121,6 +160,7 @@ static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
121} 160}
122 161
123extern char *ceph_osdmap_state_str(char *str, int len, int state); 162extern char *ceph_osdmap_state_str(char *str, int len, int state);
163extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
124 164
125static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, 165static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
126 int osd) 166 int osd)
@@ -153,7 +193,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
153 return 0; 193 return 0;
154} 194}
155 195
156extern struct ceph_osdmap *osdmap_decode(void **p, void *end); 196extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
157extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 197extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
158 struct ceph_osdmap *map, 198 struct ceph_osdmap *map,
159 struct ceph_messenger *msgr); 199 struct ceph_messenger *msgr);
@@ -172,7 +212,7 @@ extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
172 212
173extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 213extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
174 struct ceph_pg pgid, 214 struct ceph_pg pgid,
175 int *acting); 215 int *osds, int *primary);
176extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 216extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
177 struct ceph_pg pgid); 217 struct ceph_pg pgid);
178 218
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 96292df4041b..f20e0d8a2155 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -81,8 +81,9 @@ struct ceph_pg_v1 {
81 */ 81 */
82#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */ 82#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */
83 83
84#define CEPH_PG_TYPE_REP 1 84#define CEPH_POOL_TYPE_REP 1
85#define CEPH_PG_TYPE_RAID4 2 85#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */
86#define CEPH_POOL_TYPE_EC 3
86 87
87/* 88/*
88 * stable_mod func is used to control number of placement groups. 89 * stable_mod func is used to control number of placement groups.
@@ -133,6 +134,10 @@ extern const char *ceph_osd_state_name(int s);
133#define CEPH_OSD_IN 0x10000 134#define CEPH_OSD_IN 0x10000
134#define CEPH_OSD_OUT 0 135#define CEPH_OSD_OUT 0
135 136
137/* osd primary-affinity. fixed point value: 0x10000 == baseline */
138#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
139#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
140
136 141
137/* 142/*
138 * osd map flag bits 143 * osd map flag bits
@@ -227,6 +232,9 @@ enum {
227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, 232 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, 233 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
229 234
235 /* hints */
236 CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
237
230 /** multi **/ 238 /** multi **/
231 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, 239 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
232 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, 240 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
@@ -382,7 +390,7 @@ enum {
382 */ 390 */
383struct ceph_osd_op { 391struct ceph_osd_op {
384 __le16 op; /* CEPH_OSD_OP_* */ 392 __le16 op; /* CEPH_OSD_OP_* */
385 __le32 flags; /* CEPH_OSD_FLAG_* */ 393 __le32 flags; /* CEPH_OSD_OP_FLAG_* */
386 union { 394 union {
387 struct { 395 struct {
388 __le64 offset, length; 396 __le64 offset, length;
@@ -416,6 +424,10 @@ struct ceph_osd_op {
416 __le64 offset, length; 424 __le64 offset, length;
417 __le64 src_offset; 425 __le64 src_offset;
418 } __attribute__ ((packed)) clonerange; 426 } __attribute__ ((packed)) clonerange;
427 struct {
428 __le64 expected_object_size;
429 __le64 expected_write_size;
430 } __attribute__ ((packed)) alloc_hint;
419 }; 431 };
420 __le32 payload_len; 432 __le32 payload_len;
421} __attribute__ ((packed)); 433} __attribute__ ((packed));
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index acaa5615d634..4fad5f8ee01d 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -51,6 +51,7 @@ enum {
51 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 51 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
52 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 52 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
53 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 53 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
54 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
54}; 55};
55 56
56/* 57/*
@@ -173,6 +174,12 @@ struct crush_map {
173 * apply to a collision: in that case we will retry as we used 174 * apply to a collision: in that case we will retry as we used
174 * to. */ 175 * to. */
175 __u32 chooseleaf_descend_once; 176 __u32 chooseleaf_descend_once;
177
178 /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
179 * bits. a value of 1 is best for new clusters. for legacy clusters
180 * that want to limit reshuffling, a value of 3 or 4 will make the
181 * mappings line up a bit better with previous mappings. */
182 __u8 chooseleaf_vary_r;
176}; 183};
177 184
178 185