aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 20:43:09 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 20:43:09 -0500
commit1cf0209c431fa7790253c532039d53b0773193aa (patch)
tree24310eaaf4c9583988d9098f6c85a4a34970b5b9 /include
parentde1a2262b006220dae2561a299a6ea128c46f4fe (diff)
parent83ca14fdd35821554058e5fd4fa7b118ee504a33 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "A few groups of patches here. Alex has been hard at work improving the RBD code, layout groundwork for understanding the new formats and doing layering. Most of the infrastructure is now in place for the final bits that will come with the next window. There are a few changes to the data layout. Jim Schutt's patch fixes some non-ideal CRUSH behavior, and a set of patches from me updates the client to speak a newer version of the protocol and implement an improved hashing strategy across storage nodes (when the server side supports it too). A pair of patches from Sam Lang fix the atomicity of open+create operations. Several patches from Yan, Zheng fix various mds/client issues that turned up during multi-mds torture tests. A final set of patches expose file layouts via virtual xattrs, and allow the policies to be set on directories via xattrs as well (avoiding the awkward ioctl interface and providing a consistent interface for both kernel mount and ceph-fuse users)." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (143 commits) libceph: add support for HASHPSPOOL pool flag libceph: update osd request/reply encoding libceph: calculate placement based on the internal data types ceph: update support for PGID64, PGPOOL3, OSDENC protocol features ceph: update "ceph_features.h" libceph: decode into cpu-native ceph_pg type libceph: rename ceph_pg -> ceph_pg_v1 rbd: pass length, not op for osd completions rbd: move rbd_osd_trivial_callback() libceph: use a do..while loop in con_work() libceph: use a flag to indicate a fault has occurred libceph: separate non-locked fault handling libceph: encapsulate connection backoff libceph: eliminate sparse warnings ceph: eliminate sparse warnings in fs code rbd: eliminate sparse warnings libceph: define connection flag helpers rbd: normalize dout() calls rbd: barriers are hard rbd: ignore zero-length requests ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/ceph/ceph_features.h38
-rw-r--r--include/linux/ceph/ceph_fs.h32
-rw-r--r--include/linux/ceph/decode.h29
-rw-r--r--include/linux/ceph/libceph.h16
-rw-r--r--include/linux/ceph/mdsmap.h4
-rw-r--r--include/linux/ceph/messenger.h2
-rw-r--r--include/linux/ceph/osd_client.h74
-rw-r--r--include/linux/ceph/osdmap.h30
-rw-r--r--include/linux/ceph/rados.h158
-rw-r--r--include/linux/crush/crush.h2
10 files changed, 227 insertions, 158 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index dad579b0c0e6..76554cecaab2 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,16 +12,46 @@
12#define CEPH_FEATURE_MONNAMES (1<<5) 12#define CEPH_FEATURE_MONNAMES (1<<5)
13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) 13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) 14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
15/* bits 8-17 defined by user-space; not supported yet here */ 15#define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
16#define CEPH_FEATURE_PGID64 (1<<9)
17#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
18#define CEPH_FEATURE_PGPOOL3 (1<<11)
19#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
20#define CEPH_FEATURE_OSDENC (1<<13)
21#define CEPH_FEATURE_OMAP (1<<14)
22#define CEPH_FEATURE_MONENC (1<<15)
23#define CEPH_FEATURE_QUERY_T (1<<16)
24#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
16#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) 25#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
26#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
27#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
28#define CEPH_FEATURE_MON_GV (1<<21)
29#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
30#define CEPH_FEATURE_MSG_AUTH (1<<23)
31#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
32#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
33#define CEPH_FEATURE_CREATEPOOLID (1<<26)
34#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
35#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
36#define CEPH_FEATURE_MDSENC (1<<29)
37#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
17 38
18/* 39/*
19 * Features supported. 40 * Features supported.
20 */ 41 */
21#define CEPH_FEATURES_SUPPORTED_DEFAULT \ 42#define CEPH_FEATURES_SUPPORTED_DEFAULT \
22 (CEPH_FEATURE_NOSRCADDR | \ 43 (CEPH_FEATURE_NOSRCADDR | \
23 CEPH_FEATURE_CRUSH_TUNABLES) 44 CEPH_FEATURE_PGID64 | \
45 CEPH_FEATURE_PGPOOL3 | \
46 CEPH_FEATURE_OSDENC | \
47 CEPH_FEATURE_CRUSH_TUNABLES | \
48 CEPH_FEATURE_CRUSH_TUNABLES2 | \
49 CEPH_FEATURE_REPLY_CREATE_INODE | \
50 CEPH_FEATURE_OSDHASHPSPOOL)
24 51
25#define CEPH_FEATURES_REQUIRED_DEFAULT \ 52#define CEPH_FEATURES_REQUIRED_DEFAULT \
26 (CEPH_FEATURE_NOSRCADDR) 53 (CEPH_FEATURE_NOSRCADDR | \
54 CEPH_FEATURE_PGID64 | \
55 CEPH_FEATURE_PGPOOL3 | \
56 CEPH_FEATURE_OSDENC)
27#endif 57#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cf6f4d998a76..2ad7b860f062 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -21,16 +21,14 @@
21 * internal cluster protocols separately from the public, 21 * internal cluster protocols separately from the public,
22 * client-facing protocol. 22 * client-facing protocol.
23 */ 23 */
24#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
25#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
26#define CEPH_MON_PROTOCOL 5 /* cluster internal */
27#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 24#define CEPH_OSDC_PROTOCOL 24 /* server/client */
28#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 25#define CEPH_MDSC_PROTOCOL 32 /* server/client */
29#define CEPH_MONC_PROTOCOL 15 /* server/client */ 26#define CEPH_MONC_PROTOCOL 15 /* server/client */
30 27
31 28
32#define CEPH_INO_ROOT 1 29#define CEPH_INO_ROOT 1
33#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ 30#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
31#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
34 32
35/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ 33/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
36#define CEPH_MAX_MON 31 34#define CEPH_MAX_MON 31
@@ -51,7 +49,7 @@ struct ceph_file_layout {
51 __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ 49 __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
52 50
53 /* object -> pg layout */ 51 /* object -> pg layout */
54 __le32 fl_unused; /* unused; used to be preferred primary (-1) */ 52 __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
55 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ 53 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
56} __attribute__ ((packed)); 54} __attribute__ ((packed));
57 55
@@ -101,6 +99,8 @@ struct ceph_dir_layout {
101#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 99#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
102#define CEPH_MSG_AUTH 17 100#define CEPH_MSG_AUTH 17
103#define CEPH_MSG_AUTH_REPLY 18 101#define CEPH_MSG_AUTH_REPLY 18
102#define CEPH_MSG_MON_GET_VERSION 19
103#define CEPH_MSG_MON_GET_VERSION_REPLY 20
104 104
105/* client <-> mds */ 105/* client <-> mds */
106#define CEPH_MSG_MDS_MAP 21 106#define CEPH_MSG_MDS_MAP 21
@@ -221,6 +221,11 @@ struct ceph_mon_subscribe_ack {
221} __attribute__ ((packed)); 221} __attribute__ ((packed));
222 222
223/* 223/*
224 * mdsmap flags
225 */
226#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
227
228/*
224 * mds states 229 * mds states
225 * > 0 -> in 230 * > 0 -> in
226 * <= 0 -> out 231 * <= 0 -> out
@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack {
233#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ 238#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
234#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ 239#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
235#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ 240#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
241#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */
236 242
237#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ 243#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
238#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed 244#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s);
264#define CEPH_LOCK_IXATTR 2048 270#define CEPH_LOCK_IXATTR 2048
265#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ 271#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
266#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ 272#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
273#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
267 274
268/* client_session ops */ 275/* client_session ops */
269enum { 276enum {
@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op);
338#define CEPH_SETATTR_SIZE 32 345#define CEPH_SETATTR_SIZE 32
339#define CEPH_SETATTR_CTIME 64 346#define CEPH_SETATTR_CTIME 64
340 347
348/*
349 * Ceph setxattr request flags.
350 */
351#define CEPH_XATTR_CREATE 1
352#define CEPH_XATTR_REPLACE 2
353
341union ceph_mds_request_args { 354union ceph_mds_request_args {
342 struct { 355 struct {
343 __le32 mask; /* CEPH_CAP_* */ 356 __le32 mask; /* CEPH_CAP_* */
@@ -522,14 +535,17 @@ int ceph_flags_to_mode(int flags);
522#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ 535#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
523#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ 536#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
524 537
538#define CEPH_CAP_SIMPLE_BITS 2
539#define CEPH_CAP_FILE_BITS 8
540
525/* per-lock shift */ 541/* per-lock shift */
526#define CEPH_CAP_SAUTH 2 542#define CEPH_CAP_SAUTH 2
527#define CEPH_CAP_SLINK 4 543#define CEPH_CAP_SLINK 4
528#define CEPH_CAP_SXATTR 6 544#define CEPH_CAP_SXATTR 6
529#define CEPH_CAP_SFILE 8 545#define CEPH_CAP_SFILE 8
530#define CEPH_CAP_SFLOCK 20 546#define CEPH_CAP_SFLOCK 20
531 547
532#define CEPH_CAP_BITS 22 548#define CEPH_CAP_BITS 22
533 549
534/* composed values */ 550/* composed values */
535#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) 551#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 63d092822bad..360d9d08ca9e 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
52 return end >= *p && n <= end - *p; 52 return end >= *p && n <= end - *p;
53} 53}
54 54
55#define ceph_decode_need(p, end, n, bad) \ 55#define ceph_decode_need(p, end, n, bad) \
56 do { \ 56 do { \
57 if (!likely(ceph_has_room(p, end, n))) \ 57 if (!likely(ceph_has_room(p, end, n))) \
58 goto bad; \ 58 goto bad; \
59 } while (0) 59 } while (0)
60 60
61#define ceph_decode_64_safe(p, end, v, bad) \ 61#define ceph_decode_64_safe(p, end, v, bad) \
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
99 * 99 *
100 * There are two possible failures: 100 * There are two possible failures:
101 * - converting the string would require accessing memory at or 101 * - converting the string would require accessing memory at or
102 * beyond the "end" pointer provided (-E 102 * beyond the "end" pointer provided (-ERANGE)
103 * - memory could not be allocated for the result 103 * - memory could not be allocated for the result (-ENOMEM)
104 */ 104 */
105static inline char *ceph_extract_encoded_string(void **p, void *end, 105static inline char *ceph_extract_encoded_string(void **p, void *end,
106 size_t *lenp, gfp_t gfp) 106 size_t *lenp, gfp_t gfp)
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end,
217 *p += len; 217 *p += len;
218} 218}
219 219
220#define ceph_encode_need(p, end, n, bad) \ 220#define ceph_encode_need(p, end, n, bad) \
221 do { \ 221 do { \
222 if (!likely(ceph_has_room(p, end, n))) \ 222 if (!likely(ceph_has_room(p, end, n))) \
223 goto bad; \ 223 goto bad; \
224 } while (0) 224 } while (0)
225 225
226#define ceph_encode_64_safe(p, end, v, bad) \ 226#define ceph_encode_64_safe(p, end, v, bad) \
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end,
231#define ceph_encode_32_safe(p, end, v, bad) \ 231#define ceph_encode_32_safe(p, end, v, bad) \
232 do { \ 232 do { \
233 ceph_encode_need(p, end, sizeof(u32), bad); \ 233 ceph_encode_need(p, end, sizeof(u32), bad); \
234 ceph_encode_32(p, v); \ 234 ceph_encode_32(p, v); \
235 } while (0) 235 } while (0)
236#define ceph_encode_16_safe(p, end, v, bad) \ 236#define ceph_encode_16_safe(p, end, v, bad) \
237 do { \ 237 do { \
238 ceph_encode_need(p, end, sizeof(u16), bad); \ 238 ceph_encode_need(p, end, sizeof(u16), bad); \
239 ceph_encode_16(p, v); \ 239 ceph_encode_16(p, v); \
240 } while (0)
241#define ceph_encode_8_safe(p, end, v, bad) \
242 do { \
243 ceph_encode_need(p, end, sizeof(u8), bad); \
244 ceph_encode_8(p, v); \
240 } while (0) 245 } while (0)
241 246
242#define ceph_encode_copy_safe(p, end, pv, n, bad) \ 247#define ceph_encode_copy_safe(p, end, pv, n, bad) \
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 084d3c622b12..29818fc3fa49 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len)
193} 193}
194 194
195/* ceph_common.c */ 195/* ceph_common.c */
196extern bool libceph_compatible(void *data);
197
196extern const char *ceph_msg_type_name(int type); 198extern const char *ceph_msg_type_name(int type);
197extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 199extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
198extern struct kmem_cache *ceph_inode_cachep; 200extern struct kmem_cache *ceph_inode_cachep;
@@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client);
220/* pagevec.c */ 222/* pagevec.c */
221extern void ceph_release_page_vector(struct page **pages, int num_pages); 223extern void ceph_release_page_vector(struct page **pages, int num_pages);
222 224
223extern struct page **ceph_get_direct_page_vector(const char __user *data, 225extern struct page **ceph_get_direct_page_vector(const void __user *data,
224 int num_pages, 226 int num_pages,
225 bool write_page); 227 bool write_page);
226extern void ceph_put_page_vector(struct page **pages, int num_pages, 228extern void ceph_put_page_vector(struct page **pages, int num_pages,
@@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages,
228extern void ceph_release_page_vector(struct page **pages, int num_pages); 230extern void ceph_release_page_vector(struct page **pages, int num_pages);
229extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); 231extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
230extern int ceph_copy_user_to_page_vector(struct page **pages, 232extern int ceph_copy_user_to_page_vector(struct page **pages,
231 const char __user *data, 233 const void __user *data,
232 loff_t off, size_t len); 234 loff_t off, size_t len);
233extern int ceph_copy_to_page_vector(struct page **pages, 235extern void ceph_copy_to_page_vector(struct page **pages,
234 const char *data, 236 const void *data,
235 loff_t off, size_t len); 237 loff_t off, size_t len);
236extern int ceph_copy_from_page_vector(struct page **pages, 238extern void ceph_copy_from_page_vector(struct page **pages,
237 char *data, 239 void *data,
238 loff_t off, size_t len); 240 loff_t off, size_t len);
239extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, 241extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
240 loff_t off, size_t len); 242 loff_t off, size_t len);
241extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); 243extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
242 244
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index cb15b5d867c7..87ed09f54800 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -29,8 +29,8 @@ struct ceph_mdsmap {
29 29
30 /* which object pools file data can be stored in */ 30 /* which object pools file data can be stored in */
31 int m_num_data_pg_pools; 31 int m_num_data_pg_pools;
32 u32 *m_data_pg_pools; 32 u64 *m_data_pg_pools;
33 u32 m_cas_pg_pool; 33 u64 m_cas_pg_pool;
34}; 34};
35 35
36static inline struct ceph_entity_addr * 36static inline struct ceph_entity_addr *
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 14ba5ee738a9..60903e0f665c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -83,9 +83,11 @@ struct ceph_msg {
83 struct list_head list_head; 83 struct list_head list_head;
84 84
85 struct kref kref; 85 struct kref kref;
86#ifdef CONFIG_BLOCK
86 struct bio *bio; /* instead of pages/pagelist */ 87 struct bio *bio; /* instead of pages/pagelist */
87 struct bio *bio_iter; /* bio iterator */ 88 struct bio *bio_iter; /* bio iterator */
88 int bio_seg; /* current bio segment */ 89 int bio_seg; /* current bio segment */
90#endif /* CONFIG_BLOCK */
89 struct ceph_pagelist *trail; /* the trailing part of the data */ 91 struct ceph_pagelist *trail; /* the trailing part of the data */
90 bool front_is_vmalloc; 92 bool front_is_vmalloc;
91 bool more_to_follow; 93 bool more_to_follow;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d9b880e977e6..1dd5d466b6f9 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -10,6 +10,7 @@
10#include <linux/ceph/osdmap.h> 10#include <linux/ceph/osdmap.h>
11#include <linux/ceph/messenger.h> 11#include <linux/ceph/messenger.h>
12#include <linux/ceph/auth.h> 12#include <linux/ceph/auth.h>
13#include <linux/ceph/pagelist.h>
13 14
14/* 15/*
15 * Maximum object name size 16 * Maximum object name size
@@ -22,7 +23,6 @@ struct ceph_snap_context;
22struct ceph_osd_request; 23struct ceph_osd_request;
23struct ceph_osd_client; 24struct ceph_osd_client;
24struct ceph_authorizer; 25struct ceph_authorizer;
25struct ceph_pagelist;
26 26
27/* 27/*
28 * completion callback for async writepages 28 * completion callback for async writepages
@@ -47,6 +47,9 @@ struct ceph_osd {
47 struct list_head o_keepalive_item; 47 struct list_head o_keepalive_item;
48}; 48};
49 49
50
51#define CEPH_OSD_MAX_OP 10
52
50/* an in-flight request */ 53/* an in-flight request */
51struct ceph_osd_request { 54struct ceph_osd_request {
52 u64 r_tid; /* unique for this client */ 55 u64 r_tid; /* unique for this client */
@@ -63,9 +66,23 @@ struct ceph_osd_request {
63 struct ceph_connection *r_con_filling_msg; 66 struct ceph_connection *r_con_filling_msg;
64 67
65 struct ceph_msg *r_request, *r_reply; 68 struct ceph_msg *r_request, *r_reply;
66 int r_result;
67 int r_flags; /* any additional flags for the osd */ 69 int r_flags; /* any additional flags for the osd */
68 u32 r_sent; /* >0 if r_request is sending/sent */ 70 u32 r_sent; /* >0 if r_request is sending/sent */
71 int r_num_ops;
72
73 /* encoded message content */
74 struct ceph_osd_op *r_request_ops;
75 /* these are updated on each send */
76 __le32 *r_request_osdmap_epoch;
77 __le32 *r_request_flags;
78 __le64 *r_request_pool;
79 void *r_request_pgid;
80 __le32 *r_request_attempts;
81 struct ceph_eversion *r_request_reassert_version;
82
83 int r_result;
84 int r_reply_op_len[CEPH_OSD_MAX_OP];
85 s32 r_reply_op_result[CEPH_OSD_MAX_OP];
69 int r_got_reply; 86 int r_got_reply;
70 int r_linger; 87 int r_linger;
71 88
@@ -82,6 +99,7 @@ struct ceph_osd_request {
82 99
83 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ 100 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
84 int r_oid_len; 101 int r_oid_len;
102 u64 r_snapid;
85 unsigned long r_stamp; /* send OR check time */ 103 unsigned long r_stamp; /* send OR check time */
86 104
87 struct ceph_file_layout r_file_layout; 105 struct ceph_file_layout r_file_layout;
@@ -95,7 +113,7 @@ struct ceph_osd_request {
95 struct bio *r_bio; /* instead of pages */ 113 struct bio *r_bio; /* instead of pages */
96#endif 114#endif
97 115
98 struct ceph_pagelist *r_trail; /* trailing part of the data */ 116 struct ceph_pagelist r_trail; /* trailing part of the data */
99}; 117};
100 118
101struct ceph_osd_event { 119struct ceph_osd_event {
@@ -107,7 +125,6 @@ struct ceph_osd_event {
107 struct rb_node node; 125 struct rb_node node;
108 struct list_head osd_node; 126 struct list_head osd_node;
109 struct kref kref; 127 struct kref kref;
110 struct completion completion;
111}; 128};
112 129
113struct ceph_osd_event_work { 130struct ceph_osd_event_work {
@@ -157,7 +174,7 @@ struct ceph_osd_client {
157 174
158struct ceph_osd_req_op { 175struct ceph_osd_req_op {
159 u16 op; /* CEPH_OSD_OP_* */ 176 u16 op; /* CEPH_OSD_OP_* */
160 u32 flags; /* CEPH_OSD_FLAG_* */ 177 u32 payload_len;
161 union { 178 union {
162 struct { 179 struct {
163 u64 offset, length; 180 u64 offset, length;
@@ -166,23 +183,24 @@ struct ceph_osd_req_op {
166 } extent; 183 } extent;
167 struct { 184 struct {
168 const char *name; 185 const char *name;
169 u32 name_len;
170 const char *val; 186 const char *val;
187 u32 name_len;
171 u32 value_len; 188 u32 value_len;
172 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 189 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
173 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 190 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
174 } xattr; 191 } xattr;
175 struct { 192 struct {
176 const char *class_name; 193 const char *class_name;
177 __u8 class_len;
178 const char *method_name; 194 const char *method_name;
179 __u8 method_len;
180 __u8 argc;
181 const char *indata; 195 const char *indata;
182 u32 indata_len; 196 u32 indata_len;
197 __u8 class_len;
198 __u8 method_len;
199 __u8 argc;
183 } cls; 200 } cls;
184 struct { 201 struct {
185 u64 cookie, count; 202 u64 cookie;
203 u64 count;
186 } pgls; 204 } pgls;
187 struct { 205 struct {
188 u64 snapid; 206 u64 snapid;
@@ -190,12 +208,11 @@ struct ceph_osd_req_op {
190 struct { 208 struct {
191 u64 cookie; 209 u64 cookie;
192 u64 ver; 210 u64 ver;
193 __u8 flag;
194 u32 prot_ver; 211 u32 prot_ver;
195 u32 timeout; 212 u32 timeout;
213 __u8 flag;
196 } watch; 214 } watch;
197 }; 215 };
198 u32 payload_len;
199}; 216};
200 217
201extern int ceph_osdc_init(struct ceph_osd_client *osdc, 218extern int ceph_osdc_init(struct ceph_osd_client *osdc,
@@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
207extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 224extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
208 struct ceph_msg *msg); 225 struct ceph_msg *msg);
209 226
210extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
211 struct ceph_file_layout *layout,
212 u64 snapid,
213 u64 off, u64 *plen, u64 *bno,
214 struct ceph_osd_request *req,
215 struct ceph_osd_req_op *op);
216
217extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 227extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
218 int flags,
219 struct ceph_snap_context *snapc, 228 struct ceph_snap_context *snapc,
220 struct ceph_osd_req_op *ops, 229 unsigned int num_op,
221 bool use_mempool, 230 bool use_mempool,
222 gfp_t gfp_flags, 231 gfp_t gfp_flags);
223 struct page **pages,
224 struct bio *bio);
225 232
226extern void ceph_osdc_build_request(struct ceph_osd_request *req, 233extern void ceph_osdc_build_request(struct ceph_osd_request *req,
227 u64 off, u64 *plen, 234 u64 off, u64 len,
235 unsigned int num_op,
228 struct ceph_osd_req_op *src_ops, 236 struct ceph_osd_req_op *src_ops,
229 struct ceph_snap_context *snapc, 237 struct ceph_snap_context *snapc,
230 struct timespec *mtime, 238 u64 snap_id,
231 const char *oid, 239 struct timespec *mtime);
232 int oid_len);
233 240
234extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 241extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
235 struct ceph_file_layout *layout, 242 struct ceph_file_layout *layout,
@@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
239 int do_sync, u32 truncate_seq, 246 int do_sync, u32 truncate_seq,
240 u64 truncate_size, 247 u64 truncate_size,
241 struct timespec *mtime, 248 struct timespec *mtime,
242 bool use_mempool, int num_reply, 249 bool use_mempool, int page_align);
243 int page_align);
244 250
245extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 251extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
246 struct ceph_osd_request *req); 252 struct ceph_osd_request *req);
@@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
279 u64 off, u64 len, 285 u64 off, u64 len,
280 u32 truncate_seq, u64 truncate_size, 286 u32 truncate_seq, u64 truncate_size,
281 struct timespec *mtime, 287 struct timespec *mtime,
282 struct page **pages, int nr_pages, 288 struct page **pages, int nr_pages);
283 int flags, int do_sync, bool nofail);
284 289
285/* watch/notify events */ 290/* watch/notify events */
286extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, 291extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
287 void (*event_cb)(u64, u64, u8, void *), 292 void (*event_cb)(u64, u64, u8, void *),
288 int one_shot, void *data, 293 void *data, struct ceph_osd_event **pevent);
289 struct ceph_osd_event **pevent);
290extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); 294extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
291extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
292 unsigned long timeout);
293extern void ceph_osdc_put_event(struct ceph_osd_event *event); 295extern void ceph_osdc_put_event(struct ceph_osd_event *event);
294#endif 296#endif
295 297
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 10a417f9f76f..c819190d1642 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -18,14 +18,31 @@
18 * The map can be updated either via an incremental map (diff) describing 18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map. 19 * the change between two successive epochs, or as a fully encoded map.
20 */ 20 */
21struct ceph_pg {
22 uint64_t pool;
23 uint32_t seed;
24};
25
26#define CEPH_POOL_FLAG_HASHPSPOOL 1
27
21struct ceph_pg_pool_info { 28struct ceph_pg_pool_info {
22 struct rb_node node; 29 struct rb_node node;
23 int id; 30 s64 id;
24 struct ceph_pg_pool v; 31 u8 type;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 32 u8 size;
33 u8 crush_ruleset;
34 u8 object_hash;
35 u32 pg_num, pgp_num;
36 int pg_num_mask, pgp_num_mask;
37 u64 flags;
26 char *name; 38 char *name;
27}; 39};
28 40
41struct ceph_object_locator {
42 uint64_t pool;
43 char *key;
44};
45
29struct ceph_pg_mapping { 46struct ceph_pg_mapping {
30 struct rb_node node; 47 struct rb_node node;
31 struct ceph_pg pgid; 48 struct ceph_pg pgid;
@@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
110 127
111/* calculate mapping of a file extent to an object */ 128/* calculate mapping of a file extent to an object */
112extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 129extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
113 u64 off, u64 *plen, 130 u64 off, u64 len,
114 u64 *bno, u64 *oxoff, u64 *oxlen); 131 u64 *bno, u64 *oxoff, u64 *oxlen);
115 132
116/* calculate mapping of object to a placement group */ 133/* calculate mapping of object to a placement group */
117extern int ceph_calc_object_layout(struct ceph_object_layout *ol, 134extern int ceph_calc_object_layout(struct ceph_pg *pg,
118 const char *oid, 135 const char *oid,
119 struct ceph_file_layout *fl, 136 struct ceph_file_layout *fl,
120 struct ceph_osdmap *osdmap); 137 struct ceph_osdmap *osdmap);
121extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 138extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
139 struct ceph_pg pgid,
122 int *acting); 140 int *acting);
123extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 141extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
124 struct ceph_pg pgid); 142 struct ceph_pg pgid);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2c04afeead1c..68c96a508ac2 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -9,14 +9,6 @@
9#include <linux/ceph/msgr.h> 9#include <linux/ceph/msgr.h>
10 10
11/* 11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 6
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 6
18
19/*
20 * fs id 12 * fs id
21 */ 13 */
22struct ceph_fsid { 14struct ceph_fsid {
@@ -64,7 +56,7 @@ struct ceph_timespec {
64 * placement group. 56 * placement group.
65 * we encode this into one __le64. 57 * we encode this into one __le64.
66 */ 58 */
67struct ceph_pg { 59struct ceph_pg_v1 {
68 __le16 preferred; /* preferred primary osd */ 60 __le16 preferred; /* preferred primary osd */
69 __le16 ps; /* placement seed */ 61 __le16 ps; /* placement seed */
70 __le32 pool; /* object pool */ 62 __le32 pool; /* object pool */
@@ -91,21 +83,6 @@ struct ceph_pg {
91 83
92#define CEPH_PG_TYPE_REP 1 84#define CEPH_PG_TYPE_REP 1
93#define CEPH_PG_TYPE_RAID4 2 85#define CEPH_PG_TYPE_RAID4 2
94#define CEPH_PG_POOL_VERSION 2
95struct ceph_pg_pool {
96 __u8 type; /* CEPH_PG_TYPE_* */
97 __u8 size; /* number of osds in each pg */
98 __u8 crush_ruleset; /* crush placement rule */
99 __u8 object_hash; /* hash mapping object name to ps */
100 __le32 pg_num, pgp_num; /* number of pg's */
101 __le32 lpg_num, lpgp_num; /* number of localized pg's */
102 __le32 last_change; /* most recent epoch changed */
103 __le64 snap_seq; /* seq for per-pool snapshot */
104 __le32 snap_epoch; /* epoch of last snap */
105 __le32 num_snaps;
106 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
107 __le64 auid; /* who owns the pg */
108} __attribute__ ((packed));
109 86
110/* 87/*
111 * stable_mod func is used to control number of placement groups. 88 * stable_mod func is used to control number of placement groups.
@@ -128,7 +105,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask)
128 * object layout - how a given object should be stored. 105 * object layout - how a given object should be stored.
129 */ 106 */
130struct ceph_object_layout { 107struct ceph_object_layout {
131 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ 108 struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */
132 __le32 ol_stripe_unit; /* for per-object parity, if any */ 109 __le32 ol_stripe_unit; /* for per-object parity, if any */
133} __attribute__ ((packed)); 110} __attribute__ ((packed));
134 111
@@ -145,8 +122,12 @@ struct ceph_eversion {
145 */ 122 */
146 123
147/* status bits */ 124/* status bits */
148#define CEPH_OSD_EXISTS 1 125#define CEPH_OSD_EXISTS (1<<0)
149#define CEPH_OSD_UP 2 126#define CEPH_OSD_UP (1<<1)
127#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
128#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
129
130extern const char *ceph_osd_state_name(int s);
150 131
151/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ 132/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
152#define CEPH_OSD_IN 0x10000 133#define CEPH_OSD_IN 0x10000
@@ -161,9 +142,25 @@ struct ceph_eversion {
161#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ 142#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
162#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ 143#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
163#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ 144#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
145#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
146#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
147#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
148#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
149#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
150#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
151
152/*
153 * The error code to return when an OSD can't handle a write
154 * because it is too large.
155 */
156#define OSD_WRITETOOBIG EMSGSIZE
164 157
165/* 158/*
166 * osd ops 159 * osd ops
160 *
161 * WARNING: do not use these op codes directly. Use the helpers
162 * defined below instead. In certain cases, op code behavior was
163 * redefined, resulting in special-cases in the helpers.
167 */ 164 */
168#define CEPH_OSD_OP_MODE 0xf000 165#define CEPH_OSD_OP_MODE 0xf000
169#define CEPH_OSD_OP_MODE_RD 0x1000 166#define CEPH_OSD_OP_MODE_RD 0x1000
@@ -177,6 +174,7 @@ struct ceph_eversion {
177#define CEPH_OSD_OP_TYPE_ATTR 0x0300 174#define CEPH_OSD_OP_TYPE_ATTR 0x0300
178#define CEPH_OSD_OP_TYPE_EXEC 0x0400 175#define CEPH_OSD_OP_TYPE_EXEC 0x0400
179#define CEPH_OSD_OP_TYPE_PG 0x0500 176#define CEPH_OSD_OP_TYPE_PG 0x0500
177#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
180 178
181enum { 179enum {
182 /** data **/ 180 /** data **/
@@ -217,6 +215,23 @@ enum {
217 215
218 CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, 216 CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
219 217
218 /* omap */
219 CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17,
220 CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18,
221 CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19,
222 CEPH_OSD_OP_OMAPGETVALSBYKEYS =
223 CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20,
224 CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21,
225 CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22,
226 CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23,
227 CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
228 CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
229
230 /** multi **/
231 CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
232 CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
233 CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3,
234
220 /** attrs **/ 235 /** attrs **/
221 /* read */ 236 /* read */
222 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 237 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -238,6 +253,7 @@ enum {
238 CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, 253 CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6,
239 CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, 254 CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7,
240 CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, 255 CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8,
256 CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9,
241 257
242 /** lock **/ 258 /** lock **/
243 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, 259 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
@@ -248,10 +264,12 @@ enum {
248 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, 264 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
249 265
250 /** exec **/ 266 /** exec **/
267 /* note: the RD bit here is wrong; see special-case below in helper */
251 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, 268 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
252 269
253 /** pg **/ 270 /** pg **/
254 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, 271 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
272 CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2,
255}; 273};
256 274
257static inline int ceph_osd_op_type_lock(int op) 275static inline int ceph_osd_op_type_lock(int op)
@@ -274,6 +292,10 @@ static inline int ceph_osd_op_type_pg(int op)
274{ 292{
275 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; 293 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
276} 294}
295static inline int ceph_osd_op_type_multi(int op)
296{
297 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI;
298}
277 299
278static inline int ceph_osd_op_mode_subop(int op) 300static inline int ceph_osd_op_mode_subop(int op)
279{ 301{
@@ -281,11 +303,12 @@ static inline int ceph_osd_op_mode_subop(int op)
281} 303}
282static inline int ceph_osd_op_mode_read(int op) 304static inline int ceph_osd_op_mode_read(int op)
283{ 305{
284 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; 306 return (op & CEPH_OSD_OP_MODE_RD) &&
307 op != CEPH_OSD_OP_CALL;
285} 308}
286static inline int ceph_osd_op_mode_modify(int op) 309static inline int ceph_osd_op_mode_modify(int op)
287{ 310{
288 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 311 return op & CEPH_OSD_OP_MODE_WR;
289} 312}
290 313
291/* 314/*
@@ -294,34 +317,38 @@ static inline int ceph_osd_op_mode_modify(int op)
294 */ 317 */
295#define CEPH_OSD_TMAP_HDR 'h' 318#define CEPH_OSD_TMAP_HDR 'h'
296#define CEPH_OSD_TMAP_SET 's' 319#define CEPH_OSD_TMAP_SET 's'
320#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
297#define CEPH_OSD_TMAP_RM 'r' 321#define CEPH_OSD_TMAP_RM 'r'
322#define CEPH_OSD_TMAP_RMSLOPPY 'R'
298 323
299extern const char *ceph_osd_op_name(int op); 324extern const char *ceph_osd_op_name(int op);
300 325
301
302/* 326/*
303 * osd op flags 327 * osd op flags
304 * 328 *
305 * An op may be READ, WRITE, or READ|WRITE. 329 * An op may be READ, WRITE, or READ|WRITE.
306 */ 330 */
307enum { 331enum {
308 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ 332 CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
309 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ 333 CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
310 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ 334 CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
311 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ 335 CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
312 CEPH_OSD_FLAG_READ = 16, /* op may read */ 336 CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
313 CEPH_OSD_FLAG_WRITE = 32, /* op may write */ 337 CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
314 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ 338 CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
315 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ 339 CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
316 CEPH_OSD_FLAG_BALANCE_READS = 256, 340 CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
317 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 341 CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
318 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 342 CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
319 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 343 CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
320 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ 344 CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
345 CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
346 CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
321}; 347};
322 348
323enum { 349enum {
324 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ 350 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
351 CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
325}; 352};
326 353
327#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 354#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
@@ -381,48 +408,13 @@ struct ceph_osd_op {
381 __le64 ver; 408 __le64 ver;
382 __u8 flag; /* 0 = unwatch, 1 = watch */ 409 __u8 flag; /* 0 = unwatch, 1 = watch */
383 } __attribute__ ((packed)) watch; 410 } __attribute__ ((packed)) watch;
384}; 411 struct {
412 __le64 offset, length;
413 __le64 src_offset;
414 } __attribute__ ((packed)) clonerange;
415 };
385 __le32 payload_len; 416 __le32 payload_len;
386} __attribute__ ((packed)); 417} __attribute__ ((packed));
387 418
388/*
389 * osd request message header. each request may include multiple
390 * ceph_osd_op object operations.
391 */
392struct ceph_osd_request_head {
393 __le32 client_inc; /* client incarnation */
394 struct ceph_object_layout layout; /* pgid */
395 __le32 osdmap_epoch; /* client's osdmap epoch */
396
397 __le32 flags;
398
399 struct ceph_timespec mtime; /* for mutations only */
400 struct ceph_eversion reassert_version; /* if we are replaying op */
401
402 __le32 object_len; /* length of object name */
403
404 __le64 snapid; /* snapid to read */
405 __le64 snap_seq; /* writer's snap context */
406 __le32 num_snaps;
407
408 __le16 num_ops;
409 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
410} __attribute__ ((packed));
411
412struct ceph_osd_reply_head {
413 __le32 client_inc; /* client incarnation */
414 __le32 flags;
415 struct ceph_object_layout layout;
416 __le32 osdmap_epoch;
417 struct ceph_eversion reassert_version; /* for replaying uncommitted */
418
419 __le32 result; /* result code */
420
421 __le32 object_len; /* length of object name */
422 __le32 num_ops;
423 struct ceph_osd_op ops[0]; /* ops[], object */
424} __attribute__ ((packed));
425
426
427 419
428#endif 420#endif
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 25baa287cff7..6a1101f24cfb 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -162,6 +162,8 @@ struct crush_map {
162 __u32 choose_local_fallback_tries; 162 __u32 choose_local_fallback_tries;
163 /* choose attempts before giving up */ 163 /* choose attempts before giving up */
164 __u32 choose_total_tries; 164 __u32 choose_total_tries;
165 /* attempt chooseleaf inner descent once; on failure retry outer descent */
166 __u32 chooseleaf_descend_once;
165}; 167};
166 168
167 169