aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-10-06 14:31:06 -0400
committerSage Weil <sage@newdream.net>2009-10-06 14:31:06 -0400
commit0dee3c28af2fbe22ca62739a7f57da5435d35793 (patch)
treedd5992a4abc86c5931ce36258b972dbf48ab355d
parent7ad920b504a980adcab4d3f6b85695526e6fd7bb (diff)
ceph: on-wire types
These headers describe the types used to exchange messages between the Ceph client and various servers. All types are little-endian and packed. These headers are shared between the kernel and userspace, so all types are in terms of e.g. __u32. Additionally, we define a few magic values to identify the current version of the protocol(s) in use, so that discrepancies to be detected on mount. Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r--fs/ceph/ceph_fs.c80
-rw-r--r--fs/ceph/ceph_fs.h629
-rw-r--r--fs/ceph/ceph_strings.c163
-rw-r--r--fs/ceph/msgr.h157
-rw-r--r--fs/ceph/rados.h372
5 files changed, 1401 insertions, 0 deletions
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..9371ff1c0002
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,80 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6int ceph_flags_to_mode(int flags)
7{
8#ifdef O_DIRECTORY /* fixme */
9 if ((flags & O_DIRECTORY) == O_DIRECTORY)
10 return CEPH_FILE_MODE_PIN;
11#endif
12#ifdef O_LAZY
13 if (flags & O_LAZY)
14 return CEPH_FILE_MODE_LAZY;
15#endif
16 if ((flags & O_APPEND) == O_APPEND)
17 flags |= O_WRONLY;
18
19 flags &= O_ACCMODE;
20 if ((flags & O_RDWR) == O_RDWR)
21 return CEPH_FILE_MODE_RDWR;
22 if ((flags & O_WRONLY) == O_WRONLY)
23 return CEPH_FILE_MODE_WR;
24 return CEPH_FILE_MODE_RD;
25}
26
27int ceph_caps_for_mode(int mode)
28{
29 switch (mode) {
30 case CEPH_FILE_MODE_PIN:
31 return CEPH_CAP_PIN;
32 case CEPH_FILE_MODE_RD:
33 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
34 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
35 case CEPH_FILE_MODE_RDWR:
36 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
37 CEPH_CAP_FILE_EXCL |
38 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
39 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
40 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
41 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
42 case CEPH_FILE_MODE_WR:
43 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
44 CEPH_CAP_FILE_EXCL |
45 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
46 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
47 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
48 }
49 return 0;
50}
51
52/* Name hashing routines. Initial hash value */
53/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
54#define ceph_init_name_hash() 0
55
56/* partial hash update function. Assume roughly 4 bits per character */
57static unsigned long ceph_partial_name_hash(unsigned long c,
58 unsigned long prevhash)
59{
60 return (prevhash + (c << 4) + (c >> 4)) * 11;
61}
62
63/*
64 * Finally: cut down the number of bits to a int value (and try to avoid
65 * losing bits)
66 */
67static unsigned long ceph_end_name_hash(unsigned long hash)
68{
69 return hash & 0xffffffff;
70}
71
72/* Compute the hash for a name string. */
73unsigned int ceph_full_name_hash(const char *name, unsigned int len)
74{
75 unsigned long hash = ceph_init_name_hash();
76 while (len--)
77 hash = ceph_partial_name_hash(*name++, hash);
78 return ceph_end_name_hash(hash);
79}
80
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..21ed51b127f2
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,629 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 16
23#define CEPH_VERSION_PATCH 1
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 7 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 4 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 20 /* server/client */
42#define CEPH_MDSC_PROTOCOL 29 /* server/client */
43#define CEPH_MONC_PROTOCOL 14 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47
48/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
49#define CEPH_MAX_MON 31
50
51
52unsigned int ceph_full_name_hash(const char *name, unsigned int len);
53
54
55/*
56 * ceph_file_layout - describe data layout for a file/inode
57 */
58struct ceph_file_layout {
59 /* file -> object mapping */
60 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
61 of page size. */
62 __le32 fl_stripe_count; /* over this many objects */
63 __le32 fl_object_size; /* until objects are this big, then move to
64 new objects */
65 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
66
67 /* pg -> disk layout */
68 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
69
70 /* object -> pg layout */
71 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
72 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
73} __attribute__ ((packed));
74
75
76
77
78/*********************************************
79 * message layer
80 */
81
82/*
83 * message types
84 */
85
86/* misc */
87#define CEPH_MSG_SHUTDOWN 1
88#define CEPH_MSG_PING 2
89
90/* client <-> monitor */
91#define CEPH_MSG_MON_MAP 4
92#define CEPH_MSG_MON_GET_MAP 5
93#define CEPH_MSG_CLIENT_MOUNT 10
94#define CEPH_MSG_CLIENT_MOUNT_ACK 11
95#define CEPH_MSG_STATFS 13
96#define CEPH_MSG_STATFS_REPLY 14
97#define CEPH_MSG_MON_SUBSCRIBE 15
98#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
99
100/* client <-> mds */
101#define CEPH_MSG_MDS_GETMAP 20
102#define CEPH_MSG_MDS_MAP 21
103
104#define CEPH_MSG_CLIENT_SESSION 22
105#define CEPH_MSG_CLIENT_RECONNECT 23
106
107#define CEPH_MSG_CLIENT_REQUEST 24
108#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
109#define CEPH_MSG_CLIENT_REPLY 26
110#define CEPH_MSG_CLIENT_CAPS 0x310
111#define CEPH_MSG_CLIENT_LEASE 0x311
112#define CEPH_MSG_CLIENT_SNAP 0x312
113#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
114
115/* osd */
116#define CEPH_MSG_OSD_GETMAP 40
117#define CEPH_MSG_OSD_MAP 41
118#define CEPH_MSG_OSD_OP 42
119#define CEPH_MSG_OSD_OPREPLY 43
120
121
122struct ceph_mon_statfs {
123 __le64 have_version;
124 struct ceph_fsid fsid;
125 __le64 tid;
126} __attribute__ ((packed));
127
128struct ceph_statfs {
129 __le64 kb, kb_used, kb_avail;
130 __le64 num_objects;
131} __attribute__ ((packed));
132
133struct ceph_mon_statfs_reply {
134 struct ceph_fsid fsid;
135 __le64 tid;
136 __le64 version;
137 struct ceph_statfs st;
138} __attribute__ ((packed));
139
140struct ceph_osd_getmap {
141 __le64 have_version;
142 struct ceph_fsid fsid;
143 __le32 start;
144} __attribute__ ((packed));
145
146struct ceph_mds_getmap {
147 __le64 have_version;
148 struct ceph_fsid fsid;
149} __attribute__ ((packed));
150
151struct ceph_client_mount {
152 __le64 have_version;
153} __attribute__ ((packed));
154
155struct ceph_mon_subscribe_item {
156 __le64 have;
157 __u8 onetime;
158} __attribute__ ((packed));
159
160/*
161 * mds states
162 * > 0 -> in
163 * <= 0 -> out
164 */
165#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
166#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
167 empty log. */
168#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
169#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
170#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
171#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
172#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
173
174#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
175#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
176 operations (import, rename, etc.) */
177#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
178#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
179#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
180#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
181#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
182
183extern const char *ceph_mds_state_name(int s);
184
185
186/*
187 * metadata lock types.
188 * - these are bitmasks.. we can compose them
189 * - they also define the lock ordering by the MDS
190 * - a few of these are internal to the mds
191 */
192#define CEPH_LOCK_DN 1
193#define CEPH_LOCK_ISNAP 2
194#define CEPH_LOCK_IVERSION 4 /* mds internal */
195#define CEPH_LOCK_IFILE 8 /* mds internal */
196#define CEPH_LOCK_IAUTH 32
197#define CEPH_LOCK_ILINK 64
198#define CEPH_LOCK_IDFT 128 /* dir frag tree */
199#define CEPH_LOCK_INEST 256 /* mds internal */
200#define CEPH_LOCK_IXATTR 512
201#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
202
203/* client_session ops */
204enum {
205 CEPH_SESSION_REQUEST_OPEN,
206 CEPH_SESSION_OPEN,
207 CEPH_SESSION_REQUEST_CLOSE,
208 CEPH_SESSION_CLOSE,
209 CEPH_SESSION_REQUEST_RENEWCAPS,
210 CEPH_SESSION_RENEWCAPS,
211 CEPH_SESSION_STALE,
212 CEPH_SESSION_RECALL_STATE,
213};
214
215extern const char *ceph_session_op_name(int op);
216
217struct ceph_mds_session_head {
218 __le32 op;
219 __le64 seq;
220 struct ceph_timespec stamp;
221 __le32 max_caps, max_leases;
222} __attribute__ ((packed));
223
224/* client_request */
225/*
226 * metadata ops.
227 * & 0x001000 -> write op
228 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
229 & & 0x100000 -> use weird ino/path trace
230 */
231#define CEPH_MDS_OP_WRITE 0x001000
232enum {
233 CEPH_MDS_OP_LOOKUP = 0x00100,
234 CEPH_MDS_OP_GETATTR = 0x00101,
235 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
236 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
237
238 CEPH_MDS_OP_SETXATTR = 0x01105,
239 CEPH_MDS_OP_RMXATTR = 0x01106,
240 CEPH_MDS_OP_SETLAYOUT = 0x01107,
241 CEPH_MDS_OP_SETATTR = 0x01108,
242
243 CEPH_MDS_OP_MKNOD = 0x01201,
244 CEPH_MDS_OP_LINK = 0x01202,
245 CEPH_MDS_OP_UNLINK = 0x01203,
246 CEPH_MDS_OP_RENAME = 0x01204,
247 CEPH_MDS_OP_MKDIR = 0x01220,
248 CEPH_MDS_OP_RMDIR = 0x01221,
249 CEPH_MDS_OP_SYMLINK = 0x01222,
250
251 CEPH_MDS_OP_CREATE = 0x00301,
252 CEPH_MDS_OP_OPEN = 0x00302,
253 CEPH_MDS_OP_READDIR = 0x00305,
254
255 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
256 CEPH_MDS_OP_MKSNAP = 0x01400,
257 CEPH_MDS_OP_RMSNAP = 0x01401,
258 CEPH_MDS_OP_LSSNAP = 0x00402,
259};
260
261extern const char *ceph_mds_op_name(int op);
262
263
264#define CEPH_SETATTR_MODE 1
265#define CEPH_SETATTR_UID 2
266#define CEPH_SETATTR_GID 4
267#define CEPH_SETATTR_MTIME 8
268#define CEPH_SETATTR_ATIME 16
269#define CEPH_SETATTR_SIZE 32
270#define CEPH_SETATTR_CTIME 64
271
272union ceph_mds_request_args {
273 struct {
274 __le32 mask; /* CEPH_CAP_* */
275 } __attribute__ ((packed)) getattr;
276 struct {
277 __le32 mode;
278 __le32 uid;
279 __le32 gid;
280 struct ceph_timespec mtime;
281 struct ceph_timespec atime;
282 __le64 size, old_size; /* old_size needed by truncate */
283 __le32 mask; /* CEPH_SETATTR_* */
284 } __attribute__ ((packed)) setattr;
285 struct {
286 __le32 frag; /* which dir fragment */
287 __le32 max_entries; /* how many dentries to grab */
288 } __attribute__ ((packed)) readdir;
289 struct {
290 __le32 mode;
291 __le32 rdev;
292 } __attribute__ ((packed)) mknod;
293 struct {
294 __le32 mode;
295 } __attribute__ ((packed)) mkdir;
296 struct {
297 __le32 flags;
298 __le32 mode;
299 __le32 stripe_unit; /* layout for newly created file */
300 __le32 stripe_count; /* ... */
301 __le32 object_size;
302 __le32 file_replication;
303 __le32 preferred;
304 } __attribute__ ((packed)) open;
305 struct {
306 __le32 flags;
307 } __attribute__ ((packed)) setxattr;
308 struct {
309 struct ceph_file_layout layout;
310 } __attribute__ ((packed)) setlayout;
311} __attribute__ ((packed));
312
313#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
314#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
315
316struct ceph_mds_request_head {
317 __le64 tid, oldest_client_tid;
318 __le32 mdsmap_epoch; /* on client */
319 __le32 flags; /* CEPH_MDS_FLAG_* */
320 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
321 __le16 num_releases; /* # include cap/lease release records */
322 __le32 op; /* mds op code */
323 __le32 caller_uid, caller_gid;
324 __le64 ino; /* use this ino for openc, mkdir, mknod,
325 etc. (if replaying) */
326 union ceph_mds_request_args args;
327} __attribute__ ((packed));
328
329/* cap/lease release record */
330struct ceph_mds_request_release {
331 __le64 ino, cap_id; /* ino and unique cap id */
332 __le32 caps, wanted; /* new issued, wanted */
333 __le32 seq, issue_seq, mseq;
334 __le32 dname_seq; /* if releasing a dentry lease, a */
335 __le32 dname_len; /* string follows. */
336} __attribute__ ((packed));
337
338/* client reply */
339struct ceph_mds_reply_head {
340 __le64 tid;
341 __le32 op;
342 __le32 result;
343 __le32 mdsmap_epoch;
344 __u8 safe; /* true if committed to disk */
345 __u8 is_dentry, is_target; /* true if dentry, target inode records
346 are included with reply */
347} __attribute__ ((packed));
348
349/* one for each node split */
350struct ceph_frag_tree_split {
351 __le32 frag; /* this frag splits... */
352 __le32 by; /* ...by this many bits */
353} __attribute__ ((packed));
354
355struct ceph_frag_tree_head {
356 __le32 nsplits; /* num ceph_frag_tree_split records */
357 struct ceph_frag_tree_split splits[];
358} __attribute__ ((packed));
359
360/* capability issue, for bundling with mds reply */
361struct ceph_mds_reply_cap {
362 __le32 caps, wanted; /* caps issued, wanted */
363 __le64 cap_id;
364 __le32 seq, mseq;
365 __le64 realm; /* snap realm */
366 __u8 flags; /* CEPH_CAP_FLAG_* */
367} __attribute__ ((packed));
368
369#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
370
371/* inode record, for bundling with mds reply */
372struct ceph_mds_reply_inode {
373 __le64 ino;
374 __le64 snapid;
375 __le32 rdev;
376 __le64 version; /* inode version */
377 __le64 xattr_version; /* version for xattr blob */
378 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
379 struct ceph_file_layout layout;
380 struct ceph_timespec ctime, mtime, atime;
381 __le32 time_warp_seq;
382 __le64 size, max_size, truncate_size;
383 __le32 truncate_seq;
384 __le32 mode, uid, gid;
385 __le32 nlink;
386 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
387 struct ceph_timespec rctime;
388 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
389} __attribute__ ((packed));
390/* followed by frag array, then symlink string, then xattr blob */
391
392/* reply_lease follows dname, and reply_inode */
393struct ceph_mds_reply_lease {
394 __le16 mask; /* lease type(s) */
395 __le32 duration_ms; /* lease duration */
396 __le32 seq;
397} __attribute__ ((packed));
398
399struct ceph_mds_reply_dirfrag {
400 __le32 frag; /* fragment */
401 __le32 auth; /* auth mds, if this is a delegation point */
402 __le32 ndist; /* number of mds' this is replicated on */
403 __le32 dist[];
404} __attribute__ ((packed));
405
406/* file access modes */
407#define CEPH_FILE_MODE_PIN 0
408#define CEPH_FILE_MODE_RD 1
409#define CEPH_FILE_MODE_WR 2
410#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
411#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
412#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
413
414int ceph_flags_to_mode(int flags);
415
416
417/* capability bits */
418#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
419
420/* generic cap bits */
421#define CEPH_CAP_GSHARED 1 /* client can reads */
422#define CEPH_CAP_GEXCL 2 /* client can read and update */
423#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
424#define CEPH_CAP_GRD 8 /* (file) client can read */
425#define CEPH_CAP_GWR 16 /* (file) client can write */
426#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
427#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
428#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
429
430/* per-lock shift */
431#define CEPH_CAP_SAUTH 2
432#define CEPH_CAP_SLINK 4
433#define CEPH_CAP_SXATTR 6
434#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
435
436#define CEPH_CAP_BITS 16
437
438/* composed values */
439#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
440#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
441#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
442#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
443#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
444#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
445#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
446#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
447#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
448#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
449#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
450#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
451#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
452#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
453#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
454
455/* cap masks (for getattr) */
456#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
457#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
458#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
459#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
460#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
461#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
462#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
463#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
464#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
465#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
466#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
467#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
468#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
469 CEPH_CAP_AUTH_SHARED | \
470 CEPH_CAP_LINK_SHARED | \
471 CEPH_CAP_FILE_SHARED | \
472 CEPH_CAP_XATTR_SHARED)
473
474#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
475 CEPH_CAP_LINK_SHARED | \
476 CEPH_CAP_XATTR_SHARED | \
477 CEPH_CAP_FILE_SHARED)
478#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
479 CEPH_CAP_FILE_CACHE)
480
481#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
482 CEPH_CAP_LINK_EXCL | \
483 CEPH_CAP_XATTR_EXCL | \
484 CEPH_CAP_FILE_EXCL)
485#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
486 CEPH_CAP_FILE_EXCL)
487#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
488#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
489 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
490
491#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
492 CEPH_LOCK_IXATTR)
493
494int ceph_caps_for_mode(int mode);
495
496enum {
497 CEPH_CAP_OP_GRANT, /* mds->client grant */
498 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
499 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
500 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
501 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
502 CEPH_CAP_OP_UPDATE, /* client->mds update */
503 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
504 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
505 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
506 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
507 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
508 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
509 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
510};
511
512extern const char *ceph_cap_op_name(int op);
513
514/*
515 * caps message, used for capability callbacks, acks, requests, etc.
516 */
517struct ceph_mds_caps {
518 __le32 op; /* CEPH_CAP_OP_* */
519 __le64 ino, realm;
520 __le64 cap_id;
521 __le32 seq, issue_seq;
522 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
523 __le32 migrate_seq;
524 __le64 snap_follows;
525 __le32 snap_trace_len;
526 __le64 client_tid; /* for FLUSH(SNAP) -> FLUSH(SNAP)_ACK */
527
528 /* authlock */
529 __le32 uid, gid, mode;
530
531 /* linklock */
532 __le32 nlink;
533
534 /* xattrlock */
535 __le32 xattr_len;
536 __le64 xattr_version;
537
538 /* filelock */
539 __le64 size, max_size, truncate_size;
540 __le32 truncate_seq;
541 struct ceph_timespec mtime, atime, ctime;
542 struct ceph_file_layout layout;
543 __le32 time_warp_seq;
544} __attribute__ ((packed));
545
546/* cap release msg head */
547struct ceph_mds_cap_release {
548 __le32 num; /* number of cap_items that follow */
549} __attribute__ ((packed));
550
551struct ceph_mds_cap_item {
552 __le64 ino;
553 __le64 cap_id;
554 __le32 migrate_seq, seq;
555} __attribute__ ((packed));
556
557#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
558#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
559#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
560#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
561
562extern const char *ceph_lease_op_name(int o);
563
564/* lease msg header */
565struct ceph_mds_lease {
566 __u8 action; /* CEPH_MDS_LEASE_* */
567 __le16 mask; /* which lease */
568 __le64 ino;
569 __le64 first, last; /* snap range */
570 __le32 seq;
571 __le32 duration_ms; /* duration of renewal */
572} __attribute__ ((packed));
573/* followed by a __le32+string for dname */
574
575/* client reconnect */
576struct ceph_mds_cap_reconnect {
577 __le64 cap_id;
578 __le32 wanted;
579 __le32 issued;
580 __le64 size;
581 struct ceph_timespec mtime, atime;
582 __le64 snaprealm;
583 __le64 pathbase; /* base ino for our path to this ino */
584} __attribute__ ((packed));
585/* followed by encoded string */
586
587struct ceph_mds_snaprealm_reconnect {
588 __le64 ino; /* snap realm base */
589 __le64 seq; /* snap seq for this snap realm */
590 __le64 parent; /* parent realm */
591} __attribute__ ((packed));
592
593/*
594 * snaps
595 */
596enum {
597 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
598 CEPH_SNAP_OP_CREATE,
599 CEPH_SNAP_OP_DESTROY,
600 CEPH_SNAP_OP_SPLIT,
601};
602
603extern const char *ceph_snap_op_name(int o);
604
605/* snap msg header */
606struct ceph_mds_snap_head {
607 __le32 op; /* CEPH_SNAP_OP_* */
608 __le64 split; /* ino to split off, if any */
609 __le32 num_split_inos; /* # inos belonging to new child realm */
610 __le32 num_split_realms; /* # child realms udner new child realm */
611 __le32 trace_len; /* size of snap trace blob */
612} __attribute__ ((packed));
613/* followed by split ino list, then split realms, then the trace blob */
614
615/*
616 * encode info about a snaprealm, as viewed by a client
617 */
618struct ceph_mds_snap_realm {
619 __le64 ino; /* ino */
620 __le64 created; /* snap: when created */
621 __le64 parent; /* ino: parent realm */
622 __le64 parent_since; /* snap: same parent since */
623 __le64 seq; /* snap: version */
624 __le32 num_snaps;
625 __le32 num_prior_parent_snaps;
626} __attribute__ ((packed));
627/* followed by my snap list, then prior parent snap list */
628
629#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..90d19d9d8d8f
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,163 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_osd_op_name(int op)
7{
8 switch (op) {
9 case CEPH_OSD_OP_READ: return "read";
10 case CEPH_OSD_OP_STAT: return "stat";
11
12 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
13
14 case CEPH_OSD_OP_WRITE: return "write";
15 case CEPH_OSD_OP_DELETE: return "delete";
16 case CEPH_OSD_OP_TRUNCATE: return "truncate";
17 case CEPH_OSD_OP_ZERO: return "zero";
18 case CEPH_OSD_OP_WRITEFULL: return "writefull";
19
20 case CEPH_OSD_OP_APPEND: return "append";
21 case CEPH_OSD_OP_STARTSYNC: return "startsync";
22 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
23 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
24
25 case CEPH_OSD_OP_TMAPUP: return "tmapup";
26 case CEPH_OSD_OP_TMAPGET: return "tmapget";
27 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
28
29 case CEPH_OSD_OP_GETXATTR: return "getxattr";
30 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
31 case CEPH_OSD_OP_SETXATTR: return "setxattr";
32 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
33 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
34 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
35
36 case CEPH_OSD_OP_PULL: return "pull";
37 case CEPH_OSD_OP_PUSH: return "push";
38 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
39 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
40 case CEPH_OSD_OP_SCRUB: return "scrub";
41
42 case CEPH_OSD_OP_WRLOCK: return "wrlock";
43 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
44 case CEPH_OSD_OP_RDLOCK: return "rdlock";
45 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
46 case CEPH_OSD_OP_UPLOCK: return "uplock";
47 case CEPH_OSD_OP_DNLOCK: return "dnlock";
48
49 case CEPH_OSD_OP_CALL: return "call";
50
51 case CEPH_OSD_OP_PGLS: return "pgls";
52 }
53 return "???";
54}
55
56const char *ceph_mds_state_name(int s)
57{
58 switch (s) {
59 /* down and out */
60 case CEPH_MDS_STATE_DNE: return "down:dne";
61 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
62 /* up and out */
63 case CEPH_MDS_STATE_BOOT: return "up:boot";
64 case CEPH_MDS_STATE_STANDBY: return "up:standby";
65 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
66 case CEPH_MDS_STATE_CREATING: return "up:creating";
67 case CEPH_MDS_STATE_STARTING: return "up:starting";
68 /* up and in */
69 case CEPH_MDS_STATE_REPLAY: return "up:replay";
70 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
71 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
72 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
73 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
74 case CEPH_MDS_STATE_ACTIVE: return "up:active";
75 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
76 }
77 return "???";
78}
79
80const char *ceph_session_op_name(int op)
81{
82 switch (op) {
83 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
84 case CEPH_SESSION_OPEN: return "open";
85 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
86 case CEPH_SESSION_CLOSE: return "close";
87 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
88 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
89 case CEPH_SESSION_STALE: return "stale";
90 case CEPH_SESSION_RECALL_STATE: return "recall_state";
91 }
92 return "???";
93}
94
95const char *ceph_mds_op_name(int op)
96{
97 switch (op) {
98 case CEPH_MDS_OP_LOOKUP: return "lookup";
99 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
100 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
101 case CEPH_MDS_OP_GETATTR: return "getattr";
102 case CEPH_MDS_OP_SETXATTR: return "setxattr";
103 case CEPH_MDS_OP_SETATTR: return "setattr";
104 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
105 case CEPH_MDS_OP_READDIR: return "readdir";
106 case CEPH_MDS_OP_MKNOD: return "mknod";
107 case CEPH_MDS_OP_LINK: return "link";
108 case CEPH_MDS_OP_UNLINK: return "unlink";
109 case CEPH_MDS_OP_RENAME: return "rename";
110 case CEPH_MDS_OP_MKDIR: return "mkdir";
111 case CEPH_MDS_OP_RMDIR: return "rmdir";
112 case CEPH_MDS_OP_SYMLINK: return "symlink";
113 case CEPH_MDS_OP_CREATE: return "create";
114 case CEPH_MDS_OP_OPEN: return "open";
115 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
116 case CEPH_MDS_OP_LSSNAP: return "lssnap";
117 case CEPH_MDS_OP_MKSNAP: return "mksnap";
118 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
119 }
120 return "???";
121}
122
123const char *ceph_cap_op_name(int op)
124{
125 switch (op) {
126 case CEPH_CAP_OP_GRANT: return "grant";
127 case CEPH_CAP_OP_REVOKE: return "revoke";
128 case CEPH_CAP_OP_TRUNC: return "trunc";
129 case CEPH_CAP_OP_EXPORT: return "export";
130 case CEPH_CAP_OP_IMPORT: return "import";
131 case CEPH_CAP_OP_UPDATE: return "update";
132 case CEPH_CAP_OP_DROP: return "drop";
133 case CEPH_CAP_OP_FLUSH: return "flush";
134 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
135 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
136 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
137 case CEPH_CAP_OP_RELEASE: return "release";
138 case CEPH_CAP_OP_RENEW: return "renew";
139 }
140 return "???";
141}
142
143const char *ceph_lease_op_name(int o)
144{
145 switch (o) {
146 case CEPH_MDS_LEASE_REVOKE: return "revoke";
147 case CEPH_MDS_LEASE_RELEASE: return "release";
148 case CEPH_MDS_LEASE_RENEW: return "renew";
149 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
150 }
151 return "???";
152}
153
154const char *ceph_snap_op_name(int o)
155{
156 switch (o) {
157 case CEPH_SNAP_OP_UPDATE: return "update";
158 case CEPH_SNAP_OP_CREATE: return "create";
159 case CEPH_SNAP_OP_DESTROY: return "destroy";
160 case CEPH_SNAP_OP_SPLIT: return "split";
161 }
162 return "???";
163}
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..73921ae43faa
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,157 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v021"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 1
50#define CEPH_ENTITY_TYPE_MDS 2
51#define CEPH_ENTITY_TYPE_OSD 3
52#define CEPH_ENTITY_TYPE_CLIENT 4
53#define CEPH_ENTITY_TYPE_ADMIN 5
54
55/*
56 * entity_addr -- network address
57 */
58struct ceph_entity_addr {
59 __le32 erank; /* entity's rank in process */
60 __le32 nonce; /* unique id for process (e.g. pid) */
61 struct sockaddr_storage in_addr;
62} __attribute__ ((packed));
63
64static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a,
65 const struct ceph_entity_addr *b)
66{
67 return a->nonce == b->nonce &&
68 memcmp(&a->in_addr, &b->in_addr, sizeof(a->in_addr)) == 0;
69}
70
71static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a,
72 const struct ceph_entity_addr *b)
73{
74 return memcmp(a, b, sizeof(*a)) == 0;
75}
76
77struct ceph_entity_inst {
78 struct ceph_entity_name name;
79 struct ceph_entity_addr addr;
80} __attribute__ ((packed));
81
82
83/* used by message exchange protocol */
84#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
85#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
86#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
87 incoming connection */
88#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
89 with higher cseq */
90#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
91 with higher gseq */
92#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
93#define CEPH_MSGR_TAG_MSG 7 /* message */
94#define CEPH_MSGR_TAG_ACK 8 /* message ack */
95#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
96#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
97
98
99/*
100 * connection negotiation
101 */
102struct ceph_msg_connect {
103 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
104 __le32 global_seq; /* count connections initiated by this host */
105 __le32 connect_seq; /* count connections initiated in this session */
106 __le32 protocol_version;
107 __u8 flags; /* CEPH_MSG_CONNECT_* */
108} __attribute__ ((packed));
109
110struct ceph_msg_connect_reply {
111 __u8 tag;
112 __le32 global_seq;
113 __le32 connect_seq;
114 __le32 protocol_version;
115 __u8 flags;
116} __attribute__ ((packed));
117
118#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
119
120
121/*
122 * message header
123 */
124struct ceph_msg_header {
125 __le64 seq; /* message seq# for this session */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128
129 __le32 front_len; /* bytes in main payload */
130 __le32 middle_len;/* bytes in middle payload */
131 __le32 data_len; /* bytes of data payload */
132 __le16 data_off; /* sender: include full offset;
133 receiver: mask against ~PAGE_MASK */
134
135 struct ceph_entity_inst src, orig_src;
136 __le32 dst_erank;
137 __le32 crc; /* header crc32c */
138} __attribute__ ((packed));
139
140#define CEPH_MSG_PRIO_LOW 64
141#define CEPH_MSG_PRIO_DEFAULT 127
142#define CEPH_MSG_PRIO_HIGH 196
143#define CEPH_MSG_PRIO_HIGHEST 255
144
145/*
146 * follows data payload
147 */
148struct ceph_msg_footer {
149 __le32 front_crc, middle_crc, data_crc;
150 __u8 flags;
151} __attribute__ ((packed));
152
153#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
154#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
155
156
157#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..a48cf4ae391e
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,372 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * fs id
13 */
14struct ceph_fsid {
15 unsigned char fsid[16];
16};
17
18static inline int ceph_fsid_compare(const struct ceph_fsid *a,
19 const struct ceph_fsid *b)
20{
21 return memcmp(a, b, sizeof(*a));
22}
23
24/*
25 * ino, object, etc.
26 */
27typedef __le64 ceph_snapid_t;
28#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
29#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
30#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
31
32struct ceph_timespec {
33 __le32 tv_sec;
34 __le32 tv_nsec;
35} __attribute__ ((packed));
36
37
38/*
39 * object layout - how objects are mapped into PGs
40 */
41#define CEPH_OBJECT_LAYOUT_HASH 1
42#define CEPH_OBJECT_LAYOUT_LINEAR 2
43#define CEPH_OBJECT_LAYOUT_HASHINO 3
44
45/*
46 * pg layout -- how PGs are mapped onto (sets of) OSDs
47 */
48#define CEPH_PG_LAYOUT_CRUSH 0
49#define CEPH_PG_LAYOUT_HASH 1
50#define CEPH_PG_LAYOUT_LINEAR 2
51#define CEPH_PG_LAYOUT_HYBRID 3
52
53
54/*
55 * placement group.
56 * we encode this into one __le64.
57 */
58union ceph_pg {
59 __u64 pg64;
60 struct {
61 __s16 preferred; /* preferred primary osd */
62 __u16 ps; /* placement seed */
63 __u32 pool; /* object pool */
64 } __attribute__ ((packed)) pg;
65} __attribute__ ((packed));
66
67/*
68 * pg_pool is a set of pgs storing a pool of objects
69 *
70 * pg_num -- base number of pseudorandomly placed pgs
71 *
72 * pgp_num -- effective number when calculating pg placement. this
73 * is used for pg_num increases. new pgs result in data being "split"
74 * into new pgs. for this to proceed smoothly, new pgs are intiially
75 * colocated with their parents; that is, pgp_num doesn't increase
76 * until the new pgs have successfully split. only _then_ are the new
77 * pgs placed independently.
78 *
79 * lpg_num -- localized pg count (per device). replicas are randomly
80 * selected.
81 *
82 * lpgp_num -- as above.
83 */
84#define CEPH_PG_TYPE_REP 1
85#define CEPH_PG_TYPE_RAID4 2
86struct ceph_pg_pool {
87 __u8 type; /* CEPH_PG_TYPE_* */
88 __u8 size; /* number of osds in each pg */
89 __u8 crush_ruleset; /* crush placement rule */
90 __le32 pg_num, pgp_num; /* number of pg's */
91 __le32 lpg_num, lpgp_num; /* number of localized pg's */
92 __le32 last_change; /* most recent epoch changed */
93 __le64 snap_seq; /* seq for per-pool snapshot */
94 __le32 snap_epoch; /* epoch of last snap */
95 __le32 num_snaps;
96 __le32 num_removed_snap_intervals;
97} __attribute__ ((packed));
98
99/*
100 * stable_mod func is used to control number of placement groups.
101 * similar to straight-up modulo, but produces a stable mapping as b
102 * increases over time. b is the number of bins, and bmask is the
103 * containing power of 2 minus 1.
104 *
105 * b <= bmask and bmask=(2**n)-1
106 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
107 */
108static inline int ceph_stable_mod(int x, int b, int bmask)
109{
110 if ((x & bmask) < b)
111 return x & bmask;
112 else
113 return x & (bmask >> 1);
114}
115
116/*
117 * object layout - how a given object should be stored.
118 */
119struct ceph_object_layout {
120 __le64 ol_pgid; /* raw pg, with _full_ ps precision. */
121 __le32 ol_stripe_unit; /* for per-object parity, if any */
122} __attribute__ ((packed));
123
124/*
125 * compound epoch+version, used by storage layer to serialize mutations
126 */
127struct ceph_eversion {
128 __le32 epoch;
129 __le64 version;
130} __attribute__ ((packed));
131
132/*
133 * osd map bits
134 */
135
136/* status bits */
137#define CEPH_OSD_EXISTS 1
138#define CEPH_OSD_UP 2
139
140/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
141#define CEPH_OSD_IN 0x10000
142#define CEPH_OSD_OUT 0
143
144
145/*
146 * osd map flag bits
147 */
148#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
149#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
150#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
151#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
152#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
153
154/*
155 * osd ops
156 */
157#define CEPH_OSD_OP_MODE 0xf000
158#define CEPH_OSD_OP_MODE_RD 0x1000
159#define CEPH_OSD_OP_MODE_WR 0x2000
160#define CEPH_OSD_OP_MODE_RMW 0x3000
161#define CEPH_OSD_OP_MODE_SUB 0x4000
162#define CEPH_OSD_OP_MODE_EXEC 0x8000
163
164#define CEPH_OSD_OP_TYPE 0x0f00
165#define CEPH_OSD_OP_TYPE_LOCK 0x0100
166#define CEPH_OSD_OP_TYPE_DATA 0x0200
167#define CEPH_OSD_OP_TYPE_ATTR 0x0300
168#define CEPH_OSD_OP_TYPE_EXEC 0x0400
169#define CEPH_OSD_OP_TYPE_PG 0x0500
170
171enum {
172 /** data **/
173 /* read */
174 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
175 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
176
177 /* fancy read */
178 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
179
180 /* write */
181 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
182 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
183 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
184 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
185 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
186
187 /* fancy write */
188 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
189 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
190 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
191 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
192
193 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
194 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
195 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
196
197 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
198
199 /** attrs **/
200 /* read */
201 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
202 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
203
204 /* write */
205 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
206 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
207 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
208 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
209
210 /** subop **/
211 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
212 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
213 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
214 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
215 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
216
217 /** lock **/
218 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
219 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
220 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
221 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
222 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
223 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
224
225 /** exec **/
226 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
227
228 /** pg **/
229 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
230};
231
232static inline int ceph_osd_op_type_lock(int op)
233{
234 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
235}
236static inline int ceph_osd_op_type_data(int op)
237{
238 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
239}
240static inline int ceph_osd_op_type_attr(int op)
241{
242 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
243}
244static inline int ceph_osd_op_type_exec(int op)
245{
246 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
247}
248static inline int ceph_osd_op_type_pg(int op)
249{
250 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
251}
252
253static inline int ceph_osd_op_mode_subop(int op)
254{
255 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
256}
257static inline int ceph_osd_op_mode_read(int op)
258{
259 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
260}
261static inline int ceph_osd_op_mode_modify(int op)
262{
263 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
264}
265
266#define CEPH_OSD_TMAP_HDR 'h'
267#define CEPH_OSD_TMAP_SET 's'
268#define CEPH_OSD_TMAP_RM 'r'
269
270extern const char *ceph_osd_op_name(int op);
271
272
273/*
274 * osd op flags
275 *
276 * An op may be READ, WRITE, or READ|WRITE.
277 */
278enum {
279 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
280 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
281 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
282 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
283 CEPH_OSD_FLAG_READ = 16, /* op may read */
284 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
285 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
286 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
287 CEPH_OSD_FLAG_BALANCE_READS = 256,
288 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
289 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
290};
291
292enum {
293 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
294};
295
296#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
297#define EBLACKLISTED ESHUTDOWN /* blacklisted */
298
299/*
300 * an individual object operation. each may be accompanied by some data
301 * payload
302 */
303struct ceph_osd_op {
304 __le16 op; /* CEPH_OSD_OP_* */
305 __le32 flags; /* CEPH_OSD_FLAG_* */
306 union {
307 struct {
308 __le64 offset, length;
309 } __attribute__ ((packed)) extent;
310 struct {
311 __le32 name_len;
312 __le32 value_len;
313 } __attribute__ ((packed)) xattr;
314 struct {
315 __le64 truncate_size;
316 __le32 truncate_seq;
317 } __attribute__ ((packed)) trunc;
318 struct {
319 __u8 class_len;
320 __u8 method_len;
321 __u8 argc;
322 __le32 indata_len;
323 } __attribute__ ((packed)) cls;
324 struct {
325 __le64 cookie, count;
326 } __attribute__ ((packed)) pgls;
327 };
328 __le32 payload_len;
329} __attribute__ ((packed));
330
331/*
332 * osd request message header. each request may include multiple
333 * ceph_osd_op object operations.
334 */
335struct ceph_osd_request_head {
336 __le64 tid; /* transaction id */
337 __le32 client_inc; /* client incarnation */
338 struct ceph_object_layout layout; /* pgid */
339 __le32 osdmap_epoch; /* client's osdmap epoch */
340
341 __le32 flags;
342
343 struct ceph_timespec mtime; /* for mutations only */
344 struct ceph_eversion reassert_version; /* if we are replaying op */
345
346 __le32 object_len; /* length of object name */
347
348 __le64 snapid; /* snapid to read */
349 __le64 snap_seq; /* writer's snap context */
350 __le32 num_snaps;
351
352 __le16 num_ops;
353 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
354} __attribute__ ((packed));
355
356struct ceph_osd_reply_head {
357 __le64 tid; /* transaction id */
358 __le32 client_inc; /* client incarnation */
359 __le32 flags;
360 struct ceph_object_layout layout;
361 __le32 osdmap_epoch;
362 struct ceph_eversion reassert_version; /* for replaying uncommitted */
363
364 __le32 result; /* result code */
365
366 __le32 object_len; /* length of object name */
367 __le32 num_ops;
368 struct ceph_osd_op ops[0]; /* ops[], object */
369} __attribute__ ((packed));
370
371
372#endif