diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 15:38:28 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 15:38:28 -0400 |
| commit | 2017bd19454ea7cdae19922d15b6930f6c8088a2 (patch) | |
| tree | 53974657ab3a2c98f2da7b3fcb050ff5b697f876 /include/linux | |
| parent | 9f1ad09493451c19d00c004da479acf699eeedd6 (diff) | |
| parent | efa4c1206eaff047c474af2136748a58eb8cc33b (diff) | |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (22 commits)
ceph: do not carry i_lock for readdir from dcache
fs/ceph/xattr.c: Use kmemdup
rbd: passing wrong variable to bvec_kunmap_irq()
rbd: null vs ERR_PTR
ceph: fix num_pages_free accounting in pagelist
ceph: add CEPH_MDS_OP_SETDIRLAYOUT and associated ioctl.
ceph: don't crash when passed bad mount options
ceph: fix debugfs warnings
block: rbd: removing unnecessary test
block: rbd: fixed may leaks
ceph: switch from BKL to lock_flocks()
ceph: preallocate flock state without locks held
ceph: add pagelist_reserve, pagelist_truncate, pagelist_set_cursor
ceph: use mapping->nrpages to determine if mapping is empty
ceph: only invalidate on check_caps if we actually have pages
ceph: do not hide .snap in root directory
rbd: introduce rados block device (rbd), based on libceph
ceph: factor out libceph from Ceph file system
ceph-rbd: osdc support for osd call and rollback operations
ceph: messenger and osdc changes for rbd
...
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/ceph/auth.h | 92 | ||||
| -rw-r--r-- | include/linux/ceph/buffer.h | 39 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_debug.h | 38 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_frag.h | 109 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_fs.h | 729 | ||||
| -rw-r--r-- | include/linux/ceph/ceph_hash.h | 13 | ||||
| -rw-r--r-- | include/linux/ceph/debugfs.h | 33 | ||||
| -rw-r--r-- | include/linux/ceph/decode.h | 201 | ||||
| -rw-r--r-- | include/linux/ceph/libceph.h | 249 | ||||
| -rw-r--r-- | include/linux/ceph/mdsmap.h | 62 | ||||
| -rw-r--r-- | include/linux/ceph/messenger.h | 261 | ||||
| -rw-r--r-- | include/linux/ceph/mon_client.h | 122 | ||||
| -rw-r--r-- | include/linux/ceph/msgpool.h | 25 | ||||
| -rw-r--r-- | include/linux/ceph/msgr.h | 175 | ||||
| -rw-r--r-- | include/linux/ceph/osd_client.h | 234 | ||||
| -rw-r--r-- | include/linux/ceph/osdmap.h | 130 | ||||
| -rw-r--r-- | include/linux/ceph/pagelist.h | 75 | ||||
| -rw-r--r-- | include/linux/ceph/rados.h | 405 | ||||
| -rw-r--r-- | include/linux/ceph/types.h | 29 | ||||
| -rw-r--r-- | include/linux/crush/crush.h | 180 | ||||
| -rw-r--r-- | include/linux/crush/hash.h | 17 | ||||
| -rw-r--r-- | include/linux/crush/mapper.h | 20 |
22 files changed, 3238 insertions, 0 deletions
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h new file mode 100644 index 000000000000..7fff521d7eb5 --- /dev/null +++ b/include/linux/ceph/auth.h | |||
| @@ -0,0 +1,92 @@ | |||
| 1 | #ifndef _FS_CEPH_AUTH_H | ||
| 2 | #define _FS_CEPH_AUTH_H | ||
| 3 | |||
| 4 | #include <linux/ceph/types.h> | ||
| 5 | #include <linux/ceph/buffer.h> | ||
| 6 | |||
| 7 | /* | ||
| 8 | * Abstract interface for communicating with the authenticate module. | ||
| 9 | * There is some handshake that takes place between us and the monitor | ||
| 10 | * to acquire the necessary keys. These are used to generate an | ||
| 11 | * 'authorizer' that we use when connecting to a service (mds, osd). | ||
| 12 | */ | ||
| 13 | |||
| 14 | struct ceph_auth_client; | ||
| 15 | struct ceph_authorizer; | ||
| 16 | |||
| 17 | struct ceph_auth_client_ops { | ||
| 18 | const char *name; | ||
| 19 | |||
| 20 | /* | ||
| 21 | * true if we are authenticated and can connect to | ||
| 22 | * services. | ||
| 23 | */ | ||
| 24 | int (*is_authenticated)(struct ceph_auth_client *ac); | ||
| 25 | |||
| 26 | /* | ||
| 27 | * true if we should (re)authenticate, e.g., when our tickets | ||
| 28 | * are getting old and crusty. | ||
| 29 | */ | ||
| 30 | int (*should_authenticate)(struct ceph_auth_client *ac); | ||
| 31 | |||
| 32 | /* | ||
| 33 | * build requests and process replies during monitor | ||
| 34 | * handshake. if handle_reply returns -EAGAIN, we build | ||
| 35 | * another request. | ||
| 36 | */ | ||
| 37 | int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); | ||
| 38 | int (*handle_reply)(struct ceph_auth_client *ac, int result, | ||
| 39 | void *buf, void *end); | ||
| 40 | |||
| 41 | /* | ||
| 42 | * Create authorizer for connecting to a service, and verify | ||
| 43 | * the response to authenticate the service. | ||
| 44 | */ | ||
| 45 | int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, | ||
| 46 | struct ceph_authorizer **a, | ||
| 47 | void **buf, size_t *len, | ||
| 48 | void **reply_buf, size_t *reply_len); | ||
| 49 | int (*verify_authorizer_reply)(struct ceph_auth_client *ac, | ||
| 50 | struct ceph_authorizer *a, size_t len); | ||
| 51 | void (*destroy_authorizer)(struct ceph_auth_client *ac, | ||
| 52 | struct ceph_authorizer *a); | ||
| 53 | void (*invalidate_authorizer)(struct ceph_auth_client *ac, | ||
| 54 | int peer_type); | ||
| 55 | |||
| 56 | /* reset when we (re)connect to a monitor */ | ||
| 57 | void (*reset)(struct ceph_auth_client *ac); | ||
| 58 | |||
| 59 | void (*destroy)(struct ceph_auth_client *ac); | ||
| 60 | }; | ||
| 61 | |||
| 62 | struct ceph_auth_client { | ||
| 63 | u32 protocol; /* CEPH_AUTH_* */ | ||
| 64 | void *private; /* for use by protocol implementation */ | ||
| 65 | const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */ | ||
| 66 | |||
| 67 | bool negotiating; /* true if negotiating protocol */ | ||
| 68 | const char *name; /* entity name */ | ||
| 69 | u64 global_id; /* our unique id in system */ | ||
| 70 | const char *secret; /* our secret key */ | ||
| 71 | unsigned want_keys; /* which services we want */ | ||
| 72 | }; | ||
| 73 | |||
| 74 | extern struct ceph_auth_client *ceph_auth_init(const char *name, | ||
| 75 | const char *secret); | ||
| 76 | extern void ceph_auth_destroy(struct ceph_auth_client *ac); | ||
| 77 | |||
| 78 | extern void ceph_auth_reset(struct ceph_auth_client *ac); | ||
| 79 | |||
| 80 | extern int ceph_auth_build_hello(struct ceph_auth_client *ac, | ||
| 81 | void *buf, size_t len); | ||
| 82 | extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, | ||
| 83 | void *buf, size_t len, | ||
| 84 | void *reply_buf, size_t reply_len); | ||
| 85 | extern int ceph_entity_name_encode(const char *name, void **p, void *end); | ||
| 86 | |||
| 87 | extern int ceph_build_auth(struct ceph_auth_client *ac, | ||
| 88 | void *msg_buf, size_t msg_len); | ||
| 89 | |||
| 90 | extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); | ||
| 91 | |||
| 92 | #endif | ||
diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h new file mode 100644 index 000000000000..58d19014068f --- /dev/null +++ b/include/linux/ceph/buffer.h | |||
| @@ -0,0 +1,39 @@ | |||
| 1 | #ifndef __FS_CEPH_BUFFER_H | ||
| 2 | #define __FS_CEPH_BUFFER_H | ||
| 3 | |||
| 4 | #include <linux/kref.h> | ||
| 5 | #include <linux/mm.h> | ||
| 6 | #include <linux/vmalloc.h> | ||
| 7 | #include <linux/types.h> | ||
| 8 | #include <linux/uio.h> | ||
| 9 | |||
| 10 | /* | ||
| 11 | * a simple reference counted buffer. | ||
| 12 | * | ||
| 13 | * use kmalloc for small sizes (<= one page), vmalloc for larger | ||
| 14 | * sizes. | ||
| 15 | */ | ||
| 16 | struct ceph_buffer { | ||
| 17 | struct kref kref; | ||
| 18 | struct kvec vec; | ||
| 19 | size_t alloc_len; | ||
| 20 | bool is_vmalloc; | ||
| 21 | }; | ||
| 22 | |||
| 23 | extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); | ||
| 24 | extern void ceph_buffer_release(struct kref *kref); | ||
| 25 | |||
| 26 | static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) | ||
| 27 | { | ||
| 28 | kref_get(&b->kref); | ||
| 29 | return b; | ||
| 30 | } | ||
| 31 | |||
| 32 | static inline void ceph_buffer_put(struct ceph_buffer *b) | ||
| 33 | { | ||
| 34 | kref_put(&b->kref, ceph_buffer_release); | ||
| 35 | } | ||
| 36 | |||
| 37 | extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); | ||
| 38 | |||
| 39 | #endif | ||
diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h new file mode 100644 index 000000000000..aa2e19182d99 --- /dev/null +++ b/include/linux/ceph/ceph_debug.h | |||
| @@ -0,0 +1,38 @@ | |||
| 1 | #ifndef _FS_CEPH_DEBUG_H | ||
| 2 | #define _FS_CEPH_DEBUG_H | ||
| 3 | |||
| 4 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 5 | |||
| 6 | #ifdef CONFIG_CEPH_LIB_PRETTYDEBUG | ||
| 7 | |||
| 8 | /* | ||
| 9 | * wrap pr_debug to include a filename:lineno prefix on each line. | ||
| 10 | * this incurs some overhead (kernel size and execution time) due to | ||
| 11 | * the extra function call at each call site. | ||
| 12 | */ | ||
| 13 | |||
| 14 | # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) | ||
| 15 | extern const char *ceph_file_part(const char *s, int len); | ||
| 16 | # define dout(fmt, ...) \ | ||
| 17 | pr_debug("%.*s %12.12s:%-4d : " fmt, \ | ||
| 18 | 8 - (int)sizeof(KBUILD_MODNAME), " ", \ | ||
| 19 | ceph_file_part(__FILE__, sizeof(__FILE__)), \ | ||
| 20 | __LINE__, ##__VA_ARGS__) | ||
| 21 | # else | ||
| 22 | /* faux printk call just to see any compiler warnings. */ | ||
| 23 | # define dout(fmt, ...) do { \ | ||
| 24 | if (0) \ | ||
| 25 | printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ | ||
| 26 | } while (0) | ||
| 27 | # endif | ||
| 28 | |||
| 29 | #else | ||
| 30 | |||
| 31 | /* | ||
| 32 | * or, just wrap pr_debug | ||
| 33 | */ | ||
| 34 | # define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__) | ||
| 35 | |||
| 36 | #endif | ||
| 37 | |||
| 38 | #endif | ||
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h new file mode 100644 index 000000000000..5babb8e95352 --- /dev/null +++ b/include/linux/ceph/ceph_frag.h | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | #ifndef FS_CEPH_FRAG_H | ||
| 2 | #define FS_CEPH_FRAG_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * "Frags" are a way to describe a subset of a 32-bit number space, | ||
| 6 | * using a mask and a value to match against that mask. Any given frag | ||
| 7 | * (subset of the number space) can be partitioned into 2^n sub-frags. | ||
| 8 | * | ||
| 9 | * Frags are encoded into a 32-bit word: | ||
| 10 | * 8 upper bits = "bits" | ||
| 11 | * 24 lower bits = "value" | ||
| 12 | * (We could go to 5+27 bits, but who cares.) | ||
| 13 | * | ||
| 14 | * We use the _most_ significant bits of the 24 bit value. This makes | ||
| 15 | * values logically sort. | ||
| 16 | * | ||
| 17 | * Unfortunately, because the "bits" field is still in the high bits, we | ||
| 18 | * can't sort encoded frags numerically. However, it does allow you | ||
| 19 | * to feed encoded frags as values into frag_contains_value. | ||
| 20 | */ | ||
| 21 | static inline __u32 ceph_frag_make(__u32 b, __u32 v) | ||
| 22 | { | ||
| 23 | return (b << 24) | | ||
| 24 | (v & (0xffffffu << (24-b)) & 0xffffffu); | ||
| 25 | } | ||
| 26 | static inline __u32 ceph_frag_bits(__u32 f) | ||
| 27 | { | ||
| 28 | return f >> 24; | ||
| 29 | } | ||
| 30 | static inline __u32 ceph_frag_value(__u32 f) | ||
| 31 | { | ||
| 32 | return f & 0xffffffu; | ||
| 33 | } | ||
| 34 | static inline __u32 ceph_frag_mask(__u32 f) | ||
| 35 | { | ||
| 36 | return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu; | ||
| 37 | } | ||
| 38 | static inline __u32 ceph_frag_mask_shift(__u32 f) | ||
| 39 | { | ||
| 40 | return 24 - ceph_frag_bits(f); | ||
| 41 | } | ||
| 42 | |||
| 43 | static inline int ceph_frag_contains_value(__u32 f, __u32 v) | ||
| 44 | { | ||
| 45 | return (v & ceph_frag_mask(f)) == ceph_frag_value(f); | ||
| 46 | } | ||
| 47 | static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) | ||
| 48 | { | ||
| 49 | /* is sub as specific as us, and contained by us? */ | ||
| 50 | return ceph_frag_bits(sub) >= ceph_frag_bits(f) && | ||
| 51 | (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); | ||
| 52 | } | ||
| 53 | |||
| 54 | static inline __u32 ceph_frag_parent(__u32 f) | ||
| 55 | { | ||
| 56 | return ceph_frag_make(ceph_frag_bits(f) - 1, | ||
| 57 | ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); | ||
| 58 | } | ||
| 59 | static inline int ceph_frag_is_left_child(__u32 f) | ||
| 60 | { | ||
| 61 | return ceph_frag_bits(f) > 0 && | ||
| 62 | (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; | ||
| 63 | } | ||
| 64 | static inline int ceph_frag_is_right_child(__u32 f) | ||
| 65 | { | ||
| 66 | return ceph_frag_bits(f) > 0 && | ||
| 67 | (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; | ||
| 68 | } | ||
| 69 | static inline __u32 ceph_frag_sibling(__u32 f) | ||
| 70 | { | ||
| 71 | return ceph_frag_make(ceph_frag_bits(f), | ||
| 72 | ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); | ||
| 73 | } | ||
| 74 | static inline __u32 ceph_frag_left_child(__u32 f) | ||
| 75 | { | ||
| 76 | return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); | ||
| 77 | } | ||
| 78 | static inline __u32 ceph_frag_right_child(__u32 f) | ||
| 79 | { | ||
| 80 | return ceph_frag_make(ceph_frag_bits(f)+1, | ||
| 81 | ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); | ||
| 82 | } | ||
| 83 | static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) | ||
| 84 | { | ||
| 85 | int newbits = ceph_frag_bits(f) + by; | ||
| 86 | return ceph_frag_make(newbits, | ||
| 87 | ceph_frag_value(f) | (i << (24 - newbits))); | ||
| 88 | } | ||
| 89 | static inline int ceph_frag_is_leftmost(__u32 f) | ||
| 90 | { | ||
| 91 | return ceph_frag_value(f) == 0; | ||
| 92 | } | ||
| 93 | static inline int ceph_frag_is_rightmost(__u32 f) | ||
| 94 | { | ||
| 95 | return ceph_frag_value(f) == ceph_frag_mask(f); | ||
| 96 | } | ||
| 97 | static inline __u32 ceph_frag_next(__u32 f) | ||
| 98 | { | ||
| 99 | return ceph_frag_make(ceph_frag_bits(f), | ||
| 100 | ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f))); | ||
| 101 | } | ||
| 102 | |||
| 103 | /* | ||
| 104 | * comparator to sort frags logically, as when traversing the | ||
| 105 | * number space in ascending order... | ||
| 106 | */ | ||
| 107 | int ceph_frag_compare(__u32 a, __u32 b); | ||
| 108 | |||
| 109 | #endif | ||
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h new file mode 100644 index 000000000000..c3c74aef289d --- /dev/null +++ b/include/linux/ceph/ceph_fs.h | |||
| @@ -0,0 +1,729 @@ | |||
| 1 | /* | ||
| 2 | * ceph_fs.h - Ceph constants and data types to share between kernel and | ||
| 3 | * user space. | ||
| 4 | * | ||
| 5 | * Most types in this file are defined as little-endian, and are | ||
| 6 | * primarily intended to describe data structures that pass over the | ||
| 7 | * wire or that are stored on disk. | ||
| 8 | * | ||
| 9 | * LGPL2 | ||
| 10 | */ | ||
| 11 | |||
| 12 | #ifndef CEPH_FS_H | ||
| 13 | #define CEPH_FS_H | ||
| 14 | |||
| 15 | #include "msgr.h" | ||
| 16 | #include "rados.h" | ||
| 17 | |||
| 18 | /* | ||
| 19 | * subprotocol versions. when specific messages types or high-level | ||
| 20 | * protocols change, bump the affected components. we keep rev | ||
| 21 | * internal cluster protocols separately from the public, | ||
| 22 | * client-facing protocol. | ||
| 23 | */ | ||
| 24 | #define CEPH_OSD_PROTOCOL 8 /* cluster internal */ | ||
| 25 | #define CEPH_MDS_PROTOCOL 12 /* cluster internal */ | ||
| 26 | #define CEPH_MON_PROTOCOL 5 /* cluster internal */ | ||
| 27 | #define CEPH_OSDC_PROTOCOL 24 /* server/client */ | ||
| 28 | #define CEPH_MDSC_PROTOCOL 32 /* server/client */ | ||
| 29 | #define CEPH_MONC_PROTOCOL 15 /* server/client */ | ||
| 30 | |||
| 31 | |||
| 32 | #define CEPH_INO_ROOT 1 | ||
| 33 | #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ | ||
| 34 | |||
| 35 | /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ | ||
| 36 | #define CEPH_MAX_MON 31 | ||
| 37 | |||
| 38 | |||
| 39 | /* | ||
| 40 | * feature bits | ||
| 41 | */ | ||
| 42 | #define CEPH_FEATURE_UID (1<<0) | ||
| 43 | #define CEPH_FEATURE_NOSRCADDR (1<<1) | ||
| 44 | #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) | ||
| 45 | #define CEPH_FEATURE_FLOCK (1<<3) | ||
| 46 | |||
| 47 | |||
| 48 | /* | ||
| 49 | * ceph_file_layout - describe data layout for a file/inode | ||
| 50 | */ | ||
| 51 | struct ceph_file_layout { | ||
| 52 | /* file -> object mapping */ | ||
| 53 | __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple | ||
| 54 | of page size. */ | ||
| 55 | __le32 fl_stripe_count; /* over this many objects */ | ||
| 56 | __le32 fl_object_size; /* until objects are this big, then move to | ||
| 57 | new objects */ | ||
| 58 | __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ | ||
| 59 | |||
| 60 | /* pg -> disk layout */ | ||
| 61 | __le32 fl_object_stripe_unit; /* for per-object parity, if any */ | ||
| 62 | |||
| 63 | /* object -> pg layout */ | ||
| 64 | __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ | ||
| 65 | __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ | ||
| 66 | } __attribute__ ((packed)); | ||
| 67 | |||
| 68 | #define CEPH_MIN_STRIPE_UNIT 65536 | ||
| 69 | |||
| 70 | int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); | ||
| 71 | |||
| 72 | |||
| 73 | /* crypto algorithms */ | ||
| 74 | #define CEPH_CRYPTO_NONE 0x0 | ||
| 75 | #define CEPH_CRYPTO_AES 0x1 | ||
| 76 | |||
| 77 | #define CEPH_AES_IV "cephsageyudagreg" | ||
| 78 | |||
| 79 | /* security/authentication protocols */ | ||
| 80 | #define CEPH_AUTH_UNKNOWN 0x0 | ||
| 81 | #define CEPH_AUTH_NONE 0x1 | ||
| 82 | #define CEPH_AUTH_CEPHX 0x2 | ||
| 83 | |||
| 84 | #define CEPH_AUTH_UID_DEFAULT ((__u64) -1) | ||
| 85 | |||
| 86 | |||
| 87 | /********************************************* | ||
| 88 | * message layer | ||
| 89 | */ | ||
| 90 | |||
| 91 | /* | ||
| 92 | * message types | ||
| 93 | */ | ||
| 94 | |||
| 95 | /* misc */ | ||
| 96 | #define CEPH_MSG_SHUTDOWN 1 | ||
| 97 | #define CEPH_MSG_PING 2 | ||
| 98 | |||
| 99 | /* client <-> monitor */ | ||
| 100 | #define CEPH_MSG_MON_MAP 4 | ||
| 101 | #define CEPH_MSG_MON_GET_MAP 5 | ||
| 102 | #define CEPH_MSG_STATFS 13 | ||
| 103 | #define CEPH_MSG_STATFS_REPLY 14 | ||
| 104 | #define CEPH_MSG_MON_SUBSCRIBE 15 | ||
| 105 | #define CEPH_MSG_MON_SUBSCRIBE_ACK 16 | ||
| 106 | #define CEPH_MSG_AUTH 17 | ||
| 107 | #define CEPH_MSG_AUTH_REPLY 18 | ||
| 108 | |||
| 109 | /* client <-> mds */ | ||
| 110 | #define CEPH_MSG_MDS_MAP 21 | ||
| 111 | |||
| 112 | #define CEPH_MSG_CLIENT_SESSION 22 | ||
| 113 | #define CEPH_MSG_CLIENT_RECONNECT 23 | ||
| 114 | |||
| 115 | #define CEPH_MSG_CLIENT_REQUEST 24 | ||
| 116 | #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 | ||
| 117 | #define CEPH_MSG_CLIENT_REPLY 26 | ||
| 118 | #define CEPH_MSG_CLIENT_CAPS 0x310 | ||
| 119 | #define CEPH_MSG_CLIENT_LEASE 0x311 | ||
| 120 | #define CEPH_MSG_CLIENT_SNAP 0x312 | ||
| 121 | #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 | ||
| 122 | |||
| 123 | /* pool ops */ | ||
| 124 | #define CEPH_MSG_POOLOP_REPLY 48 | ||
| 125 | #define CEPH_MSG_POOLOP 49 | ||
| 126 | |||
| 127 | |||
| 128 | /* osd */ | ||
| 129 | #define CEPH_MSG_OSD_MAP 41 | ||
| 130 | #define CEPH_MSG_OSD_OP 42 | ||
| 131 | #define CEPH_MSG_OSD_OPREPLY 43 | ||
| 132 | |||
| 133 | /* pool operations */ | ||
| 134 | enum { | ||
| 135 | POOL_OP_CREATE = 0x01, | ||
| 136 | POOL_OP_DELETE = 0x02, | ||
| 137 | POOL_OP_AUID_CHANGE = 0x03, | ||
| 138 | POOL_OP_CREATE_SNAP = 0x11, | ||
| 139 | POOL_OP_DELETE_SNAP = 0x12, | ||
| 140 | POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, | ||
| 141 | POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, | ||
| 142 | }; | ||
| 143 | |||
| 144 | struct ceph_mon_request_header { | ||
| 145 | __le64 have_version; | ||
| 146 | __le16 session_mon; | ||
| 147 | __le64 session_mon_tid; | ||
| 148 | } __attribute__ ((packed)); | ||
| 149 | |||
| 150 | struct ceph_mon_statfs { | ||
| 151 | struct ceph_mon_request_header monhdr; | ||
| 152 | struct ceph_fsid fsid; | ||
| 153 | } __attribute__ ((packed)); | ||
| 154 | |||
| 155 | struct ceph_statfs { | ||
| 156 | __le64 kb, kb_used, kb_avail; | ||
| 157 | __le64 num_objects; | ||
| 158 | } __attribute__ ((packed)); | ||
| 159 | |||
| 160 | struct ceph_mon_statfs_reply { | ||
| 161 | struct ceph_fsid fsid; | ||
| 162 | __le64 version; | ||
| 163 | struct ceph_statfs st; | ||
| 164 | } __attribute__ ((packed)); | ||
| 165 | |||
| 166 | const char *ceph_pool_op_name(int op); | ||
| 167 | |||
| 168 | struct ceph_mon_poolop { | ||
| 169 | struct ceph_mon_request_header monhdr; | ||
| 170 | struct ceph_fsid fsid; | ||
| 171 | __le32 pool; | ||
| 172 | __le32 op; | ||
| 173 | __le64 auid; | ||
| 174 | __le64 snapid; | ||
| 175 | __le32 name_len; | ||
| 176 | } __attribute__ ((packed)); | ||
| 177 | |||
| 178 | struct ceph_mon_poolop_reply { | ||
| 179 | struct ceph_mon_request_header monhdr; | ||
| 180 | struct ceph_fsid fsid; | ||
| 181 | __le32 reply_code; | ||
| 182 | __le32 epoch; | ||
| 183 | char has_data; | ||
| 184 | char data[0]; | ||
| 185 | } __attribute__ ((packed)); | ||
| 186 | |||
| 187 | struct ceph_mon_unmanaged_snap { | ||
| 188 | __le64 snapid; | ||
| 189 | } __attribute__ ((packed)); | ||
| 190 | |||
| 191 | struct ceph_osd_getmap { | ||
| 192 | struct ceph_mon_request_header monhdr; | ||
| 193 | struct ceph_fsid fsid; | ||
| 194 | __le32 start; | ||
| 195 | } __attribute__ ((packed)); | ||
| 196 | |||
| 197 | struct ceph_mds_getmap { | ||
| 198 | struct ceph_mon_request_header monhdr; | ||
| 199 | struct ceph_fsid fsid; | ||
| 200 | } __attribute__ ((packed)); | ||
| 201 | |||
| 202 | struct ceph_client_mount { | ||
| 203 | struct ceph_mon_request_header monhdr; | ||
| 204 | } __attribute__ ((packed)); | ||
| 205 | |||
| 206 | struct ceph_mon_subscribe_item { | ||
| 207 | __le64 have_version; __le64 have; | ||
| 208 | __u8 onetime; | ||
| 209 | } __attribute__ ((packed)); | ||
| 210 | |||
| 211 | struct ceph_mon_subscribe_ack { | ||
| 212 | __le32 duration; /* seconds */ | ||
| 213 | struct ceph_fsid fsid; | ||
| 214 | } __attribute__ ((packed)); | ||
| 215 | |||
| 216 | /* | ||
| 217 | * mds states | ||
| 218 | * > 0 -> in | ||
| 219 | * <= 0 -> out | ||
| 220 | */ | ||
| 221 | #define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ | ||
| 222 | #define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. | ||
| 223 | empty log. */ | ||
| 224 | #define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ | ||
| 225 | #define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ | ||
| 226 | #define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ | ||
| 227 | #define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ | ||
| 228 | #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ | ||
| 229 | |||
| 230 | #define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ | ||
| 231 | #define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed | ||
| 232 | operations (import, rename, etc.) */ | ||
| 233 | #define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ | ||
| 234 | #define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ | ||
| 235 | #define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */ | ||
| 236 | #define CEPH_MDS_STATE_ACTIVE 13 /* up, active */ | ||
| 237 | #define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */ | ||
| 238 | |||
| 239 | extern const char *ceph_mds_state_name(int s); | ||
| 240 | |||
| 241 | |||
| 242 | /* | ||
| 243 | * metadata lock types. | ||
| 244 | * - these are bitmasks.. we can compose them | ||
| 245 | * - they also define the lock ordering by the MDS | ||
| 246 | * - a few of these are internal to the mds | ||
| 247 | */ | ||
| 248 | #define CEPH_LOCK_DVERSION 1 | ||
| 249 | #define CEPH_LOCK_DN 2 | ||
| 250 | #define CEPH_LOCK_ISNAP 16 | ||
| 251 | #define CEPH_LOCK_IVERSION 32 /* mds internal */ | ||
| 252 | #define CEPH_LOCK_IFILE 64 | ||
| 253 | #define CEPH_LOCK_IAUTH 128 | ||
| 254 | #define CEPH_LOCK_ILINK 256 | ||
| 255 | #define CEPH_LOCK_IDFT 512 /* dir frag tree */ | ||
| 256 | #define CEPH_LOCK_INEST 1024 /* mds internal */ | ||
| 257 | #define CEPH_LOCK_IXATTR 2048 | ||
| 258 | #define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ | ||
| 259 | #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ | ||
| 260 | |||
| 261 | /* client_session ops */ | ||
| 262 | enum { | ||
| 263 | CEPH_SESSION_REQUEST_OPEN, | ||
| 264 | CEPH_SESSION_OPEN, | ||
| 265 | CEPH_SESSION_REQUEST_CLOSE, | ||
| 266 | CEPH_SESSION_CLOSE, | ||
| 267 | CEPH_SESSION_REQUEST_RENEWCAPS, | ||
| 268 | CEPH_SESSION_RENEWCAPS, | ||
| 269 | CEPH_SESSION_STALE, | ||
| 270 | CEPH_SESSION_RECALL_STATE, | ||
| 271 | }; | ||
| 272 | |||
| 273 | extern const char *ceph_session_op_name(int op); | ||
| 274 | |||
| 275 | struct ceph_mds_session_head { | ||
| 276 | __le32 op; | ||
| 277 | __le64 seq; | ||
| 278 | struct ceph_timespec stamp; | ||
| 279 | __le32 max_caps, max_leases; | ||
| 280 | } __attribute__ ((packed)); | ||
| 281 | |||
| 282 | /* client_request */ | ||
| 283 | /* | ||
| 284 | * metadata ops. | ||
| 285 | * & 0x001000 -> write op | ||
| 286 | * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). | ||
| 287 | & & 0x100000 -> use weird ino/path trace | ||
| 288 | */ | ||
| 289 | #define CEPH_MDS_OP_WRITE 0x001000 | ||
| 290 | enum { | ||
| 291 | CEPH_MDS_OP_LOOKUP = 0x00100, | ||
| 292 | CEPH_MDS_OP_GETATTR = 0x00101, | ||
| 293 | CEPH_MDS_OP_LOOKUPHASH = 0x00102, | ||
| 294 | CEPH_MDS_OP_LOOKUPPARENT = 0x00103, | ||
| 295 | |||
| 296 | CEPH_MDS_OP_SETXATTR = 0x01105, | ||
| 297 | CEPH_MDS_OP_RMXATTR = 0x01106, | ||
| 298 | CEPH_MDS_OP_SETLAYOUT = 0x01107, | ||
| 299 | CEPH_MDS_OP_SETATTR = 0x01108, | ||
| 300 | CEPH_MDS_OP_SETFILELOCK= 0x01109, | ||
| 301 | CEPH_MDS_OP_GETFILELOCK= 0x00110, | ||
| 302 | CEPH_MDS_OP_SETDIRLAYOUT=0x0110a, | ||
| 303 | |||
| 304 | CEPH_MDS_OP_MKNOD = 0x01201, | ||
| 305 | CEPH_MDS_OP_LINK = 0x01202, | ||
| 306 | CEPH_MDS_OP_UNLINK = 0x01203, | ||
| 307 | CEPH_MDS_OP_RENAME = 0x01204, | ||
| 308 | CEPH_MDS_OP_MKDIR = 0x01220, | ||
| 309 | CEPH_MDS_OP_RMDIR = 0x01221, | ||
| 310 | CEPH_MDS_OP_SYMLINK = 0x01222, | ||
| 311 | |||
| 312 | CEPH_MDS_OP_CREATE = 0x01301, | ||
| 313 | CEPH_MDS_OP_OPEN = 0x00302, | ||
| 314 | CEPH_MDS_OP_READDIR = 0x00305, | ||
| 315 | |||
| 316 | CEPH_MDS_OP_LOOKUPSNAP = 0x00400, | ||
| 317 | CEPH_MDS_OP_MKSNAP = 0x01400, | ||
| 318 | CEPH_MDS_OP_RMSNAP = 0x01401, | ||
| 319 | CEPH_MDS_OP_LSSNAP = 0x00402, | ||
| 320 | }; | ||
| 321 | |||
| 322 | extern const char *ceph_mds_op_name(int op); | ||
| 323 | |||
| 324 | |||
| 325 | #define CEPH_SETATTR_MODE 1 | ||
| 326 | #define CEPH_SETATTR_UID 2 | ||
| 327 | #define CEPH_SETATTR_GID 4 | ||
| 328 | #define CEPH_SETATTR_MTIME 8 | ||
| 329 | #define CEPH_SETATTR_ATIME 16 | ||
| 330 | #define CEPH_SETATTR_SIZE 32 | ||
| 331 | #define CEPH_SETATTR_CTIME 64 | ||
| 332 | |||
| 333 | union ceph_mds_request_args { | ||
| 334 | struct { | ||
| 335 | __le32 mask; /* CEPH_CAP_* */ | ||
| 336 | } __attribute__ ((packed)) getattr; | ||
| 337 | struct { | ||
| 338 | __le32 mode; | ||
| 339 | __le32 uid; | ||
| 340 | __le32 gid; | ||
| 341 | struct ceph_timespec mtime; | ||
| 342 | struct ceph_timespec atime; | ||
| 343 | __le64 size, old_size; /* old_size needed by truncate */ | ||
| 344 | __le32 mask; /* CEPH_SETATTR_* */ | ||
| 345 | } __attribute__ ((packed)) setattr; | ||
| 346 | struct { | ||
| 347 | __le32 frag; /* which dir fragment */ | ||
| 348 | __le32 max_entries; /* how many dentries to grab */ | ||
| 349 | __le32 max_bytes; | ||
| 350 | } __attribute__ ((packed)) readdir; | ||
| 351 | struct { | ||
| 352 | __le32 mode; | ||
| 353 | __le32 rdev; | ||
| 354 | } __attribute__ ((packed)) mknod; | ||
| 355 | struct { | ||
| 356 | __le32 mode; | ||
| 357 | } __attribute__ ((packed)) mkdir; | ||
| 358 | struct { | ||
| 359 | __le32 flags; | ||
| 360 | __le32 mode; | ||
| 361 | __le32 stripe_unit; /* layout for newly created file */ | ||
| 362 | __le32 stripe_count; /* ... */ | ||
| 363 | __le32 object_size; | ||
| 364 | __le32 file_replication; | ||
| 365 | __le32 preferred; | ||
| 366 | } __attribute__ ((packed)) open; | ||
| 367 | struct { | ||
| 368 | __le32 flags; | ||
| 369 | } __attribute__ ((packed)) setxattr; | ||
| 370 | struct { | ||
| 371 | struct ceph_file_layout layout; | ||
| 372 | } __attribute__ ((packed)) setlayout; | ||
| 373 | struct { | ||
| 374 | __u8 rule; /* currently fcntl or flock */ | ||
| 375 | __u8 type; /* shared, exclusive, remove*/ | ||
| 376 | __le64 pid; /* process id requesting the lock */ | ||
| 377 | __le64 pid_namespace; | ||
| 378 | __le64 start; /* initial location to lock */ | ||
| 379 | __le64 length; /* num bytes to lock from start */ | ||
| 380 | __u8 wait; /* will caller wait for lock to become available? */ | ||
| 381 | } __attribute__ ((packed)) filelock_change; | ||
| 382 | } __attribute__ ((packed)); | ||
| 383 | |||
| 384 | #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ | ||
| 385 | #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ | ||
| 386 | |||
| 387 | struct ceph_mds_request_head { | ||
| 388 | __le64 oldest_client_tid; | ||
| 389 | __le32 mdsmap_epoch; /* on client */ | ||
| 390 | __le32 flags; /* CEPH_MDS_FLAG_* */ | ||
| 391 | __u8 num_retry, num_fwd; /* count retry, fwd attempts */ | ||
| 392 | __le16 num_releases; /* # include cap/lease release records */ | ||
| 393 | __le32 op; /* mds op code */ | ||
| 394 | __le32 caller_uid, caller_gid; | ||
| 395 | __le64 ino; /* use this ino for openc, mkdir, mknod, | ||
| 396 | etc. (if replaying) */ | ||
| 397 | union ceph_mds_request_args args; | ||
| 398 | } __attribute__ ((packed)); | ||
| 399 | |||
| 400 | /* cap/lease release record */ | ||
| 401 | struct ceph_mds_request_release { | ||
| 402 | __le64 ino, cap_id; /* ino and unique cap id */ | ||
| 403 | __le32 caps, wanted; /* new issued, wanted */ | ||
| 404 | __le32 seq, issue_seq, mseq; | ||
| 405 | __le32 dname_seq; /* if releasing a dentry lease, a */ | ||
| 406 | __le32 dname_len; /* string follows. */ | ||
| 407 | } __attribute__ ((packed)); | ||
| 408 | |||
| 409 | /* client reply */ | ||
| 410 | struct ceph_mds_reply_head { | ||
| 411 | __le32 op; | ||
| 412 | __le32 result; | ||
| 413 | __le32 mdsmap_epoch; | ||
| 414 | __u8 safe; /* true if committed to disk */ | ||
| 415 | __u8 is_dentry, is_target; /* true if dentry, target inode records | ||
| 416 | are included with reply */ | ||
| 417 | } __attribute__ ((packed)); | ||
| 418 | |||
| 419 | /* one for each node split */ | ||
| 420 | struct ceph_frag_tree_split { | ||
| 421 | __le32 frag; /* this frag splits... */ | ||
| 422 | __le32 by; /* ...by this many bits */ | ||
| 423 | } __attribute__ ((packed)); | ||
| 424 | |||
| 425 | struct ceph_frag_tree_head { | ||
| 426 | __le32 nsplits; /* num ceph_frag_tree_split records */ | ||
| 427 | struct ceph_frag_tree_split splits[]; | ||
| 428 | } __attribute__ ((packed)); | ||
| 429 | |||
| 430 | /* capability issue, for bundling with mds reply */ | ||
| 431 | struct ceph_mds_reply_cap { | ||
| 432 | __le32 caps, wanted; /* caps issued, wanted */ | ||
| 433 | __le64 cap_id; | ||
| 434 | __le32 seq, mseq; | ||
| 435 | __le64 realm; /* snap realm */ | ||
| 436 | __u8 flags; /* CEPH_CAP_FLAG_* */ | ||
| 437 | } __attribute__ ((packed)); | ||
| 438 | |||
| 439 | #define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ | ||
| 440 | |||
| 441 | /* inode record, for bundling with mds reply */ | ||
| 442 | struct ceph_mds_reply_inode { | ||
| 443 | __le64 ino; | ||
| 444 | __le64 snapid; | ||
| 445 | __le32 rdev; | ||
| 446 | __le64 version; /* inode version */ | ||
| 447 | __le64 xattr_version; /* version for xattr blob */ | ||
| 448 | struct ceph_mds_reply_cap cap; /* caps issued for this inode */ | ||
| 449 | struct ceph_file_layout layout; | ||
| 450 | struct ceph_timespec ctime, mtime, atime; | ||
| 451 | __le32 time_warp_seq; | ||
| 452 | __le64 size, max_size, truncate_size; | ||
| 453 | __le32 truncate_seq; | ||
| 454 | __le32 mode, uid, gid; | ||
| 455 | __le32 nlink; | ||
| 456 | __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */ | ||
| 457 | struct ceph_timespec rctime; | ||
| 458 | struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ | ||
| 459 | } __attribute__ ((packed)); | ||
| 460 | /* followed by frag array, then symlink string, then xattr blob */ | ||
| 461 | |||
| 462 | /* reply_lease follows dname, and reply_inode */ | ||
| 463 | struct ceph_mds_reply_lease { | ||
| 464 | __le16 mask; /* lease type(s) */ | ||
| 465 | __le32 duration_ms; /* lease duration */ | ||
| 466 | __le32 seq; | ||
| 467 | } __attribute__ ((packed)); | ||
| 468 | |||
| 469 | struct ceph_mds_reply_dirfrag { | ||
| 470 | __le32 frag; /* fragment */ | ||
| 471 | __le32 auth; /* auth mds, if this is a delegation point */ | ||
| 472 | __le32 ndist; /* number of mds' this is replicated on */ | ||
| 473 | __le32 dist[]; | ||
| 474 | } __attribute__ ((packed)); | ||
| 475 | |||
| 476 | #define CEPH_LOCK_FCNTL 1 | ||
| 477 | #define CEPH_LOCK_FLOCK 2 | ||
| 478 | |||
| 479 | #define CEPH_LOCK_SHARED 1 | ||
| 480 | #define CEPH_LOCK_EXCL 2 | ||
| 481 | #define CEPH_LOCK_UNLOCK 4 | ||
| 482 | |||
| 483 | struct ceph_filelock { | ||
| 484 | __le64 start;/* file offset to start lock at */ | ||
| 485 | __le64 length; /* num bytes to lock; 0 for all following start */ | ||
| 486 | __le64 client; /* which client holds the lock */ | ||
| 487 | __le64 pid; /* process id holding the lock on the client */ | ||
| 488 | __le64 pid_namespace; | ||
| 489 | __u8 type; /* shared lock, exclusive lock, or unlock */ | ||
| 490 | } __attribute__ ((packed)); | ||
| 491 | |||
| 492 | |||
| 493 | /* file access modes */ | ||
| 494 | #define CEPH_FILE_MODE_PIN 0 | ||
| 495 | #define CEPH_FILE_MODE_RD 1 | ||
| 496 | #define CEPH_FILE_MODE_WR 2 | ||
| 497 | #define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ | ||
| 498 | #define CEPH_FILE_MODE_LAZY 4 /* lazy io */ | ||
| 499 | #define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ | ||
| 500 | |||
| 501 | int ceph_flags_to_mode(int flags); | ||
| 502 | |||
| 503 | |||
| 504 | /* capability bits */ | ||
| 505 | #define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ | ||
| 506 | |||
| 507 | /* generic cap bits */ | ||
| 508 | #define CEPH_CAP_GSHARED 1 /* client can reads */ | ||
| 509 | #define CEPH_CAP_GEXCL 2 /* client can read and update */ | ||
| 510 | #define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ | ||
| 511 | #define CEPH_CAP_GRD 8 /* (file) client can read */ | ||
| 512 | #define CEPH_CAP_GWR 16 /* (file) client can write */ | ||
| 513 | #define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ | ||
| 514 | #define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ | ||
| 515 | #define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ | ||
| 516 | |||
| 517 | /* per-lock shift */ | ||
| 518 | #define CEPH_CAP_SAUTH 2 | ||
| 519 | #define CEPH_CAP_SLINK 4 | ||
| 520 | #define CEPH_CAP_SXATTR 6 | ||
| 521 | #define CEPH_CAP_SFILE 8 | ||
| 522 | #define CEPH_CAP_SFLOCK 20 | ||
| 523 | |||
| 524 | #define CEPH_CAP_BITS 22 | ||
| 525 | |||
| 526 | /* composed values */ | ||
| 527 | #define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) | ||
| 528 | #define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH) | ||
| 529 | #define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK) | ||
| 530 | #define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK) | ||
| 531 | #define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR) | ||
| 532 | #define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR) | ||
| 533 | #define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE) | ||
| 534 | #define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE) | ||
| 535 | #define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE) | ||
| 536 | #define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE) | ||
| 537 | #define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE) | ||
| 538 | #define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE) | ||
| 539 | #define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) | ||
| 540 | #define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) | ||
| 541 | #define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) | ||
| 542 | #define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK) | ||
| 543 | #define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK) | ||
| 544 | |||
| 545 | |||
| 546 | /* cap masks (for getattr) */ | ||
| 547 | #define CEPH_STAT_CAP_INODE CEPH_CAP_PIN | ||
| 548 | #define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */ | ||
| 549 | #define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN | ||
| 550 | #define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED | ||
| 551 | #define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED | ||
| 552 | #define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED | ||
| 553 | #define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED | ||
| 554 | #define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED | ||
| 555 | #define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED | ||
| 556 | #define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED | ||
| 557 | #define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */ | ||
| 558 | #define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED | ||
| 559 | #define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \ | ||
| 560 | CEPH_CAP_AUTH_SHARED | \ | ||
| 561 | CEPH_CAP_LINK_SHARED | \ | ||
| 562 | CEPH_CAP_FILE_SHARED | \ | ||
| 563 | CEPH_CAP_XATTR_SHARED) | ||
| 564 | |||
| 565 | #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ | ||
| 566 | CEPH_CAP_LINK_SHARED | \ | ||
| 567 | CEPH_CAP_XATTR_SHARED | \ | ||
| 568 | CEPH_CAP_FILE_SHARED) | ||
| 569 | #define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \ | ||
| 570 | CEPH_CAP_FILE_CACHE) | ||
| 571 | |||
| 572 | #define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ | ||
| 573 | CEPH_CAP_LINK_EXCL | \ | ||
| 574 | CEPH_CAP_XATTR_EXCL | \ | ||
| 575 | CEPH_CAP_FILE_EXCL) | ||
| 576 | #define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ | ||
| 577 | CEPH_CAP_FILE_EXCL) | ||
| 578 | #define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) | ||
| 579 | #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ | ||
| 580 | CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ | ||
| 581 | CEPH_CAP_PIN) | ||
| 582 | |||
| 583 | #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ | ||
| 584 | CEPH_LOCK_IXATTR) | ||
| 585 | |||
| 586 | int ceph_caps_for_mode(int mode); | ||
| 587 | |||
| 588 | enum { | ||
| 589 | CEPH_CAP_OP_GRANT, /* mds->client grant */ | ||
| 590 | CEPH_CAP_OP_REVOKE, /* mds->client revoke */ | ||
| 591 | CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ | ||
| 592 | CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ | ||
| 593 | CEPH_CAP_OP_IMPORT, /* mds has imported the cap */ | ||
| 594 | CEPH_CAP_OP_UPDATE, /* client->mds update */ | ||
| 595 | CEPH_CAP_OP_DROP, /* client->mds drop cap bits */ | ||
| 596 | CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */ | ||
| 597 | CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */ | ||
| 598 | CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ | ||
| 599 | CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ | ||
| 600 | CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ | ||
| 601 | CEPH_CAP_OP_RENEW, /* client->mds renewal request */ | ||
| 602 | }; | ||
| 603 | |||
| 604 | extern const char *ceph_cap_op_name(int op); | ||
| 605 | |||
| 606 | /* | ||
| 607 | * caps message, used for capability callbacks, acks, requests, etc. | ||
| 608 | */ | ||
| 609 | struct ceph_mds_caps { | ||
| 610 | __le32 op; /* CEPH_CAP_OP_* */ | ||
| 611 | __le64 ino, realm; | ||
| 612 | __le64 cap_id; | ||
| 613 | __le32 seq, issue_seq; | ||
| 614 | __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */ | ||
| 615 | __le32 migrate_seq; | ||
| 616 | __le64 snap_follows; | ||
| 617 | __le32 snap_trace_len; | ||
| 618 | |||
| 619 | /* authlock */ | ||
| 620 | __le32 uid, gid, mode; | ||
| 621 | |||
| 622 | /* linklock */ | ||
| 623 | __le32 nlink; | ||
| 624 | |||
| 625 | /* xattrlock */ | ||
| 626 | __le32 xattr_len; | ||
| 627 | __le64 xattr_version; | ||
| 628 | |||
| 629 | /* filelock */ | ||
| 630 | __le64 size, max_size, truncate_size; | ||
| 631 | __le32 truncate_seq; | ||
| 632 | struct ceph_timespec mtime, atime, ctime; | ||
| 633 | struct ceph_file_layout layout; | ||
| 634 | __le32 time_warp_seq; | ||
| 635 | } __attribute__ ((packed)); | ||
| 636 | |||
| 637 | /* cap release msg head */ | ||
| 638 | struct ceph_mds_cap_release { | ||
| 639 | __le32 num; /* number of cap_items that follow */ | ||
| 640 | } __attribute__ ((packed)); | ||
| 641 | |||
| 642 | struct ceph_mds_cap_item { | ||
| 643 | __le64 ino; | ||
| 644 | __le64 cap_id; | ||
| 645 | __le32 migrate_seq, seq; | ||
| 646 | } __attribute__ ((packed)); | ||
| 647 | |||
| 648 | #define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ | ||
| 649 | #define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ | ||
| 650 | #define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ | ||
| 651 | #define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ | ||
| 652 | |||
| 653 | extern const char *ceph_lease_op_name(int o); | ||
| 654 | |||
| 655 | /* lease msg header */ | ||
| 656 | struct ceph_mds_lease { | ||
| 657 | __u8 action; /* CEPH_MDS_LEASE_* */ | ||
| 658 | __le16 mask; /* which lease */ | ||
| 659 | __le64 ino; | ||
| 660 | __le64 first, last; /* snap range */ | ||
| 661 | __le32 seq; | ||
| 662 | __le32 duration_ms; /* duration of renewal */ | ||
| 663 | } __attribute__ ((packed)); | ||
| 664 | /* followed by a __le32+string for dname */ | ||
| 665 | |||
| 666 | /* client reconnect */ | ||
| 667 | struct ceph_mds_cap_reconnect { | ||
| 668 | __le64 cap_id; | ||
| 669 | __le32 wanted; | ||
| 670 | __le32 issued; | ||
| 671 | __le64 snaprealm; | ||
| 672 | __le64 pathbase; /* base ino for our path to this ino */ | ||
| 673 | __le32 flock_len; /* size of flock state blob, if any */ | ||
| 674 | } __attribute__ ((packed)); | ||
| 675 | /* followed by flock blob */ | ||
| 676 | |||
| 677 | struct ceph_mds_cap_reconnect_v1 { | ||
| 678 | __le64 cap_id; | ||
| 679 | __le32 wanted; | ||
| 680 | __le32 issued; | ||
| 681 | __le64 size; | ||
| 682 | struct ceph_timespec mtime, atime; | ||
| 683 | __le64 snaprealm; | ||
| 684 | __le64 pathbase; /* base ino for our path to this ino */ | ||
| 685 | } __attribute__ ((packed)); | ||
| 686 | |||
| 687 | struct ceph_mds_snaprealm_reconnect { | ||
| 688 | __le64 ino; /* snap realm base */ | ||
| 689 | __le64 seq; /* snap seq for this snap realm */ | ||
| 690 | __le64 parent; /* parent realm */ | ||
| 691 | } __attribute__ ((packed)); | ||
| 692 | |||
| 693 | /* | ||
| 694 | * snaps | ||
| 695 | */ | ||
| 696 | enum { | ||
| 697 | CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ | ||
| 698 | CEPH_SNAP_OP_CREATE, | ||
| 699 | CEPH_SNAP_OP_DESTROY, | ||
| 700 | CEPH_SNAP_OP_SPLIT, | ||
| 701 | }; | ||
| 702 | |||
| 703 | extern const char *ceph_snap_op_name(int o); | ||
| 704 | |||
| 705 | /* snap msg header */ | ||
| 706 | struct ceph_mds_snap_head { | ||
| 707 | __le32 op; /* CEPH_SNAP_OP_* */ | ||
| 708 | __le64 split; /* ino to split off, if any */ | ||
| 709 | __le32 num_split_inos; /* # inos belonging to new child realm */ | ||
| 710 | __le32 num_split_realms; /* # child realms udner new child realm */ | ||
| 711 | __le32 trace_len; /* size of snap trace blob */ | ||
| 712 | } __attribute__ ((packed)); | ||
| 713 | /* followed by split ino list, then split realms, then the trace blob */ | ||
| 714 | |||
| 715 | /* | ||
| 716 | * encode info about a snaprealm, as viewed by a client | ||
| 717 | */ | ||
| 718 | struct ceph_mds_snap_realm { | ||
| 719 | __le64 ino; /* ino */ | ||
| 720 | __le64 created; /* snap: when created */ | ||
| 721 | __le64 parent; /* ino: parent realm */ | ||
| 722 | __le64 parent_since; /* snap: same parent since */ | ||
| 723 | __le64 seq; /* snap: version */ | ||
| 724 | __le32 num_snaps; | ||
| 725 | __le32 num_prior_parent_snaps; | ||
| 726 | } __attribute__ ((packed)); | ||
| 727 | /* followed by my snap list, then prior parent snap list */ | ||
| 728 | |||
| 729 | #endif | ||
diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h new file mode 100644 index 000000000000..d099c3f90236 --- /dev/null +++ b/include/linux/ceph/ceph_hash.h | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | #ifndef FS_CEPH_HASH_H | ||
| 2 | #define FS_CEPH_HASH_H | ||
| 3 | |||
| 4 | #define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ | ||
| 5 | #define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ | ||
| 6 | |||
| 7 | extern unsigned ceph_str_hash_linux(const char *s, unsigned len); | ||
| 8 | extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len); | ||
| 9 | |||
| 10 | extern unsigned ceph_str_hash(int type, const char *s, unsigned len); | ||
| 11 | extern const char *ceph_str_hash_name(int type); | ||
| 12 | |||
| 13 | #endif | ||
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h new file mode 100644 index 000000000000..2a79702e092b --- /dev/null +++ b/include/linux/ceph/debugfs.h | |||
| @@ -0,0 +1,33 @@ | |||
| 1 | #ifndef _FS_CEPH_DEBUGFS_H | ||
| 2 | #define _FS_CEPH_DEBUGFS_H | ||
| 3 | |||
| 4 | #include "ceph_debug.h" | ||
| 5 | #include "types.h" | ||
| 6 | |||
| 7 | #define CEPH_DEFINE_SHOW_FUNC(name) \ | ||
| 8 | static int name##_open(struct inode *inode, struct file *file) \ | ||
| 9 | { \ | ||
| 10 | struct seq_file *sf; \ | ||
| 11 | int ret; \ | ||
| 12 | \ | ||
| 13 | ret = single_open(file, name, NULL); \ | ||
| 14 | sf = file->private_data; \ | ||
| 15 | sf->private = inode->i_private; \ | ||
| 16 | return ret; \ | ||
| 17 | } \ | ||
| 18 | \ | ||
| 19 | static const struct file_operations name##_fops = { \ | ||
| 20 | .open = name##_open, \ | ||
| 21 | .read = seq_read, \ | ||
| 22 | .llseek = seq_lseek, \ | ||
| 23 | .release = single_release, \ | ||
| 24 | }; | ||
| 25 | |||
| 26 | /* debugfs.c */ | ||
| 27 | extern int ceph_debugfs_init(void); | ||
| 28 | extern void ceph_debugfs_cleanup(void); | ||
| 29 | extern int ceph_debugfs_client_init(struct ceph_client *client); | ||
| 30 | extern void ceph_debugfs_client_cleanup(struct ceph_client *client); | ||
| 31 | |||
| 32 | #endif | ||
| 33 | |||
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h new file mode 100644 index 000000000000..c5b6939fb32a --- /dev/null +++ b/include/linux/ceph/decode.h | |||
| @@ -0,0 +1,201 @@ | |||
| 1 | #ifndef __CEPH_DECODE_H | ||
| 2 | #define __CEPH_DECODE_H | ||
| 3 | |||
| 4 | #include <asm/unaligned.h> | ||
| 5 | #include <linux/time.h> | ||
| 6 | |||
| 7 | #include "types.h" | ||
| 8 | |||
| 9 | /* | ||
| 10 | * in all cases, | ||
| 11 | * void **p pointer to position pointer | ||
| 12 | * void *end pointer to end of buffer (last byte + 1) | ||
| 13 | */ | ||
| 14 | |||
| 15 | static inline u64 ceph_decode_64(void **p) | ||
| 16 | { | ||
| 17 | u64 v = get_unaligned_le64(*p); | ||
| 18 | *p += sizeof(u64); | ||
| 19 | return v; | ||
| 20 | } | ||
| 21 | static inline u32 ceph_decode_32(void **p) | ||
| 22 | { | ||
| 23 | u32 v = get_unaligned_le32(*p); | ||
| 24 | *p += sizeof(u32); | ||
| 25 | return v; | ||
| 26 | } | ||
| 27 | static inline u16 ceph_decode_16(void **p) | ||
| 28 | { | ||
| 29 | u16 v = get_unaligned_le16(*p); | ||
| 30 | *p += sizeof(u16); | ||
| 31 | return v; | ||
| 32 | } | ||
| 33 | static inline u8 ceph_decode_8(void **p) | ||
| 34 | { | ||
| 35 | u8 v = *(u8 *)*p; | ||
| 36 | (*p)++; | ||
| 37 | return v; | ||
| 38 | } | ||
| 39 | static inline void ceph_decode_copy(void **p, void *pv, size_t n) | ||
| 40 | { | ||
| 41 | memcpy(pv, *p, n); | ||
| 42 | *p += n; | ||
| 43 | } | ||
| 44 | |||
| 45 | /* | ||
| 46 | * bounds check input. | ||
| 47 | */ | ||
| 48 | #define ceph_decode_need(p, end, n, bad) \ | ||
| 49 | do { \ | ||
| 50 | if (unlikely(*(p) + (n) > (end))) \ | ||
| 51 | goto bad; \ | ||
| 52 | } while (0) | ||
| 53 | |||
| 54 | #define ceph_decode_64_safe(p, end, v, bad) \ | ||
| 55 | do { \ | ||
| 56 | ceph_decode_need(p, end, sizeof(u64), bad); \ | ||
| 57 | v = ceph_decode_64(p); \ | ||
| 58 | } while (0) | ||
| 59 | #define ceph_decode_32_safe(p, end, v, bad) \ | ||
| 60 | do { \ | ||
| 61 | ceph_decode_need(p, end, sizeof(u32), bad); \ | ||
| 62 | v = ceph_decode_32(p); \ | ||
| 63 | } while (0) | ||
| 64 | #define ceph_decode_16_safe(p, end, v, bad) \ | ||
| 65 | do { \ | ||
| 66 | ceph_decode_need(p, end, sizeof(u16), bad); \ | ||
| 67 | v = ceph_decode_16(p); \ | ||
| 68 | } while (0) | ||
| 69 | #define ceph_decode_8_safe(p, end, v, bad) \ | ||
| 70 | do { \ | ||
| 71 | ceph_decode_need(p, end, sizeof(u8), bad); \ | ||
| 72 | v = ceph_decode_8(p); \ | ||
| 73 | } while (0) | ||
| 74 | |||
| 75 | #define ceph_decode_copy_safe(p, end, pv, n, bad) \ | ||
| 76 | do { \ | ||
| 77 | ceph_decode_need(p, end, n, bad); \ | ||
| 78 | ceph_decode_copy(p, pv, n); \ | ||
| 79 | } while (0) | ||
| 80 | |||
| 81 | /* | ||
| 82 | * struct ceph_timespec <-> struct timespec | ||
| 83 | */ | ||
| 84 | static inline void ceph_decode_timespec(struct timespec *ts, | ||
| 85 | const struct ceph_timespec *tv) | ||
| 86 | { | ||
| 87 | ts->tv_sec = le32_to_cpu(tv->tv_sec); | ||
| 88 | ts->tv_nsec = le32_to_cpu(tv->tv_nsec); | ||
| 89 | } | ||
| 90 | static inline void ceph_encode_timespec(struct ceph_timespec *tv, | ||
| 91 | const struct timespec *ts) | ||
| 92 | { | ||
| 93 | tv->tv_sec = cpu_to_le32(ts->tv_sec); | ||
| 94 | tv->tv_nsec = cpu_to_le32(ts->tv_nsec); | ||
| 95 | } | ||
| 96 | |||
| 97 | /* | ||
| 98 | * sockaddr_storage <-> ceph_sockaddr | ||
| 99 | */ | ||
| 100 | static inline void ceph_encode_addr(struct ceph_entity_addr *a) | ||
| 101 | { | ||
| 102 | __be16 ss_family = htons(a->in_addr.ss_family); | ||
| 103 | a->in_addr.ss_family = *(__u16 *)&ss_family; | ||
| 104 | } | ||
| 105 | static inline void ceph_decode_addr(struct ceph_entity_addr *a) | ||
| 106 | { | ||
| 107 | __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; | ||
| 108 | a->in_addr.ss_family = ntohs(ss_family); | ||
| 109 | WARN_ON(a->in_addr.ss_family == 512); | ||
| 110 | } | ||
| 111 | |||
| 112 | /* | ||
| 113 | * encoders | ||
| 114 | */ | ||
| 115 | static inline void ceph_encode_64(void **p, u64 v) | ||
| 116 | { | ||
| 117 | put_unaligned_le64(v, (__le64 *)*p); | ||
| 118 | *p += sizeof(u64); | ||
| 119 | } | ||
| 120 | static inline void ceph_encode_32(void **p, u32 v) | ||
| 121 | { | ||
| 122 | put_unaligned_le32(v, (__le32 *)*p); | ||
| 123 | *p += sizeof(u32); | ||
| 124 | } | ||
| 125 | static inline void ceph_encode_16(void **p, u16 v) | ||
| 126 | { | ||
| 127 | put_unaligned_le16(v, (__le16 *)*p); | ||
| 128 | *p += sizeof(u16); | ||
| 129 | } | ||
| 130 | static inline void ceph_encode_8(void **p, u8 v) | ||
| 131 | { | ||
| 132 | *(u8 *)*p = v; | ||
| 133 | (*p)++; | ||
| 134 | } | ||
| 135 | static inline void ceph_encode_copy(void **p, const void *s, int len) | ||
| 136 | { | ||
| 137 | memcpy(*p, s, len); | ||
| 138 | *p += len; | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 142 | * filepath, string encoders | ||
| 143 | */ | ||
| 144 | static inline void ceph_encode_filepath(void **p, void *end, | ||
| 145 | u64 ino, const char *path) | ||
| 146 | { | ||
| 147 | u32 len = path ? strlen(path) : 0; | ||
| 148 | BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); | ||
| 149 | ceph_encode_8(p, 1); | ||
| 150 | ceph_encode_64(p, ino); | ||
| 151 | ceph_encode_32(p, len); | ||
| 152 | if (len) | ||
| 153 | memcpy(*p, path, len); | ||
| 154 | *p += len; | ||
| 155 | } | ||
| 156 | |||
| 157 | static inline void ceph_encode_string(void **p, void *end, | ||
| 158 | const char *s, u32 len) | ||
| 159 | { | ||
| 160 | BUG_ON(*p + sizeof(len) + len > end); | ||
| 161 | ceph_encode_32(p, len); | ||
| 162 | if (len) | ||
| 163 | memcpy(*p, s, len); | ||
| 164 | *p += len; | ||
| 165 | } | ||
| 166 | |||
| 167 | #define ceph_encode_need(p, end, n, bad) \ | ||
| 168 | do { \ | ||
| 169 | if (unlikely(*(p) + (n) > (end))) \ | ||
| 170 | goto bad; \ | ||
| 171 | } while (0) | ||
| 172 | |||
| 173 | #define ceph_encode_64_safe(p, end, v, bad) \ | ||
| 174 | do { \ | ||
| 175 | ceph_encode_need(p, end, sizeof(u64), bad); \ | ||
| 176 | ceph_encode_64(p, v); \ | ||
| 177 | } while (0) | ||
| 178 | #define ceph_encode_32_safe(p, end, v, bad) \ | ||
| 179 | do { \ | ||
| 180 | ceph_encode_need(p, end, sizeof(u32), bad); \ | ||
| 181 | ceph_encode_32(p, v); \ | ||
| 182 | } while (0) | ||
| 183 | #define ceph_encode_16_safe(p, end, v, bad) \ | ||
| 184 | do { \ | ||
| 185 | ceph_encode_need(p, end, sizeof(u16), bad); \ | ||
| 186 | ceph_encode_16(p, v); \ | ||
| 187 | } while (0) | ||
| 188 | |||
| 189 | #define ceph_encode_copy_safe(p, end, pv, n, bad) \ | ||
| 190 | do { \ | ||
| 191 | ceph_encode_need(p, end, n, bad); \ | ||
| 192 | ceph_encode_copy(p, pv, n); \ | ||
| 193 | } while (0) | ||
| 194 | #define ceph_encode_string_safe(p, end, s, n, bad) \ | ||
| 195 | do { \ | ||
| 196 | ceph_encode_need(p, end, n, bad); \ | ||
| 197 | ceph_encode_string(p, end, s, n); \ | ||
| 198 | } while (0) | ||
| 199 | |||
| 200 | |||
| 201 | #endif | ||
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h new file mode 100644 index 000000000000..f22b2e941686 --- /dev/null +++ b/include/linux/ceph/libceph.h | |||
| @@ -0,0 +1,249 @@ | |||
| 1 | #ifndef _FS_CEPH_LIBCEPH_H | ||
| 2 | #define _FS_CEPH_LIBCEPH_H | ||
| 3 | |||
| 4 | #include "ceph_debug.h" | ||
| 5 | |||
| 6 | #include <asm/unaligned.h> | ||
| 7 | #include <linux/backing-dev.h> | ||
| 8 | #include <linux/completion.h> | ||
| 9 | #include <linux/exportfs.h> | ||
| 10 | #include <linux/fs.h> | ||
| 11 | #include <linux/mempool.h> | ||
| 12 | #include <linux/pagemap.h> | ||
| 13 | #include <linux/wait.h> | ||
| 14 | #include <linux/writeback.h> | ||
| 15 | #include <linux/slab.h> | ||
| 16 | |||
| 17 | #include "types.h" | ||
| 18 | #include "messenger.h" | ||
| 19 | #include "msgpool.h" | ||
| 20 | #include "mon_client.h" | ||
| 21 | #include "osd_client.h" | ||
| 22 | #include "ceph_fs.h" | ||
| 23 | |||
| 24 | /* | ||
| 25 | * Supported features | ||
| 26 | */ | ||
| 27 | #define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR | ||
| 28 | #define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR | ||
| 29 | |||
| 30 | /* | ||
| 31 | * mount options | ||
| 32 | */ | ||
| 33 | #define CEPH_OPT_FSID (1<<0) | ||
| 34 | #define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ | ||
| 35 | #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ | ||
| 36 | #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ | ||
| 37 | |||
| 38 | #define CEPH_OPT_DEFAULT (0); | ||
| 39 | |||
| 40 | #define ceph_set_opt(client, opt) \ | ||
| 41 | (client)->options->flags |= CEPH_OPT_##opt; | ||
| 42 | #define ceph_test_opt(client, opt) \ | ||
| 43 | (!!((client)->options->flags & CEPH_OPT_##opt)) | ||
| 44 | |||
| 45 | struct ceph_options { | ||
| 46 | int flags; | ||
| 47 | struct ceph_fsid fsid; | ||
| 48 | struct ceph_entity_addr my_addr; | ||
| 49 | int mount_timeout; | ||
| 50 | int osd_idle_ttl; | ||
| 51 | int osd_timeout; | ||
| 52 | int osd_keepalive_timeout; | ||
| 53 | |||
| 54 | /* | ||
| 55 | * any type that can't be simply compared or doesn't need need | ||
| 56 | * to be compared should go beyond this point, | ||
| 57 | * ceph_compare_options() should be updated accordingly | ||
| 58 | */ | ||
| 59 | |||
| 60 | struct ceph_entity_addr *mon_addr; /* should be the first | ||
| 61 | pointer type of args */ | ||
| 62 | int num_mon; | ||
| 63 | char *name; | ||
| 64 | char *secret; | ||
| 65 | }; | ||
| 66 | |||
| 67 | /* | ||
| 68 | * defaults | ||
| 69 | */ | ||
| 70 | #define CEPH_MOUNT_TIMEOUT_DEFAULT 60 | ||
| 71 | #define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ | ||
| 72 | #define CEPH_OSD_KEEPALIVE_DEFAULT 5 | ||
| 73 | #define CEPH_OSD_IDLE_TTL_DEFAULT 60 | ||
| 74 | #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ | ||
| 75 | |||
| 76 | #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) | ||
| 77 | #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) | ||
| 78 | |||
| 79 | #define CEPH_AUTH_NAME_DEFAULT "guest" | ||
| 80 | |||
| 81 | /* | ||
| 82 | * Delay telling the MDS we no longer want caps, in case we reopen | ||
| 83 | * the file. Delay a minimum amount of time, even if we send a cap | ||
| 84 | * message for some other reason. Otherwise, take the oppotunity to | ||
| 85 | * update the mds to avoid sending another message later. | ||
| 86 | */ | ||
| 87 | #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ | ||
| 88 | #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ | ||
| 89 | |||
| 90 | #define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) | ||
| 91 | |||
| 92 | /* mount state */ | ||
| 93 | enum { | ||
| 94 | CEPH_MOUNT_MOUNTING, | ||
| 95 | CEPH_MOUNT_MOUNTED, | ||
| 96 | CEPH_MOUNT_UNMOUNTING, | ||
| 97 | CEPH_MOUNT_UNMOUNTED, | ||
| 98 | CEPH_MOUNT_SHUTDOWN, | ||
| 99 | }; | ||
| 100 | |||
| 101 | /* | ||
| 102 | * subtract jiffies | ||
| 103 | */ | ||
| 104 | static inline unsigned long time_sub(unsigned long a, unsigned long b) | ||
| 105 | { | ||
| 106 | BUG_ON(time_after(b, a)); | ||
| 107 | return (long)a - (long)b; | ||
| 108 | } | ||
| 109 | |||
| 110 | struct ceph_mds_client; | ||
| 111 | |||
| 112 | /* | ||
| 113 | * per client state | ||
| 114 | * | ||
| 115 | * possibly shared by multiple mount points, if they are | ||
| 116 | * mounting the same ceph filesystem/cluster. | ||
| 117 | */ | ||
| 118 | struct ceph_client { | ||
| 119 | struct ceph_fsid fsid; | ||
| 120 | bool have_fsid; | ||
| 121 | |||
| 122 | void *private; | ||
| 123 | |||
| 124 | struct ceph_options *options; | ||
| 125 | |||
| 126 | struct mutex mount_mutex; /* serialize mount attempts */ | ||
| 127 | wait_queue_head_t auth_wq; | ||
| 128 | int auth_err; | ||
| 129 | |||
| 130 | int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); | ||
| 131 | |||
| 132 | u32 supported_features; | ||
| 133 | u32 required_features; | ||
| 134 | |||
| 135 | struct ceph_messenger *msgr; /* messenger instance */ | ||
| 136 | struct ceph_mon_client monc; | ||
| 137 | struct ceph_osd_client osdc; | ||
| 138 | |||
| 139 | #ifdef CONFIG_DEBUG_FS | ||
| 140 | struct dentry *debugfs_dir; | ||
| 141 | struct dentry *debugfs_monmap; | ||
| 142 | struct dentry *debugfs_osdmap; | ||
| 143 | #endif | ||
| 144 | }; | ||
| 145 | |||
| 146 | |||
| 147 | |||
| 148 | /* | ||
| 149 | * snapshots | ||
| 150 | */ | ||
| 151 | |||
| 152 | /* | ||
| 153 | * A "snap context" is the set of existing snapshots when we | ||
| 154 | * write data. It is used by the OSD to guide its COW behavior. | ||
| 155 | * | ||
| 156 | * The ceph_snap_context is refcounted, and attached to each dirty | ||
| 157 | * page, indicating which context the dirty data belonged when it was | ||
| 158 | * dirtied. | ||
| 159 | */ | ||
| 160 | struct ceph_snap_context { | ||
| 161 | atomic_t nref; | ||
| 162 | u64 seq; | ||
| 163 | int num_snaps; | ||
| 164 | u64 snaps[]; | ||
| 165 | }; | ||
| 166 | |||
| 167 | static inline struct ceph_snap_context * | ||
| 168 | ceph_get_snap_context(struct ceph_snap_context *sc) | ||
| 169 | { | ||
| 170 | /* | ||
| 171 | printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), | ||
| 172 | atomic_read(&sc->nref)+1); | ||
| 173 | */ | ||
| 174 | if (sc) | ||
| 175 | atomic_inc(&sc->nref); | ||
| 176 | return sc; | ||
| 177 | } | ||
| 178 | |||
| 179 | static inline void ceph_put_snap_context(struct ceph_snap_context *sc) | ||
| 180 | { | ||
| 181 | if (!sc) | ||
| 182 | return; | ||
| 183 | /* | ||
| 184 | printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), | ||
| 185 | atomic_read(&sc->nref)-1); | ||
| 186 | */ | ||
| 187 | if (atomic_dec_and_test(&sc->nref)) { | ||
| 188 | /*printk(" deleting snap_context %p\n", sc);*/ | ||
| 189 | kfree(sc); | ||
| 190 | } | ||
| 191 | } | ||
| 192 | |||
| 193 | /* | ||
| 194 | * calculate the number of pages a given length and offset map onto, | ||
| 195 | * if we align the data. | ||
| 196 | */ | ||
| 197 | static inline int calc_pages_for(u64 off, u64 len) | ||
| 198 | { | ||
| 199 | return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - | ||
| 200 | (off >> PAGE_CACHE_SHIFT); | ||
| 201 | } | ||
| 202 | |||
| 203 | /* ceph_common.c */ | ||
| 204 | extern const char *ceph_msg_type_name(int type); | ||
| 205 | extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); | ||
| 206 | extern struct kmem_cache *ceph_inode_cachep; | ||
| 207 | extern struct kmem_cache *ceph_cap_cachep; | ||
| 208 | extern struct kmem_cache *ceph_dentry_cachep; | ||
| 209 | extern struct kmem_cache *ceph_file_cachep; | ||
| 210 | |||
| 211 | extern int ceph_parse_options(struct ceph_options **popt, char *options, | ||
| 212 | const char *dev_name, const char *dev_name_end, | ||
| 213 | int (*parse_extra_token)(char *c, void *private), | ||
| 214 | void *private); | ||
| 215 | extern void ceph_destroy_options(struct ceph_options *opt); | ||
| 216 | extern int ceph_compare_options(struct ceph_options *new_opt, | ||
| 217 | struct ceph_client *client); | ||
| 218 | extern struct ceph_client *ceph_create_client(struct ceph_options *opt, | ||
| 219 | void *private); | ||
| 220 | extern u64 ceph_client_id(struct ceph_client *client); | ||
| 221 | extern void ceph_destroy_client(struct ceph_client *client); | ||
| 222 | extern int __ceph_open_session(struct ceph_client *client, | ||
| 223 | unsigned long started); | ||
| 224 | extern int ceph_open_session(struct ceph_client *client); | ||
| 225 | |||
| 226 | /* pagevec.c */ | ||
| 227 | extern void ceph_release_page_vector(struct page **pages, int num_pages); | ||
| 228 | |||
| 229 | extern struct page **ceph_get_direct_page_vector(const char __user *data, | ||
| 230 | int num_pages, | ||
| 231 | loff_t off, size_t len); | ||
| 232 | extern void ceph_put_page_vector(struct page **pages, int num_pages); | ||
| 233 | extern void ceph_release_page_vector(struct page **pages, int num_pages); | ||
| 234 | extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); | ||
| 235 | extern int ceph_copy_user_to_page_vector(struct page **pages, | ||
| 236 | const char __user *data, | ||
| 237 | loff_t off, size_t len); | ||
| 238 | extern int ceph_copy_to_page_vector(struct page **pages, | ||
| 239 | const char *data, | ||
| 240 | loff_t off, size_t len); | ||
| 241 | extern int ceph_copy_from_page_vector(struct page **pages, | ||
| 242 | char *data, | ||
| 243 | loff_t off, size_t len); | ||
| 244 | extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, | ||
| 245 | loff_t off, size_t len); | ||
| 246 | extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); | ||
| 247 | |||
| 248 | |||
| 249 | #endif /* _FS_CEPH_SUPER_H */ | ||
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h new file mode 100644 index 000000000000..4c5cb0880bba --- /dev/null +++ b/include/linux/ceph/mdsmap.h | |||
| @@ -0,0 +1,62 @@ | |||
| 1 | #ifndef _FS_CEPH_MDSMAP_H | ||
| 2 | #define _FS_CEPH_MDSMAP_H | ||
| 3 | |||
| 4 | #include "types.h" | ||
| 5 | |||
| 6 | /* | ||
| 7 | * mds map - describe servers in the mds cluster. | ||
| 8 | * | ||
| 9 | * we limit fields to those the client actually xcares about | ||
| 10 | */ | ||
| 11 | struct ceph_mds_info { | ||
| 12 | u64 global_id; | ||
| 13 | struct ceph_entity_addr addr; | ||
| 14 | s32 state; | ||
| 15 | int num_export_targets; | ||
| 16 | bool laggy; | ||
| 17 | u32 *export_targets; | ||
| 18 | }; | ||
| 19 | |||
| 20 | struct ceph_mdsmap { | ||
| 21 | u32 m_epoch, m_client_epoch, m_last_failure; | ||
| 22 | u32 m_root; | ||
| 23 | u32 m_session_timeout; /* seconds */ | ||
| 24 | u32 m_session_autoclose; /* seconds */ | ||
| 25 | u64 m_max_file_size; | ||
| 26 | u32 m_max_mds; /* size of m_addr, m_state arrays */ | ||
| 27 | struct ceph_mds_info *m_info; | ||
| 28 | |||
| 29 | /* which object pools file data can be stored in */ | ||
| 30 | int m_num_data_pg_pools; | ||
| 31 | u32 *m_data_pg_pools; | ||
| 32 | u32 m_cas_pg_pool; | ||
| 33 | }; | ||
| 34 | |||
| 35 | static inline struct ceph_entity_addr * | ||
| 36 | ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) | ||
| 37 | { | ||
| 38 | if (w >= m->m_max_mds) | ||
| 39 | return NULL; | ||
| 40 | return &m->m_info[w].addr; | ||
| 41 | } | ||
| 42 | |||
| 43 | static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) | ||
| 44 | { | ||
| 45 | BUG_ON(w < 0); | ||
| 46 | if (w >= m->m_max_mds) | ||
| 47 | return CEPH_MDS_STATE_DNE; | ||
| 48 | return m->m_info[w].state; | ||
| 49 | } | ||
| 50 | |||
| 51 | static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) | ||
| 52 | { | ||
| 53 | if (w >= 0 && w < m->m_max_mds) | ||
| 54 | return m->m_info[w].laggy; | ||
| 55 | return false; | ||
| 56 | } | ||
| 57 | |||
| 58 | extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); | ||
| 59 | extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); | ||
| 60 | extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); | ||
| 61 | |||
| 62 | #endif | ||
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h new file mode 100644 index 000000000000..5956d62c3057 --- /dev/null +++ b/include/linux/ceph/messenger.h | |||
| @@ -0,0 +1,261 @@ | |||
| 1 | #ifndef __FS_CEPH_MESSENGER_H | ||
| 2 | #define __FS_CEPH_MESSENGER_H | ||
| 3 | |||
| 4 | #include <linux/kref.h> | ||
| 5 | #include <linux/mutex.h> | ||
| 6 | #include <linux/net.h> | ||
| 7 | #include <linux/radix-tree.h> | ||
| 8 | #include <linux/uio.h> | ||
| 9 | #include <linux/version.h> | ||
| 10 | #include <linux/workqueue.h> | ||
| 11 | |||
| 12 | #include "types.h" | ||
| 13 | #include "buffer.h" | ||
| 14 | |||
| 15 | struct ceph_msg; | ||
| 16 | struct ceph_connection; | ||
| 17 | |||
| 18 | extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */ | ||
| 19 | |||
| 20 | /* | ||
| 21 | * Ceph defines these callbacks for handling connection events. | ||
| 22 | */ | ||
| 23 | struct ceph_connection_operations { | ||
| 24 | struct ceph_connection *(*get)(struct ceph_connection *); | ||
| 25 | void (*put)(struct ceph_connection *); | ||
| 26 | |||
| 27 | /* handle an incoming message. */ | ||
| 28 | void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m); | ||
| 29 | |||
| 30 | /* authorize an outgoing connection */ | ||
| 31 | int (*get_authorizer) (struct ceph_connection *con, | ||
| 32 | void **buf, int *len, int *proto, | ||
| 33 | void **reply_buf, int *reply_len, int force_new); | ||
| 34 | int (*verify_authorizer_reply) (struct ceph_connection *con, int len); | ||
| 35 | int (*invalidate_authorizer)(struct ceph_connection *con); | ||
| 36 | |||
| 37 | /* protocol version mismatch */ | ||
| 38 | void (*bad_proto) (struct ceph_connection *con); | ||
| 39 | |||
| 40 | /* there was some error on the socket (disconnect, whatever) */ | ||
| 41 | void (*fault) (struct ceph_connection *con); | ||
| 42 | |||
| 43 | /* a remote host as terminated a message exchange session, and messages | ||
| 44 | * we sent (or they tried to send us) may be lost. */ | ||
| 45 | void (*peer_reset) (struct ceph_connection *con); | ||
| 46 | |||
| 47 | struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, | ||
| 48 | struct ceph_msg_header *hdr, | ||
| 49 | int *skip); | ||
| 50 | }; | ||
| 51 | |||
| 52 | /* use format string %s%d */ | ||
| 53 | #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) | ||
| 54 | |||
| 55 | struct ceph_messenger { | ||
| 56 | struct ceph_entity_inst inst; /* my name+address */ | ||
| 57 | struct ceph_entity_addr my_enc_addr; | ||
| 58 | struct page *zero_page; /* used in certain error cases */ | ||
| 59 | |||
| 60 | bool nocrc; | ||
| 61 | |||
| 62 | /* | ||
| 63 | * the global_seq counts connections i (attempt to) initiate | ||
| 64 | * in order to disambiguate certain connect race conditions. | ||
| 65 | */ | ||
| 66 | u32 global_seq; | ||
| 67 | spinlock_t global_seq_lock; | ||
| 68 | |||
| 69 | u32 supported_features; | ||
| 70 | u32 required_features; | ||
| 71 | }; | ||
| 72 | |||
| 73 | /* | ||
| 74 | * a single message. it contains a header (src, dest, message type, etc.), | ||
| 75 | * footer (crc values, mainly), a "front" message body, and possibly a | ||
| 76 | * data payload (stored in some number of pages). | ||
| 77 | */ | ||
| 78 | struct ceph_msg { | ||
| 79 | struct ceph_msg_header hdr; /* header */ | ||
| 80 | struct ceph_msg_footer footer; /* footer */ | ||
| 81 | struct kvec front; /* unaligned blobs of message */ | ||
| 82 | struct ceph_buffer *middle; | ||
| 83 | struct page **pages; /* data payload. NOT OWNER. */ | ||
| 84 | unsigned nr_pages; /* size of page array */ | ||
| 85 | struct ceph_pagelist *pagelist; /* instead of pages */ | ||
| 86 | struct list_head list_head; | ||
| 87 | struct kref kref; | ||
| 88 | struct bio *bio; /* instead of pages/pagelist */ | ||
| 89 | struct bio *bio_iter; /* bio iterator */ | ||
| 90 | int bio_seg; /* current bio segment */ | ||
| 91 | struct ceph_pagelist *trail; /* the trailing part of the data */ | ||
| 92 | bool front_is_vmalloc; | ||
| 93 | bool more_to_follow; | ||
| 94 | bool needs_out_seq; | ||
| 95 | int front_max; | ||
| 96 | |||
| 97 | struct ceph_msgpool *pool; | ||
| 98 | }; | ||
| 99 | |||
| 100 | struct ceph_msg_pos { | ||
| 101 | int page, page_pos; /* which page; offset in page */ | ||
| 102 | int data_pos; /* offset in data payload */ | ||
| 103 | int did_page_crc; /* true if we've calculated crc for current page */ | ||
| 104 | }; | ||
| 105 | |||
| 106 | /* ceph connection fault delay defaults, for exponential backoff */ | ||
| 107 | #define BASE_DELAY_INTERVAL (HZ/2) | ||
| 108 | #define MAX_DELAY_INTERVAL (5 * 60 * HZ) | ||
| 109 | |||
| 110 | /* | ||
| 111 | * ceph_connection state bit flags | ||
| 112 | * | ||
| 113 | * QUEUED and BUSY are used together to ensure that only a single | ||
| 114 | * thread is currently opening, reading or writing data to the socket. | ||
| 115 | */ | ||
| 116 | #define LOSSYTX 0 /* we can close channel or drop messages on errors */ | ||
| 117 | #define CONNECTING 1 | ||
| 118 | #define NEGOTIATING 2 | ||
| 119 | #define KEEPALIVE_PENDING 3 | ||
| 120 | #define WRITE_PENDING 4 /* we have data ready to send */ | ||
| 121 | #define QUEUED 5 /* there is work queued on this connection */ | ||
| 122 | #define BUSY 6 /* work is being done */ | ||
| 123 | #define STANDBY 8 /* no outgoing messages, socket closed. we keep | ||
| 124 | * the ceph_connection around to maintain shared | ||
| 125 | * state with the peer. */ | ||
| 126 | #define CLOSED 10 /* we've closed the connection */ | ||
| 127 | #define SOCK_CLOSED 11 /* socket state changed to closed */ | ||
| 128 | #define OPENING 13 /* open connection w/ (possibly new) peer */ | ||
| 129 | #define DEAD 14 /* dead, about to kfree */ | ||
| 130 | |||
| 131 | /* | ||
| 132 | * A single connection with another host. | ||
| 133 | * | ||
| 134 | * We maintain a queue of outgoing messages, and some session state to | ||
| 135 | * ensure that we can preserve the lossless, ordered delivery of | ||
| 136 | * messages in the case of a TCP disconnect. | ||
| 137 | */ | ||
| 138 | struct ceph_connection { | ||
| 139 | void *private; | ||
| 140 | atomic_t nref; | ||
| 141 | |||
| 142 | const struct ceph_connection_operations *ops; | ||
| 143 | |||
| 144 | struct ceph_messenger *msgr; | ||
| 145 | struct socket *sock; | ||
| 146 | unsigned long state; /* connection state (see flags above) */ | ||
| 147 | const char *error_msg; /* error message, if any */ | ||
| 148 | |||
| 149 | struct ceph_entity_addr peer_addr; /* peer address */ | ||
| 150 | struct ceph_entity_name peer_name; /* peer name */ | ||
| 151 | struct ceph_entity_addr peer_addr_for_me; | ||
| 152 | unsigned peer_features; | ||
| 153 | u32 connect_seq; /* identify the most recent connection | ||
| 154 | attempt for this connection, client */ | ||
| 155 | u32 peer_global_seq; /* peer's global seq for this connection */ | ||
| 156 | |||
| 157 | int auth_retry; /* true if we need a newer authorizer */ | ||
| 158 | void *auth_reply_buf; /* where to put the authorizer reply */ | ||
| 159 | int auth_reply_buf_len; | ||
| 160 | |||
| 161 | struct mutex mutex; | ||
| 162 | |||
| 163 | /* out queue */ | ||
| 164 | struct list_head out_queue; | ||
| 165 | struct list_head out_sent; /* sending or sent but unacked */ | ||
| 166 | u64 out_seq; /* last message queued for send */ | ||
| 167 | bool out_keepalive_pending; | ||
| 168 | |||
| 169 | u64 in_seq, in_seq_acked; /* last message received, acked */ | ||
| 170 | |||
| 171 | /* connection negotiation temps */ | ||
| 172 | char in_banner[CEPH_BANNER_MAX_LEN]; | ||
| 173 | union { | ||
| 174 | struct { /* outgoing connection */ | ||
| 175 | struct ceph_msg_connect out_connect; | ||
| 176 | struct ceph_msg_connect_reply in_reply; | ||
| 177 | }; | ||
| 178 | struct { /* incoming */ | ||
| 179 | struct ceph_msg_connect in_connect; | ||
| 180 | struct ceph_msg_connect_reply out_reply; | ||
| 181 | }; | ||
| 182 | }; | ||
| 183 | struct ceph_entity_addr actual_peer_addr; | ||
| 184 | |||
| 185 | /* message out temps */ | ||
| 186 | struct ceph_msg *out_msg; /* sending message (== tail of | ||
| 187 | out_sent) */ | ||
| 188 | bool out_msg_done; | ||
| 189 | struct ceph_msg_pos out_msg_pos; | ||
| 190 | |||
| 191 | struct kvec out_kvec[8], /* sending header/footer data */ | ||
| 192 | *out_kvec_cur; | ||
| 193 | int out_kvec_left; /* kvec's left in out_kvec */ | ||
| 194 | int out_skip; /* skip this many bytes */ | ||
| 195 | int out_kvec_bytes; /* total bytes left */ | ||
| 196 | bool out_kvec_is_msg; /* kvec refers to out_msg */ | ||
| 197 | int out_more; /* there is more data after the kvecs */ | ||
| 198 | __le64 out_temp_ack; /* for writing an ack */ | ||
| 199 | |||
| 200 | /* message in temps */ | ||
| 201 | struct ceph_msg_header in_hdr; | ||
| 202 | struct ceph_msg *in_msg; | ||
| 203 | struct ceph_msg_pos in_msg_pos; | ||
| 204 | u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ | ||
| 205 | |||
| 206 | char in_tag; /* protocol control byte */ | ||
| 207 | int in_base_pos; /* bytes read */ | ||
| 208 | __le64 in_temp_ack; /* for reading an ack */ | ||
| 209 | |||
| 210 | struct delayed_work work; /* send|recv work */ | ||
| 211 | unsigned long delay; /* current delay interval */ | ||
| 212 | }; | ||
| 213 | |||
| 214 | |||
| 215 | extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); | ||
| 216 | extern int ceph_parse_ips(const char *c, const char *end, | ||
| 217 | struct ceph_entity_addr *addr, | ||
| 218 | int max_count, int *count); | ||
| 219 | |||
| 220 | |||
| 221 | extern int ceph_msgr_init(void); | ||
| 222 | extern void ceph_msgr_exit(void); | ||
| 223 | extern void ceph_msgr_flush(void); | ||
| 224 | |||
| 225 | extern struct ceph_messenger *ceph_messenger_create( | ||
| 226 | struct ceph_entity_addr *myaddr, | ||
| 227 | u32 features, u32 required); | ||
| 228 | extern void ceph_messenger_destroy(struct ceph_messenger *); | ||
| 229 | |||
| 230 | extern void ceph_con_init(struct ceph_messenger *msgr, | ||
| 231 | struct ceph_connection *con); | ||
| 232 | extern void ceph_con_open(struct ceph_connection *con, | ||
| 233 | struct ceph_entity_addr *addr); | ||
| 234 | extern bool ceph_con_opened(struct ceph_connection *con); | ||
| 235 | extern void ceph_con_close(struct ceph_connection *con); | ||
| 236 | extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); | ||
| 237 | extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); | ||
| 238 | extern void ceph_con_revoke_message(struct ceph_connection *con, | ||
| 239 | struct ceph_msg *msg); | ||
| 240 | extern void ceph_con_keepalive(struct ceph_connection *con); | ||
| 241 | extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); | ||
| 242 | extern void ceph_con_put(struct ceph_connection *con); | ||
| 243 | |||
| 244 | extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); | ||
| 245 | extern void ceph_msg_kfree(struct ceph_msg *m); | ||
| 246 | |||
| 247 | |||
| 248 | static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) | ||
| 249 | { | ||
| 250 | kref_get(&msg->kref); | ||
| 251 | return msg; | ||
| 252 | } | ||
| 253 | extern void ceph_msg_last_put(struct kref *kref); | ||
| 254 | static inline void ceph_msg_put(struct ceph_msg *msg) | ||
| 255 | { | ||
| 256 | kref_put(&msg->kref, ceph_msg_last_put); | ||
| 257 | } | ||
| 258 | |||
| 259 | extern void ceph_msg_dump(struct ceph_msg *msg); | ||
| 260 | |||
| 261 | #endif | ||
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h new file mode 100644 index 000000000000..545f85917780 --- /dev/null +++ b/include/linux/ceph/mon_client.h | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | #ifndef _FS_CEPH_MON_CLIENT_H | ||
| 2 | #define _FS_CEPH_MON_CLIENT_H | ||
| 3 | |||
| 4 | #include <linux/completion.h> | ||
| 5 | #include <linux/kref.h> | ||
| 6 | #include <linux/rbtree.h> | ||
| 7 | |||
| 8 | #include "messenger.h" | ||
| 9 | |||
| 10 | struct ceph_client; | ||
| 11 | struct ceph_mount_args; | ||
| 12 | struct ceph_auth_client; | ||
| 13 | |||
| 14 | /* | ||
| 15 | * The monitor map enumerates the set of all monitors. | ||
| 16 | */ | ||
| 17 | struct ceph_monmap { | ||
| 18 | struct ceph_fsid fsid; | ||
| 19 | u32 epoch; | ||
| 20 | u32 num_mon; | ||
| 21 | struct ceph_entity_inst mon_inst[0]; | ||
| 22 | }; | ||
| 23 | |||
| 24 | struct ceph_mon_client; | ||
| 25 | struct ceph_mon_generic_request; | ||
| 26 | |||
| 27 | |||
| 28 | /* | ||
| 29 | * Generic mechanism for resending monitor requests. | ||
| 30 | */ | ||
| 31 | typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc, | ||
| 32 | int newmon); | ||
| 33 | |||
| 34 | /* a pending monitor request */ | ||
| 35 | struct ceph_mon_request { | ||
| 36 | struct ceph_mon_client *monc; | ||
| 37 | struct delayed_work delayed_work; | ||
| 38 | unsigned long delay; | ||
| 39 | ceph_monc_request_func_t do_request; | ||
| 40 | }; | ||
| 41 | |||
| 42 | /* | ||
| 43 | * ceph_mon_generic_request is being used for the statfs and poolop requests | ||
| 44 | * which are bening done a bit differently because we need to get data back | ||
| 45 | * to the caller | ||
| 46 | */ | ||
| 47 | struct ceph_mon_generic_request { | ||
| 48 | struct kref kref; | ||
| 49 | u64 tid; | ||
| 50 | struct rb_node node; | ||
| 51 | int result; | ||
| 52 | void *buf; | ||
| 53 | int buf_len; | ||
| 54 | struct completion completion; | ||
| 55 | struct ceph_msg *request; /* original request */ | ||
| 56 | struct ceph_msg *reply; /* and reply */ | ||
| 57 | }; | ||
| 58 | |||
| 59 | struct ceph_mon_client { | ||
| 60 | struct ceph_client *client; | ||
| 61 | struct ceph_monmap *monmap; | ||
| 62 | |||
| 63 | struct mutex mutex; | ||
| 64 | struct delayed_work delayed_work; | ||
| 65 | |||
| 66 | struct ceph_auth_client *auth; | ||
| 67 | struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; | ||
| 68 | int pending_auth; | ||
| 69 | |||
| 70 | bool hunting; | ||
| 71 | int cur_mon; /* last monitor i contacted */ | ||
| 72 | unsigned long sub_sent, sub_renew_after; | ||
| 73 | struct ceph_connection *con; | ||
| 74 | bool have_fsid; | ||
| 75 | |||
| 76 | /* pending generic requests */ | ||
| 77 | struct rb_root generic_request_tree; | ||
| 78 | int num_generic_requests; | ||
| 79 | u64 last_tid; | ||
| 80 | |||
| 81 | /* mds/osd map */ | ||
| 82 | int want_mdsmap; | ||
| 83 | int want_next_osdmap; /* 1 = want, 2 = want+asked */ | ||
| 84 | u32 have_osdmap, have_mdsmap; | ||
| 85 | |||
| 86 | #ifdef CONFIG_DEBUG_FS | ||
| 87 | struct dentry *debugfs_file; | ||
| 88 | #endif | ||
| 89 | }; | ||
| 90 | |||
| 91 | extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); | ||
| 92 | extern int ceph_monmap_contains(struct ceph_monmap *m, | ||
| 93 | struct ceph_entity_addr *addr); | ||
| 94 | |||
| 95 | extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); | ||
| 96 | extern void ceph_monc_stop(struct ceph_mon_client *monc); | ||
| 97 | |||
| 98 | /* | ||
| 99 | * The model here is to indicate that we need a new map of at least | ||
| 100 | * epoch @want, and also call in when we receive a map. We will | ||
| 101 | * periodically rerequest the map from the monitor cluster until we | ||
| 102 | * get what we want. | ||
| 103 | */ | ||
| 104 | extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); | ||
| 105 | extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); | ||
| 106 | |||
| 107 | extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); | ||
| 108 | |||
| 109 | extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, | ||
| 110 | struct ceph_statfs *buf); | ||
| 111 | |||
| 112 | extern int ceph_monc_open_session(struct ceph_mon_client *monc); | ||
| 113 | |||
| 114 | extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); | ||
| 115 | |||
| 116 | extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, | ||
| 117 | u32 pool, u64 *snapid); | ||
| 118 | |||
| 119 | extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, | ||
| 120 | u32 pool, u64 snapid); | ||
| 121 | |||
| 122 | #endif | ||
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h new file mode 100644 index 000000000000..a362605f9368 --- /dev/null +++ b/include/linux/ceph/msgpool.h | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | #ifndef _FS_CEPH_MSGPOOL | ||
| 2 | #define _FS_CEPH_MSGPOOL | ||
| 3 | |||
| 4 | #include <linux/mempool.h> | ||
| 5 | #include "messenger.h" | ||
| 6 | |||
| 7 | /* | ||
| 8 | * we use memory pools for preallocating messages we may receive, to | ||
| 9 | * avoid unexpected OOM conditions. | ||
| 10 | */ | ||
| 11 | struct ceph_msgpool { | ||
| 12 | const char *name; | ||
| 13 | mempool_t *pool; | ||
| 14 | int front_len; /* preallocated payload size */ | ||
| 15 | }; | ||
| 16 | |||
| 17 | extern int ceph_msgpool_init(struct ceph_msgpool *pool, | ||
| 18 | int front_len, int size, bool blocking, | ||
| 19 | const char *name); | ||
| 20 | extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); | ||
| 21 | extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, | ||
| 22 | int front_len); | ||
| 23 | extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); | ||
| 24 | |||
| 25 | #endif | ||
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h new file mode 100644 index 000000000000..680d3d648cac --- /dev/null +++ b/include/linux/ceph/msgr.h | |||
| @@ -0,0 +1,175 @@ | |||
| 1 | #ifndef CEPH_MSGR_H | ||
| 2 | #define CEPH_MSGR_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * Data types for message passing layer used by Ceph. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #define CEPH_MON_PORT 6789 /* default monitor port */ | ||
| 9 | |||
| 10 | /* | ||
| 11 | * client-side processes will try to bind to ports in this | ||
| 12 | * range, simply for the benefit of tools like nmap or wireshark | ||
| 13 | * that would like to identify the protocol. | ||
| 14 | */ | ||
| 15 | #define CEPH_PORT_FIRST 6789 | ||
| 16 | #define CEPH_PORT_START 6800 /* non-monitors start here */ | ||
| 17 | #define CEPH_PORT_LAST 6900 | ||
| 18 | |||
| 19 | /* | ||
| 20 | * tcp connection banner. include a protocol version. and adjust | ||
| 21 | * whenever the wire protocol changes. try to keep this string length | ||
| 22 | * constant. | ||
| 23 | */ | ||
| 24 | #define CEPH_BANNER "ceph v027" | ||
| 25 | #define CEPH_BANNER_MAX_LEN 30 | ||
| 26 | |||
| 27 | |||
| 28 | /* | ||
| 29 | * Rollover-safe type and comparator for 32-bit sequence numbers. | ||
| 30 | * Comparator returns -1, 0, or 1. | ||
| 31 | */ | ||
| 32 | typedef __u32 ceph_seq_t; | ||
| 33 | |||
| 34 | static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) | ||
| 35 | { | ||
| 36 | return (__s32)a - (__s32)b; | ||
| 37 | } | ||
| 38 | |||
| 39 | |||
| 40 | /* | ||
| 41 | * entity_name -- logical name for a process participating in the | ||
| 42 | * network, e.g. 'mds0' or 'osd3'. | ||
| 43 | */ | ||
| 44 | struct ceph_entity_name { | ||
| 45 | __u8 type; /* CEPH_ENTITY_TYPE_* */ | ||
| 46 | __le64 num; | ||
| 47 | } __attribute__ ((packed)); | ||
| 48 | |||
| 49 | #define CEPH_ENTITY_TYPE_MON 0x01 | ||
| 50 | #define CEPH_ENTITY_TYPE_MDS 0x02 | ||
| 51 | #define CEPH_ENTITY_TYPE_OSD 0x04 | ||
| 52 | #define CEPH_ENTITY_TYPE_CLIENT 0x08 | ||
| 53 | #define CEPH_ENTITY_TYPE_AUTH 0x20 | ||
| 54 | |||
| 55 | #define CEPH_ENTITY_TYPE_ANY 0xFF | ||
| 56 | |||
| 57 | extern const char *ceph_entity_type_name(int type); | ||
| 58 | |||
| 59 | /* | ||
| 60 | * entity_addr -- network address | ||
| 61 | */ | ||
| 62 | struct ceph_entity_addr { | ||
| 63 | __le32 type; | ||
| 64 | __le32 nonce; /* unique id for process (e.g. pid) */ | ||
| 65 | struct sockaddr_storage in_addr; | ||
| 66 | } __attribute__ ((packed)); | ||
| 67 | |||
| 68 | struct ceph_entity_inst { | ||
| 69 | struct ceph_entity_name name; | ||
| 70 | struct ceph_entity_addr addr; | ||
| 71 | } __attribute__ ((packed)); | ||
| 72 | |||
| 73 | |||
| 74 | /* used by message exchange protocol */ | ||
| 75 | #define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ | ||
| 76 | #define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ | ||
| 77 | #define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing | ||
| 78 | incoming connection */ | ||
| 79 | #define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again | ||
| 80 | with higher cseq */ | ||
| 81 | #define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again | ||
| 82 | with higher gseq */ | ||
| 83 | #define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ | ||
| 84 | #define CEPH_MSGR_TAG_MSG 7 /* message */ | ||
| 85 | #define CEPH_MSGR_TAG_ACK 8 /* message ack */ | ||
| 86 | #define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ | ||
| 87 | #define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ | ||
| 88 | #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ | ||
| 89 | #define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ | ||
| 90 | |||
| 91 | |||
| 92 | /* | ||
| 93 | * connection negotiation | ||
| 94 | */ | ||
| 95 | struct ceph_msg_connect { | ||
| 96 | __le64 features; /* supported feature bits */ | ||
| 97 | __le32 host_type; /* CEPH_ENTITY_TYPE_* */ | ||
| 98 | __le32 global_seq; /* count connections initiated by this host */ | ||
| 99 | __le32 connect_seq; /* count connections initiated in this session */ | ||
| 100 | __le32 protocol_version; | ||
| 101 | __le32 authorizer_protocol; | ||
| 102 | __le32 authorizer_len; | ||
| 103 | __u8 flags; /* CEPH_MSG_CONNECT_* */ | ||
| 104 | } __attribute__ ((packed)); | ||
| 105 | |||
| 106 | struct ceph_msg_connect_reply { | ||
| 107 | __u8 tag; | ||
| 108 | __le64 features; /* feature bits for this session */ | ||
| 109 | __le32 global_seq; | ||
| 110 | __le32 connect_seq; | ||
| 111 | __le32 protocol_version; | ||
| 112 | __le32 authorizer_len; | ||
| 113 | __u8 flags; | ||
| 114 | } __attribute__ ((packed)); | ||
| 115 | |||
| 116 | #define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ | ||
| 117 | |||
| 118 | |||
| 119 | /* | ||
| 120 | * message header | ||
| 121 | */ | ||
| 122 | struct ceph_msg_header_old { | ||
| 123 | __le64 seq; /* message seq# for this session */ | ||
| 124 | __le64 tid; /* transaction id */ | ||
| 125 | __le16 type; /* message type */ | ||
| 126 | __le16 priority; /* priority. higher value == higher priority */ | ||
| 127 | __le16 version; /* version of message encoding */ | ||
| 128 | |||
| 129 | __le32 front_len; /* bytes in main payload */ | ||
| 130 | __le32 middle_len;/* bytes in middle payload */ | ||
| 131 | __le32 data_len; /* bytes of data payload */ | ||
| 132 | __le16 data_off; /* sender: include full offset; | ||
| 133 | receiver: mask against ~PAGE_MASK */ | ||
| 134 | |||
| 135 | struct ceph_entity_inst src, orig_src; | ||
| 136 | __le32 reserved; | ||
| 137 | __le32 crc; /* header crc32c */ | ||
| 138 | } __attribute__ ((packed)); | ||
| 139 | |||
| 140 | struct ceph_msg_header { | ||
| 141 | __le64 seq; /* message seq# for this session */ | ||
| 142 | __le64 tid; /* transaction id */ | ||
| 143 | __le16 type; /* message type */ | ||
| 144 | __le16 priority; /* priority. higher value == higher priority */ | ||
| 145 | __le16 version; /* version of message encoding */ | ||
| 146 | |||
| 147 | __le32 front_len; /* bytes in main payload */ | ||
| 148 | __le32 middle_len;/* bytes in middle payload */ | ||
| 149 | __le32 data_len; /* bytes of data payload */ | ||
| 150 | __le16 data_off; /* sender: include full offset; | ||
| 151 | receiver: mask against ~PAGE_MASK */ | ||
| 152 | |||
| 153 | struct ceph_entity_name src; | ||
| 154 | __le32 reserved; | ||
| 155 | __le32 crc; /* header crc32c */ | ||
| 156 | } __attribute__ ((packed)); | ||
| 157 | |||
| 158 | #define CEPH_MSG_PRIO_LOW 64 | ||
| 159 | #define CEPH_MSG_PRIO_DEFAULT 127 | ||
| 160 | #define CEPH_MSG_PRIO_HIGH 196 | ||
| 161 | #define CEPH_MSG_PRIO_HIGHEST 255 | ||
| 162 | |||
| 163 | /* | ||
| 164 | * follows data payload | ||
| 165 | */ | ||
| 166 | struct ceph_msg_footer { | ||
| 167 | __le32 front_crc, middle_crc, data_crc; | ||
| 168 | __u8 flags; | ||
| 169 | } __attribute__ ((packed)); | ||
| 170 | |||
| 171 | #define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ | ||
| 172 | #define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ | ||
| 173 | |||
| 174 | |||
| 175 | #endif | ||
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h new file mode 100644 index 000000000000..6c91fb032c39 --- /dev/null +++ b/include/linux/ceph/osd_client.h | |||
| @@ -0,0 +1,234 @@ | |||
| 1 | #ifndef _FS_CEPH_OSD_CLIENT_H | ||
| 2 | #define _FS_CEPH_OSD_CLIENT_H | ||
| 3 | |||
| 4 | #include <linux/completion.h> | ||
| 5 | #include <linux/kref.h> | ||
| 6 | #include <linux/mempool.h> | ||
| 7 | #include <linux/rbtree.h> | ||
| 8 | |||
| 9 | #include "types.h" | ||
| 10 | #include "osdmap.h" | ||
| 11 | #include "messenger.h" | ||
| 12 | |||
| 13 | struct ceph_msg; | ||
| 14 | struct ceph_snap_context; | ||
| 15 | struct ceph_osd_request; | ||
| 16 | struct ceph_osd_client; | ||
| 17 | struct ceph_authorizer; | ||
| 18 | struct ceph_pagelist; | ||
| 19 | |||
| 20 | /* | ||
| 21 | * completion callback for async writepages | ||
| 22 | */ | ||
| 23 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, | ||
| 24 | struct ceph_msg *); | ||
| 25 | |||
| 26 | /* a given osd we're communicating with */ | ||
| 27 | struct ceph_osd { | ||
| 28 | atomic_t o_ref; | ||
| 29 | struct ceph_osd_client *o_osdc; | ||
| 30 | int o_osd; | ||
| 31 | int o_incarnation; | ||
| 32 | struct rb_node o_node; | ||
| 33 | struct ceph_connection o_con; | ||
| 34 | struct list_head o_requests; | ||
| 35 | struct list_head o_osd_lru; | ||
| 36 | struct ceph_authorizer *o_authorizer; | ||
| 37 | void *o_authorizer_buf, *o_authorizer_reply_buf; | ||
| 38 | size_t o_authorizer_buf_len, o_authorizer_reply_buf_len; | ||
| 39 | unsigned long lru_ttl; | ||
| 40 | int o_marked_for_keepalive; | ||
| 41 | struct list_head o_keepalive_item; | ||
| 42 | }; | ||
| 43 | |||
| 44 | /* an in-flight request */ | ||
| 45 | struct ceph_osd_request { | ||
| 46 | u64 r_tid; /* unique for this client */ | ||
| 47 | struct rb_node r_node; | ||
| 48 | struct list_head r_req_lru_item; | ||
| 49 | struct list_head r_osd_item; | ||
| 50 | struct ceph_osd *r_osd; | ||
| 51 | struct ceph_pg r_pgid; | ||
| 52 | int r_pg_osds[CEPH_PG_MAX_SIZE]; | ||
| 53 | int r_num_pg_osds; | ||
| 54 | |||
| 55 | struct ceph_connection *r_con_filling_msg; | ||
| 56 | |||
| 57 | struct ceph_msg *r_request, *r_reply; | ||
| 58 | int r_result; | ||
| 59 | int r_flags; /* any additional flags for the osd */ | ||
| 60 | u32 r_sent; /* >0 if r_request is sending/sent */ | ||
| 61 | int r_got_reply; | ||
| 62 | |||
| 63 | struct ceph_osd_client *r_osdc; | ||
| 64 | struct kref r_kref; | ||
| 65 | bool r_mempool; | ||
| 66 | struct completion r_completion, r_safe_completion; | ||
| 67 | ceph_osdc_callback_t r_callback, r_safe_callback; | ||
| 68 | struct ceph_eversion r_reassert_version; | ||
| 69 | struct list_head r_unsafe_item; | ||
| 70 | |||
| 71 | struct inode *r_inode; /* for use by callbacks */ | ||
| 72 | void *r_priv; /* ditto */ | ||
| 73 | |||
| 74 | char r_oid[40]; /* object name */ | ||
| 75 | int r_oid_len; | ||
| 76 | unsigned long r_stamp; /* send OR check time */ | ||
| 77 | bool r_resend; /* msg send failed, needs retry */ | ||
| 78 | |||
| 79 | struct ceph_file_layout r_file_layout; | ||
| 80 | struct ceph_snap_context *r_snapc; /* snap context for writes */ | ||
| 81 | unsigned r_num_pages; /* size of page array (follows) */ | ||
| 82 | struct page **r_pages; /* pages for data payload */ | ||
| 83 | int r_pages_from_pool; | ||
| 84 | int r_own_pages; /* if true, i own page list */ | ||
| 85 | #ifdef CONFIG_BLOCK | ||
| 86 | struct bio *r_bio; /* instead of pages */ | ||
| 87 | #endif | ||
| 88 | |||
| 89 | struct ceph_pagelist *r_trail; /* trailing part of the data */ | ||
| 90 | }; | ||
| 91 | |||
| 92 | struct ceph_osd_client { | ||
| 93 | struct ceph_client *client; | ||
| 94 | |||
| 95 | struct ceph_osdmap *osdmap; /* current map */ | ||
| 96 | struct rw_semaphore map_sem; | ||
| 97 | struct completion map_waiters; | ||
| 98 | u64 last_requested_map; | ||
| 99 | |||
| 100 | struct mutex request_mutex; | ||
| 101 | struct rb_root osds; /* osds */ | ||
| 102 | struct list_head osd_lru; /* idle osds */ | ||
| 103 | u64 timeout_tid; /* tid of timeout triggering rq */ | ||
| 104 | u64 last_tid; /* tid of last request */ | ||
| 105 | struct rb_root requests; /* pending requests */ | ||
| 106 | struct list_head req_lru; /* pending requests lru */ | ||
| 107 | int num_requests; | ||
| 108 | struct delayed_work timeout_work; | ||
| 109 | struct delayed_work osds_timeout_work; | ||
| 110 | #ifdef CONFIG_DEBUG_FS | ||
| 111 | struct dentry *debugfs_file; | ||
| 112 | #endif | ||
| 113 | |||
| 114 | mempool_t *req_mempool; | ||
| 115 | |||
| 116 | struct ceph_msgpool msgpool_op; | ||
| 117 | struct ceph_msgpool msgpool_op_reply; | ||
| 118 | }; | ||
| 119 | |||
| 120 | struct ceph_osd_req_op { | ||
| 121 | u16 op; /* CEPH_OSD_OP_* */ | ||
| 122 | u32 flags; /* CEPH_OSD_FLAG_* */ | ||
| 123 | union { | ||
| 124 | struct { | ||
| 125 | u64 offset, length; | ||
| 126 | u64 truncate_size; | ||
| 127 | u32 truncate_seq; | ||
| 128 | } extent; | ||
| 129 | struct { | ||
| 130 | const char *name; | ||
| 131 | u32 name_len; | ||
| 132 | const char *val; | ||
| 133 | u32 value_len; | ||
| 134 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ | ||
| 135 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ | ||
| 136 | } xattr; | ||
| 137 | struct { | ||
| 138 | const char *class_name; | ||
| 139 | __u8 class_len; | ||
| 140 | const char *method_name; | ||
| 141 | __u8 method_len; | ||
| 142 | __u8 argc; | ||
| 143 | const char *indata; | ||
| 144 | u32 indata_len; | ||
| 145 | } cls; | ||
| 146 | struct { | ||
| 147 | u64 cookie, count; | ||
| 148 | } pgls; | ||
| 149 | struct { | ||
| 150 | u64 snapid; | ||
| 151 | } snap; | ||
| 152 | }; | ||
| 153 | u32 payload_len; | ||
| 154 | }; | ||
| 155 | |||
| 156 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, | ||
| 157 | struct ceph_client *client); | ||
| 158 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); | ||
| 159 | |||
| 160 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, | ||
| 161 | struct ceph_msg *msg); | ||
| 162 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, | ||
| 163 | struct ceph_msg *msg); | ||
| 164 | |||
| 165 | extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, | ||
| 166 | struct ceph_file_layout *layout, | ||
| 167 | u64 snapid, | ||
| 168 | u64 off, u64 *plen, u64 *bno, | ||
| 169 | struct ceph_osd_request *req, | ||
| 170 | struct ceph_osd_req_op *op); | ||
| 171 | |||
| 172 | extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | ||
| 173 | int flags, | ||
| 174 | struct ceph_snap_context *snapc, | ||
| 175 | struct ceph_osd_req_op *ops, | ||
| 176 | bool use_mempool, | ||
| 177 | gfp_t gfp_flags, | ||
| 178 | struct page **pages, | ||
| 179 | struct bio *bio); | ||
| 180 | |||
| 181 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, | ||
| 182 | u64 off, u64 *plen, | ||
| 183 | struct ceph_osd_req_op *src_ops, | ||
| 184 | struct ceph_snap_context *snapc, | ||
| 185 | struct timespec *mtime, | ||
| 186 | const char *oid, | ||
| 187 | int oid_len); | ||
| 188 | |||
| 189 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | ||
| 190 | struct ceph_file_layout *layout, | ||
| 191 | struct ceph_vino vino, | ||
| 192 | u64 offset, u64 *len, int op, int flags, | ||
| 193 | struct ceph_snap_context *snapc, | ||
| 194 | int do_sync, u32 truncate_seq, | ||
| 195 | u64 truncate_size, | ||
| 196 | struct timespec *mtime, | ||
| 197 | bool use_mempool, int num_reply); | ||
| 198 | |||
| 199 | static inline void ceph_osdc_get_request(struct ceph_osd_request *req) | ||
| 200 | { | ||
| 201 | kref_get(&req->r_kref); | ||
| 202 | } | ||
| 203 | extern void ceph_osdc_release_request(struct kref *kref); | ||
| 204 | static inline void ceph_osdc_put_request(struct ceph_osd_request *req) | ||
| 205 | { | ||
| 206 | kref_put(&req->r_kref, ceph_osdc_release_request); | ||
| 207 | } | ||
| 208 | |||
| 209 | extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, | ||
| 210 | struct ceph_osd_request *req, | ||
| 211 | bool nofail); | ||
| 212 | extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | ||
| 213 | struct ceph_osd_request *req); | ||
| 214 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); | ||
| 215 | |||
| 216 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, | ||
| 217 | struct ceph_vino vino, | ||
| 218 | struct ceph_file_layout *layout, | ||
| 219 | u64 off, u64 *plen, | ||
| 220 | u32 truncate_seq, u64 truncate_size, | ||
| 221 | struct page **pages, int nr_pages); | ||
| 222 | |||
| 223 | extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | ||
| 224 | struct ceph_vino vino, | ||
| 225 | struct ceph_file_layout *layout, | ||
| 226 | struct ceph_snap_context *sc, | ||
| 227 | u64 off, u64 len, | ||
| 228 | u32 truncate_seq, u64 truncate_size, | ||
| 229 | struct timespec *mtime, | ||
| 230 | struct page **pages, int nr_pages, | ||
| 231 | int flags, int do_sync, bool nofail); | ||
| 232 | |||
| 233 | #endif | ||
| 234 | |||
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h new file mode 100644 index 000000000000..ba4c205cbb01 --- /dev/null +++ b/include/linux/ceph/osdmap.h | |||
| @@ -0,0 +1,130 @@ | |||
| 1 | #ifndef _FS_CEPH_OSDMAP_H | ||
| 2 | #define _FS_CEPH_OSDMAP_H | ||
| 3 | |||
| 4 | #include <linux/rbtree.h> | ||
| 5 | #include "types.h" | ||
| 6 | #include "ceph_fs.h" | ||
| 7 | #include <linux/crush/crush.h> | ||
| 8 | |||
| 9 | /* | ||
| 10 | * The osd map describes the current membership of the osd cluster and | ||
| 11 | * specifies the mapping of objects to placement groups and placement | ||
| 12 | * groups to (sets of) osds. That is, it completely specifies the | ||
| 13 | * (desired) distribution of all data objects in the system at some | ||
| 14 | * point in time. | ||
| 15 | * | ||
| 16 | * Each map version is identified by an epoch, which increases monotonically. | ||
| 17 | * | ||
| 18 | * The map can be updated either via an incremental map (diff) describing | ||
| 19 | * the change between two successive epochs, or as a fully encoded map. | ||
| 20 | */ | ||
| 21 | struct ceph_pg_pool_info { | ||
| 22 | struct rb_node node; | ||
| 23 | int id; | ||
| 24 | struct ceph_pg_pool v; | ||
| 25 | int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; | ||
| 26 | char *name; | ||
| 27 | }; | ||
| 28 | |||
| 29 | struct ceph_pg_mapping { | ||
| 30 | struct rb_node node; | ||
| 31 | struct ceph_pg pgid; | ||
| 32 | int len; | ||
| 33 | int osds[]; | ||
| 34 | }; | ||
| 35 | |||
| 36 | struct ceph_osdmap { | ||
| 37 | struct ceph_fsid fsid; | ||
| 38 | u32 epoch; | ||
| 39 | u32 mkfs_epoch; | ||
| 40 | struct ceph_timespec created, modified; | ||
| 41 | |||
| 42 | u32 flags; /* CEPH_OSDMAP_* */ | ||
| 43 | |||
| 44 | u32 max_osd; /* size of osd_state, _offload, _addr arrays */ | ||
| 45 | u8 *osd_state; /* CEPH_OSD_* */ | ||
| 46 | u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ | ||
| 47 | struct ceph_entity_addr *osd_addr; | ||
| 48 | |||
| 49 | struct rb_root pg_temp; | ||
| 50 | struct rb_root pg_pools; | ||
| 51 | u32 pool_max; | ||
| 52 | |||
| 53 | /* the CRUSH map specifies the mapping of placement groups to | ||
| 54 | * the list of osds that store+replicate them. */ | ||
| 55 | struct crush_map *crush; | ||
| 56 | }; | ||
| 57 | |||
| 58 | /* | ||
| 59 | * file layout helpers | ||
| 60 | */ | ||
| 61 | #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) | ||
| 62 | #define ceph_file_layout_stripe_count(l) \ | ||
| 63 | ((__s32)le32_to_cpu((l).fl_stripe_count)) | ||
| 64 | #define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) | ||
| 65 | #define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) | ||
| 66 | #define ceph_file_layout_object_su(l) \ | ||
| 67 | ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) | ||
| 68 | #define ceph_file_layout_pg_preferred(l) \ | ||
| 69 | ((__s32)le32_to_cpu((l).fl_pg_preferred)) | ||
| 70 | #define ceph_file_layout_pg_pool(l) \ | ||
| 71 | ((__s32)le32_to_cpu((l).fl_pg_pool)) | ||
| 72 | |||
| 73 | static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) | ||
| 74 | { | ||
| 75 | return le32_to_cpu(l->fl_stripe_unit) * | ||
| 76 | le32_to_cpu(l->fl_stripe_count); | ||
| 77 | } | ||
| 78 | |||
| 79 | /* "period" == bytes before i start on a new set of objects */ | ||
| 80 | static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) | ||
| 81 | { | ||
| 82 | return le32_to_cpu(l->fl_object_size) * | ||
| 83 | le32_to_cpu(l->fl_stripe_count); | ||
| 84 | } | ||
| 85 | |||
| 86 | |||
| 87 | static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) | ||
| 88 | { | ||
| 89 | return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); | ||
| 90 | } | ||
| 91 | |||
| 92 | static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) | ||
| 93 | { | ||
| 94 | return map && (map->flags & flag); | ||
| 95 | } | ||
| 96 | |||
| 97 | extern char *ceph_osdmap_state_str(char *str, int len, int state); | ||
| 98 | |||
| 99 | static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, | ||
| 100 | int osd) | ||
| 101 | { | ||
| 102 | if (osd >= map->max_osd) | ||
| 103 | return NULL; | ||
| 104 | return &map->osd_addr[osd]; | ||
| 105 | } | ||
| 106 | |||
| 107 | extern struct ceph_osdmap *osdmap_decode(void **p, void *end); | ||
| 108 | extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | ||
| 109 | struct ceph_osdmap *map, | ||
| 110 | struct ceph_messenger *msgr); | ||
| 111 | extern void ceph_osdmap_destroy(struct ceph_osdmap *map); | ||
| 112 | |||
| 113 | /* calculate mapping of a file extent to an object */ | ||
| 114 | extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | ||
| 115 | u64 off, u64 *plen, | ||
| 116 | u64 *bno, u64 *oxoff, u64 *oxlen); | ||
| 117 | |||
| 118 | /* calculate mapping of object to a placement group */ | ||
| 119 | extern int ceph_calc_object_layout(struct ceph_object_layout *ol, | ||
| 120 | const char *oid, | ||
| 121 | struct ceph_file_layout *fl, | ||
| 122 | struct ceph_osdmap *osdmap); | ||
| 123 | extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | ||
| 124 | int *acting); | ||
| 125 | extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, | ||
| 126 | struct ceph_pg pgid); | ||
| 127 | |||
| 128 | extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); | ||
| 129 | |||
| 130 | #endif | ||
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h new file mode 100644 index 000000000000..9660d6b0a35d --- /dev/null +++ b/include/linux/ceph/pagelist.h | |||
| @@ -0,0 +1,75 @@ | |||
| 1 | #ifndef __FS_CEPH_PAGELIST_H | ||
| 2 | #define __FS_CEPH_PAGELIST_H | ||
| 3 | |||
| 4 | #include <linux/list.h> | ||
| 5 | |||
| 6 | struct ceph_pagelist { | ||
| 7 | struct list_head head; | ||
| 8 | void *mapped_tail; | ||
| 9 | size_t length; | ||
| 10 | size_t room; | ||
| 11 | struct list_head free_list; | ||
| 12 | size_t num_pages_free; | ||
| 13 | }; | ||
| 14 | |||
| 15 | struct ceph_pagelist_cursor { | ||
| 16 | struct ceph_pagelist *pl; /* pagelist, for error checking */ | ||
| 17 | struct list_head *page_lru; /* page in list */ | ||
| 18 | size_t room; /* room remaining to reset to */ | ||
| 19 | }; | ||
| 20 | |||
| 21 | static inline void ceph_pagelist_init(struct ceph_pagelist *pl) | ||
| 22 | { | ||
| 23 | INIT_LIST_HEAD(&pl->head); | ||
| 24 | pl->mapped_tail = NULL; | ||
| 25 | pl->length = 0; | ||
| 26 | pl->room = 0; | ||
| 27 | INIT_LIST_HEAD(&pl->free_list); | ||
| 28 | pl->num_pages_free = 0; | ||
| 29 | } | ||
| 30 | |||
| 31 | extern int ceph_pagelist_release(struct ceph_pagelist *pl); | ||
| 32 | |||
| 33 | extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l); | ||
| 34 | |||
| 35 | extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); | ||
| 36 | |||
| 37 | extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); | ||
| 38 | |||
| 39 | extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, | ||
| 40 | struct ceph_pagelist_cursor *c); | ||
| 41 | |||
| 42 | extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, | ||
| 43 | struct ceph_pagelist_cursor *c); | ||
| 44 | |||
| 45 | static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) | ||
| 46 | { | ||
| 47 | __le64 ev = cpu_to_le64(v); | ||
| 48 | return ceph_pagelist_append(pl, &ev, sizeof(ev)); | ||
| 49 | } | ||
| 50 | static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v) | ||
| 51 | { | ||
| 52 | __le32 ev = cpu_to_le32(v); | ||
| 53 | return ceph_pagelist_append(pl, &ev, sizeof(ev)); | ||
| 54 | } | ||
| 55 | static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v) | ||
| 56 | { | ||
| 57 | __le16 ev = cpu_to_le16(v); | ||
| 58 | return ceph_pagelist_append(pl, &ev, sizeof(ev)); | ||
| 59 | } | ||
| 60 | static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) | ||
| 61 | { | ||
| 62 | return ceph_pagelist_append(pl, &v, 1); | ||
| 63 | } | ||
| 64 | static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, | ||
| 65 | char *s, size_t len) | ||
| 66 | { | ||
| 67 | int ret = ceph_pagelist_encode_32(pl, len); | ||
| 68 | if (ret) | ||
| 69 | return ret; | ||
| 70 | if (len) | ||
| 71 | return ceph_pagelist_append(pl, s, len); | ||
| 72 | return 0; | ||
| 73 | } | ||
| 74 | |||
| 75 | #endif | ||
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h new file mode 100644 index 000000000000..6d5247f2e81b --- /dev/null +++ b/include/linux/ceph/rados.h | |||
| @@ -0,0 +1,405 @@ | |||
| 1 | #ifndef CEPH_RADOS_H | ||
| 2 | #define CEPH_RADOS_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * Data types for the Ceph distributed object storage layer RADOS | ||
| 6 | * (Reliable Autonomic Distributed Object Store). | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include "msgr.h" | ||
| 10 | |||
| 11 | /* | ||
| 12 | * osdmap encoding versions | ||
| 13 | */ | ||
| 14 | #define CEPH_OSDMAP_INC_VERSION 5 | ||
| 15 | #define CEPH_OSDMAP_INC_VERSION_EXT 5 | ||
| 16 | #define CEPH_OSDMAP_VERSION 5 | ||
| 17 | #define CEPH_OSDMAP_VERSION_EXT 5 | ||
| 18 | |||
| 19 | /* | ||
| 20 | * fs id | ||
| 21 | */ | ||
| 22 | struct ceph_fsid { | ||
| 23 | unsigned char fsid[16]; | ||
| 24 | }; | ||
| 25 | |||
| 26 | static inline int ceph_fsid_compare(const struct ceph_fsid *a, | ||
| 27 | const struct ceph_fsid *b) | ||
| 28 | { | ||
| 29 | return memcmp(a, b, sizeof(*a)); | ||
| 30 | } | ||
| 31 | |||
| 32 | /* | ||
| 33 | * ino, object, etc. | ||
| 34 | */ | ||
| 35 | typedef __le64 ceph_snapid_t; | ||
| 36 | #define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ | ||
| 37 | #define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ | ||
| 38 | #define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ | ||
| 39 | |||
| 40 | struct ceph_timespec { | ||
| 41 | __le32 tv_sec; | ||
| 42 | __le32 tv_nsec; | ||
| 43 | } __attribute__ ((packed)); | ||
| 44 | |||
| 45 | |||
| 46 | /* | ||
| 47 | * object layout - how objects are mapped into PGs | ||
| 48 | */ | ||
| 49 | #define CEPH_OBJECT_LAYOUT_HASH 1 | ||
| 50 | #define CEPH_OBJECT_LAYOUT_LINEAR 2 | ||
| 51 | #define CEPH_OBJECT_LAYOUT_HASHINO 3 | ||
| 52 | |||
| 53 | /* | ||
| 54 | * pg layout -- how PGs are mapped onto (sets of) OSDs | ||
| 55 | */ | ||
| 56 | #define CEPH_PG_LAYOUT_CRUSH 0 | ||
| 57 | #define CEPH_PG_LAYOUT_HASH 1 | ||
| 58 | #define CEPH_PG_LAYOUT_LINEAR 2 | ||
| 59 | #define CEPH_PG_LAYOUT_HYBRID 3 | ||
| 60 | |||
| 61 | #define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ | ||
| 62 | |||
| 63 | /* | ||
| 64 | * placement group. | ||
| 65 | * we encode this into one __le64. | ||
| 66 | */ | ||
| 67 | struct ceph_pg { | ||
| 68 | __le16 preferred; /* preferred primary osd */ | ||
| 69 | __le16 ps; /* placement seed */ | ||
| 70 | __le32 pool; /* object pool */ | ||
| 71 | } __attribute__ ((packed)); | ||
| 72 | |||
| 73 | /* | ||
| 74 | * pg_pool is a set of pgs storing a pool of objects | ||
| 75 | * | ||
| 76 | * pg_num -- base number of pseudorandomly placed pgs | ||
| 77 | * | ||
| 78 | * pgp_num -- effective number when calculating pg placement. this | ||
| 79 | * is used for pg_num increases. new pgs result in data being "split" | ||
| 80 | * into new pgs. for this to proceed smoothly, new pgs are intiially | ||
| 81 | * colocated with their parents; that is, pgp_num doesn't increase | ||
| 82 | * until the new pgs have successfully split. only _then_ are the new | ||
| 83 | * pgs placed independently. | ||
| 84 | * | ||
| 85 | * lpg_num -- localized pg count (per device). replicas are randomly | ||
| 86 | * selected. | ||
| 87 | * | ||
| 88 | * lpgp_num -- as above. | ||
| 89 | */ | ||
| 90 | #define CEPH_PG_TYPE_REP 1 | ||
| 91 | #define CEPH_PG_TYPE_RAID4 2 | ||
| 92 | #define CEPH_PG_POOL_VERSION 2 | ||
| 93 | struct ceph_pg_pool { | ||
| 94 | __u8 type; /* CEPH_PG_TYPE_* */ | ||
| 95 | __u8 size; /* number of osds in each pg */ | ||
| 96 | __u8 crush_ruleset; /* crush placement rule */ | ||
| 97 | __u8 object_hash; /* hash mapping object name to ps */ | ||
| 98 | __le32 pg_num, pgp_num; /* number of pg's */ | ||
| 99 | __le32 lpg_num, lpgp_num; /* number of localized pg's */ | ||
| 100 | __le32 last_change; /* most recent epoch changed */ | ||
| 101 | __le64 snap_seq; /* seq for per-pool snapshot */ | ||
| 102 | __le32 snap_epoch; /* epoch of last snap */ | ||
| 103 | __le32 num_snaps; | ||
| 104 | __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ | ||
| 105 | __le64 auid; /* who owns the pg */ | ||
| 106 | } __attribute__ ((packed)); | ||
| 107 | |||
| 108 | /* | ||
| 109 | * stable_mod func is used to control number of placement groups. | ||
| 110 | * similar to straight-up modulo, but produces a stable mapping as b | ||
| 111 | * increases over time. b is the number of bins, and bmask is the | ||
| 112 | * containing power of 2 minus 1. | ||
| 113 | * | ||
| 114 | * b <= bmask and bmask=(2**n)-1 | ||
| 115 | * e.g., b=12 -> bmask=15, b=123 -> bmask=127 | ||
| 116 | */ | ||
| 117 | static inline int ceph_stable_mod(int x, int b, int bmask) | ||
| 118 | { | ||
| 119 | if ((x & bmask) < b) | ||
| 120 | return x & bmask; | ||
| 121 | else | ||
| 122 | return x & (bmask >> 1); | ||
| 123 | } | ||
| 124 | |||
| 125 | /* | ||
| 126 | * object layout - how a given object should be stored. | ||
| 127 | */ | ||
| 128 | struct ceph_object_layout { | ||
| 129 | struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ | ||
| 130 | __le32 ol_stripe_unit; /* for per-object parity, if any */ | ||
| 131 | } __attribute__ ((packed)); | ||
| 132 | |||
| 133 | /* | ||
| 134 | * compound epoch+version, used by storage layer to serialize mutations | ||
| 135 | */ | ||
| 136 | struct ceph_eversion { | ||
| 137 | __le32 epoch; | ||
| 138 | __le64 version; | ||
| 139 | } __attribute__ ((packed)); | ||
| 140 | |||
| 141 | /* | ||
| 142 | * osd map bits | ||
| 143 | */ | ||
| 144 | |||
| 145 | /* status bits */ | ||
| 146 | #define CEPH_OSD_EXISTS 1 | ||
| 147 | #define CEPH_OSD_UP 2 | ||
| 148 | |||
| 149 | /* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ | ||
| 150 | #define CEPH_OSD_IN 0x10000 | ||
| 151 | #define CEPH_OSD_OUT 0 | ||
| 152 | |||
| 153 | |||
| 154 | /* | ||
| 155 | * osd map flag bits | ||
| 156 | */ | ||
| 157 | #define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ | ||
| 158 | #define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ | ||
| 159 | #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ | ||
| 160 | #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ | ||
| 161 | #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ | ||
| 162 | |||
| 163 | /* | ||
| 164 | * osd ops | ||
| 165 | */ | ||
| 166 | #define CEPH_OSD_OP_MODE 0xf000 | ||
| 167 | #define CEPH_OSD_OP_MODE_RD 0x1000 | ||
| 168 | #define CEPH_OSD_OP_MODE_WR 0x2000 | ||
| 169 | #define CEPH_OSD_OP_MODE_RMW 0x3000 | ||
| 170 | #define CEPH_OSD_OP_MODE_SUB 0x4000 | ||
| 171 | |||
| 172 | #define CEPH_OSD_OP_TYPE 0x0f00 | ||
| 173 | #define CEPH_OSD_OP_TYPE_LOCK 0x0100 | ||
| 174 | #define CEPH_OSD_OP_TYPE_DATA 0x0200 | ||
| 175 | #define CEPH_OSD_OP_TYPE_ATTR 0x0300 | ||
| 176 | #define CEPH_OSD_OP_TYPE_EXEC 0x0400 | ||
| 177 | #define CEPH_OSD_OP_TYPE_PG 0x0500 | ||
| 178 | |||
| 179 | enum { | ||
| 180 | /** data **/ | ||
| 181 | /* read */ | ||
| 182 | CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, | ||
| 183 | CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, | ||
| 184 | |||
| 185 | /* fancy read */ | ||
| 186 | CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, | ||
| 187 | |||
| 188 | /* write */ | ||
| 189 | CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, | ||
| 190 | CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, | ||
| 191 | CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, | ||
| 192 | CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, | ||
| 193 | CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, | ||
| 194 | |||
| 195 | /* fancy write */ | ||
| 196 | CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, | ||
| 197 | CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, | ||
| 198 | CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, | ||
| 199 | CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, | ||
| 200 | |||
| 201 | CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, | ||
| 202 | CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, | ||
| 203 | CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, | ||
| 204 | |||
| 205 | CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, | ||
| 206 | CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, | ||
| 207 | |||
| 208 | /** attrs **/ | ||
| 209 | /* read */ | ||
| 210 | CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, | ||
| 211 | CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, | ||
| 212 | CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, | ||
| 213 | |||
| 214 | /* write */ | ||
| 215 | CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, | ||
| 216 | CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, | ||
| 217 | CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, | ||
| 218 | CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, | ||
| 219 | |||
| 220 | /** subop **/ | ||
| 221 | CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, | ||
| 222 | CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, | ||
| 223 | CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, | ||
| 224 | CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, | ||
| 225 | CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, | ||
| 226 | |||
| 227 | /** lock **/ | ||
| 228 | CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, | ||
| 229 | CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, | ||
| 230 | CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, | ||
| 231 | CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, | ||
| 232 | CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, | ||
| 233 | CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, | ||
| 234 | |||
| 235 | /** exec **/ | ||
| 236 | CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, | ||
| 237 | |||
| 238 | /** pg **/ | ||
| 239 | CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, | ||
| 240 | }; | ||
| 241 | |||
| 242 | static inline int ceph_osd_op_type_lock(int op) | ||
| 243 | { | ||
| 244 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; | ||
| 245 | } | ||
| 246 | static inline int ceph_osd_op_type_data(int op) | ||
| 247 | { | ||
| 248 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; | ||
| 249 | } | ||
| 250 | static inline int ceph_osd_op_type_attr(int op) | ||
| 251 | { | ||
| 252 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; | ||
| 253 | } | ||
| 254 | static inline int ceph_osd_op_type_exec(int op) | ||
| 255 | { | ||
| 256 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; | ||
| 257 | } | ||
| 258 | static inline int ceph_osd_op_type_pg(int op) | ||
| 259 | { | ||
| 260 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; | ||
| 261 | } | ||
| 262 | |||
| 263 | static inline int ceph_osd_op_mode_subop(int op) | ||
| 264 | { | ||
| 265 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; | ||
| 266 | } | ||
| 267 | static inline int ceph_osd_op_mode_read(int op) | ||
| 268 | { | ||
| 269 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; | ||
| 270 | } | ||
| 271 | static inline int ceph_osd_op_mode_modify(int op) | ||
| 272 | { | ||
| 273 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; | ||
| 274 | } | ||
| 275 | |||
| 276 | /* | ||
| 277 | * note that the following tmap stuff is also defined in the ceph librados.h | ||
| 278 | * any modification here needs to be updated there | ||
| 279 | */ | ||
| 280 | #define CEPH_OSD_TMAP_HDR 'h' | ||
| 281 | #define CEPH_OSD_TMAP_SET 's' | ||
| 282 | #define CEPH_OSD_TMAP_RM 'r' | ||
| 283 | |||
| 284 | extern const char *ceph_osd_op_name(int op); | ||
| 285 | |||
| 286 | |||
| 287 | /* | ||
| 288 | * osd op flags | ||
| 289 | * | ||
| 290 | * An op may be READ, WRITE, or READ|WRITE. | ||
| 291 | */ | ||
| 292 | enum { | ||
| 293 | CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ | ||
| 294 | CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ | ||
| 295 | CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ | ||
| 296 | CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ | ||
| 297 | CEPH_OSD_FLAG_READ = 16, /* op may read */ | ||
| 298 | CEPH_OSD_FLAG_WRITE = 32, /* op may write */ | ||
| 299 | CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ | ||
| 300 | CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ | ||
| 301 | CEPH_OSD_FLAG_BALANCE_READS = 256, | ||
| 302 | CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ | ||
| 303 | CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ | ||
| 304 | CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ | ||
| 305 | CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ | ||
| 306 | }; | ||
| 307 | |||
| 308 | enum { | ||
| 309 | CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ | ||
| 310 | }; | ||
| 311 | |||
| 312 | #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ | ||
| 313 | #define EBLACKLISTED ESHUTDOWN /* blacklisted */ | ||
| 314 | |||
| 315 | /* xattr comparison */ | ||
| 316 | enum { | ||
| 317 | CEPH_OSD_CMPXATTR_OP_NOP = 0, | ||
| 318 | CEPH_OSD_CMPXATTR_OP_EQ = 1, | ||
| 319 | CEPH_OSD_CMPXATTR_OP_NE = 2, | ||
| 320 | CEPH_OSD_CMPXATTR_OP_GT = 3, | ||
| 321 | CEPH_OSD_CMPXATTR_OP_GTE = 4, | ||
| 322 | CEPH_OSD_CMPXATTR_OP_LT = 5, | ||
| 323 | CEPH_OSD_CMPXATTR_OP_LTE = 6 | ||
| 324 | }; | ||
| 325 | |||
| 326 | enum { | ||
| 327 | CEPH_OSD_CMPXATTR_MODE_STRING = 1, | ||
| 328 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 | ||
| 329 | }; | ||
| 330 | |||
| 331 | /* | ||
| 332 | * an individual object operation. each may be accompanied by some data | ||
| 333 | * payload | ||
| 334 | */ | ||
| 335 | struct ceph_osd_op { | ||
| 336 | __le16 op; /* CEPH_OSD_OP_* */ | ||
| 337 | __le32 flags; /* CEPH_OSD_FLAG_* */ | ||
| 338 | union { | ||
| 339 | struct { | ||
| 340 | __le64 offset, length; | ||
| 341 | __le64 truncate_size; | ||
| 342 | __le32 truncate_seq; | ||
| 343 | } __attribute__ ((packed)) extent; | ||
| 344 | struct { | ||
| 345 | __le32 name_len; | ||
| 346 | __le32 value_len; | ||
| 347 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ | ||
| 348 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ | ||
| 349 | } __attribute__ ((packed)) xattr; | ||
| 350 | struct { | ||
| 351 | __u8 class_len; | ||
| 352 | __u8 method_len; | ||
| 353 | __u8 argc; | ||
| 354 | __le32 indata_len; | ||
| 355 | } __attribute__ ((packed)) cls; | ||
| 356 | struct { | ||
| 357 | __le64 cookie, count; | ||
| 358 | } __attribute__ ((packed)) pgls; | ||
| 359 | struct { | ||
| 360 | __le64 snapid; | ||
| 361 | } __attribute__ ((packed)) snap; | ||
| 362 | }; | ||
| 363 | __le32 payload_len; | ||
| 364 | } __attribute__ ((packed)); | ||
| 365 | |||
| 366 | /* | ||
| 367 | * osd request message header. each request may include multiple | ||
| 368 | * ceph_osd_op object operations. | ||
| 369 | */ | ||
| 370 | struct ceph_osd_request_head { | ||
| 371 | __le32 client_inc; /* client incarnation */ | ||
| 372 | struct ceph_object_layout layout; /* pgid */ | ||
| 373 | __le32 osdmap_epoch; /* client's osdmap epoch */ | ||
| 374 | |||
| 375 | __le32 flags; | ||
| 376 | |||
| 377 | struct ceph_timespec mtime; /* for mutations only */ | ||
| 378 | struct ceph_eversion reassert_version; /* if we are replaying op */ | ||
| 379 | |||
| 380 | __le32 object_len; /* length of object name */ | ||
| 381 | |||
| 382 | __le64 snapid; /* snapid to read */ | ||
| 383 | __le64 snap_seq; /* writer's snap context */ | ||
| 384 | __le32 num_snaps; | ||
| 385 | |||
| 386 | __le16 num_ops; | ||
| 387 | struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ | ||
| 388 | } __attribute__ ((packed)); | ||
| 389 | |||
| 390 | struct ceph_osd_reply_head { | ||
| 391 | __le32 client_inc; /* client incarnation */ | ||
| 392 | __le32 flags; | ||
| 393 | struct ceph_object_layout layout; | ||
| 394 | __le32 osdmap_epoch; | ||
| 395 | struct ceph_eversion reassert_version; /* for replaying uncommitted */ | ||
| 396 | |||
| 397 | __le32 result; /* result code */ | ||
| 398 | |||
| 399 | __le32 object_len; /* length of object name */ | ||
| 400 | __le32 num_ops; | ||
| 401 | struct ceph_osd_op ops[0]; /* ops[], object */ | ||
| 402 | } __attribute__ ((packed)); | ||
| 403 | |||
| 404 | |||
| 405 | #endif | ||
diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h new file mode 100644 index 000000000000..28b35a005ec2 --- /dev/null +++ b/include/linux/ceph/types.h | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | #ifndef _FS_CEPH_TYPES_H | ||
| 2 | #define _FS_CEPH_TYPES_H | ||
| 3 | |||
| 4 | /* needed before including ceph_fs.h */ | ||
| 5 | #include <linux/in.h> | ||
| 6 | #include <linux/types.h> | ||
| 7 | #include <linux/fcntl.h> | ||
| 8 | #include <linux/string.h> | ||
| 9 | |||
| 10 | #include "ceph_fs.h" | ||
| 11 | #include "ceph_frag.h" | ||
| 12 | #include "ceph_hash.h" | ||
| 13 | |||
| 14 | /* | ||
| 15 | * Identify inodes by both their ino AND snapshot id (a u64). | ||
| 16 | */ | ||
| 17 | struct ceph_vino { | ||
| 18 | u64 ino; | ||
| 19 | u64 snap; | ||
| 20 | }; | ||
| 21 | |||
| 22 | |||
| 23 | /* context for the caps reservation mechanism */ | ||
| 24 | struct ceph_cap_reservation { | ||
| 25 | int count; | ||
| 26 | }; | ||
| 27 | |||
| 28 | |||
| 29 | #endif | ||
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h new file mode 100644 index 000000000000..97e435b191f4 --- /dev/null +++ b/include/linux/crush/crush.h | |||
| @@ -0,0 +1,180 @@ | |||
| 1 | #ifndef CEPH_CRUSH_CRUSH_H | ||
| 2 | #define CEPH_CRUSH_CRUSH_H | ||
| 3 | |||
| 4 | #include <linux/types.h> | ||
| 5 | |||
| 6 | /* | ||
| 7 | * CRUSH is a pseudo-random data distribution algorithm that | ||
| 8 | * efficiently distributes input values (typically, data objects) | ||
| 9 | * across a heterogeneous, structured storage cluster. | ||
| 10 | * | ||
| 11 | * The algorithm was originally described in detail in this paper | ||
| 12 | * (although the algorithm has evolved somewhat since then): | ||
| 13 | * | ||
| 14 | * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf | ||
| 15 | * | ||
| 16 | * LGPL2 | ||
| 17 | */ | ||
| 18 | |||
| 19 | |||
| 20 | #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ | ||
| 21 | |||
| 22 | |||
| 23 | #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ | ||
| 24 | #define CRUSH_MAX_SET 10 /* max size of a mapping result */ | ||
| 25 | |||
| 26 | |||
| 27 | /* | ||
| 28 | * CRUSH uses user-defined "rules" to describe how inputs should be | ||
| 29 | * mapped to devices. A rule consists of sequence of steps to perform | ||
| 30 | * to generate the set of output devices. | ||
| 31 | */ | ||
| 32 | struct crush_rule_step { | ||
| 33 | __u32 op; | ||
| 34 | __s32 arg1; | ||
| 35 | __s32 arg2; | ||
| 36 | }; | ||
| 37 | |||
| 38 | /* step op codes */ | ||
| 39 | enum { | ||
| 40 | CRUSH_RULE_NOOP = 0, | ||
| 41 | CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ | ||
| 42 | CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ | ||
| 43 | /* arg2 = type */ | ||
| 44 | CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ | ||
| 45 | CRUSH_RULE_EMIT = 4, /* no args */ | ||
| 46 | CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, | ||
| 47 | CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, | ||
| 48 | }; | ||
| 49 | |||
| 50 | /* | ||
| 51 | * for specifying choose num (arg1) relative to the max parameter | ||
| 52 | * passed to do_rule | ||
| 53 | */ | ||
| 54 | #define CRUSH_CHOOSE_N 0 | ||
| 55 | #define CRUSH_CHOOSE_N_MINUS(x) (-(x)) | ||
| 56 | |||
| 57 | /* | ||
| 58 | * The rule mask is used to describe what the rule is intended for. | ||
| 59 | * Given a ruleset and size of output set, we search through the | ||
| 60 | * rule list for a matching rule_mask. | ||
| 61 | */ | ||
| 62 | struct crush_rule_mask { | ||
| 63 | __u8 ruleset; | ||
| 64 | __u8 type; | ||
| 65 | __u8 min_size; | ||
| 66 | __u8 max_size; | ||
| 67 | }; | ||
| 68 | |||
| 69 | struct crush_rule { | ||
| 70 | __u32 len; | ||
| 71 | struct crush_rule_mask mask; | ||
| 72 | struct crush_rule_step steps[0]; | ||
| 73 | }; | ||
| 74 | |||
| 75 | #define crush_rule_size(len) (sizeof(struct crush_rule) + \ | ||
| 76 | (len)*sizeof(struct crush_rule_step)) | ||
| 77 | |||
| 78 | |||
| 79 | |||
| 80 | /* | ||
| 81 | * A bucket is a named container of other items (either devices or | ||
| 82 | * other buckets). Items within a bucket are chosen using one of a | ||
| 83 | * few different algorithms. The table summarizes how the speed of | ||
| 84 | * each option measures up against mapping stability when items are | ||
| 85 | * added or removed. | ||
| 86 | * | ||
| 87 | * Bucket Alg Speed Additions Removals | ||
| 88 | * ------------------------------------------------ | ||
| 89 | * uniform O(1) poor poor | ||
| 90 | * list O(n) optimal poor | ||
| 91 | * tree O(log n) good good | ||
| 92 | * straw O(n) optimal optimal | ||
| 93 | */ | ||
| 94 | enum { | ||
| 95 | CRUSH_BUCKET_UNIFORM = 1, | ||
| 96 | CRUSH_BUCKET_LIST = 2, | ||
| 97 | CRUSH_BUCKET_TREE = 3, | ||
| 98 | CRUSH_BUCKET_STRAW = 4 | ||
| 99 | }; | ||
| 100 | extern const char *crush_bucket_alg_name(int alg); | ||
| 101 | |||
| 102 | struct crush_bucket { | ||
| 103 | __s32 id; /* this'll be negative */ | ||
| 104 | __u16 type; /* non-zero; type=0 is reserved for devices */ | ||
| 105 | __u8 alg; /* one of CRUSH_BUCKET_* */ | ||
| 106 | __u8 hash; /* which hash function to use, CRUSH_HASH_* */ | ||
| 107 | __u32 weight; /* 16-bit fixed point */ | ||
| 108 | __u32 size; /* num items */ | ||
| 109 | __s32 *items; | ||
| 110 | |||
| 111 | /* | ||
| 112 | * cached random permutation: used for uniform bucket and for | ||
| 113 | * the linear search fallback for the other bucket types. | ||
| 114 | */ | ||
| 115 | __u32 perm_x; /* @x for which *perm is defined */ | ||
| 116 | __u32 perm_n; /* num elements of *perm that are permuted/defined */ | ||
| 117 | __u32 *perm; | ||
| 118 | }; | ||
| 119 | |||
| 120 | struct crush_bucket_uniform { | ||
| 121 | struct crush_bucket h; | ||
| 122 | __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ | ||
| 123 | }; | ||
| 124 | |||
| 125 | struct crush_bucket_list { | ||
| 126 | struct crush_bucket h; | ||
| 127 | __u32 *item_weights; /* 16-bit fixed point */ | ||
| 128 | __u32 *sum_weights; /* 16-bit fixed point. element i is sum | ||
| 129 | of weights 0..i, inclusive */ | ||
| 130 | }; | ||
| 131 | |||
| 132 | struct crush_bucket_tree { | ||
| 133 | struct crush_bucket h; /* note: h.size is _tree_ size, not number of | ||
| 134 | actual items */ | ||
| 135 | __u8 num_nodes; | ||
| 136 | __u32 *node_weights; | ||
| 137 | }; | ||
| 138 | |||
| 139 | struct crush_bucket_straw { | ||
| 140 | struct crush_bucket h; | ||
| 141 | __u32 *item_weights; /* 16-bit fixed point */ | ||
| 142 | __u32 *straws; /* 16-bit fixed point */ | ||
| 143 | }; | ||
| 144 | |||
| 145 | |||
| 146 | |||
| 147 | /* | ||
| 148 | * CRUSH map includes all buckets, rules, etc. | ||
| 149 | */ | ||
| 150 | struct crush_map { | ||
| 151 | struct crush_bucket **buckets; | ||
| 152 | struct crush_rule **rules; | ||
| 153 | |||
| 154 | /* | ||
| 155 | * Parent pointers to identify the parent bucket a device or | ||
| 156 | * bucket in the hierarchy. If an item appears more than | ||
| 157 | * once, this is the _last_ time it appeared (where buckets | ||
| 158 | * are processed in bucket id order, from -1 on down to | ||
| 159 | * -max_buckets. | ||
| 160 | */ | ||
| 161 | __u32 *bucket_parents; | ||
| 162 | __u32 *device_parents; | ||
| 163 | |||
| 164 | __s32 max_buckets; | ||
| 165 | __u32 max_rules; | ||
| 166 | __s32 max_devices; | ||
| 167 | }; | ||
| 168 | |||
| 169 | |||
| 170 | /* crush.c */ | ||
| 171 | extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos); | ||
| 172 | extern void crush_calc_parents(struct crush_map *map); | ||
| 173 | extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); | ||
| 174 | extern void crush_destroy_bucket_list(struct crush_bucket_list *b); | ||
| 175 | extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); | ||
| 176 | extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); | ||
| 177 | extern void crush_destroy_bucket(struct crush_bucket *b); | ||
| 178 | extern void crush_destroy(struct crush_map *map); | ||
| 179 | |||
| 180 | #endif | ||
diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h new file mode 100644 index 000000000000..91e884230d5d --- /dev/null +++ b/include/linux/crush/hash.h | |||
| @@ -0,0 +1,17 @@ | |||
| 1 | #ifndef CEPH_CRUSH_HASH_H | ||
| 2 | #define CEPH_CRUSH_HASH_H | ||
| 3 | |||
| 4 | #define CRUSH_HASH_RJENKINS1 0 | ||
| 5 | |||
| 6 | #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 | ||
| 7 | |||
| 8 | extern const char *crush_hash_name(int type); | ||
| 9 | |||
| 10 | extern __u32 crush_hash32(int type, __u32 a); | ||
| 11 | extern __u32 crush_hash32_2(int type, __u32 a, __u32 b); | ||
| 12 | extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c); | ||
| 13 | extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d); | ||
| 14 | extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, | ||
| 15 | __u32 e); | ||
| 16 | |||
| 17 | #endif | ||
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h new file mode 100644 index 000000000000..c46b99c18bb0 --- /dev/null +++ b/include/linux/crush/mapper.h | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | #ifndef CEPH_CRUSH_MAPPER_H | ||
| 2 | #define CEPH_CRUSH_MAPPER_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * CRUSH functions for find rules and then mapping an input to an | ||
| 6 | * output set. | ||
| 7 | * | ||
| 8 | * LGPL2 | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include "crush.h" | ||
| 12 | |||
| 13 | extern int crush_find_rule(struct crush_map *map, int pool, int type, int size); | ||
| 14 | extern int crush_do_rule(struct crush_map *map, | ||
| 15 | int ruleno, | ||
| 16 | int x, int *result, int result_max, | ||
| 17 | int forcefeed, /* -1 for none */ | ||
| 18 | __u32 *weights); | ||
| 19 | |||
| 20 | #endif | ||
