aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 15:38:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 15:38:28 -0400
commit2017bd19454ea7cdae19922d15b6930f6c8088a2 (patch)
tree53974657ab3a2c98f2da7b3fcb050ff5b697f876 /net
parent9f1ad09493451c19d00c004da479acf699eeedd6 (diff)
parentefa4c1206eaff047c474af2136748a58eb8cc33b (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (22 commits) ceph: do not carry i_lock for readdir from dcache fs/ceph/xattr.c: Use kmemdup rbd: passing wrong variable to bvec_kunmap_irq() rbd: null vs ERR_PTR ceph: fix num_pages_free accounting in pagelist ceph: add CEPH_MDS_OP_SETDIRLAYOUT and associated ioctl. ceph: don't crash when passed bad mount options ceph: fix debugfs warnings block: rbd: removing unnecessary test block: rbd: fixed may leaks ceph: switch from BKL to lock_flocks() ceph: preallocate flock state without locks held ceph: add pagelist_reserve, pagelist_truncate, pagelist_set_cursor ceph: use mapping->nrpages to determine if mapping is empty ceph: only invalidate on check_caps if we actually have pages ceph: do not hide .snap in root directory rbd: introduce rados block device (rbd), based on libceph ceph: factor out libceph from Ceph file system ceph-rbd: osdc support for osd call and rollback operations ceph: messenger and osdc changes for rbd ...
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/ceph/Kconfig28
-rw-r--r--net/ceph/Makefile37
-rw-r--r--net/ceph/armor.c103
-rw-r--r--net/ceph/auth.c259
-rw-r--r--net/ceph/auth_none.c132
-rw-r--r--net/ceph/auth_none.h29
-rw-r--r--net/ceph/auth_x.c688
-rw-r--r--net/ceph/auth_x.h50
-rw-r--r--net/ceph/auth_x_protocol.h90
-rw-r--r--net/ceph/buffer.c68
-rw-r--r--net/ceph/ceph_common.c529
-rw-r--r--net/ceph/ceph_fs.c75
-rw-r--r--net/ceph/ceph_hash.c118
-rw-r--r--net/ceph/ceph_strings.c84
-rw-r--r--net/ceph/crush/crush.c151
-rw-r--r--net/ceph/crush/hash.c149
-rw-r--r--net/ceph/crush/mapper.c609
-rw-r--r--net/ceph/crypto.c412
-rw-r--r--net/ceph/crypto.h48
-rw-r--r--net/ceph/debugfs.c267
-rw-r--r--net/ceph/messenger.c2453
-rw-r--r--net/ceph/mon_client.c1027
-rw-r--r--net/ceph/msgpool.c64
-rw-r--r--net/ceph/osd_client.c1773
-rw-r--r--net/ceph/osdmap.c1128
-rw-r--r--net/ceph/pagelist.c154
-rw-r--r--net/ceph/pagevec.c223
29 files changed, 10750 insertions, 0 deletions
diff --git a/net/Kconfig b/net/Kconfig
index e926884c1675..55fd82e9ffd9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
293source "net/rfkill/Kconfig" 293source "net/rfkill/Kconfig"
294source "net/9p/Kconfig" 294source "net/9p/Kconfig"
295source "net/caif/Kconfig" 295source "net/caif/Kconfig"
296source "net/ceph/Kconfig"
296 297
297 298
298endif # if NET 299endif # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1b..6b7bfd7f1416 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o
68endif 68endif
69obj-$(CONFIG_WIMAX) += wimax/ 69obj-$(CONFIG_WIMAX) += wimax/
70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ 70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
71obj-$(CONFIG_CEPH_LIB) += ceph/
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000000..ad424049b0cf
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
1config CEPH_LIB
2 tristate "Ceph core library (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CRYPTO_AES
6 select CRYPTO
7 default n
8 help
9 Choose Y or M here to include cephlib, which provides the
10 common functionality to both the Ceph filesystem and
11 to the rados block device (rbd).
12
13 More information at http://ceph.newdream.net/.
14
15 If unsure, say N.
16
17config CEPH_LIB_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_LIB
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This increases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000000..aab1cabb8035
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_LIB) += libceph.o
8
9libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
10 mon_client.o \
11 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
12 debugfs.o \
13 auth.o auth_none.o \
14 crypto.o armor.o \
15 auth_x.o \
16 ceph_fs.o ceph_strings.o ceph_hash.o \
17 pagevec.o
18
19else
20#Otherwise we were called directly from the command
21# line; invoke the kernel build system.
22
23KERNELDIR ?= /lib/modules/$(shell uname -r)/build
24PWD := $(shell pwd)
25
26default: all
27
28all:
29 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
30
31modules_install:
32 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
33
34clean:
35 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
36
37endif
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 000000000000..eb2a666b0be7
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,103 @@
1
2#include <linux/errno.h>
3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
7/*
8 * base64 encode/decode.
9 */
10
11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
13
14static int encode_bits(int c)
15{
16 return pem_key[c];
17}
18
19static int decode_bits(char c)
20{
21 if (c >= 'A' && c <= 'Z')
22 return c - 'A';
23 if (c >= 'a' && c <= 'z')
24 return c - 'a' + 26;
25 if (c >= '0' && c <= '9')
26 return c - '0' + 52;
27 if (c == '+')
28 return 62;
29 if (c == '/')
30 return 63;
31 if (c == '=')
32 return 0; /* just non-negative, please */
33 return -EINVAL;
34}
35
36int ceph_armor(char *dst, const char *src, const char *end)
37{
38 int olen = 0;
39 int line = 0;
40
41 while (src < end) {
42 unsigned char a, b, c;
43
44 a = *src++;
45 *dst++ = encode_bits(a >> 2);
46 if (src < end) {
47 b = *src++;
48 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
49 if (src < end) {
50 c = *src++;
51 *dst++ = encode_bits(((b & 15) << 2) |
52 (c >> 6));
53 *dst++ = encode_bits(c & 63);
54 } else {
55 *dst++ = encode_bits((b & 15) << 2);
56 *dst++ = '=';
57 }
58 } else {
59 *dst++ = encode_bits(((a & 3) << 4));
60 *dst++ = '=';
61 *dst++ = '=';
62 }
63 olen += 4;
64 line += 4;
65 if (line == 64) {
66 line = 0;
67 *(dst++) = '\n';
68 olen++;
69 }
70 }
71 return olen;
72}
73
74int ceph_unarmor(char *dst, const char *src, const char *end)
75{
76 int olen = 0;
77
78 while (src < end) {
79 int a, b, c, d;
80
81 if (src < end && src[0] == '\n')
82 src++;
83 if (src + 4 > end)
84 return -EINVAL;
85 a = decode_bits(src[0]);
86 b = decode_bits(src[1]);
87 c = decode_bits(src[2]);
88 d = decode_bits(src[3]);
89 if (a < 0 || b < 0 || c < 0 || d < 0)
90 return -EINVAL;
91
92 *dst++ = (a << 2) | (b >> 4);
93 if (src[2] == '=')
94 return olen + 1;
95 *dst++ = ((b & 15) << 4) | (c >> 2);
96 if (src[3] == '=')
97 return olen + 2;
98 *dst++ = ((c & 3) << 6) | d;
99 olen += 3;
100 src += 4;
101 }
102 return olen;
103}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 000000000000..549c1f43e1d5
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/slab.h>
6
7#include <linux/ceph/types.h>
8#include <linux/ceph/decode.h>
9#include <linux/ceph/libceph.h>
10#include <linux/ceph/messenger.h>
11#include "auth_none.h"
12#include "auth_x.h"
13
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 000000000000..214c2bb43d62
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "auth_none.h"
13
14static void reset(struct ceph_auth_client *ac)
15{
16 struct ceph_auth_none_info *xi = ac->private;
17
18 xi->starting = true;
19 xi->built_authorizer = false;
20}
21
22static void destroy(struct ceph_auth_client *ac)
23{
24 kfree(ac->private);
25 ac->private = NULL;
26}
27
28static int is_authenticated(struct ceph_auth_client *ac)
29{
30 struct ceph_auth_none_info *xi = ac->private;
31
32 return !xi->starting;
33}
34
35static int should_authenticate(struct ceph_auth_client *ac)
36{
37 struct ceph_auth_none_info *xi = ac->private;
38
39 return xi->starting;
40}
41
42/*
43 * the generic auth code decode the global_id, and we carry no actual
44 * authenticate state, so nothing happens here.
45 */
46static int handle_reply(struct ceph_auth_client *ac, int result,
47 void *buf, void *end)
48{
49 struct ceph_auth_none_info *xi = ac->private;
50
51 xi->starting = false;
52 return result;
53}
54
55/*
56 * build an 'authorizer' with our entity_name and global_id. we can
57 * reuse a single static copy since it is identical for all services
58 * we connect to.
59 */
60static int ceph_auth_none_create_authorizer(
61 struct ceph_auth_client *ac, int peer_type,
62 struct ceph_authorizer **a,
63 void **buf, size_t *len,
64 void **reply_buf, size_t *reply_len)
65{
66 struct ceph_auth_none_info *ai = ac->private;
67 struct ceph_none_authorizer *au = &ai->au;
68 void *p, *end;
69 int ret;
70
71 if (!ai->built_authorizer) {
72 p = au->buf;
73 end = p + sizeof(au->buf);
74 ceph_encode_8(&p, 1);
75 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
76 if (ret < 0)
77 goto bad;
78 ceph_decode_need(&p, end, sizeof(u64), bad2);
79 ceph_encode_64(&p, ac->global_id);
80 au->buf_len = p - (void *)au->buf;
81 ai->built_authorizer = true;
82 dout("built authorizer len %d\n", au->buf_len);
83 }
84
85 *a = (struct ceph_authorizer *)au;
86 *buf = au->buf;
87 *len = au->buf_len;
88 *reply_buf = au->reply_buf;
89 *reply_len = sizeof(au->reply_buf);
90 return 0;
91
92bad2:
93 ret = -ERANGE;
94bad:
95 return ret;
96}
97
98static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
99 struct ceph_authorizer *a)
100{
101 /* nothing to do */
102}
103
104static const struct ceph_auth_client_ops ceph_auth_none_ops = {
105 .name = "none",
106 .reset = reset,
107 .destroy = destroy,
108 .is_authenticated = is_authenticated,
109 .should_authenticate = should_authenticate,
110 .handle_reply = handle_reply,
111 .create_authorizer = ceph_auth_none_create_authorizer,
112 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
113};
114
115int ceph_auth_none_init(struct ceph_auth_client *ac)
116{
117 struct ceph_auth_none_info *xi;
118
119 dout("ceph_auth_none_init %p\n", ac);
120 xi = kzalloc(sizeof(*xi), GFP_NOFS);
121 if (!xi)
122 return -ENOMEM;
123
124 xi->starting = true;
125 xi->built_authorizer = false;
126
127 ac->protocol = CEPH_AUTH_NONE;
128 ac->private = xi;
129 ac->ops = &ceph_auth_none_ops;
130 return 0;
131}
132
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 000000000000..ed7d088b1bc9
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5#include <linux/ceph/auth.h>
6
7/*
8 * null security mode.
9 *
10 * we use a single static authorizer that simply encodes our entity name
11 * and global id.
12 */
13
14struct ceph_none_authorizer {
15 char buf[128];
16 int buf_len;
17 char reply_buf[0];
18};
19
20struct ceph_auth_none_info {
21 bool starting;
22 bool built_authorizer;
23 struct ceph_none_authorizer au; /* we only need one; it's static */
24};
25
26extern int ceph_auth_none_init(struct ceph_auth_client *ac);
27
28#endif
29
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 000000000000..7fd5dfcf6e18
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,688 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "crypto.h"
13#include "auth_x.h"
14#include "auth_x_protocol.h"
15
16#define TEMP_TICKET_BUF_LEN 256
17
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
19
20static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
21{
22 struct ceph_x_info *xi = ac->private;
23 int need;
24
25 ceph_x_validate_tickets(ac, &need);
26 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
27 ac->want_keys, need, xi->have_keys);
28 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29}
30
31static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
32{
33 struct ceph_x_info *xi = ac->private;
34 int need;
35
36 ceph_x_validate_tickets(ac, &need);
37 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
38 ac->want_keys, need, xi->have_keys);
39 return need != 0;
40}
41
42static int ceph_x_encrypt_buflen(int ilen)
43{
44 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
45 sizeof(u32);
46}
47
48static int ceph_x_encrypt(struct ceph_crypto_key *secret,
49 void *ibuf, int ilen, void *obuf, size_t olen)
50{
51 struct ceph_x_encrypt_header head = {
52 .struct_v = 1,
53 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
54 };
55 size_t len = olen - sizeof(u32);
56 int ret;
57
58 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
59 &head, sizeof(head), ibuf, ilen);
60 if (ret)
61 return ret;
62 ceph_encode_32(&obuf, len);
63 return len + sizeof(u32);
64}
65
66static int ceph_x_decrypt(struct ceph_crypto_key *secret,
67 void **p, void *end, void *obuf, size_t olen)
68{
69 struct ceph_x_encrypt_header head;
70 size_t head_len = sizeof(head);
71 int len, ret;
72
73 len = ceph_decode_32(p);
74 if (*p + len > end)
75 return -EINVAL;
76
77 dout("ceph_x_decrypt len %d\n", len);
78 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
79 *p, len);
80 if (ret)
81 return ret;
82 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
83 return -EPERM;
84 *p += len;
85 return olen;
86}
87
88/*
89 * get existing (or insert new) ticket handler
90 */
91static struct ceph_x_ticket_handler *
92get_ticket_handler(struct ceph_auth_client *ac, int service)
93{
94 struct ceph_x_ticket_handler *th;
95 struct ceph_x_info *xi = ac->private;
96 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
97
98 while (*p) {
99 parent = *p;
100 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
101 if (service < th->service)
102 p = &(*p)->rb_left;
103 else if (service > th->service)
104 p = &(*p)->rb_right;
105 else
106 return th;
107 }
108
109 /* add it */
110 th = kzalloc(sizeof(*th), GFP_NOFS);
111 if (!th)
112 return ERR_PTR(-ENOMEM);
113 th->service = service;
114 rb_link_node(&th->node, parent, p);
115 rb_insert_color(&th->node, &xi->ticket_handlers);
116 return th;
117}
118
119static void remove_ticket_handler(struct ceph_auth_client *ac,
120 struct ceph_x_ticket_handler *th)
121{
122 struct ceph_x_info *xi = ac->private;
123
124 dout("remove_ticket_handler %p %d\n", th, th->service);
125 rb_erase(&th->node, &xi->ticket_handlers);
126 ceph_crypto_key_destroy(&th->session_key);
127 if (th->ticket_blob)
128 ceph_buffer_put(th->ticket_blob);
129 kfree(th);
130}
131
132static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
133 struct ceph_crypto_key *secret,
134 void *buf, void *end)
135{
136 struct ceph_x_info *xi = ac->private;
137 int num;
138 void *p = buf;
139 int ret;
140 char *dbuf;
141 char *ticket_buf;
142 u8 reply_struct_v;
143
144 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
145 if (!dbuf)
146 return -ENOMEM;
147
148 ret = -ENOMEM;
149 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
150 if (!ticket_buf)
151 goto out_dbuf;
152
153 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
154 reply_struct_v = ceph_decode_8(&p);
155 if (reply_struct_v != 1)
156 goto bad;
157 num = ceph_decode_32(&p);
158 dout("%d tickets\n", num);
159 while (num--) {
160 int type;
161 u8 tkt_struct_v, blob_struct_v;
162 struct ceph_x_ticket_handler *th;
163 void *dp, *dend;
164 int dlen;
165 char is_enc;
166 struct timespec validity;
167 struct ceph_crypto_key old_key;
168 void *tp, *tpend;
169 struct ceph_timespec new_validity;
170 struct ceph_crypto_key new_session_key;
171 struct ceph_buffer *new_ticket_blob;
172 unsigned long new_expires, new_renew_after;
173 u64 new_secret_id;
174
175 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
176
177 type = ceph_decode_32(&p);
178 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
179
180 tkt_struct_v = ceph_decode_8(&p);
181 if (tkt_struct_v != 1)
182 goto bad;
183
184 th = get_ticket_handler(ac, type);
185 if (IS_ERR(th)) {
186 ret = PTR_ERR(th);
187 goto out;
188 }
189
190 /* blob for me */
191 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
192 TEMP_TICKET_BUF_LEN);
193 if (dlen <= 0) {
194 ret = dlen;
195 goto out;
196 }
197 dout(" decrypted %d bytes\n", dlen);
198 dend = dbuf + dlen;
199 dp = dbuf;
200
201 tkt_struct_v = ceph_decode_8(&dp);
202 if (tkt_struct_v != 1)
203 goto bad;
204
205 memcpy(&old_key, &th->session_key, sizeof(old_key));
206 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
207 if (ret)
208 goto out;
209
210 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
211 ceph_decode_timespec(&validity, &new_validity);
212 new_expires = get_seconds() + validity.tv_sec;
213 new_renew_after = new_expires - (validity.tv_sec / 4);
214 dout(" expires=%lu renew_after=%lu\n", new_expires,
215 new_renew_after);
216
217 /* ticket blob for service */
218 ceph_decode_8_safe(&p, end, is_enc, bad);
219 tp = ticket_buf;
220 if (is_enc) {
221 /* encrypted */
222 dout(" encrypted ticket\n");
223 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
224 TEMP_TICKET_BUF_LEN);
225 if (dlen < 0) {
226 ret = dlen;
227 goto out;
228 }
229 dlen = ceph_decode_32(&tp);
230 } else {
231 /* unencrypted */
232 ceph_decode_32_safe(&p, end, dlen, bad);
233 ceph_decode_need(&p, end, dlen, bad);
234 ceph_decode_copy(&p, ticket_buf, dlen);
235 }
236 tpend = tp + dlen;
237 dout(" ticket blob is %d bytes\n", dlen);
238 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
239 blob_struct_v = ceph_decode_8(&tp);
240 new_secret_id = ceph_decode_64(&tp);
241 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
242 if (ret)
243 goto out;
244
245 /* all is well, update our ticket */
246 ceph_crypto_key_destroy(&th->session_key);
247 if (th->ticket_blob)
248 ceph_buffer_put(th->ticket_blob);
249 th->session_key = new_session_key;
250 th->ticket_blob = new_ticket_blob;
251 th->validity = new_validity;
252 th->secret_id = new_secret_id;
253 th->expires = new_expires;
254 th->renew_after = new_renew_after;
255 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
256 type, ceph_entity_type_name(type), th->secret_id,
257 (int)th->ticket_blob->vec.iov_len);
258 xi->have_keys |= th->service;
259 }
260
261 ret = 0;
262out:
263 kfree(ticket_buf);
264out_dbuf:
265 kfree(dbuf);
266 return ret;
267
268bad:
269 ret = -EINVAL;
270 goto out;
271}
272
273static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
274 struct ceph_x_ticket_handler *th,
275 struct ceph_x_authorizer *au)
276{
277 int maxlen;
278 struct ceph_x_authorize_a *msg_a;
279 struct ceph_x_authorize_b msg_b;
280 void *p, *end;
281 int ret;
282 int ticket_blob_len =
283 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
284
285 dout("build_authorizer for %s %p\n",
286 ceph_entity_type_name(th->service), au);
287
288 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
289 ceph_x_encrypt_buflen(ticket_blob_len);
290 dout(" need len %d\n", maxlen);
291 if (au->buf && au->buf->alloc_len < maxlen) {
292 ceph_buffer_put(au->buf);
293 au->buf = NULL;
294 }
295 if (!au->buf) {
296 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
297 if (!au->buf)
298 return -ENOMEM;
299 }
300 au->service = th->service;
301
302 msg_a = au->buf->vec.iov_base;
303 msg_a->struct_v = 1;
304 msg_a->global_id = cpu_to_le64(ac->global_id);
305 msg_a->service_id = cpu_to_le32(th->service);
306 msg_a->ticket_blob.struct_v = 1;
307 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
308 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
309 if (ticket_blob_len) {
310 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
311 th->ticket_blob->vec.iov_len);
312 }
313 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
314 le64_to_cpu(msg_a->ticket_blob.secret_id));
315
316 p = msg_a + 1;
317 p += ticket_blob_len;
318 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
319
320 get_random_bytes(&au->nonce, sizeof(au->nonce));
321 msg_b.struct_v = 1;
322 msg_b.nonce = cpu_to_le64(au->nonce);
323 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
324 p, end - p);
325 if (ret < 0)
326 goto out_buf;
327 p += ret;
328 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
329 dout(" built authorizer nonce %llx len %d\n", au->nonce,
330 (int)au->buf->vec.iov_len);
331 BUG_ON(au->buf->vec.iov_len > maxlen);
332 return 0;
333
334out_buf:
335 ceph_buffer_put(au->buf);
336 au->buf = NULL;
337 return ret;
338}
339
340static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
341 void **p, void *end)
342{
343 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
344 ceph_encode_8(p, 1);
345 ceph_encode_64(p, th->secret_id);
346 if (th->ticket_blob) {
347 const char *buf = th->ticket_blob->vec.iov_base;
348 u32 len = th->ticket_blob->vec.iov_len;
349
350 ceph_encode_32_safe(p, end, len, bad);
351 ceph_encode_copy_safe(p, end, buf, len, bad);
352 } else {
353 ceph_encode_32_safe(p, end, 0, bad);
354 }
355
356 return 0;
357bad:
358 return -ERANGE;
359}
360
361static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
362{
363 int want = ac->want_keys;
364 struct ceph_x_info *xi = ac->private;
365 int service;
366
367 *pneed = ac->want_keys & ~(xi->have_keys);
368
369 for (service = 1; service <= want; service <<= 1) {
370 struct ceph_x_ticket_handler *th;
371
372 if (!(ac->want_keys & service))
373 continue;
374
375 if (*pneed & service)
376 continue;
377
378 th = get_ticket_handler(ac, service);
379
380 if (IS_ERR(th)) {
381 *pneed |= service;
382 continue;
383 }
384
385 if (get_seconds() >= th->renew_after)
386 *pneed |= service;
387 if (get_seconds() >= th->expires)
388 xi->have_keys &= ~service;
389 }
390}
391
392
393static int ceph_x_build_request(struct ceph_auth_client *ac,
394 void *buf, void *end)
395{
396 struct ceph_x_info *xi = ac->private;
397 int need;
398 struct ceph_x_request_header *head = buf;
399 int ret;
400 struct ceph_x_ticket_handler *th =
401 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
402
403 if (IS_ERR(th))
404 return PTR_ERR(th);
405
406 ceph_x_validate_tickets(ac, &need);
407
408 dout("build_request want %x have %x need %x\n",
409 ac->want_keys, xi->have_keys, need);
410
411 if (need & CEPH_ENTITY_TYPE_AUTH) {
412 struct ceph_x_authenticate *auth = (void *)(head + 1);
413 void *p = auth + 1;
414 struct ceph_x_challenge_blob tmp;
415 char tmp_enc[40];
416 u64 *u;
417
418 if (p > end)
419 return -ERANGE;
420
421 dout(" get_auth_session_key\n");
422 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
423
424 /* encrypt and hash */
425 get_random_bytes(&auth->client_challenge, sizeof(u64));
426 tmp.client_challenge = auth->client_challenge;
427 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
428 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
429 tmp_enc, sizeof(tmp_enc));
430 if (ret < 0)
431 return ret;
432
433 auth->struct_v = 1;
434 auth->key = 0;
435 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
436 auth->key ^= *(__le64 *)u;
437 dout(" server_challenge %llx client_challenge %llx key %llx\n",
438 xi->server_challenge, le64_to_cpu(auth->client_challenge),
439 le64_to_cpu(auth->key));
440
441 /* now encode the old ticket if exists */
442 ret = ceph_x_encode_ticket(th, &p, end);
443 if (ret < 0)
444 return ret;
445
446 return p - buf;
447 }
448
449 if (need) {
450 void *p = head + 1;
451 struct ceph_x_service_ticket_request *req;
452
453 if (p > end)
454 return -ERANGE;
455 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
456
457 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
458 if (ret)
459 return ret;
460 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
461 xi->auth_authorizer.buf->vec.iov_len);
462
463 req = p;
464 req->keys = cpu_to_le32(need);
465 p += sizeof(*req);
466 return p - buf;
467 }
468
469 return 0;
470}
471
472static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
473 void *buf, void *end)
474{
475 struct ceph_x_info *xi = ac->private;
476 struct ceph_x_reply_header *head = buf;
477 struct ceph_x_ticket_handler *th;
478 int len = end - buf;
479 int op;
480 int ret;
481
482 if (result)
483 return result; /* XXX hmm? */
484
485 if (xi->starting) {
486 /* it's a hello */
487 struct ceph_x_server_challenge *sc = buf;
488
489 if (len != sizeof(*sc))
490 return -EINVAL;
491 xi->server_challenge = le64_to_cpu(sc->server_challenge);
492 dout("handle_reply got server challenge %llx\n",
493 xi->server_challenge);
494 xi->starting = false;
495 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
496 return -EAGAIN;
497 }
498
499 op = le16_to_cpu(head->op);
500 result = le32_to_cpu(head->result);
501 dout("handle_reply op %d result %d\n", op, result);
502 switch (op) {
503 case CEPHX_GET_AUTH_SESSION_KEY:
504 /* verify auth key */
505 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
506 buf + sizeof(*head), end);
507 break;
508
509 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
510 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
511 if (IS_ERR(th))
512 return PTR_ERR(th);
513 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
514 buf + sizeof(*head), end);
515 break;
516
517 default:
518 return -EINVAL;
519 }
520 if (ret)
521 return ret;
522 if (ac->want_keys == xi->have_keys)
523 return 0;
524 return -EAGAIN;
525}
526
527static int ceph_x_create_authorizer(
528 struct ceph_auth_client *ac, int peer_type,
529 struct ceph_authorizer **a,
530 void **buf, size_t *len,
531 void **reply_buf, size_t *reply_len)
532{
533 struct ceph_x_authorizer *au;
534 struct ceph_x_ticket_handler *th;
535 int ret;
536
537 th = get_ticket_handler(ac, peer_type);
538 if (IS_ERR(th))
539 return PTR_ERR(th);
540
541 au = kzalloc(sizeof(*au), GFP_NOFS);
542 if (!au)
543 return -ENOMEM;
544
545 ret = ceph_x_build_authorizer(ac, th, au);
546 if (ret) {
547 kfree(au);
548 return ret;
549 }
550
551 *a = (struct ceph_authorizer *)au;
552 *buf = au->buf->vec.iov_base;
553 *len = au->buf->vec.iov_len;
554 *reply_buf = au->reply_buf;
555 *reply_len = sizeof(au->reply_buf);
556 return 0;
557}
558
559static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
560 struct ceph_authorizer *a, size_t len)
561{
562 struct ceph_x_authorizer *au = (void *)a;
563 struct ceph_x_ticket_handler *th;
564 int ret = 0;
565 struct ceph_x_authorize_reply reply;
566 void *p = au->reply_buf;
567 void *end = p + sizeof(au->reply_buf);
568
569 th = get_ticket_handler(ac, au->service);
570 if (IS_ERR(th))
571 return PTR_ERR(th);
572 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
573 if (ret < 0)
574 return ret;
575 if (ret != sizeof(reply))
576 return -EPERM;
577
578 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
579 ret = -EPERM;
580 else
581 ret = 0;
582 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
583 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
584 return ret;
585}
586
587static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
588 struct ceph_authorizer *a)
589{
590 struct ceph_x_authorizer *au = (void *)a;
591
592 ceph_buffer_put(au->buf);
593 kfree(au);
594}
595
596
597static void ceph_x_reset(struct ceph_auth_client *ac)
598{
599 struct ceph_x_info *xi = ac->private;
600
601 dout("reset\n");
602 xi->starting = true;
603 xi->server_challenge = 0;
604}
605
606static void ceph_x_destroy(struct ceph_auth_client *ac)
607{
608 struct ceph_x_info *xi = ac->private;
609 struct rb_node *p;
610
611 dout("ceph_x_destroy %p\n", ac);
612 ceph_crypto_key_destroy(&xi->secret);
613
614 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
615 struct ceph_x_ticket_handler *th =
616 rb_entry(p, struct ceph_x_ticket_handler, node);
617 remove_ticket_handler(ac, th);
618 }
619
620 if (xi->auth_authorizer.buf)
621 ceph_buffer_put(xi->auth_authorizer.buf);
622
623 kfree(ac->private);
624 ac->private = NULL;
625}
626
627static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
628 int peer_type)
629{
630 struct ceph_x_ticket_handler *th;
631
632 th = get_ticket_handler(ac, peer_type);
633 if (!IS_ERR(th))
634 remove_ticket_handler(ac, th);
635}
636
637
638static const struct ceph_auth_client_ops ceph_x_ops = {
639 .name = "x",
640 .is_authenticated = ceph_x_is_authenticated,
641 .should_authenticate = ceph_x_should_authenticate,
642 .build_request = ceph_x_build_request,
643 .handle_reply = ceph_x_handle_reply,
644 .create_authorizer = ceph_x_create_authorizer,
645 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
646 .destroy_authorizer = ceph_x_destroy_authorizer,
647 .invalidate_authorizer = ceph_x_invalidate_authorizer,
648 .reset = ceph_x_reset,
649 .destroy = ceph_x_destroy,
650};
651
652
653int ceph_x_init(struct ceph_auth_client *ac)
654{
655 struct ceph_x_info *xi;
656 int ret;
657
658 dout("ceph_x_init %p\n", ac);
659 ret = -ENOMEM;
660 xi = kzalloc(sizeof(*xi), GFP_NOFS);
661 if (!xi)
662 goto out;
663
664 ret = -EINVAL;
665 if (!ac->secret) {
666 pr_err("no secret set (for auth_x protocol)\n");
667 goto out_nomem;
668 }
669
670 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
671 if (ret)
672 goto out_nomem;
673
674 xi->starting = true;
675 xi->ticket_handlers = RB_ROOT;
676
677 ac->protocol = CEPH_AUTH_CEPHX;
678 ac->private = xi;
679 ac->ops = &ceph_x_ops;
680 return 0;
681
682out_nomem:
683 kfree(xi);
684out:
685 return ret;
686}
687
688
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 000000000000..e02da7a5c5a1
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include <linux/ceph/auth.h>
7
8#include "crypto.h"
9#include "auth_x_protocol.h"
10
11/*
12 * Handle ticket for a single service.
13 */
14struct ceph_x_ticket_handler {
15 struct rb_node node;
16 unsigned service;
17
18 struct ceph_crypto_key session_key;
19 struct ceph_timespec validity;
20
21 u64 secret_id;
22 struct ceph_buffer *ticket_blob;
23
24 unsigned long renew_after, expires;
25};
26
27
28struct ceph_x_authorizer {
29 struct ceph_buffer *buf;
30 unsigned service;
31 u64 nonce;
32 char reply_buf[128]; /* big enough for encrypted blob */
33};
34
35struct ceph_x_info {
36 struct ceph_crypto_key secret;
37
38 bool starting;
39 u64 server_challenge;
40
41 unsigned have_keys;
42 struct rb_root ticket_handlers;
43
44 struct ceph_x_authorizer auth_authorizer;
45};
46
47extern int ceph_x_init(struct ceph_auth_client *ac);
48
49#endif
50
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 000000000000..53d8abfa25d5
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/module.h>
5#include <linux/slab.h>
6
7#include <linux/ceph/buffer.h>
8#include <linux/ceph/decode.h>
9
10struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
11{
12 struct ceph_buffer *b;
13
14 b = kmalloc(sizeof(*b), gfp);
15 if (!b)
16 return NULL;
17
18 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
19 if (b->vec.iov_base) {
20 b->is_vmalloc = false;
21 } else {
22 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
23 if (!b->vec.iov_base) {
24 kfree(b);
25 return NULL;
26 }
27 b->is_vmalloc = true;
28 }
29
30 kref_init(&b->kref);
31 b->alloc_len = len;
32 b->vec.iov_len = len;
33 dout("buffer_new %p\n", b);
34 return b;
35}
36EXPORT_SYMBOL(ceph_buffer_new);
37
38void ceph_buffer_release(struct kref *kref)
39{
40 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
41
42 dout("buffer_release %p\n", b);
43 if (b->vec.iov_base) {
44 if (b->is_vmalloc)
45 vfree(b->vec.iov_base);
46 else
47 kfree(b->vec.iov_base);
48 }
49 kfree(b);
50}
51EXPORT_SYMBOL(ceph_buffer_release);
52
53int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
54{
55 size_t len;
56
57 ceph_decode_need(p, end, sizeof(u32), bad);
58 len = ceph_decode_32(p);
59 dout("decode_buffer len %d\n", (int)len);
60 ceph_decode_need(p, end, len, bad);
61 *b = ceph_buffer_new(len, GFP_NOFS);
62 if (!*b)
63 return -ENOMEM;
64 ceph_decode_copy(p, (*b)->vec.iov_base, len);
65 return 0;
66bad:
67 return -EINVAL;
68}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000000..f3e4a13fea0c
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
1
2#include <linux/ceph/ceph_debug.h>
3#include <linux/backing-dev.h>
4#include <linux/ctype.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/sched.h>
12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16
17
18#include <linux/ceph/libceph.h>
19#include <linux/ceph/debugfs.h>
20#include <linux/ceph/decode.h>
21#include <linux/ceph/mon_client.h>
22#include <linux/ceph/auth.h>
23
24
25
26/*
27 * find filename portion of a path (/foo/bar/baz -> baz)
28 */
29const char *ceph_file_part(const char *s, int len)
30{
31 const char *e = s + len;
32
33 while (e != s && *(e-1) != '/')
34 e--;
35 return e;
36}
37EXPORT_SYMBOL(ceph_file_part);
38
39const char *ceph_msg_type_name(int type)
40{
41 switch (type) {
42 case CEPH_MSG_SHUTDOWN: return "shutdown";
43 case CEPH_MSG_PING: return "ping";
44 case CEPH_MSG_AUTH: return "auth";
45 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
46 case CEPH_MSG_MON_MAP: return "mon_map";
47 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
48 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
49 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
50 case CEPH_MSG_STATFS: return "statfs";
51 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
52 case CEPH_MSG_MDS_MAP: return "mds_map";
53 case CEPH_MSG_CLIENT_SESSION: return "client_session";
54 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
55 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
56 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
57 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
58 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
59 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
60 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
61 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
62 case CEPH_MSG_OSD_MAP: return "osd_map";
63 case CEPH_MSG_OSD_OP: return "osd_op";
64 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
65 default: return "unknown";
66 }
67}
68EXPORT_SYMBOL(ceph_msg_type_name);
69
70/*
71 * Initially learn our fsid, or verify an fsid matches.
72 */
73int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
74{
75 if (client->have_fsid) {
76 if (ceph_fsid_compare(&client->fsid, fsid)) {
77 pr_err("bad fsid, had %pU got %pU",
78 &client->fsid, fsid);
79 return -1;
80 }
81 } else {
82 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
83 memcpy(&client->fsid, fsid, sizeof(*fsid));
84 ceph_debugfs_client_init(client);
85 client->have_fsid = true;
86 }
87 return 0;
88}
89EXPORT_SYMBOL(ceph_check_fsid);
90
91static int strcmp_null(const char *s1, const char *s2)
92{
93 if (!s1 && !s2)
94 return 0;
95 if (s1 && !s2)
96 return -1;
97 if (!s1 && s2)
98 return 1;
99 return strcmp(s1, s2);
100}
101
102int ceph_compare_options(struct ceph_options *new_opt,
103 struct ceph_client *client)
104{
105 struct ceph_options *opt1 = new_opt;
106 struct ceph_options *opt2 = client->options;
107 int ofs = offsetof(struct ceph_options, mon_addr);
108 int i;
109 int ret;
110
111 ret = memcmp(opt1, opt2, ofs);
112 if (ret)
113 return ret;
114
115 ret = strcmp_null(opt1->name, opt2->name);
116 if (ret)
117 return ret;
118
119 ret = strcmp_null(opt1->secret, opt2->secret);
120 if (ret)
121 return ret;
122
123 /* any matching mon ip implies a match */
124 for (i = 0; i < opt1->num_mon; i++) {
125 if (ceph_monmap_contains(client->monc.monmap,
126 &opt1->mon_addr[i]))
127 return 0;
128 }
129 return -1;
130}
131EXPORT_SYMBOL(ceph_compare_options);
132
133
134static int parse_fsid(const char *str, struct ceph_fsid *fsid)
135{
136 int i = 0;
137 char tmp[3];
138 int err = -EINVAL;
139 int d;
140
141 dout("parse_fsid '%s'\n", str);
142 tmp[2] = 0;
143 while (*str && i < 16) {
144 if (ispunct(*str)) {
145 str++;
146 continue;
147 }
148 if (!isxdigit(str[0]) || !isxdigit(str[1]))
149 break;
150 tmp[0] = str[0];
151 tmp[1] = str[1];
152 if (sscanf(tmp, "%x", &d) < 1)
153 break;
154 fsid->fsid[i] = d & 0xff;
155 i++;
156 str += 2;
157 }
158
159 if (i == 16)
160 err = 0;
161 dout("parse_fsid ret %d got fsid %pU", err, fsid);
162 return err;
163}
164
165/*
166 * ceph options
167 */
168enum {
169 Opt_osdtimeout,
170 Opt_osdkeepalivetimeout,
171 Opt_mount_timeout,
172 Opt_osd_idle_ttl,
173 Opt_last_int,
174 /* int args above */
175 Opt_fsid,
176 Opt_name,
177 Opt_secret,
178 Opt_ip,
179 Opt_last_string,
180 /* string args above */
181 Opt_noshare,
182 Opt_nocrc,
183};
184
185static match_table_t opt_tokens = {
186 {Opt_osdtimeout, "osdtimeout=%d"},
187 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
188 {Opt_mount_timeout, "mount_timeout=%d"},
189 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
190 /* int args above */
191 {Opt_fsid, "fsid=%s"},
192 {Opt_name, "name=%s"},
193 {Opt_secret, "secret=%s"},
194 {Opt_ip, "ip=%s"},
195 /* string args above */
196 {Opt_noshare, "noshare"},
197 {Opt_nocrc, "nocrc"},
198 {-1, NULL}
199};
200
201void ceph_destroy_options(struct ceph_options *opt)
202{
203 dout("destroy_options %p\n", opt);
204 kfree(opt->name);
205 kfree(opt->secret);
206 kfree(opt);
207}
208EXPORT_SYMBOL(ceph_destroy_options);
209
210int ceph_parse_options(struct ceph_options **popt, char *options,
211 const char *dev_name, const char *dev_name_end,
212 int (*parse_extra_token)(char *c, void *private),
213 void *private)
214{
215 struct ceph_options *opt;
216 const char *c;
217 int err = -ENOMEM;
218 substring_t argstr[MAX_OPT_ARGS];
219
220 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
221 if (!opt)
222 return err;
223 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
224 GFP_KERNEL);
225 if (!opt->mon_addr)
226 goto out;
227
228 dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
229 dev_name);
230
231 /* start with defaults */
232 opt->flags = CEPH_OPT_DEFAULT;
233 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
234 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
235 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
236 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
237
238 /* get mon ip(s) */
239 /* ip1[:port1][,ip2[:port2]...] */
240 err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
241 CEPH_MAX_MON, &opt->num_mon);
242 if (err < 0)
243 goto out;
244
245 /* parse mount options */
246 while ((c = strsep(&options, ",")) != NULL) {
247 int token, intval, ret;
248 if (!*c)
249 continue;
250 err = -EINVAL;
251 token = match_token((char *)c, opt_tokens, argstr);
252 if (token < 0 && parse_extra_token) {
253 /* extra? */
254 err = parse_extra_token((char *)c, private);
255 if (err < 0) {
256 pr_err("bad option at '%s'\n", c);
257 goto out;
258 }
259 continue;
260 }
261 if (token < Opt_last_int) {
262 ret = match_int(&argstr[0], &intval);
263 if (ret < 0) {
264 pr_err("bad mount option arg (not int) "
265 "at '%s'\n", c);
266 continue;
267 }
268 dout("got int token %d val %d\n", token, intval);
269 } else if (token > Opt_last_int && token < Opt_last_string) {
270 dout("got string token %d val %s\n", token,
271 argstr[0].from);
272 } else {
273 dout("got token %d\n", token);
274 }
275 switch (token) {
276 case Opt_ip:
277 err = ceph_parse_ips(argstr[0].from,
278 argstr[0].to,
279 &opt->my_addr,
280 1, NULL);
281 if (err < 0)
282 goto out;
283 opt->flags |= CEPH_OPT_MYIP;
284 break;
285
286 case Opt_fsid:
287 err = parse_fsid(argstr[0].from, &opt->fsid);
288 if (err == 0)
289 opt->flags |= CEPH_OPT_FSID;
290 break;
291 case Opt_name:
292 opt->name = kstrndup(argstr[0].from,
293 argstr[0].to-argstr[0].from,
294 GFP_KERNEL);
295 break;
296 case Opt_secret:
297 opt->secret = kstrndup(argstr[0].from,
298 argstr[0].to-argstr[0].from,
299 GFP_KERNEL);
300 break;
301
302 /* misc */
303 case Opt_osdtimeout:
304 opt->osd_timeout = intval;
305 break;
306 case Opt_osdkeepalivetimeout:
307 opt->osd_keepalive_timeout = intval;
308 break;
309 case Opt_osd_idle_ttl:
310 opt->osd_idle_ttl = intval;
311 break;
312 case Opt_mount_timeout:
313 opt->mount_timeout = intval;
314 break;
315
316 case Opt_noshare:
317 opt->flags |= CEPH_OPT_NOSHARE;
318 break;
319
320 case Opt_nocrc:
321 opt->flags |= CEPH_OPT_NOCRC;
322 break;
323
324 default:
325 BUG_ON(token);
326 }
327 }
328
329 /* success */
330 *popt = opt;
331 return 0;
332
333out:
334 ceph_destroy_options(opt);
335 return err;
336}
337EXPORT_SYMBOL(ceph_parse_options);
338
339u64 ceph_client_id(struct ceph_client *client)
340{
341 return client->monc.auth->global_id;
342}
343EXPORT_SYMBOL(ceph_client_id);
344
345/*
346 * create a fresh client instance
347 */
348struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
349{
350 struct ceph_client *client;
351 int err = -ENOMEM;
352
353 client = kzalloc(sizeof(*client), GFP_KERNEL);
354 if (client == NULL)
355 return ERR_PTR(-ENOMEM);
356
357 client->private = private;
358 client->options = opt;
359
360 mutex_init(&client->mount_mutex);
361 init_waitqueue_head(&client->auth_wq);
362 client->auth_err = 0;
363
364 client->extra_mon_dispatch = NULL;
365 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
366 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
367
368 client->msgr = NULL;
369
370 /* subsystems */
371 err = ceph_monc_init(&client->monc, client);
372 if (err < 0)
373 goto fail;
374 err = ceph_osdc_init(&client->osdc, client);
375 if (err < 0)
376 goto fail_monc;
377
378 return client;
379
380fail_monc:
381 ceph_monc_stop(&client->monc);
382fail:
383 kfree(client);
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL(ceph_create_client);
387
388void ceph_destroy_client(struct ceph_client *client)
389{
390 dout("destroy_client %p\n", client);
391
392 /* unmount */
393 ceph_osdc_stop(&client->osdc);
394
395 /*
396 * make sure mds and osd connections close out before destroying
397 * the auth module, which is needed to free those connections'
398 * ceph_authorizers.
399 */
400 ceph_msgr_flush();
401
402 ceph_monc_stop(&client->monc);
403
404 ceph_debugfs_client_cleanup(client);
405
406 if (client->msgr)
407 ceph_messenger_destroy(client->msgr);
408
409 ceph_destroy_options(client->options);
410
411 kfree(client);
412 dout("destroy_client %p done\n", client);
413}
414EXPORT_SYMBOL(ceph_destroy_client);
415
416/*
417 * true if we have the mon map (and have thus joined the cluster)
418 */
419static int have_mon_and_osd_map(struct ceph_client *client)
420{
421 return client->monc.monmap && client->monc.monmap->epoch &&
422 client->osdc.osdmap && client->osdc.osdmap->epoch;
423}
424
425/*
426 * mount: join the ceph cluster, and open root directory.
427 */
428int __ceph_open_session(struct ceph_client *client, unsigned long started)
429{
430 struct ceph_entity_addr *myaddr = NULL;
431 int err;
432 unsigned long timeout = client->options->mount_timeout * HZ;
433
434 /* initialize the messenger */
435 if (client->msgr == NULL) {
436 if (ceph_test_opt(client, MYIP))
437 myaddr = &client->options->my_addr;
438 client->msgr = ceph_messenger_create(myaddr,
439 client->supported_features,
440 client->required_features);
441 if (IS_ERR(client->msgr)) {
442 client->msgr = NULL;
443 return PTR_ERR(client->msgr);
444 }
445 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
446 }
447
448 /* open session, and wait for mon and osd maps */
449 err = ceph_monc_open_session(&client->monc);
450 if (err < 0)
451 return err;
452
453 while (!have_mon_and_osd_map(client)) {
454 err = -EIO;
455 if (timeout && time_after_eq(jiffies, started + timeout))
456 return err;
457
458 /* wait */
459 dout("mount waiting for mon_map\n");
460 err = wait_event_interruptible_timeout(client->auth_wq,
461 have_mon_and_osd_map(client) || (client->auth_err < 0),
462 timeout);
463 if (err == -EINTR || err == -ERESTARTSYS)
464 return err;
465 if (client->auth_err < 0)
466 return client->auth_err;
467 }
468
469 return 0;
470}
471EXPORT_SYMBOL(__ceph_open_session);
472
473
474int ceph_open_session(struct ceph_client *client)
475{
476 int ret;
477 unsigned long started = jiffies; /* note the start time */
478
479 dout("open_session start\n");
480 mutex_lock(&client->mount_mutex);
481
482 ret = __ceph_open_session(client, started);
483
484 mutex_unlock(&client->mount_mutex);
485 return ret;
486}
487EXPORT_SYMBOL(ceph_open_session);
488
489
490static int __init init_ceph_lib(void)
491{
492 int ret = 0;
493
494 ret = ceph_debugfs_init();
495 if (ret < 0)
496 goto out;
497
498 ret = ceph_msgr_init();
499 if (ret < 0)
500 goto out_debugfs;
501
502 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
503 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
504 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
505 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
506
507 return 0;
508
509out_debugfs:
510 ceph_debugfs_cleanup();
511out:
512 return ret;
513}
514
515static void __exit exit_ceph_lib(void)
516{
517 dout("exit_ceph_lib\n");
518 ceph_msgr_exit();
519 ceph_debugfs_cleanup();
520}
521
522module_init(init_ceph_lib);
523module_exit(exit_ceph_lib);
524
525MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
526MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
527MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
528MODULE_DESCRIPTION("Ceph filesystem for Linux");
529MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 000000000000..a3a3a31d3c37
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,75 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7/*
8 * return true if @layout appears to be valid
9 */
10int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
11{
12 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
13 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
14 __u32 os = le32_to_cpu(layout->fl_object_size);
15
16 /* stripe unit, object size must be non-zero, 64k increment */
17 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
18 return 0;
19 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
20 return 0;
21 /* object size must be a multiple of stripe unit */
22 if (os < su || os % su)
23 return 0;
24 /* stripe count must be non-zero */
25 if (!sc)
26 return 0;
27 return 1;
28}
29
30
31int ceph_flags_to_mode(int flags)
32{
33 int mode;
34
35#ifdef O_DIRECTORY /* fixme */
36 if ((flags & O_DIRECTORY) == O_DIRECTORY)
37 return CEPH_FILE_MODE_PIN;
38#endif
39 if ((flags & O_APPEND) == O_APPEND)
40 flags |= O_WRONLY;
41
42 if ((flags & O_ACCMODE) == O_RDWR)
43 mode = CEPH_FILE_MODE_RDWR;
44 else if ((flags & O_ACCMODE) == O_WRONLY)
45 mode = CEPH_FILE_MODE_WR;
46 else
47 mode = CEPH_FILE_MODE_RD;
48
49#ifdef O_LAZY
50 if (flags & O_LAZY)
51 mode |= CEPH_FILE_MODE_LAZY;
52#endif
53
54 return mode;
55}
56EXPORT_SYMBOL(ceph_flags_to_mode);
57
58int ceph_caps_for_mode(int mode)
59{
60 int caps = CEPH_CAP_PIN;
61
62 if (mode & CEPH_FILE_MODE_RD)
63 caps |= CEPH_CAP_FILE_SHARED |
64 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
65 if (mode & CEPH_FILE_MODE_WR)
66 caps |= CEPH_CAP_FILE_EXCL |
67 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
68 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
69 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
70 if (mode & CEPH_FILE_MODE_LAZY)
71 caps |= CEPH_CAP_FILE_LAZYIO;
72
73 return caps;
74}
75EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 000000000000..815ef8826796
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include <linux/ceph/types.h>
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000000..3fbda04de29c
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
1/*
2 * Ceph string constants
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7const char *ceph_entity_type_name(int type)
8{
9 switch (type) {
10 case CEPH_ENTITY_TYPE_MDS: return "mds";
11 case CEPH_ENTITY_TYPE_OSD: return "osd";
12 case CEPH_ENTITY_TYPE_MON: return "mon";
13 case CEPH_ENTITY_TYPE_CLIENT: return "client";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32 case CEPH_OSD_OP_ROLLBACK: return "rollback";
33
34 case CEPH_OSD_OP_APPEND: return "append";
35 case CEPH_OSD_OP_STARTSYNC: return "startsync";
36 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
37 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
38
39 case CEPH_OSD_OP_TMAPUP: return "tmapup";
40 case CEPH_OSD_OP_TMAPGET: return "tmapget";
41 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
42
43 case CEPH_OSD_OP_GETXATTR: return "getxattr";
44 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
45 case CEPH_OSD_OP_SETXATTR: return "setxattr";
46 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
47 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
48 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
49 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
50
51 case CEPH_OSD_OP_PULL: return "pull";
52 case CEPH_OSD_OP_PUSH: return "push";
53 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
54 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
55 case CEPH_OSD_OP_SCRUB: return "scrub";
56
57 case CEPH_OSD_OP_WRLOCK: return "wrlock";
58 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
59 case CEPH_OSD_OP_RDLOCK: return "rdlock";
60 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
61 case CEPH_OSD_OP_UPLOCK: return "uplock";
62 case CEPH_OSD_OP_DNLOCK: return "dnlock";
63
64 case CEPH_OSD_OP_CALL: return "call";
65
66 case CEPH_OSD_OP_PGLS: return "pgls";
67 }
68 return "???";
69}
70
71
72const char *ceph_pool_op_name(int op)
73{
74 switch (op) {
75 case POOL_OP_CREATE: return "create";
76 case POOL_OP_DELETE: return "delete";
77 case POOL_OP_AUID_CHANGE: return "auid change";
78 case POOL_OP_CREATE_SNAP: return "create snap";
79 case POOL_OP_DELETE_SNAP: return "delete snap";
80 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
81 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
82 }
83 return "???";
84}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 000000000000..d6ebb13a18a4
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include <linux/crush/crush.h>
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 000000000000..5bb63e37a8a1
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include <linux/crush/hash.h>
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 000000000000..42599e31dcad
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include <linux/crush/crush.h>
22#include <linux/crush/hash.h>
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x10000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
311
312 for (rep = outpos; rep < numrep; rep++) {
313 /* keep trying until we get a non-out, non-colliding item */
314 ftotal = 0;
315 skip_rep = 0;
316 do {
317 retry_descent = 0;
318 in = bucket; /* initial bucket */
319
320 /* choose through intervening buckets */
321 flocal = 0;
322 do {
323 collide = 0;
324 retry_bucket = 0;
325 r = rep;
326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
327 /* be careful */
328 if (firstn || numrep >= in->size)
329 /* r' = r + f_total */
330 r += ftotal;
331 else if (in->size % numrep == 0)
332 /* r'=r+(n+1)*f_local */
333 r += (numrep+1) *
334 (flocal+ftotal);
335 else
336 /* r' = r + n*f_local */
337 r += numrep * (flocal+ftotal);
338 } else {
339 if (firstn)
340 /* r' = r + f_total */
341 r += ftotal;
342 else
343 /* r' = r + n*f_local */
344 r += numrep * (flocal+ftotal);
345 }
346
347 /* bucket choose */
348 if (in->size == 0) {
349 reject = 1;
350 goto reject;
351 }
352 if (flocal >= (in->size>>1) &&
353 flocal > orig_tries)
354 item = bucket_perm_choose(in, x, r);
355 else
356 item = crush_bucket_choose(in, x, r);
357 BUG_ON(item >= map->max_devices);
358
359 /* desired type? */
360 if (item < 0)
361 itemtype = map->buckets[-1-item]->type;
362 else
363 itemtype = 0;
364 dprintk(" item %d type %d\n", item, itemtype);
365
366 /* keep going? */
367 if (itemtype != type) {
368 BUG_ON(item >= 0 ||
369 (-1-item) >= map->max_buckets);
370 in = map->buckets[-1-item];
371 retry_bucket = 1;
372 continue;
373 }
374
375 /* collision? */
376 for (i = 0; i < outpos; i++) {
377 if (out[i] == item) {
378 collide = 1;
379 break;
380 }
381 }
382
383 reject = 0;
384 if (recurse_to_leaf) {
385 if (item < 0) {
386 if (crush_choose(map,
387 map->buckets[-1-item],
388 weight,
389 x, outpos+1, 0,
390 out2, outpos,
391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
402 /* out? */
403 if (itemtype == 0)
404 reject = is_out(map, weight,
405 item, x);
406 else
407 reject = 0;
408 }
409
410reject:
411 if (reject || collide) {
412 ftotal++;
413 flocal++;
414
415 if (collide && flocal < 3)
416 /* retry locally a few times */
417 retry_bucket = 1;
418 else if (flocal < in->size + orig_tries)
419 /* exhaustive bucket search */
420 retry_bucket = 1;
421 else if (ftotal < 20)
422 /* then retry descent */
423 retry_descent = 1;
424 else
425 /* else give up */
426 skip_rep = 1;
427 dprintk(" reject %d collide %d "
428 "ftotal %d flocal %d\n",
429 reject, collide, ftotal,
430 flocal);
431 }
432 } while (retry_bucket);
433 } while (retry_descent);
434
435 if (skip_rep) {
436 dprintk("skip rep\n");
437 continue;
438 }
439
440 dprintk("CHOOSE got %d\n", item);
441 out[outpos] = item;
442 outpos++;
443 }
444
445 dprintk("CHOOSE returns %d\n", outpos);
446 return outpos;
447}
448
449
450/**
451 * crush_do_rule - calculate a mapping with the given input and rule
452 * @map: the crush_map
453 * @ruleno: the rule id
454 * @x: hash input
455 * @result: pointer to result vector
456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
458 */
459int crush_do_rule(struct crush_map *map,
460 int ruleno, int x, int *result, int result_max,
461 int force, __u32 *weight)
462{
463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
466 int a[CRUSH_MAX_SET];
467 int b[CRUSH_MAX_SET];
468 int c[CRUSH_MAX_SET];
469 int recurse_to_leaf;
470 int *w;
471 int wsize = 0;
472 int *o;
473 int osize;
474 int *tmp;
475 struct crush_rule *rule;
476 int step;
477 int i, j;
478 int numrep;
479 int firstn;
480 int rc = -1;
481
482 BUG_ON(ruleno >= map->max_rules);
483
484 rule = map->rules[ruleno];
485 result_len = 0;
486 w = a;
487 o = b;
488
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
513
514 for (step = 0; step < rule->len; step++) {
515 firstn = 0;
516 switch (rule->steps[step].op) {
517 case CRUSH_RULE_TAKE:
518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
523 wsize = 1;
524 break;
525
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1;
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0);
532
533 recurse_to_leaf =
534 rule->steps[step].op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538
539 /* reset output */
540 osize = 0;
541
542 for (i = 0; i < wsize; i++) {
543 /*
544 * see CRUSH_N, CRUSH_N_MINUS macros.
545 * basically, numrep <= 0 means relative to
546 * the provided result_max
547 */
548 numrep = rule->steps[step].arg1;
549 if (numrep <= 0) {
550 numrep += result_max;
551 if (numrep <= 0)
552 continue;
553 }
554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map,
570 map->buckets[-1-w[i]],
571 weight,
572 x, numrep,
573 rule->steps[step].arg2,
574 o+osize, j,
575 firstn,
576 recurse_to_leaf, c+osize);
577 }
578
579 if (recurse_to_leaf)
580 /* copy final _leaf_ values to output set */
581 memcpy(o, c, osize*sizeof(*o));
582
583 /* swap t and w arrays */
584 tmp = o;
585 o = w;
586 w = tmp;
587 wsize = osize;
588 break;
589
590
591 case CRUSH_RULE_EMIT:
592 for (i = 0; i < wsize && result_len < result_max; i++) {
593 result[result_len] = w[i];
594 result_len++;
595 }
596 wsize = 0;
597 break;
598
599 default:
600 BUG_ON(1);
601 }
602 }
603 rc = result_len;
604
605out:
606 return rc;
607}
608
609
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 000000000000..7b505b0c983f
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,412 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include <linux/ceph/decode.h>
10#include "crypto.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79
80static int ceph_aes_encrypt(const void *key, int key_len,
81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
83{
84 struct scatterlist sg_in[2], sg_out[1];
85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
86 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
87 int ret;
88 void *iv;
89 int ivsize;
90 size_t zero_padding = (0x10 - (src_len & 0x0f));
91 char pad[16];
92
93 if (IS_ERR(tfm))
94 return PTR_ERR(tfm);
95
96 memset(pad, zero_padding, zero_padding);
97
98 *dst_len = src_len + zero_padding;
99
100 crypto_blkcipher_setkey((void *)tfm, key, key_len);
101 sg_init_table(sg_in, 2);
102 sg_set_buf(&sg_in[0], src, src_len);
103 sg_set_buf(&sg_in[1], pad, zero_padding);
104 sg_init_table(sg_out, 1);
105 sg_set_buf(sg_out, dst, *dst_len);
106 iv = crypto_blkcipher_crt(tfm)->iv;
107 ivsize = crypto_blkcipher_ivsize(tfm);
108
109 memcpy(iv, aes_iv, ivsize);
110 /*
111 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
112 key, key_len, 1);
113 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
114 src, src_len, 1);
115 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
116 pad, zero_padding, 1);
117 */
118 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
119 src_len + zero_padding);
120 crypto_free_blkcipher(tfm);
121 if (ret < 0)
122 pr_err("ceph_aes_crypt failed %d\n", ret);
123 /*
124 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
125 dst, *dst_len, 1);
126 */
127 return 0;
128}
129
130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
131 size_t *dst_len,
132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
134{
135 struct scatterlist sg_in[3], sg_out[1];
136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
137 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
138 int ret;
139 void *iv;
140 int ivsize;
141 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
142 char pad[16];
143
144 if (IS_ERR(tfm))
145 return PTR_ERR(tfm);
146
147 memset(pad, zero_padding, zero_padding);
148
149 *dst_len = src1_len + src2_len + zero_padding;
150
151 crypto_blkcipher_setkey((void *)tfm, key, key_len);
152 sg_init_table(sg_in, 3);
153 sg_set_buf(&sg_in[0], src1, src1_len);
154 sg_set_buf(&sg_in[1], src2, src2_len);
155 sg_set_buf(&sg_in[2], pad, zero_padding);
156 sg_init_table(sg_out, 1);
157 sg_set_buf(sg_out, dst, *dst_len);
158 iv = crypto_blkcipher_crt(tfm)->iv;
159 ivsize = crypto_blkcipher_ivsize(tfm);
160
161 memcpy(iv, aes_iv, ivsize);
162 /*
163 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
164 key, key_len, 1);
165 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
166 src1, src1_len, 1);
167 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
168 src2, src2_len, 1);
169 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
170 pad, zero_padding, 1);
171 */
172 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
173 src1_len + src2_len + zero_padding);
174 crypto_free_blkcipher(tfm);
175 if (ret < 0)
176 pr_err("ceph_aes_crypt2 failed %d\n", ret);
177 /*
178 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
179 dst, *dst_len, 1);
180 */
181 return 0;
182}
183
184static int ceph_aes_decrypt(const void *key, int key_len,
185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
187{
188 struct scatterlist sg_in[1], sg_out[2];
189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
190 struct blkcipher_desc desc = { .tfm = tfm };
191 char pad[16];
192 void *iv;
193 int ivsize;
194 int ret;
195 int last_byte;
196
197 if (IS_ERR(tfm))
198 return PTR_ERR(tfm);
199
200 crypto_blkcipher_setkey((void *)tfm, key, key_len);
201 sg_init_table(sg_in, 1);
202 sg_init_table(sg_out, 2);
203 sg_set_buf(sg_in, src, src_len);
204 sg_set_buf(&sg_out[0], dst, *dst_len);
205 sg_set_buf(&sg_out[1], pad, sizeof(pad));
206
207 iv = crypto_blkcipher_crt(tfm)->iv;
208 ivsize = crypto_blkcipher_ivsize(tfm);
209
210 memcpy(iv, aes_iv, ivsize);
211
212 /*
213 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
214 key, key_len, 1);
215 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
216 src, src_len, 1);
217 */
218
219 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
220 crypto_free_blkcipher(tfm);
221 if (ret < 0) {
222 pr_err("ceph_aes_decrypt failed %d\n", ret);
223 return ret;
224 }
225
226 if (src_len <= *dst_len)
227 last_byte = ((char *)dst)[src_len - 1];
228 else
229 last_byte = pad[src_len - *dst_len - 1];
230 if (last_byte <= 16 && src_len >= last_byte) {
231 *dst_len = src_len - last_byte;
232 } else {
233 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
234 last_byte, (int)src_len);
235 return -EPERM; /* bad padding */
236 }
237 /*
238 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
239 dst, *dst_len, 1);
240 */
241 return 0;
242}
243
244static int ceph_aes_decrypt2(const void *key, int key_len,
245 void *dst1, size_t *dst1_len,
246 void *dst2, size_t *dst2_len,
247 const void *src, size_t src_len)
248{
249 struct scatterlist sg_in[1], sg_out[3];
250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
251 struct blkcipher_desc desc = { .tfm = tfm };
252 char pad[16];
253 void *iv;
254 int ivsize;
255 int ret;
256 int last_byte;
257
258 if (IS_ERR(tfm))
259 return PTR_ERR(tfm);
260
261 sg_init_table(sg_in, 1);
262 sg_set_buf(sg_in, src, src_len);
263 sg_init_table(sg_out, 3);
264 sg_set_buf(&sg_out[0], dst1, *dst1_len);
265 sg_set_buf(&sg_out[1], dst2, *dst2_len);
266 sg_set_buf(&sg_out[2], pad, sizeof(pad));
267
268 crypto_blkcipher_setkey((void *)tfm, key, key_len);
269 iv = crypto_blkcipher_crt(tfm)->iv;
270 ivsize = crypto_blkcipher_ivsize(tfm);
271
272 memcpy(iv, aes_iv, ivsize);
273
274 /*
275 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
276 key, key_len, 1);
277 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
278 src, src_len, 1);
279 */
280
281 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
282 crypto_free_blkcipher(tfm);
283 if (ret < 0) {
284 pr_err("ceph_aes_decrypt failed %d\n", ret);
285 return ret;
286 }
287
288 if (src_len <= *dst1_len)
289 last_byte = ((char *)dst1)[src_len - 1];
290 else if (src_len <= *dst1_len + *dst2_len)
291 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
292 else
293 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
294 if (last_byte <= 16 && src_len >= last_byte) {
295 src_len -= last_byte;
296 } else {
297 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
298 last_byte, (int)src_len);
299 return -EPERM; /* bad padding */
300 }
301
302 if (src_len < *dst1_len) {
303 *dst1_len = src_len;
304 *dst2_len = 0;
305 } else {
306 *dst2_len = src_len - *dst1_len;
307 }
308 /*
309 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
310 dst1, *dst1_len, 1);
311 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
312 dst2, *dst2_len, 1);
313 */
314
315 return 0;
316}
317
318
319int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
320 const void *src, size_t src_len)
321{
322 switch (secret->type) {
323 case CEPH_CRYPTO_NONE:
324 if (*dst_len < src_len)
325 return -ERANGE;
326 memcpy(dst, src, src_len);
327 *dst_len = src_len;
328 return 0;
329
330 case CEPH_CRYPTO_AES:
331 return ceph_aes_decrypt(secret->key, secret->len, dst,
332 dst_len, src, src_len);
333
334 default:
335 return -EINVAL;
336 }
337}
338
339int ceph_decrypt2(struct ceph_crypto_key *secret,
340 void *dst1, size_t *dst1_len,
341 void *dst2, size_t *dst2_len,
342 const void *src, size_t src_len)
343{
344 size_t t;
345
346 switch (secret->type) {
347 case CEPH_CRYPTO_NONE:
348 if (*dst1_len + *dst2_len < src_len)
349 return -ERANGE;
350 t = min(*dst1_len, src_len);
351 memcpy(dst1, src, t);
352 *dst1_len = t;
353 src += t;
354 src_len -= t;
355 if (src_len) {
356 t = min(*dst2_len, src_len);
357 memcpy(dst2, src, t);
358 *dst2_len = t;
359 }
360 return 0;
361
362 case CEPH_CRYPTO_AES:
363 return ceph_aes_decrypt2(secret->key, secret->len,
364 dst1, dst1_len, dst2, dst2_len,
365 src, src_len);
366
367 default:
368 return -EINVAL;
369 }
370}
371
372int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
373 const void *src, size_t src_len)
374{
375 switch (secret->type) {
376 case CEPH_CRYPTO_NONE:
377 if (*dst_len < src_len)
378 return -ERANGE;
379 memcpy(dst, src, src_len);
380 *dst_len = src_len;
381 return 0;
382
383 case CEPH_CRYPTO_AES:
384 return ceph_aes_encrypt(secret->key, secret->len, dst,
385 dst_len, src, src_len);
386
387 default:
388 return -EINVAL;
389 }
390}
391
392int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
393 const void *src1, size_t src1_len,
394 const void *src2, size_t src2_len)
395{
396 switch (secret->type) {
397 case CEPH_CRYPTO_NONE:
398 if (*dst_len < src1_len + src2_len)
399 return -ERANGE;
400 memcpy(dst, src1, src1_len);
401 memcpy(dst + src1_len, src2, src2_len);
402 *dst_len = src1_len + src2_len;
403 return 0;
404
405 case CEPH_CRYPTO_AES:
406 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
407 src1, src1_len, src2, src2_len);
408
409 default:
410 return -EINVAL;
411 }
412}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 000000000000..f9eccace592b
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include <linux/ceph/types.h>
5#include <linux/ceph/buffer.h>
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47
48#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000000..27d4ea315d12
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../monc - mon client state
25 * .../dentry_lru - dump contents of dentry lru
26 * .../caps - expose cap (reservation) stats
27 * .../bdi - symlink to ../../bdi/something
28 */
29
30static struct dentry *ceph_debugfs_dir;
31
32static int monmap_show(struct seq_file *s, void *p)
33{
34 int i;
35 struct ceph_client *client = s->private;
36
37 if (client->monc.monmap == NULL)
38 return 0;
39
40 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
41 for (i = 0; i < client->monc.monmap->num_mon; i++) {
42 struct ceph_entity_inst *inst =
43 &client->monc.monmap->mon_inst[i];
44
45 seq_printf(s, "\t%s%lld\t%s\n",
46 ENTITY_NAME(inst->name),
47 ceph_pr_addr(&inst->addr.in_addr));
48 }
49 return 0;
50}
51
52static int osdmap_show(struct seq_file *s, void *p)
53{
54 int i;
55 struct ceph_client *client = s->private;
56 struct rb_node *n;
57
58 if (client->osdc.osdmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
61 seq_printf(s, "flags%s%s\n",
62 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
63 " NEARFULL" : "",
64 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
65 " FULL" : "");
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
70 pool->id, pool->v.pg_num, pool->pg_num_mask,
71 pool->v.lpg_num, pool->lpg_num_mask);
72 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
74 struct ceph_entity_addr *addr =
75 &client->osdc.osdmap->osd_addr[i];
76 int state = client->osdc.osdmap->osd_state[i];
77 char sb[64];
78
79 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
80 i, ceph_pr_addr(&addr->in_addr),
81 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
82 ceph_osdmap_state_str(sb, sizeof(sb), state));
83 }
84 return 0;
85}
86
87static int monc_show(struct seq_file *s, void *p)
88{
89 struct ceph_client *client = s->private;
90 struct ceph_mon_generic_request *req;
91 struct ceph_mon_client *monc = &client->monc;
92 struct rb_node *rp;
93
94 mutex_lock(&monc->mutex);
95
96 if (monc->have_mdsmap)
97 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
98 if (monc->have_osdmap)
99 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
100 if (monc->want_next_osdmap)
101 seq_printf(s, "want next osdmap\n");
102
103 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
104 __u16 op;
105 req = rb_entry(rp, struct ceph_mon_generic_request, node);
106 op = le16_to_cpu(req->request->hdr.type);
107 if (op == CEPH_MSG_STATFS)
108 seq_printf(s, "%lld statfs\n", req->tid);
109 else
110 seq_printf(s, "%lld unknown\n", req->tid);
111 }
112
113 mutex_unlock(&monc->mutex);
114 return 0;
115}
116
117static int osdc_show(struct seq_file *s, void *pp)
118{
119 struct ceph_client *client = s->private;
120 struct ceph_osd_client *osdc = &client->osdc;
121 struct rb_node *p;
122
123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req;
126 struct ceph_osd_request_head *head;
127 struct ceph_osd_op *op;
128 int num_ops;
129 int opcode, olen;
130 int i;
131
132 req = rb_entry(p, struct ceph_osd_request, r_node);
133
134 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
135 req->r_osd ? req->r_osd->o_osd : -1,
136 le32_to_cpu(req->r_pgid.pool),
137 le16_to_cpu(req->r_pgid.ps));
138
139 head = req->r_request->front.iov_base;
140 op = (void *)(head + 1);
141
142 num_ops = le16_to_cpu(head->num_ops);
143 olen = le32_to_cpu(head->object_len);
144 seq_printf(s, "%.*s", olen,
145 (const char *)(head->ops + num_ops));
146
147 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu",
149 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
150 le64_to_cpu(req->r_reassert_version.version));
151 else
152 seq_printf(s, "\t");
153
154 for (i = 0; i < num_ops; i++) {
155 opcode = le16_to_cpu(op->op);
156 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
157 op++;
158 }
159
160 seq_printf(s, "\n");
161 }
162 mutex_unlock(&osdc->request_mutex);
163 return 0;
164}
165
166CEPH_DEFINE_SHOW_FUNC(monmap_show)
167CEPH_DEFINE_SHOW_FUNC(osdmap_show)
168CEPH_DEFINE_SHOW_FUNC(monc_show)
169CEPH_DEFINE_SHOW_FUNC(osdc_show)
170
171int ceph_debugfs_init(void)
172{
173 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
174 if (!ceph_debugfs_dir)
175 return -ENOMEM;
176 return 0;
177}
178
179void ceph_debugfs_cleanup(void)
180{
181 debugfs_remove(ceph_debugfs_dir);
182}
183
184int ceph_debugfs_client_init(struct ceph_client *client)
185{
186 int ret = -ENOMEM;
187 char name[80];
188
189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
190 client->monc.auth->global_id);
191
192 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
193 if (!client->debugfs_dir)
194 goto out;
195
196 client->monc.debugfs_file = debugfs_create_file("monc",
197 0600,
198 client->debugfs_dir,
199 client,
200 &monc_show_fops);
201 if (!client->monc.debugfs_file)
202 goto out;
203
204 client->osdc.debugfs_file = debugfs_create_file("osdc",
205 0600,
206 client->debugfs_dir,
207 client,
208 &osdc_show_fops);
209 if (!client->osdc.debugfs_file)
210 goto out;
211
212 client->debugfs_monmap = debugfs_create_file("monmap",
213 0600,
214 client->debugfs_dir,
215 client,
216 &monmap_show_fops);
217 if (!client->debugfs_monmap)
218 goto out;
219
220 client->debugfs_osdmap = debugfs_create_file("osdmap",
221 0600,
222 client->debugfs_dir,
223 client,
224 &osdmap_show_fops);
225 if (!client->debugfs_osdmap)
226 goto out;
227
228 return 0;
229
230out:
231 ceph_debugfs_client_cleanup(client);
232 return ret;
233}
234
235void ceph_debugfs_client_cleanup(struct ceph_client *client)
236{
237 debugfs_remove(client->debugfs_osdmap);
238 debugfs_remove(client->debugfs_monmap);
239 debugfs_remove(client->osdc.debugfs_file);
240 debugfs_remove(client->monc.debugfs_file);
241 debugfs_remove(client->debugfs_dir);
242}
243
244#else /* CONFIG_DEBUG_FS */
245
246int ceph_debugfs_init(void)
247{
248 return 0;
249}
250
251void ceph_debugfs_cleanup(void)
252{
253}
254
255int ceph_debugfs_client_init(struct ceph_client *client)
256{
257 return 0;
258}
259
260void ceph_debugfs_client_cleanup(struct ceph_client *client)
261{
262}
263
264#endif /* CONFIG_DEBUG_FS */
265
266EXPORT_SYMBOL(ceph_debugfs_init);
267EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 000000000000..0e8157ee5d43
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2453 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <linux/bio.h>
13#include <linux/blkdev.h>
14#include <net/tcp.h>
15
16#include <linux/ceph/libceph.h>
17#include <linux/ceph/messenger.h>
18#include <linux/ceph/decode.h>
19#include <linux/ceph/pagelist.h>
20
21/*
22 * Ceph uses the messenger to exchange ceph_msg messages with other
23 * hosts in the system. The messenger provides ordered and reliable
24 * delivery. We tolerate TCP disconnects by reconnecting (with
25 * exponential backoff) in the case of a fault (disconnection, bad
26 * crc, protocol error). Acks allow sent messages to be discarded by
27 * the sender.
28 */
29
30/* static tag bytes (protocol control messages) */
31static char tag_msg = CEPH_MSGR_TAG_MSG;
32static char tag_ack = CEPH_MSGR_TAG_ACK;
33static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
34
35#ifdef CONFIG_LOCKDEP
36static struct lock_class_key socket_class;
37#endif
38
39
40static void queue_con(struct ceph_connection *con);
41static void con_work(struct work_struct *);
42static void ceph_fault(struct ceph_connection *con);
43
44/*
45 * nicely render a sockaddr as a string.
46 */
47#define MAX_ADDR_STR 20
48#define MAX_ADDR_STR_LEN 60
49static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
50static DEFINE_SPINLOCK(addr_str_lock);
51static int last_addr_str;
52
53const char *ceph_pr_addr(const struct sockaddr_storage *ss)
54{
55 int i;
56 char *s;
57 struct sockaddr_in *in4 = (void *)ss;
58 struct sockaddr_in6 *in6 = (void *)ss;
59
60 spin_lock(&addr_str_lock);
61 i = last_addr_str++;
62 if (last_addr_str == MAX_ADDR_STR)
63 last_addr_str = 0;
64 spin_unlock(&addr_str_lock);
65 s = addr_str[i];
66
67 switch (ss->ss_family) {
68 case AF_INET:
69 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
70 (unsigned int)ntohs(in4->sin_port));
71 break;
72
73 case AF_INET6:
74 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
75 (unsigned int)ntohs(in6->sin6_port));
76 break;
77
78 default:
79 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
80 }
81
82 return s;
83}
84EXPORT_SYMBOL(ceph_pr_addr);
85
86static void encode_my_addr(struct ceph_messenger *msgr)
87{
88 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
89 ceph_encode_addr(&msgr->my_enc_addr);
90}
91
92/*
93 * work queue for all reading and writing to/from the socket.
94 */
95struct workqueue_struct *ceph_msgr_wq;
96
97int ceph_msgr_init(void)
98{
99 ceph_msgr_wq = create_workqueue("ceph-msgr");
100 if (IS_ERR(ceph_msgr_wq)) {
101 int ret = PTR_ERR(ceph_msgr_wq);
102 pr_err("msgr_init failed to create workqueue: %d\n", ret);
103 ceph_msgr_wq = NULL;
104 return ret;
105 }
106 return 0;
107}
108EXPORT_SYMBOL(ceph_msgr_init);
109
110void ceph_msgr_exit(void)
111{
112 destroy_workqueue(ceph_msgr_wq);
113}
114EXPORT_SYMBOL(ceph_msgr_exit);
115
116void ceph_msgr_flush(void)
117{
118 flush_workqueue(ceph_msgr_wq);
119}
120EXPORT_SYMBOL(ceph_msgr_flush);
121
122
123/*
124 * socket callback functions
125 */
126
127/* data available on socket, or listen socket received a connect */
128static void ceph_data_ready(struct sock *sk, int count_unused)
129{
130 struct ceph_connection *con =
131 (struct ceph_connection *)sk->sk_user_data;
132 if (sk->sk_state != TCP_CLOSE_WAIT) {
133 dout("ceph_data_ready on %p state = %lu, queueing work\n",
134 con, con->state);
135 queue_con(con);
136 }
137}
138
139/* socket has buffer space for writing */
140static void ceph_write_space(struct sock *sk)
141{
142 struct ceph_connection *con =
143 (struct ceph_connection *)sk->sk_user_data;
144
145 /* only queue to workqueue if there is data we want to write. */
146 if (test_bit(WRITE_PENDING, &con->state)) {
147 dout("ceph_write_space %p queueing write work\n", con);
148 queue_con(con);
149 } else {
150 dout("ceph_write_space %p nothing to write\n", con);
151 }
152
153 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
154 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
155}
156
157/* socket's state has changed */
158static void ceph_state_change(struct sock *sk)
159{
160 struct ceph_connection *con =
161 (struct ceph_connection *)sk->sk_user_data;
162
163 dout("ceph_state_change %p state = %lu sk_state = %u\n",
164 con, con->state, sk->sk_state);
165
166 if (test_bit(CLOSED, &con->state))
167 return;
168
169 switch (sk->sk_state) {
170 case TCP_CLOSE:
171 dout("ceph_state_change TCP_CLOSE\n");
172 case TCP_CLOSE_WAIT:
173 dout("ceph_state_change TCP_CLOSE_WAIT\n");
174 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
175 if (test_bit(CONNECTING, &con->state))
176 con->error_msg = "connection failed";
177 else
178 con->error_msg = "socket closed";
179 queue_con(con);
180 }
181 break;
182 case TCP_ESTABLISHED:
183 dout("ceph_state_change TCP_ESTABLISHED\n");
184 queue_con(con);
185 break;
186 }
187}
188
189/*
190 * set up socket callbacks
191 */
192static void set_sock_callbacks(struct socket *sock,
193 struct ceph_connection *con)
194{
195 struct sock *sk = sock->sk;
196 sk->sk_user_data = (void *)con;
197 sk->sk_data_ready = ceph_data_ready;
198 sk->sk_write_space = ceph_write_space;
199 sk->sk_state_change = ceph_state_change;
200}
201
202
203/*
204 * socket helpers
205 */
206
207/*
208 * initiate connection to a remote socket.
209 */
210static struct socket *ceph_tcp_connect(struct ceph_connection *con)
211{
212 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
213 struct socket *sock;
214 int ret;
215
216 BUG_ON(con->sock);
217 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
218 IPPROTO_TCP, &sock);
219 if (ret)
220 return ERR_PTR(ret);
221 con->sock = sock;
222 sock->sk->sk_allocation = GFP_NOFS;
223
224#ifdef CONFIG_LOCKDEP
225 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
226#endif
227
228 set_sock_callbacks(sock, con);
229
230 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
231
232 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
233 O_NONBLOCK);
234 if (ret == -EINPROGRESS) {
235 dout("connect %s EINPROGRESS sk_state = %u\n",
236 ceph_pr_addr(&con->peer_addr.in_addr),
237 sock->sk->sk_state);
238 ret = 0;
239 }
240 if (ret < 0) {
241 pr_err("connect %s error %d\n",
242 ceph_pr_addr(&con->peer_addr.in_addr), ret);
243 sock_release(sock);
244 con->sock = NULL;
245 con->error_msg = "connect error";
246 }
247
248 if (ret < 0)
249 return ERR_PTR(ret);
250 return sock;
251}
252
253static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
254{
255 struct kvec iov = {buf, len};
256 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
257
258 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
259}
260
261/*
262 * write something. @more is true if caller will be sending more data
263 * shortly.
264 */
265static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
266 size_t kvlen, size_t len, int more)
267{
268 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
269
270 if (more)
271 msg.msg_flags |= MSG_MORE;
272 else
273 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
274
275 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
276}
277
278
279/*
280 * Shutdown/close the socket for the given connection.
281 */
282static int con_close_socket(struct ceph_connection *con)
283{
284 int rc;
285
286 dout("con_close_socket on %p sock %p\n", con, con->sock);
287 if (!con->sock)
288 return 0;
289 set_bit(SOCK_CLOSED, &con->state);
290 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
291 sock_release(con->sock);
292 con->sock = NULL;
293 clear_bit(SOCK_CLOSED, &con->state);
294 return rc;
295}
296
297/*
298 * Reset a connection. Discard all incoming and outgoing messages
299 * and clear *_seq state.
300 */
301static void ceph_msg_remove(struct ceph_msg *msg)
302{
303 list_del_init(&msg->list_head);
304 ceph_msg_put(msg);
305}
306static void ceph_msg_remove_list(struct list_head *head)
307{
308 while (!list_empty(head)) {
309 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
310 list_head);
311 ceph_msg_remove(msg);
312 }
313}
314
315static void reset_connection(struct ceph_connection *con)
316{
317 /* reset connection, out_queue, msg_ and connect_seq */
318 /* discard existing out_queue and msg_seq */
319 ceph_msg_remove_list(&con->out_queue);
320 ceph_msg_remove_list(&con->out_sent);
321
322 if (con->in_msg) {
323 ceph_msg_put(con->in_msg);
324 con->in_msg = NULL;
325 }
326
327 con->connect_seq = 0;
328 con->out_seq = 0;
329 if (con->out_msg) {
330 ceph_msg_put(con->out_msg);
331 con->out_msg = NULL;
332 }
333 con->out_keepalive_pending = false;
334 con->in_seq = 0;
335 con->in_seq_acked = 0;
336}
337
338/*
339 * mark a peer down. drop any open connections.
340 */
341void ceph_con_close(struct ceph_connection *con)
342{
343 dout("con_close %p peer %s\n", con,
344 ceph_pr_addr(&con->peer_addr.in_addr));
345 set_bit(CLOSED, &con->state); /* in case there's queued work */
346 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
347 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
348 clear_bit(KEEPALIVE_PENDING, &con->state);
349 clear_bit(WRITE_PENDING, &con->state);
350 mutex_lock(&con->mutex);
351 reset_connection(con);
352 con->peer_global_seq = 0;
353 cancel_delayed_work(&con->work);
354 mutex_unlock(&con->mutex);
355 queue_con(con);
356}
357EXPORT_SYMBOL(ceph_con_close);
358
359/*
360 * Reopen a closed connection, with a new peer address.
361 */
362void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
363{
364 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
365 set_bit(OPENING, &con->state);
366 clear_bit(CLOSED, &con->state);
367 memcpy(&con->peer_addr, addr, sizeof(*addr));
368 con->delay = 0; /* reset backoff memory */
369 queue_con(con);
370}
371EXPORT_SYMBOL(ceph_con_open);
372
373/*
374 * return true if this connection ever successfully opened
375 */
376bool ceph_con_opened(struct ceph_connection *con)
377{
378 return con->connect_seq > 0;
379}
380
381/*
382 * generic get/put
383 */
384struct ceph_connection *ceph_con_get(struct ceph_connection *con)
385{
386 dout("con_get %p nref = %d -> %d\n", con,
387 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
388 if (atomic_inc_not_zero(&con->nref))
389 return con;
390 return NULL;
391}
392
393void ceph_con_put(struct ceph_connection *con)
394{
395 dout("con_put %p nref = %d -> %d\n", con,
396 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
397 BUG_ON(atomic_read(&con->nref) == 0);
398 if (atomic_dec_and_test(&con->nref)) {
399 BUG_ON(con->sock);
400 kfree(con);
401 }
402}
403
404/*
405 * initialize a new connection.
406 */
407void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
408{
409 dout("con_init %p\n", con);
410 memset(con, 0, sizeof(*con));
411 atomic_set(&con->nref, 1);
412 con->msgr = msgr;
413 mutex_init(&con->mutex);
414 INIT_LIST_HEAD(&con->out_queue);
415 INIT_LIST_HEAD(&con->out_sent);
416 INIT_DELAYED_WORK(&con->work, con_work);
417}
418EXPORT_SYMBOL(ceph_con_init);
419
420
421/*
422 * We maintain a global counter to order connection attempts. Get
423 * a unique seq greater than @gt.
424 */
425static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
426{
427 u32 ret;
428
429 spin_lock(&msgr->global_seq_lock);
430 if (msgr->global_seq < gt)
431 msgr->global_seq = gt;
432 ret = ++msgr->global_seq;
433 spin_unlock(&msgr->global_seq_lock);
434 return ret;
435}
436
437
438/*
439 * Prepare footer for currently outgoing message, and finish things
440 * off. Assumes out_kvec* are already valid.. we just add on to the end.
441 */
442static void prepare_write_message_footer(struct ceph_connection *con, int v)
443{
444 struct ceph_msg *m = con->out_msg;
445
446 dout("prepare_write_message_footer %p\n", con);
447 con->out_kvec_is_msg = true;
448 con->out_kvec[v].iov_base = &m->footer;
449 con->out_kvec[v].iov_len = sizeof(m->footer);
450 con->out_kvec_bytes += sizeof(m->footer);
451 con->out_kvec_left++;
452 con->out_more = m->more_to_follow;
453 con->out_msg_done = true;
454}
455
456/*
457 * Prepare headers for the next outgoing message.
458 */
459static void prepare_write_message(struct ceph_connection *con)
460{
461 struct ceph_msg *m;
462 int v = 0;
463
464 con->out_kvec_bytes = 0;
465 con->out_kvec_is_msg = true;
466 con->out_msg_done = false;
467
468 /* Sneak an ack in there first? If we can get it into the same
469 * TCP packet that's a good thing. */
470 if (con->in_seq > con->in_seq_acked) {
471 con->in_seq_acked = con->in_seq;
472 con->out_kvec[v].iov_base = &tag_ack;
473 con->out_kvec[v++].iov_len = 1;
474 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
475 con->out_kvec[v].iov_base = &con->out_temp_ack;
476 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
477 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
478 }
479
480 m = list_first_entry(&con->out_queue,
481 struct ceph_msg, list_head);
482 con->out_msg = m;
483 if (test_bit(LOSSYTX, &con->state)) {
484 list_del_init(&m->list_head);
485 } else {
486 /* put message on sent list */
487 ceph_msg_get(m);
488 list_move_tail(&m->list_head, &con->out_sent);
489 }
490
491 /*
492 * only assign outgoing seq # if we haven't sent this message
493 * yet. if it is requeued, resend with it's original seq.
494 */
495 if (m->needs_out_seq) {
496 m->hdr.seq = cpu_to_le64(++con->out_seq);
497 m->needs_out_seq = false;
498 }
499
500 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
501 m, con->out_seq, le16_to_cpu(m->hdr.type),
502 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
503 le32_to_cpu(m->hdr.data_len),
504 m->nr_pages);
505 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
506
507 /* tag + hdr + front + middle */
508 con->out_kvec[v].iov_base = &tag_msg;
509 con->out_kvec[v++].iov_len = 1;
510 con->out_kvec[v].iov_base = &m->hdr;
511 con->out_kvec[v++].iov_len = sizeof(m->hdr);
512 con->out_kvec[v++] = m->front;
513 if (m->middle)
514 con->out_kvec[v++] = m->middle->vec;
515 con->out_kvec_left = v;
516 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
517 (m->middle ? m->middle->vec.iov_len : 0);
518 con->out_kvec_cur = con->out_kvec;
519
520 /* fill in crc (except data pages), footer */
521 con->out_msg->hdr.crc =
522 cpu_to_le32(crc32c(0, (void *)&m->hdr,
523 sizeof(m->hdr) - sizeof(m->hdr.crc)));
524 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
525 con->out_msg->footer.front_crc =
526 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
527 if (m->middle)
528 con->out_msg->footer.middle_crc =
529 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
530 m->middle->vec.iov_len));
531 else
532 con->out_msg->footer.middle_crc = 0;
533 con->out_msg->footer.data_crc = 0;
534 dout("prepare_write_message front_crc %u data_crc %u\n",
535 le32_to_cpu(con->out_msg->footer.front_crc),
536 le32_to_cpu(con->out_msg->footer.middle_crc));
537
538 /* is there a data payload? */
539 if (le32_to_cpu(m->hdr.data_len) > 0) {
540 /* initialize page iterator */
541 con->out_msg_pos.page = 0;
542 if (m->pages)
543 con->out_msg_pos.page_pos =
544 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
545 else
546 con->out_msg_pos.page_pos = 0;
547 con->out_msg_pos.data_pos = 0;
548 con->out_msg_pos.did_page_crc = 0;
549 con->out_more = 1; /* data + footer will follow */
550 } else {
551 /* no, queue up footer too and be done */
552 prepare_write_message_footer(con, v);
553 }
554
555 set_bit(WRITE_PENDING, &con->state);
556}
557
558/*
559 * Prepare an ack.
560 */
561static void prepare_write_ack(struct ceph_connection *con)
562{
563 dout("prepare_write_ack %p %llu -> %llu\n", con,
564 con->in_seq_acked, con->in_seq);
565 con->in_seq_acked = con->in_seq;
566
567 con->out_kvec[0].iov_base = &tag_ack;
568 con->out_kvec[0].iov_len = 1;
569 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
570 con->out_kvec[1].iov_base = &con->out_temp_ack;
571 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
572 con->out_kvec_left = 2;
573 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
574 con->out_kvec_cur = con->out_kvec;
575 con->out_more = 1; /* more will follow.. eventually.. */
576 set_bit(WRITE_PENDING, &con->state);
577}
578
579/*
580 * Prepare to write keepalive byte.
581 */
582static void prepare_write_keepalive(struct ceph_connection *con)
583{
584 dout("prepare_write_keepalive %p\n", con);
585 con->out_kvec[0].iov_base = &tag_keepalive;
586 con->out_kvec[0].iov_len = 1;
587 con->out_kvec_left = 1;
588 con->out_kvec_bytes = 1;
589 con->out_kvec_cur = con->out_kvec;
590 set_bit(WRITE_PENDING, &con->state);
591}
592
593/*
594 * Connection negotiation.
595 */
596
597static void prepare_connect_authorizer(struct ceph_connection *con)
598{
599 void *auth_buf;
600 int auth_len = 0;
601 int auth_protocol = 0;
602
603 mutex_unlock(&con->mutex);
604 if (con->ops->get_authorizer)
605 con->ops->get_authorizer(con, &auth_buf, &auth_len,
606 &auth_protocol, &con->auth_reply_buf,
607 &con->auth_reply_buf_len,
608 con->auth_retry);
609 mutex_lock(&con->mutex);
610
611 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
612 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
613
614 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
615 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
616 con->out_kvec_left++;
617 con->out_kvec_bytes += auth_len;
618}
619
620/*
621 * We connected to a peer and are saying hello.
622 */
623static void prepare_write_banner(struct ceph_messenger *msgr,
624 struct ceph_connection *con)
625{
626 int len = strlen(CEPH_BANNER);
627
628 con->out_kvec[0].iov_base = CEPH_BANNER;
629 con->out_kvec[0].iov_len = len;
630 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
631 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
632 con->out_kvec_left = 2;
633 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
634 con->out_kvec_cur = con->out_kvec;
635 con->out_more = 0;
636 set_bit(WRITE_PENDING, &con->state);
637}
638
639static void prepare_write_connect(struct ceph_messenger *msgr,
640 struct ceph_connection *con,
641 int after_banner)
642{
643 unsigned global_seq = get_global_seq(con->msgr, 0);
644 int proto;
645
646 switch (con->peer_name.type) {
647 case CEPH_ENTITY_TYPE_MON:
648 proto = CEPH_MONC_PROTOCOL;
649 break;
650 case CEPH_ENTITY_TYPE_OSD:
651 proto = CEPH_OSDC_PROTOCOL;
652 break;
653 case CEPH_ENTITY_TYPE_MDS:
654 proto = CEPH_MDSC_PROTOCOL;
655 break;
656 default:
657 BUG();
658 }
659
660 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
661 con->connect_seq, global_seq, proto);
662
663 con->out_connect.features = cpu_to_le64(msgr->supported_features);
664 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
665 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
666 con->out_connect.global_seq = cpu_to_le32(global_seq);
667 con->out_connect.protocol_version = cpu_to_le32(proto);
668 con->out_connect.flags = 0;
669
670 if (!after_banner) {
671 con->out_kvec_left = 0;
672 con->out_kvec_bytes = 0;
673 }
674 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
675 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
676 con->out_kvec_left++;
677 con->out_kvec_bytes += sizeof(con->out_connect);
678 con->out_kvec_cur = con->out_kvec;
679 con->out_more = 0;
680 set_bit(WRITE_PENDING, &con->state);
681
682 prepare_connect_authorizer(con);
683}
684
685
686/*
687 * write as much of pending kvecs to the socket as we can.
688 * 1 -> done
689 * 0 -> socket full, but more to do
690 * <0 -> error
691 */
692static int write_partial_kvec(struct ceph_connection *con)
693{
694 int ret;
695
696 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
697 while (con->out_kvec_bytes > 0) {
698 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
699 con->out_kvec_left, con->out_kvec_bytes,
700 con->out_more);
701 if (ret <= 0)
702 goto out;
703 con->out_kvec_bytes -= ret;
704 if (con->out_kvec_bytes == 0)
705 break; /* done */
706 while (ret > 0) {
707 if (ret >= con->out_kvec_cur->iov_len) {
708 ret -= con->out_kvec_cur->iov_len;
709 con->out_kvec_cur++;
710 con->out_kvec_left--;
711 } else {
712 con->out_kvec_cur->iov_len -= ret;
713 con->out_kvec_cur->iov_base += ret;
714 ret = 0;
715 break;
716 }
717 }
718 }
719 con->out_kvec_left = 0;
720 con->out_kvec_is_msg = false;
721 ret = 1;
722out:
723 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
724 con->out_kvec_bytes, con->out_kvec_left, ret);
725 return ret; /* done! */
726}
727
728#ifdef CONFIG_BLOCK
729static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
730{
731 if (!bio) {
732 *iter = NULL;
733 *seg = 0;
734 return;
735 }
736 *iter = bio;
737 *seg = bio->bi_idx;
738}
739
740static void iter_bio_next(struct bio **bio_iter, int *seg)
741{
742 if (*bio_iter == NULL)
743 return;
744
745 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
746
747 (*seg)++;
748 if (*seg == (*bio_iter)->bi_vcnt)
749 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
750}
751#endif
752
753/*
754 * Write as much message data payload as we can. If we finish, queue
755 * up the footer.
756 * 1 -> done, footer is now queued in out_kvec[].
757 * 0 -> socket full, but more to do
758 * <0 -> error
759 */
760static int write_partial_msg_pages(struct ceph_connection *con)
761{
762 struct ceph_msg *msg = con->out_msg;
763 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
764 size_t len;
765 int crc = con->msgr->nocrc;
766 int ret;
767 int total_max_write;
768 int in_trail = 0;
769 size_t trail_len = (msg->trail ? msg->trail->length : 0);
770
771 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
772 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
773 con->out_msg_pos.page_pos);
774
775#ifdef CONFIG_BLOCK
776 if (msg->bio && !msg->bio_iter)
777 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
778#endif
779
780 while (data_len > con->out_msg_pos.data_pos) {
781 struct page *page = NULL;
782 void *kaddr = NULL;
783 int max_write = PAGE_SIZE;
784 int page_shift = 0;
785
786 total_max_write = data_len - trail_len -
787 con->out_msg_pos.data_pos;
788
789 /*
790 * if we are calculating the data crc (the default), we need
791 * to map the page. if our pages[] has been revoked, use the
792 * zero page.
793 */
794
795 /* have we reached the trail part of the data? */
796 if (con->out_msg_pos.data_pos >= data_len - trail_len) {
797 in_trail = 1;
798
799 total_max_write = data_len - con->out_msg_pos.data_pos;
800
801 page = list_first_entry(&msg->trail->head,
802 struct page, lru);
803 if (crc)
804 kaddr = kmap(page);
805 max_write = PAGE_SIZE;
806 } else if (msg->pages) {
807 page = msg->pages[con->out_msg_pos.page];
808 if (crc)
809 kaddr = kmap(page);
810 } else if (msg->pagelist) {
811 page = list_first_entry(&msg->pagelist->head,
812 struct page, lru);
813 if (crc)
814 kaddr = kmap(page);
815#ifdef CONFIG_BLOCK
816 } else if (msg->bio) {
817 struct bio_vec *bv;
818
819 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
820 page = bv->bv_page;
821 page_shift = bv->bv_offset;
822 if (crc)
823 kaddr = kmap(page) + page_shift;
824 max_write = bv->bv_len;
825#endif
826 } else {
827 page = con->msgr->zero_page;
828 if (crc)
829 kaddr = page_address(con->msgr->zero_page);
830 }
831 len = min_t(int, max_write - con->out_msg_pos.page_pos,
832 total_max_write);
833
834 if (crc && !con->out_msg_pos.did_page_crc) {
835 void *base = kaddr + con->out_msg_pos.page_pos;
836 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
837
838 BUG_ON(kaddr == NULL);
839 con->out_msg->footer.data_crc =
840 cpu_to_le32(crc32c(tmpcrc, base, len));
841 con->out_msg_pos.did_page_crc = 1;
842 }
843 ret = kernel_sendpage(con->sock, page,
844 con->out_msg_pos.page_pos + page_shift,
845 len,
846 MSG_DONTWAIT | MSG_NOSIGNAL |
847 MSG_MORE);
848
849 if (crc &&
850 (msg->pages || msg->pagelist || msg->bio || in_trail))
851 kunmap(page);
852
853 if (ret <= 0)
854 goto out;
855
856 con->out_msg_pos.data_pos += ret;
857 con->out_msg_pos.page_pos += ret;
858 if (ret == len) {
859 con->out_msg_pos.page_pos = 0;
860 con->out_msg_pos.page++;
861 con->out_msg_pos.did_page_crc = 0;
862 if (in_trail)
863 list_move_tail(&page->lru,
864 &msg->trail->head);
865 else if (msg->pagelist)
866 list_move_tail(&page->lru,
867 &msg->pagelist->head);
868#ifdef CONFIG_BLOCK
869 else if (msg->bio)
870 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
871#endif
872 }
873 }
874
875 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
876
877 /* prepare and queue up footer, too */
878 if (!crc)
879 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
880 con->out_kvec_bytes = 0;
881 con->out_kvec_left = 0;
882 con->out_kvec_cur = con->out_kvec;
883 prepare_write_message_footer(con, 0);
884 ret = 1;
885out:
886 return ret;
887}
888
889/*
890 * write some zeros
891 */
892static int write_partial_skip(struct ceph_connection *con)
893{
894 int ret;
895
896 while (con->out_skip > 0) {
897 struct kvec iov = {
898 .iov_base = page_address(con->msgr->zero_page),
899 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
900 };
901
902 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
903 if (ret <= 0)
904 goto out;
905 con->out_skip -= ret;
906 }
907 ret = 1;
908out:
909 return ret;
910}
911
912/*
913 * Prepare to read connection handshake, or an ack.
914 */
915static void prepare_read_banner(struct ceph_connection *con)
916{
917 dout("prepare_read_banner %p\n", con);
918 con->in_base_pos = 0;
919}
920
921static void prepare_read_connect(struct ceph_connection *con)
922{
923 dout("prepare_read_connect %p\n", con);
924 con->in_base_pos = 0;
925}
926
927static void prepare_read_ack(struct ceph_connection *con)
928{
929 dout("prepare_read_ack %p\n", con);
930 con->in_base_pos = 0;
931}
932
933static void prepare_read_tag(struct ceph_connection *con)
934{
935 dout("prepare_read_tag %p\n", con);
936 con->in_base_pos = 0;
937 con->in_tag = CEPH_MSGR_TAG_READY;
938}
939
940/*
941 * Prepare to read a message.
942 */
943static int prepare_read_message(struct ceph_connection *con)
944{
945 dout("prepare_read_message %p\n", con);
946 BUG_ON(con->in_msg != NULL);
947 con->in_base_pos = 0;
948 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
949 return 0;
950}
951
952
953static int read_partial(struct ceph_connection *con,
954 int *to, int size, void *object)
955{
956 *to += size;
957 while (con->in_base_pos < *to) {
958 int left = *to - con->in_base_pos;
959 int have = size - left;
960 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
961 if (ret <= 0)
962 return ret;
963 con->in_base_pos += ret;
964 }
965 return 1;
966}
967
968
969/*
970 * Read all or part of the connect-side handshake on a new connection
971 */
972static int read_partial_banner(struct ceph_connection *con)
973{
974 int ret, to = 0;
975
976 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
977
978 /* peer's banner */
979 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
980 if (ret <= 0)
981 goto out;
982 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
983 &con->actual_peer_addr);
984 if (ret <= 0)
985 goto out;
986 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
987 &con->peer_addr_for_me);
988 if (ret <= 0)
989 goto out;
990out:
991 return ret;
992}
993
994static int read_partial_connect(struct ceph_connection *con)
995{
996 int ret, to = 0;
997
998 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
999
1000 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
1001 if (ret <= 0)
1002 goto out;
1003 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
1004 con->auth_reply_buf);
1005 if (ret <= 0)
1006 goto out;
1007
1008 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
1009 con, (int)con->in_reply.tag,
1010 le32_to_cpu(con->in_reply.connect_seq),
1011 le32_to_cpu(con->in_reply.global_seq));
1012out:
1013 return ret;
1014
1015}
1016
1017/*
1018 * Verify the hello banner looks okay.
1019 */
1020static int verify_hello(struct ceph_connection *con)
1021{
1022 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
1023 pr_err("connect to %s got bad banner\n",
1024 ceph_pr_addr(&con->peer_addr.in_addr));
1025 con->error_msg = "protocol error, bad banner";
1026 return -1;
1027 }
1028 return 0;
1029}
1030
1031static bool addr_is_blank(struct sockaddr_storage *ss)
1032{
1033 switch (ss->ss_family) {
1034 case AF_INET:
1035 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
1036 case AF_INET6:
1037 return
1038 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
1039 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
1040 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
1041 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
1042 }
1043 return false;
1044}
1045
1046static int addr_port(struct sockaddr_storage *ss)
1047{
1048 switch (ss->ss_family) {
1049 case AF_INET:
1050 return ntohs(((struct sockaddr_in *)ss)->sin_port);
1051 case AF_INET6:
1052 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
1053 }
1054 return 0;
1055}
1056
1057static void addr_set_port(struct sockaddr_storage *ss, int p)
1058{
1059 switch (ss->ss_family) {
1060 case AF_INET:
1061 ((struct sockaddr_in *)ss)->sin_port = htons(p);
1062 case AF_INET6:
1063 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
1064 }
1065}
1066
1067/*
1068 * Parse an ip[:port] list into an addr array. Use the default
1069 * monitor port if a port isn't specified.
1070 */
1071int ceph_parse_ips(const char *c, const char *end,
1072 struct ceph_entity_addr *addr,
1073 int max_count, int *count)
1074{
1075 int i;
1076 const char *p = c;
1077
1078 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1079 for (i = 0; i < max_count; i++) {
1080 const char *ipend;
1081 struct sockaddr_storage *ss = &addr[i].in_addr;
1082 struct sockaddr_in *in4 = (void *)ss;
1083 struct sockaddr_in6 *in6 = (void *)ss;
1084 int port;
1085 char delim = ',';
1086
1087 if (*p == '[') {
1088 delim = ']';
1089 p++;
1090 }
1091
1092 memset(ss, 0, sizeof(*ss));
1093 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1094 delim, &ipend))
1095 ss->ss_family = AF_INET;
1096 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1097 delim, &ipend))
1098 ss->ss_family = AF_INET6;
1099 else
1100 goto bad;
1101 p = ipend;
1102
1103 if (delim == ']') {
1104 if (*p != ']') {
1105 dout("missing matching ']'\n");
1106 goto bad;
1107 }
1108 p++;
1109 }
1110
1111 /* port? */
1112 if (p < end && *p == ':') {
1113 port = 0;
1114 p++;
1115 while (p < end && *p >= '0' && *p <= '9') {
1116 port = (port * 10) + (*p - '0');
1117 p++;
1118 }
1119 if (port > 65535 || port == 0)
1120 goto bad;
1121 } else {
1122 port = CEPH_MON_PORT;
1123 }
1124
1125 addr_set_port(ss, port);
1126
1127 dout("parse_ips got %s\n", ceph_pr_addr(ss));
1128
1129 if (p == end)
1130 break;
1131 if (*p != ',')
1132 goto bad;
1133 p++;
1134 }
1135
1136 if (p != end)
1137 goto bad;
1138
1139 if (count)
1140 *count = i + 1;
1141 return 0;
1142
1143bad:
1144 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1145 return -EINVAL;
1146}
1147EXPORT_SYMBOL(ceph_parse_ips);
1148
1149static int process_banner(struct ceph_connection *con)
1150{
1151 dout("process_banner on %p\n", con);
1152
1153 if (verify_hello(con) < 0)
1154 return -1;
1155
1156 ceph_decode_addr(&con->actual_peer_addr);
1157 ceph_decode_addr(&con->peer_addr_for_me);
1158
1159 /*
1160 * Make sure the other end is who we wanted. note that the other
1161 * end may not yet know their ip address, so if it's 0.0.0.0, give
1162 * them the benefit of the doubt.
1163 */
1164 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1165 sizeof(con->peer_addr)) != 0 &&
1166 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1167 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1168 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1169 ceph_pr_addr(&con->peer_addr.in_addr),
1170 (int)le32_to_cpu(con->peer_addr.nonce),
1171 ceph_pr_addr(&con->actual_peer_addr.in_addr),
1172 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1173 con->error_msg = "wrong peer at address";
1174 return -1;
1175 }
1176
1177 /*
1178 * did we learn our address?
1179 */
1180 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1181 int port = addr_port(&con->msgr->inst.addr.in_addr);
1182
1183 memcpy(&con->msgr->inst.addr.in_addr,
1184 &con->peer_addr_for_me.in_addr,
1185 sizeof(con->peer_addr_for_me.in_addr));
1186 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1187 encode_my_addr(con->msgr);
1188 dout("process_banner learned my addr is %s\n",
1189 ceph_pr_addr(&con->msgr->inst.addr.in_addr));
1190 }
1191
1192 set_bit(NEGOTIATING, &con->state);
1193 prepare_read_connect(con);
1194 return 0;
1195}
1196
1197static void fail_protocol(struct ceph_connection *con)
1198{
1199 reset_connection(con);
1200 set_bit(CLOSED, &con->state); /* in case there's queued work */
1201
1202 mutex_unlock(&con->mutex);
1203 if (con->ops->bad_proto)
1204 con->ops->bad_proto(con);
1205 mutex_lock(&con->mutex);
1206}
1207
1208static int process_connect(struct ceph_connection *con)
1209{
1210 u64 sup_feat = con->msgr->supported_features;
1211 u64 req_feat = con->msgr->required_features;
1212 u64 server_feat = le64_to_cpu(con->in_reply.features);
1213
1214 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1215
1216 switch (con->in_reply.tag) {
1217 case CEPH_MSGR_TAG_FEATURES:
1218 pr_err("%s%lld %s feature set mismatch,"
1219 " my %llx < server's %llx, missing %llx\n",
1220 ENTITY_NAME(con->peer_name),
1221 ceph_pr_addr(&con->peer_addr.in_addr),
1222 sup_feat, server_feat, server_feat & ~sup_feat);
1223 con->error_msg = "missing required protocol features";
1224 fail_protocol(con);
1225 return -1;
1226
1227 case CEPH_MSGR_TAG_BADPROTOVER:
1228 pr_err("%s%lld %s protocol version mismatch,"
1229 " my %d != server's %d\n",
1230 ENTITY_NAME(con->peer_name),
1231 ceph_pr_addr(&con->peer_addr.in_addr),
1232 le32_to_cpu(con->out_connect.protocol_version),
1233 le32_to_cpu(con->in_reply.protocol_version));
1234 con->error_msg = "protocol version mismatch";
1235 fail_protocol(con);
1236 return -1;
1237
1238 case CEPH_MSGR_TAG_BADAUTHORIZER:
1239 con->auth_retry++;
1240 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1241 con->auth_retry);
1242 if (con->auth_retry == 2) {
1243 con->error_msg = "connect authorization failure";
1244 reset_connection(con);
1245 set_bit(CLOSED, &con->state);
1246 return -1;
1247 }
1248 con->auth_retry = 1;
1249 prepare_write_connect(con->msgr, con, 0);
1250 prepare_read_connect(con);
1251 break;
1252
1253 case CEPH_MSGR_TAG_RESETSESSION:
1254 /*
1255 * If we connected with a large connect_seq but the peer
1256 * has no record of a session with us (no connection, or
1257 * connect_seq == 0), they will send RESETSESION to indicate
1258 * that they must have reset their session, and may have
1259 * dropped messages.
1260 */
1261 dout("process_connect got RESET peer seq %u\n",
1262 le32_to_cpu(con->in_connect.connect_seq));
1263 pr_err("%s%lld %s connection reset\n",
1264 ENTITY_NAME(con->peer_name),
1265 ceph_pr_addr(&con->peer_addr.in_addr));
1266 reset_connection(con);
1267 prepare_write_connect(con->msgr, con, 0);
1268 prepare_read_connect(con);
1269
1270 /* Tell ceph about it. */
1271 mutex_unlock(&con->mutex);
1272 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1273 if (con->ops->peer_reset)
1274 con->ops->peer_reset(con);
1275 mutex_lock(&con->mutex);
1276 break;
1277
1278 case CEPH_MSGR_TAG_RETRY_SESSION:
1279 /*
1280 * If we sent a smaller connect_seq than the peer has, try
1281 * again with a larger value.
1282 */
1283 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1284 le32_to_cpu(con->out_connect.connect_seq),
1285 le32_to_cpu(con->in_connect.connect_seq));
1286 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1287 prepare_write_connect(con->msgr, con, 0);
1288 prepare_read_connect(con);
1289 break;
1290
1291 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1292 /*
1293 * If we sent a smaller global_seq than the peer has, try
1294 * again with a larger value.
1295 */
1296 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1297 con->peer_global_seq,
1298 le32_to_cpu(con->in_connect.global_seq));
1299 get_global_seq(con->msgr,
1300 le32_to_cpu(con->in_connect.global_seq));
1301 prepare_write_connect(con->msgr, con, 0);
1302 prepare_read_connect(con);
1303 break;
1304
1305 case CEPH_MSGR_TAG_READY:
1306 if (req_feat & ~server_feat) {
1307 pr_err("%s%lld %s protocol feature mismatch,"
1308 " my required %llx > server's %llx, need %llx\n",
1309 ENTITY_NAME(con->peer_name),
1310 ceph_pr_addr(&con->peer_addr.in_addr),
1311 req_feat, server_feat, req_feat & ~server_feat);
1312 con->error_msg = "missing required protocol features";
1313 fail_protocol(con);
1314 return -1;
1315 }
1316 clear_bit(CONNECTING, &con->state);
1317 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1318 con->connect_seq++;
1319 con->peer_features = server_feat;
1320 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1321 con->peer_global_seq,
1322 le32_to_cpu(con->in_reply.connect_seq),
1323 con->connect_seq);
1324 WARN_ON(con->connect_seq !=
1325 le32_to_cpu(con->in_reply.connect_seq));
1326
1327 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1328 set_bit(LOSSYTX, &con->state);
1329
1330 prepare_read_tag(con);
1331 break;
1332
1333 case CEPH_MSGR_TAG_WAIT:
1334 /*
1335 * If there is a connection race (we are opening
1336 * connections to each other), one of us may just have
1337 * to WAIT. This shouldn't happen if we are the
1338 * client.
1339 */
1340 pr_err("process_connect peer connecting WAIT\n");
1341
1342 default:
1343 pr_err("connect protocol error, will retry\n");
1344 con->error_msg = "protocol error, garbage tag during connect";
1345 return -1;
1346 }
1347 return 0;
1348}
1349
1350
1351/*
1352 * read (part of) an ack
1353 */
1354static int read_partial_ack(struct ceph_connection *con)
1355{
1356 int to = 0;
1357
1358 return read_partial(con, &to, sizeof(con->in_temp_ack),
1359 &con->in_temp_ack);
1360}
1361
1362
1363/*
1364 * We can finally discard anything that's been acked.
1365 */
1366static void process_ack(struct ceph_connection *con)
1367{
1368 struct ceph_msg *m;
1369 u64 ack = le64_to_cpu(con->in_temp_ack);
1370 u64 seq;
1371
1372 while (!list_empty(&con->out_sent)) {
1373 m = list_first_entry(&con->out_sent, struct ceph_msg,
1374 list_head);
1375 seq = le64_to_cpu(m->hdr.seq);
1376 if (seq > ack)
1377 break;
1378 dout("got ack for seq %llu type %d at %p\n", seq,
1379 le16_to_cpu(m->hdr.type), m);
1380 ceph_msg_remove(m);
1381 }
1382 prepare_read_tag(con);
1383}
1384
1385
1386
1387
1388static int read_partial_message_section(struct ceph_connection *con,
1389 struct kvec *section,
1390 unsigned int sec_len, u32 *crc)
1391{
1392 int ret, left;
1393
1394 BUG_ON(!section);
1395
1396 while (section->iov_len < sec_len) {
1397 BUG_ON(section->iov_base == NULL);
1398 left = sec_len - section->iov_len;
1399 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1400 section->iov_len, left);
1401 if (ret <= 0)
1402 return ret;
1403 section->iov_len += ret;
1404 if (section->iov_len == sec_len)
1405 *crc = crc32c(0, section->iov_base,
1406 section->iov_len);
1407 }
1408
1409 return 1;
1410}
1411
1412static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1413 struct ceph_msg_header *hdr,
1414 int *skip);
1415
1416
1417static int read_partial_message_pages(struct ceph_connection *con,
1418 struct page **pages,
1419 unsigned data_len, int datacrc)
1420{
1421 void *p;
1422 int ret;
1423 int left;
1424
1425 left = min((int)(data_len - con->in_msg_pos.data_pos),
1426 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1427 /* (page) data */
1428 BUG_ON(pages == NULL);
1429 p = kmap(pages[con->in_msg_pos.page]);
1430 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1431 left);
1432 if (ret > 0 && datacrc)
1433 con->in_data_crc =
1434 crc32c(con->in_data_crc,
1435 p + con->in_msg_pos.page_pos, ret);
1436 kunmap(pages[con->in_msg_pos.page]);
1437 if (ret <= 0)
1438 return ret;
1439 con->in_msg_pos.data_pos += ret;
1440 con->in_msg_pos.page_pos += ret;
1441 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1442 con->in_msg_pos.page_pos = 0;
1443 con->in_msg_pos.page++;
1444 }
1445
1446 return ret;
1447}
1448
1449#ifdef CONFIG_BLOCK
1450static int read_partial_message_bio(struct ceph_connection *con,
1451 struct bio **bio_iter, int *bio_seg,
1452 unsigned data_len, int datacrc)
1453{
1454 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1455 void *p;
1456 int ret, left;
1457
1458 if (IS_ERR(bv))
1459 return PTR_ERR(bv);
1460
1461 left = min((int)(data_len - con->in_msg_pos.data_pos),
1462 (int)(bv->bv_len - con->in_msg_pos.page_pos));
1463
1464 p = kmap(bv->bv_page) + bv->bv_offset;
1465
1466 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1467 left);
1468 if (ret > 0 && datacrc)
1469 con->in_data_crc =
1470 crc32c(con->in_data_crc,
1471 p + con->in_msg_pos.page_pos, ret);
1472 kunmap(bv->bv_page);
1473 if (ret <= 0)
1474 return ret;
1475 con->in_msg_pos.data_pos += ret;
1476 con->in_msg_pos.page_pos += ret;
1477 if (con->in_msg_pos.page_pos == bv->bv_len) {
1478 con->in_msg_pos.page_pos = 0;
1479 iter_bio_next(bio_iter, bio_seg);
1480 }
1481
1482 return ret;
1483}
1484#endif
1485
1486/*
1487 * read (part of) a message.
1488 */
1489static int read_partial_message(struct ceph_connection *con)
1490{
1491 struct ceph_msg *m = con->in_msg;
1492 int ret;
1493 int to, left;
1494 unsigned front_len, middle_len, data_len, data_off;
1495 int datacrc = con->msgr->nocrc;
1496 int skip;
1497 u64 seq;
1498
1499 dout("read_partial_message con %p msg %p\n", con, m);
1500
1501 /* header */
1502 while (con->in_base_pos < sizeof(con->in_hdr)) {
1503 left = sizeof(con->in_hdr) - con->in_base_pos;
1504 ret = ceph_tcp_recvmsg(con->sock,
1505 (char *)&con->in_hdr + con->in_base_pos,
1506 left);
1507 if (ret <= 0)
1508 return ret;
1509 con->in_base_pos += ret;
1510 if (con->in_base_pos == sizeof(con->in_hdr)) {
1511 u32 crc = crc32c(0, (void *)&con->in_hdr,
1512 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1513 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1514 pr_err("read_partial_message bad hdr "
1515 " crc %u != expected %u\n",
1516 crc, con->in_hdr.crc);
1517 return -EBADMSG;
1518 }
1519 }
1520 }
1521 front_len = le32_to_cpu(con->in_hdr.front_len);
1522 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1523 return -EIO;
1524 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1525 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1526 return -EIO;
1527 data_len = le32_to_cpu(con->in_hdr.data_len);
1528 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1529 return -EIO;
1530 data_off = le16_to_cpu(con->in_hdr.data_off);
1531
1532 /* verify seq# */
1533 seq = le64_to_cpu(con->in_hdr.seq);
1534 if ((s64)seq - (s64)con->in_seq < 1) {
1535 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1536 ENTITY_NAME(con->peer_name),
1537 ceph_pr_addr(&con->peer_addr.in_addr),
1538 seq, con->in_seq + 1);
1539 con->in_base_pos = -front_len - middle_len - data_len -
1540 sizeof(m->footer);
1541 con->in_tag = CEPH_MSGR_TAG_READY;
1542 con->in_seq++;
1543 return 0;
1544 } else if ((s64)seq - (s64)con->in_seq > 1) {
1545 pr_err("read_partial_message bad seq %lld expected %lld\n",
1546 seq, con->in_seq + 1);
1547 con->error_msg = "bad message sequence # for incoming message";
1548 return -EBADMSG;
1549 }
1550
1551 /* allocate message? */
1552 if (!con->in_msg) {
1553 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1554 con->in_hdr.front_len, con->in_hdr.data_len);
1555 skip = 0;
1556 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1557 if (skip) {
1558 /* skip this message */
1559 dout("alloc_msg said skip message\n");
1560 BUG_ON(con->in_msg);
1561 con->in_base_pos = -front_len - middle_len - data_len -
1562 sizeof(m->footer);
1563 con->in_tag = CEPH_MSGR_TAG_READY;
1564 con->in_seq++;
1565 return 0;
1566 }
1567 if (!con->in_msg) {
1568 con->error_msg =
1569 "error allocating memory for incoming message";
1570 return -ENOMEM;
1571 }
1572 m = con->in_msg;
1573 m->front.iov_len = 0; /* haven't read it yet */
1574 if (m->middle)
1575 m->middle->vec.iov_len = 0;
1576
1577 con->in_msg_pos.page = 0;
1578 if (m->pages)
1579 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1580 else
1581 con->in_msg_pos.page_pos = 0;
1582 con->in_msg_pos.data_pos = 0;
1583 }
1584
1585 /* front */
1586 ret = read_partial_message_section(con, &m->front, front_len,
1587 &con->in_front_crc);
1588 if (ret <= 0)
1589 return ret;
1590
1591 /* middle */
1592 if (m->middle) {
1593 ret = read_partial_message_section(con, &m->middle->vec,
1594 middle_len,
1595 &con->in_middle_crc);
1596 if (ret <= 0)
1597 return ret;
1598 }
1599#ifdef CONFIG_BLOCK
1600 if (m->bio && !m->bio_iter)
1601 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1602#endif
1603
1604 /* (page) data */
1605 while (con->in_msg_pos.data_pos < data_len) {
1606 if (m->pages) {
1607 ret = read_partial_message_pages(con, m->pages,
1608 data_len, datacrc);
1609 if (ret <= 0)
1610 return ret;
1611#ifdef CONFIG_BLOCK
1612 } else if (m->bio) {
1613
1614 ret = read_partial_message_bio(con,
1615 &m->bio_iter, &m->bio_seg,
1616 data_len, datacrc);
1617 if (ret <= 0)
1618 return ret;
1619#endif
1620 } else {
1621 BUG_ON(1);
1622 }
1623 }
1624
1625 /* footer */
1626 to = sizeof(m->hdr) + sizeof(m->footer);
1627 while (con->in_base_pos < to) {
1628 left = to - con->in_base_pos;
1629 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1630 (con->in_base_pos - sizeof(m->hdr)),
1631 left);
1632 if (ret <= 0)
1633 return ret;
1634 con->in_base_pos += ret;
1635 }
1636 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1637 m, front_len, m->footer.front_crc, middle_len,
1638 m->footer.middle_crc, data_len, m->footer.data_crc);
1639
1640 /* crc ok? */
1641 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1642 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1643 m, con->in_front_crc, m->footer.front_crc);
1644 return -EBADMSG;
1645 }
1646 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1647 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1648 m, con->in_middle_crc, m->footer.middle_crc);
1649 return -EBADMSG;
1650 }
1651 if (datacrc &&
1652 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1653 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1654 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1655 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1656 return -EBADMSG;
1657 }
1658
1659 return 1; /* done! */
1660}
1661
1662/*
1663 * Process message. This happens in the worker thread. The callback should
1664 * be careful not to do anything that waits on other incoming messages or it
1665 * may deadlock.
1666 */
1667static void process_message(struct ceph_connection *con)
1668{
1669 struct ceph_msg *msg;
1670
1671 msg = con->in_msg;
1672 con->in_msg = NULL;
1673
1674 /* if first message, set peer_name */
1675 if (con->peer_name.type == 0)
1676 con->peer_name = msg->hdr.src;
1677
1678 con->in_seq++;
1679 mutex_unlock(&con->mutex);
1680
1681 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1682 msg, le64_to_cpu(msg->hdr.seq),
1683 ENTITY_NAME(msg->hdr.src),
1684 le16_to_cpu(msg->hdr.type),
1685 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1686 le32_to_cpu(msg->hdr.front_len),
1687 le32_to_cpu(msg->hdr.data_len),
1688 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1689 con->ops->dispatch(con, msg);
1690
1691 mutex_lock(&con->mutex);
1692 prepare_read_tag(con);
1693}
1694
1695
1696/*
1697 * Write something to the socket. Called in a worker thread when the
1698 * socket appears to be writeable and we have something ready to send.
1699 */
1700static int try_write(struct ceph_connection *con)
1701{
1702 struct ceph_messenger *msgr = con->msgr;
1703 int ret = 1;
1704
1705 dout("try_write start %p state %lu nref %d\n", con, con->state,
1706 atomic_read(&con->nref));
1707
1708more:
1709 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1710
1711 /* open the socket first? */
1712 if (con->sock == NULL) {
1713 /*
1714 * if we were STANDBY and are reconnecting _this_
1715 * connection, bump connect_seq now. Always bump
1716 * global_seq.
1717 */
1718 if (test_and_clear_bit(STANDBY, &con->state))
1719 con->connect_seq++;
1720
1721 prepare_write_banner(msgr, con);
1722 prepare_write_connect(msgr, con, 1);
1723 prepare_read_banner(con);
1724 set_bit(CONNECTING, &con->state);
1725 clear_bit(NEGOTIATING, &con->state);
1726
1727 BUG_ON(con->in_msg);
1728 con->in_tag = CEPH_MSGR_TAG_READY;
1729 dout("try_write initiating connect on %p new state %lu\n",
1730 con, con->state);
1731 con->sock = ceph_tcp_connect(con);
1732 if (IS_ERR(con->sock)) {
1733 con->sock = NULL;
1734 con->error_msg = "connect error";
1735 ret = -1;
1736 goto out;
1737 }
1738 }
1739
1740more_kvec:
1741 /* kvec data queued? */
1742 if (con->out_skip) {
1743 ret = write_partial_skip(con);
1744 if (ret <= 0)
1745 goto done;
1746 if (ret < 0) {
1747 dout("try_write write_partial_skip err %d\n", ret);
1748 goto done;
1749 }
1750 }
1751 if (con->out_kvec_left) {
1752 ret = write_partial_kvec(con);
1753 if (ret <= 0)
1754 goto done;
1755 }
1756
1757 /* msg pages? */
1758 if (con->out_msg) {
1759 if (con->out_msg_done) {
1760 ceph_msg_put(con->out_msg);
1761 con->out_msg = NULL; /* we're done with this one */
1762 goto do_next;
1763 }
1764
1765 ret = write_partial_msg_pages(con);
1766 if (ret == 1)
1767 goto more_kvec; /* we need to send the footer, too! */
1768 if (ret == 0)
1769 goto done;
1770 if (ret < 0) {
1771 dout("try_write write_partial_msg_pages err %d\n",
1772 ret);
1773 goto done;
1774 }
1775 }
1776
1777do_next:
1778 if (!test_bit(CONNECTING, &con->state)) {
1779 /* is anything else pending? */
1780 if (!list_empty(&con->out_queue)) {
1781 prepare_write_message(con);
1782 goto more;
1783 }
1784 if (con->in_seq > con->in_seq_acked) {
1785 prepare_write_ack(con);
1786 goto more;
1787 }
1788 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1789 prepare_write_keepalive(con);
1790 goto more;
1791 }
1792 }
1793
1794 /* Nothing to do! */
1795 clear_bit(WRITE_PENDING, &con->state);
1796 dout("try_write nothing else to write.\n");
1797done:
1798 ret = 0;
1799out:
1800 dout("try_write done on %p\n", con);
1801 return ret;
1802}
1803
1804
1805
1806/*
1807 * Read what we can from the socket.
1808 */
1809static int try_read(struct ceph_connection *con)
1810{
1811 int ret = -1;
1812
1813 if (!con->sock)
1814 return 0;
1815
1816 if (test_bit(STANDBY, &con->state))
1817 return 0;
1818
1819 dout("try_read start on %p\n", con);
1820
1821more:
1822 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1823 con->in_base_pos);
1824 if (test_bit(CONNECTING, &con->state)) {
1825 if (!test_bit(NEGOTIATING, &con->state)) {
1826 dout("try_read connecting\n");
1827 ret = read_partial_banner(con);
1828 if (ret <= 0)
1829 goto done;
1830 if (process_banner(con) < 0) {
1831 ret = -1;
1832 goto out;
1833 }
1834 }
1835 ret = read_partial_connect(con);
1836 if (ret <= 0)
1837 goto done;
1838 if (process_connect(con) < 0) {
1839 ret = -1;
1840 goto out;
1841 }
1842 goto more;
1843 }
1844
1845 if (con->in_base_pos < 0) {
1846 /*
1847 * skipping + discarding content.
1848 *
1849 * FIXME: there must be a better way to do this!
1850 */
1851 static char buf[1024];
1852 int skip = min(1024, -con->in_base_pos);
1853 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1854 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1855 if (ret <= 0)
1856 goto done;
1857 con->in_base_pos += ret;
1858 if (con->in_base_pos)
1859 goto more;
1860 }
1861 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1862 /*
1863 * what's next?
1864 */
1865 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1866 if (ret <= 0)
1867 goto done;
1868 dout("try_read got tag %d\n", (int)con->in_tag);
1869 switch (con->in_tag) {
1870 case CEPH_MSGR_TAG_MSG:
1871 prepare_read_message(con);
1872 break;
1873 case CEPH_MSGR_TAG_ACK:
1874 prepare_read_ack(con);
1875 break;
1876 case CEPH_MSGR_TAG_CLOSE:
1877 set_bit(CLOSED, &con->state); /* fixme */
1878 goto done;
1879 default:
1880 goto bad_tag;
1881 }
1882 }
1883 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1884 ret = read_partial_message(con);
1885 if (ret <= 0) {
1886 switch (ret) {
1887 case -EBADMSG:
1888 con->error_msg = "bad crc";
1889 ret = -EIO;
1890 goto out;
1891 case -EIO:
1892 con->error_msg = "io error";
1893 goto out;
1894 default:
1895 goto done;
1896 }
1897 }
1898 if (con->in_tag == CEPH_MSGR_TAG_READY)
1899 goto more;
1900 process_message(con);
1901 goto more;
1902 }
1903 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1904 ret = read_partial_ack(con);
1905 if (ret <= 0)
1906 goto done;
1907 process_ack(con);
1908 goto more;
1909 }
1910
1911done:
1912 ret = 0;
1913out:
1914 dout("try_read done on %p\n", con);
1915 return ret;
1916
1917bad_tag:
1918 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1919 con->error_msg = "protocol error, garbage tag";
1920 ret = -1;
1921 goto out;
1922}
1923
1924
1925/*
1926 * Atomically queue work on a connection. Bump @con reference to
1927 * avoid races with connection teardown.
1928 *
1929 * There is some trickery going on with QUEUED and BUSY because we
1930 * only want a _single_ thread operating on each connection at any
1931 * point in time, but we want to use all available CPUs.
1932 *
1933 * The worker thread only proceeds if it can atomically set BUSY. It
1934 * clears QUEUED and does it's thing. When it thinks it's done, it
1935 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1936 * (tries again to set BUSY).
1937 *
1938 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1939 * try to queue work. If that fails (work is already queued, or BUSY)
1940 * we give up (work also already being done or is queued) but leave QUEUED
1941 * set so that the worker thread will loop if necessary.
1942 */
1943static void queue_con(struct ceph_connection *con)
1944{
1945 if (test_bit(DEAD, &con->state)) {
1946 dout("queue_con %p ignoring: DEAD\n",
1947 con);
1948 return;
1949 }
1950
1951 if (!con->ops->get(con)) {
1952 dout("queue_con %p ref count 0\n", con);
1953 return;
1954 }
1955
1956 set_bit(QUEUED, &con->state);
1957 if (test_bit(BUSY, &con->state)) {
1958 dout("queue_con %p - already BUSY\n", con);
1959 con->ops->put(con);
1960 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1961 dout("queue_con %p - already queued\n", con);
1962 con->ops->put(con);
1963 } else {
1964 dout("queue_con %p\n", con);
1965 }
1966}
1967
1968/*
1969 * Do some work on a connection. Drop a connection ref when we're done.
1970 */
1971static void con_work(struct work_struct *work)
1972{
1973 struct ceph_connection *con = container_of(work, struct ceph_connection,
1974 work.work);
1975 int backoff = 0;
1976
1977more:
1978 if (test_and_set_bit(BUSY, &con->state) != 0) {
1979 dout("con_work %p BUSY already set\n", con);
1980 goto out;
1981 }
1982 dout("con_work %p start, clearing QUEUED\n", con);
1983 clear_bit(QUEUED, &con->state);
1984
1985 mutex_lock(&con->mutex);
1986
1987 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1988 dout("con_work CLOSED\n");
1989 con_close_socket(con);
1990 goto done;
1991 }
1992 if (test_and_clear_bit(OPENING, &con->state)) {
1993 /* reopen w/ new peer */
1994 dout("con_work OPENING\n");
1995 con_close_socket(con);
1996 }
1997
1998 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1999 try_read(con) < 0 ||
2000 try_write(con) < 0) {
2001 mutex_unlock(&con->mutex);
2002 backoff = 1;
2003 ceph_fault(con); /* error/fault path */
2004 goto done_unlocked;
2005 }
2006
2007done:
2008 mutex_unlock(&con->mutex);
2009
2010done_unlocked:
2011 clear_bit(BUSY, &con->state);
2012 dout("con->state=%lu\n", con->state);
2013 if (test_bit(QUEUED, &con->state)) {
2014 if (!backoff || test_bit(OPENING, &con->state)) {
2015 dout("con_work %p QUEUED reset, looping\n", con);
2016 goto more;
2017 }
2018 dout("con_work %p QUEUED reset, but just faulted\n", con);
2019 clear_bit(QUEUED, &con->state);
2020 }
2021 dout("con_work %p done\n", con);
2022
2023out:
2024 con->ops->put(con);
2025}
2026
2027
2028/*
2029 * Generic error/fault handler. A retry mechanism is used with
2030 * exponential backoff
2031 */
2032static void ceph_fault(struct ceph_connection *con)
2033{
2034 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2035 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
2036 dout("fault %p state %lu to peer %s\n",
2037 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
2038
2039 if (test_bit(LOSSYTX, &con->state)) {
2040 dout("fault on LOSSYTX channel\n");
2041 goto out;
2042 }
2043
2044 mutex_lock(&con->mutex);
2045 if (test_bit(CLOSED, &con->state))
2046 goto out_unlock;
2047
2048 con_close_socket(con);
2049
2050 if (con->in_msg) {
2051 ceph_msg_put(con->in_msg);
2052 con->in_msg = NULL;
2053 }
2054
2055 /* Requeue anything that hasn't been acked */
2056 list_splice_init(&con->out_sent, &con->out_queue);
2057
2058 /* If there are no messages in the queue, place the connection
2059 * in a STANDBY state (i.e., don't try to reconnect just yet). */
2060 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
2061 dout("fault setting STANDBY\n");
2062 set_bit(STANDBY, &con->state);
2063 } else {
2064 /* retry after a delay. */
2065 if (con->delay == 0)
2066 con->delay = BASE_DELAY_INTERVAL;
2067 else if (con->delay < MAX_DELAY_INTERVAL)
2068 con->delay *= 2;
2069 dout("fault queueing %p delay %lu\n", con, con->delay);
2070 con->ops->get(con);
2071 if (queue_delayed_work(ceph_msgr_wq, &con->work,
2072 round_jiffies_relative(con->delay)) == 0)
2073 con->ops->put(con);
2074 }
2075
2076out_unlock:
2077 mutex_unlock(&con->mutex);
2078out:
2079 /*
2080 * in case we faulted due to authentication, invalidate our
2081 * current tickets so that we can get new ones.
2082 */
2083 if (con->auth_retry && con->ops->invalidate_authorizer) {
2084 dout("calling invalidate_authorizer()\n");
2085 con->ops->invalidate_authorizer(con);
2086 }
2087
2088 if (con->ops->fault)
2089 con->ops->fault(con);
2090}
2091
2092
2093
2094/*
2095 * create a new messenger instance
2096 */
2097struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2098 u32 supported_features,
2099 u32 required_features)
2100{
2101 struct ceph_messenger *msgr;
2102
2103 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
2104 if (msgr == NULL)
2105 return ERR_PTR(-ENOMEM);
2106
2107 msgr->supported_features = supported_features;
2108 msgr->required_features = required_features;
2109
2110 spin_lock_init(&msgr->global_seq_lock);
2111
2112 /* the zero page is needed if a request is "canceled" while the message
2113 * is being written over the socket */
2114 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
2115 if (!msgr->zero_page) {
2116 kfree(msgr);
2117 return ERR_PTR(-ENOMEM);
2118 }
2119 kmap(msgr->zero_page);
2120
2121 if (myaddr)
2122 msgr->inst.addr = *myaddr;
2123
2124 /* select a random nonce */
2125 msgr->inst.addr.type = 0;
2126 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
2127 encode_my_addr(msgr);
2128
2129 dout("messenger_create %p\n", msgr);
2130 return msgr;
2131}
2132EXPORT_SYMBOL(ceph_messenger_create);
2133
2134void ceph_messenger_destroy(struct ceph_messenger *msgr)
2135{
2136 dout("destroy %p\n", msgr);
2137 kunmap(msgr->zero_page);
2138 __free_page(msgr->zero_page);
2139 kfree(msgr);
2140 dout("destroyed messenger %p\n", msgr);
2141}
2142EXPORT_SYMBOL(ceph_messenger_destroy);
2143
2144/*
2145 * Queue up an outgoing message on the given connection.
2146 */
2147void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2148{
2149 if (test_bit(CLOSED, &con->state)) {
2150 dout("con_send %p closed, dropping %p\n", con, msg);
2151 ceph_msg_put(msg);
2152 return;
2153 }
2154
2155 /* set src+dst */
2156 msg->hdr.src = con->msgr->inst.name;
2157
2158 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
2159
2160 msg->needs_out_seq = true;
2161
2162 /* queue */
2163 mutex_lock(&con->mutex);
2164 BUG_ON(!list_empty(&msg->list_head));
2165 list_add_tail(&msg->list_head, &con->out_queue);
2166 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2167 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2168 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2169 le32_to_cpu(msg->hdr.front_len),
2170 le32_to_cpu(msg->hdr.middle_len),
2171 le32_to_cpu(msg->hdr.data_len));
2172 mutex_unlock(&con->mutex);
2173
2174 /* if there wasn't anything waiting to send before, queue
2175 * new work */
2176 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2177 queue_con(con);
2178}
2179EXPORT_SYMBOL(ceph_con_send);
2180
2181/*
2182 * Revoke a message that was previously queued for send
2183 */
2184void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2185{
2186 mutex_lock(&con->mutex);
2187 if (!list_empty(&msg->list_head)) {
2188 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2189 list_del_init(&msg->list_head);
2190 ceph_msg_put(msg);
2191 msg->hdr.seq = 0;
2192 }
2193 if (con->out_msg == msg) {
2194 dout("con_revoke %p msg %p - was sending\n", con, msg);
2195 con->out_msg = NULL;
2196 if (con->out_kvec_is_msg) {
2197 con->out_skip = con->out_kvec_bytes;
2198 con->out_kvec_is_msg = false;
2199 }
2200 ceph_msg_put(msg);
2201 msg->hdr.seq = 0;
2202 }
2203 mutex_unlock(&con->mutex);
2204}
2205
2206/*
2207 * Revoke a message that we may be reading data into
2208 */
2209void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2210{
2211 mutex_lock(&con->mutex);
2212 if (con->in_msg && con->in_msg == msg) {
2213 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2214 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2215 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2216
2217 /* skip rest of message */
2218 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2219 con->in_base_pos = con->in_base_pos -
2220 sizeof(struct ceph_msg_header) -
2221 front_len -
2222 middle_len -
2223 data_len -
2224 sizeof(struct ceph_msg_footer);
2225 ceph_msg_put(con->in_msg);
2226 con->in_msg = NULL;
2227 con->in_tag = CEPH_MSGR_TAG_READY;
2228 con->in_seq++;
2229 } else {
2230 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2231 con, con->in_msg, msg);
2232 }
2233 mutex_unlock(&con->mutex);
2234}
2235
2236/*
2237 * Queue a keepalive byte to ensure the tcp connection is alive.
2238 */
2239void ceph_con_keepalive(struct ceph_connection *con)
2240{
2241 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2242 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2243 queue_con(con);
2244}
2245EXPORT_SYMBOL(ceph_con_keepalive);
2246
2247
2248/*
2249 * construct a new message with given type, size
2250 * the new msg has a ref count of 1.
2251 */
2252struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2253{
2254 struct ceph_msg *m;
2255
2256 m = kmalloc(sizeof(*m), flags);
2257 if (m == NULL)
2258 goto out;
2259 kref_init(&m->kref);
2260 INIT_LIST_HEAD(&m->list_head);
2261
2262 m->hdr.tid = 0;
2263 m->hdr.type = cpu_to_le16(type);
2264 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2265 m->hdr.version = 0;
2266 m->hdr.front_len = cpu_to_le32(front_len);
2267 m->hdr.middle_len = 0;
2268 m->hdr.data_len = 0;
2269 m->hdr.data_off = 0;
2270 m->hdr.reserved = 0;
2271 m->footer.front_crc = 0;
2272 m->footer.middle_crc = 0;
2273 m->footer.data_crc = 0;
2274 m->footer.flags = 0;
2275 m->front_max = front_len;
2276 m->front_is_vmalloc = false;
2277 m->more_to_follow = false;
2278 m->pool = NULL;
2279
2280 /* front */
2281 if (front_len) {
2282 if (front_len > PAGE_CACHE_SIZE) {
2283 m->front.iov_base = __vmalloc(front_len, flags,
2284 PAGE_KERNEL);
2285 m->front_is_vmalloc = true;
2286 } else {
2287 m->front.iov_base = kmalloc(front_len, flags);
2288 }
2289 if (m->front.iov_base == NULL) {
2290 pr_err("msg_new can't allocate %d bytes\n",
2291 front_len);
2292 goto out2;
2293 }
2294 } else {
2295 m->front.iov_base = NULL;
2296 }
2297 m->front.iov_len = front_len;
2298
2299 /* middle */
2300 m->middle = NULL;
2301
2302 /* data */
2303 m->nr_pages = 0;
2304 m->pages = NULL;
2305 m->pagelist = NULL;
2306 m->bio = NULL;
2307 m->bio_iter = NULL;
2308 m->bio_seg = 0;
2309 m->trail = NULL;
2310
2311 dout("ceph_msg_new %p front %d\n", m, front_len);
2312 return m;
2313
2314out2:
2315 ceph_msg_put(m);
2316out:
2317 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2318 return NULL;
2319}
2320EXPORT_SYMBOL(ceph_msg_new);
2321
2322/*
2323 * Allocate "middle" portion of a message, if it is needed and wasn't
2324 * allocated by alloc_msg. This allows us to read a small fixed-size
2325 * per-type header in the front and then gracefully fail (i.e.,
2326 * propagate the error to the caller based on info in the front) when
2327 * the middle is too large.
2328 */
2329static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2330{
2331 int type = le16_to_cpu(msg->hdr.type);
2332 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2333
2334 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2335 ceph_msg_type_name(type), middle_len);
2336 BUG_ON(!middle_len);
2337 BUG_ON(msg->middle);
2338
2339 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2340 if (!msg->middle)
2341 return -ENOMEM;
2342 return 0;
2343}
2344
2345/*
2346 * Generic message allocator, for incoming messages.
2347 */
2348static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2349 struct ceph_msg_header *hdr,
2350 int *skip)
2351{
2352 int type = le16_to_cpu(hdr->type);
2353 int front_len = le32_to_cpu(hdr->front_len);
2354 int middle_len = le32_to_cpu(hdr->middle_len);
2355 struct ceph_msg *msg = NULL;
2356 int ret;
2357
2358 if (con->ops->alloc_msg) {
2359 mutex_unlock(&con->mutex);
2360 msg = con->ops->alloc_msg(con, hdr, skip);
2361 mutex_lock(&con->mutex);
2362 if (!msg || *skip)
2363 return NULL;
2364 }
2365 if (!msg) {
2366 *skip = 0;
2367 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2368 if (!msg) {
2369 pr_err("unable to allocate msg type %d len %d\n",
2370 type, front_len);
2371 return NULL;
2372 }
2373 }
2374 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2375
2376 if (middle_len && !msg->middle) {
2377 ret = ceph_alloc_middle(con, msg);
2378 if (ret < 0) {
2379 ceph_msg_put(msg);
2380 return NULL;
2381 }
2382 }
2383
2384 return msg;
2385}
2386
2387
2388/*
2389 * Free a generically kmalloc'd message.
2390 */
2391void ceph_msg_kfree(struct ceph_msg *m)
2392{
2393 dout("msg_kfree %p\n", m);
2394 if (m->front_is_vmalloc)
2395 vfree(m->front.iov_base);
2396 else
2397 kfree(m->front.iov_base);
2398 kfree(m);
2399}
2400
2401/*
2402 * Drop a msg ref. Destroy as needed.
2403 */
2404void ceph_msg_last_put(struct kref *kref)
2405{
2406 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2407
2408 dout("ceph_msg_put last one on %p\n", m);
2409 WARN_ON(!list_empty(&m->list_head));
2410
2411 /* drop middle, data, if any */
2412 if (m->middle) {
2413 ceph_buffer_put(m->middle);
2414 m->middle = NULL;
2415 }
2416 m->nr_pages = 0;
2417 m->pages = NULL;
2418
2419 if (m->pagelist) {
2420 ceph_pagelist_release(m->pagelist);
2421 kfree(m->pagelist);
2422 m->pagelist = NULL;
2423 }
2424
2425 m->trail = NULL;
2426
2427 if (m->pool)
2428 ceph_msgpool_put(m->pool, m);
2429 else
2430 ceph_msg_kfree(m);
2431}
2432EXPORT_SYMBOL(ceph_msg_last_put);
2433
2434void ceph_msg_dump(struct ceph_msg *msg)
2435{
2436 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2437 msg->front_max, msg->nr_pages);
2438 print_hex_dump(KERN_DEBUG, "header: ",
2439 DUMP_PREFIX_OFFSET, 16, 1,
2440 &msg->hdr, sizeof(msg->hdr), true);
2441 print_hex_dump(KERN_DEBUG, " front: ",
2442 DUMP_PREFIX_OFFSET, 16, 1,
2443 msg->front.iov_base, msg->front.iov_len, true);
2444 if (msg->middle)
2445 print_hex_dump(KERN_DEBUG, "middle: ",
2446 DUMP_PREFIX_OFFSET, 16, 1,
2447 msg->middle->vec.iov_base,
2448 msg->middle->vec.iov_len, true);
2449 print_hex_dump(KERN_DEBUG, "footer: ",
2450 DUMP_PREFIX_OFFSET, 16, 1,
2451 &msg->footer, sizeof(msg->footer), true);
2452}
2453EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 000000000000..8a079399174a
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/types.h>
5#include <linux/slab.h>
6#include <linux/random.h>
7#include <linux/sched.h>
8
9#include <linux/ceph/mon_client.h>
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/decode.h>
12
13#include <linux/ceph/auth.h>
14
15/*
16 * Interact with Ceph monitor cluster. Handle requests for new map
17 * versions, and periodically resend as needed. Also implement
18 * statfs() and umount().
19 *
20 * A small cluster of Ceph "monitors" are responsible for managing critical
21 * cluster configuration and state information. An odd number (e.g., 3, 5)
22 * of cmon daemons use a modified version of the Paxos part-time parliament
23 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
24 * list of clients who have mounted the file system.
25 *
26 * We maintain an open, active session with a monitor at all times in order to
27 * receive timely MDSMap updates. We periodically send a keepalive byte on the
28 * TCP socket to ensure we detect a failure. If the connection does break, we
29 * randomly hunt for a new monitor. Once the connection is reestablished, we
30 * resend any outstanding requests.
31 */
32
33static const struct ceph_connection_operations mon_con_ops;
34
35static int __validate_auth(struct ceph_mon_client *monc);
36
37/*
38 * Decode a monmap blob (e.g., during mount).
39 */
40struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
41{
42 struct ceph_monmap *m = NULL;
43 int i, err = -EINVAL;
44 struct ceph_fsid fsid;
45 u32 epoch, num_mon;
46 u16 version;
47 u32 len;
48
49 ceph_decode_32_safe(&p, end, len, bad);
50 ceph_decode_need(&p, end, len, bad);
51
52 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
53
54 ceph_decode_16_safe(&p, end, version, bad);
55
56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
57 ceph_decode_copy(&p, &fsid, sizeof(fsid));
58 epoch = ceph_decode_32(&p);
59
60 num_mon = ceph_decode_32(&p);
61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
62
63 if (num_mon >= CEPH_MAX_MON)
64 goto bad;
65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
66 if (m == NULL)
67 return ERR_PTR(-ENOMEM);
68 m->fsid = fsid;
69 m->epoch = epoch;
70 m->num_mon = num_mon;
71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
72 for (i = 0; i < num_mon; i++)
73 ceph_decode_addr(&m->mon_inst[i].addr);
74
75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
76 m->num_mon);
77 for (i = 0; i < m->num_mon; i++)
78 dout("monmap_decode mon%d is %s\n", i,
79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
80 return m;
81
82bad:
83 dout("monmap_decode failed with %d\n", err);
84 kfree(m);
85 return ERR_PTR(err);
86}
87
88/*
89 * return true if *addr is included in the monmap.
90 */
91int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
92{
93 int i;
94
95 for (i = 0; i < m->num_mon; i++)
96 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
97 return 1;
98 return 0;
99}
100
101/*
102 * Send an auth request.
103 */
104static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
105{
106 monc->pending_auth = 1;
107 monc->m_auth->front.iov_len = len;
108 monc->m_auth->hdr.front_len = cpu_to_le32(len);
109 ceph_con_revoke(monc->con, monc->m_auth);
110 ceph_msg_get(monc->m_auth); /* keep our ref */
111 ceph_con_send(monc->con, monc->m_auth);
112}
113
114/*
115 * Close monitor session, if any.
116 */
117static void __close_session(struct ceph_mon_client *monc)
118{
119 if (monc->con) {
120 dout("__close_session closing mon%d\n", monc->cur_mon);
121 ceph_con_revoke(monc->con, monc->m_auth);
122 ceph_con_close(monc->con);
123 monc->cur_mon = -1;
124 monc->pending_auth = 0;
125 ceph_auth_reset(monc->auth);
126 }
127}
128
129/*
130 * Open a session with a (new) monitor.
131 */
132static int __open_session(struct ceph_mon_client *monc)
133{
134 char r;
135 int ret;
136
137 if (monc->cur_mon < 0) {
138 get_random_bytes(&r, 1);
139 monc->cur_mon = r % monc->monmap->num_mon;
140 dout("open_session num=%d r=%d -> mon%d\n",
141 monc->monmap->num_mon, r, monc->cur_mon);
142 monc->sub_sent = 0;
143 monc->sub_renew_after = jiffies; /* i.e., expired */
144 monc->want_next_osdmap = !!monc->want_next_osdmap;
145
146 dout("open_session mon%d opening\n", monc->cur_mon);
147 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
148 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
149 ceph_con_open(monc->con,
150 &monc->monmap->mon_inst[monc->cur_mon].addr);
151
152 /* initiatiate authentication handshake */
153 ret = ceph_auth_build_hello(monc->auth,
154 monc->m_auth->front.iov_base,
155 monc->m_auth->front_max);
156 __send_prepared_auth_request(monc, ret);
157 } else {
158 dout("open_session mon%d already open\n", monc->cur_mon);
159 }
160 return 0;
161}
162
163static bool __sub_expired(struct ceph_mon_client *monc)
164{
165 return time_after_eq(jiffies, monc->sub_renew_after);
166}
167
168/*
169 * Reschedule delayed work timer.
170 */
171static void __schedule_delayed(struct ceph_mon_client *monc)
172{
173 unsigned delay;
174
175 if (monc->cur_mon < 0 || __sub_expired(monc))
176 delay = 10 * HZ;
177 else
178 delay = 20 * HZ;
179 dout("__schedule_delayed after %u\n", delay);
180 schedule_delayed_work(&monc->delayed_work, delay);
181}
182
183/*
184 * Send subscribe request for mdsmap and/or osdmap.
185 */
186static void __send_subscribe(struct ceph_mon_client *monc)
187{
188 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
189 (unsigned)monc->sub_sent, __sub_expired(monc),
190 monc->want_next_osdmap);
191 if ((__sub_expired(monc) && !monc->sub_sent) ||
192 monc->want_next_osdmap == 1) {
193 struct ceph_msg *msg = monc->m_subscribe;
194 struct ceph_mon_subscribe_item *i;
195 void *p, *end;
196 int num;
197
198 p = msg->front.iov_base;
199 end = p + msg->front_max;
200
201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
202 ceph_encode_32(&p, num);
203
204 if (monc->want_next_osdmap) {
205 dout("__send_subscribe to 'osdmap' %u\n",
206 (unsigned)monc->have_osdmap);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 }
214 if (monc->want_mdsmap) {
215 dout("__send_subscribe to 'mdsmap' %u+\n",
216 (unsigned)monc->have_mdsmap);
217 ceph_encode_string(&p, end, "mdsmap", 6);
218 i = p;
219 i->have = cpu_to_le64(monc->have_mdsmap);
220 i->onetime = 0;
221 p += sizeof(*i);
222 }
223 ceph_encode_string(&p, end, "monmap", 6);
224 i = p;
225 i->have = 0;
226 i->onetime = 0;
227 p += sizeof(*i);
228
229 msg->front.iov_len = p - msg->front.iov_base;
230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
231 ceph_con_revoke(monc->con, msg);
232 ceph_con_send(monc->con, ceph_msg_get(msg));
233
234 monc->sub_sent = jiffies | 1; /* never 0 */
235 }
236}
237
238static void handle_subscribe_ack(struct ceph_mon_client *monc,
239 struct ceph_msg *msg)
240{
241 unsigned seconds;
242 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
243
244 if (msg->front.iov_len < sizeof(*h))
245 goto bad;
246 seconds = le32_to_cpu(h->duration);
247
248 mutex_lock(&monc->mutex);
249 if (monc->hunting) {
250 pr_info("mon%d %s session established\n",
251 monc->cur_mon,
252 ceph_pr_addr(&monc->con->peer_addr.in_addr));
253 monc->hunting = false;
254 }
255 dout("handle_subscribe_ack after %d seconds\n", seconds);
256 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
257 monc->sub_sent = 0;
258 mutex_unlock(&monc->mutex);
259 return;
260bad:
261 pr_err("got corrupt subscribe-ack msg\n");
262 ceph_msg_dump(msg);
263}
264
265/*
266 * Keep track of which maps we have
267 */
268int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
269{
270 mutex_lock(&monc->mutex);
271 monc->have_mdsmap = got;
272 mutex_unlock(&monc->mutex);
273 return 0;
274}
275EXPORT_SYMBOL(ceph_monc_got_mdsmap);
276
277int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
278{
279 mutex_lock(&monc->mutex);
280 monc->have_osdmap = got;
281 monc->want_next_osdmap = 0;
282 mutex_unlock(&monc->mutex);
283 return 0;
284}
285
286/*
287 * Register interest in the next osdmap
288 */
289void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
290{
291 dout("request_next_osdmap have %u\n", monc->have_osdmap);
292 mutex_lock(&monc->mutex);
293 if (!monc->want_next_osdmap)
294 monc->want_next_osdmap = 1;
295 if (monc->want_next_osdmap < 2)
296 __send_subscribe(monc);
297 mutex_unlock(&monc->mutex);
298}
299
300/*
301 *
302 */
303int ceph_monc_open_session(struct ceph_mon_client *monc)
304{
305 if (!monc->con) {
306 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
307 if (!monc->con)
308 return -ENOMEM;
309 ceph_con_init(monc->client->msgr, monc->con);
310 monc->con->private = monc;
311 monc->con->ops = &mon_con_ops;
312 }
313
314 mutex_lock(&monc->mutex);
315 __open_session(monc);
316 __schedule_delayed(monc);
317 mutex_unlock(&monc->mutex);
318 return 0;
319}
320EXPORT_SYMBOL(ceph_monc_open_session);
321
322/*
323 * The monitor responds with mount ack indicate mount success. The
324 * included client ticket allows the client to talk to MDSs and OSDs.
325 */
326static void ceph_monc_handle_map(struct ceph_mon_client *monc,
327 struct ceph_msg *msg)
328{
329 struct ceph_client *client = monc->client;
330 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
331 void *p, *end;
332
333 mutex_lock(&monc->mutex);
334
335 dout("handle_monmap\n");
336 p = msg->front.iov_base;
337 end = p + msg->front.iov_len;
338
339 monmap = ceph_monmap_decode(p, end);
340 if (IS_ERR(monmap)) {
341 pr_err("problem decoding monmap, %d\n",
342 (int)PTR_ERR(monmap));
343 goto out;
344 }
345
346 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
347 kfree(monmap);
348 goto out;
349 }
350
351 client->monc.monmap = monmap;
352 kfree(old);
353
354out:
355 mutex_unlock(&monc->mutex);
356 wake_up_all(&client->auth_wq);
357}
358
359/*
360 * generic requests (e.g., statfs, poolop)
361 */
362static struct ceph_mon_generic_request *__lookup_generic_req(
363 struct ceph_mon_client *monc, u64 tid)
364{
365 struct ceph_mon_generic_request *req;
366 struct rb_node *n = monc->generic_request_tree.rb_node;
367
368 while (n) {
369 req = rb_entry(n, struct ceph_mon_generic_request, node);
370 if (tid < req->tid)
371 n = n->rb_left;
372 else if (tid > req->tid)
373 n = n->rb_right;
374 else
375 return req;
376 }
377 return NULL;
378}
379
380static void __insert_generic_request(struct ceph_mon_client *monc,
381 struct ceph_mon_generic_request *new)
382{
383 struct rb_node **p = &monc->generic_request_tree.rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_mon_generic_request *req = NULL;
386
387 while (*p) {
388 parent = *p;
389 req = rb_entry(parent, struct ceph_mon_generic_request, node);
390 if (new->tid < req->tid)
391 p = &(*p)->rb_left;
392 else if (new->tid > req->tid)
393 p = &(*p)->rb_right;
394 else
395 BUG();
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, &monc->generic_request_tree);
400}
401
402static void release_generic_request(struct kref *kref)
403{
404 struct ceph_mon_generic_request *req =
405 container_of(kref, struct ceph_mon_generic_request, kref);
406
407 if (req->reply)
408 ceph_msg_put(req->reply);
409 if (req->request)
410 ceph_msg_put(req->request);
411
412 kfree(req);
413}
414
415static void put_generic_request(struct ceph_mon_generic_request *req)
416{
417 kref_put(&req->kref, release_generic_request);
418}
419
420static void get_generic_request(struct ceph_mon_generic_request *req)
421{
422 kref_get(&req->kref);
423}
424
425static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
426 struct ceph_msg_header *hdr,
427 int *skip)
428{
429 struct ceph_mon_client *monc = con->private;
430 struct ceph_mon_generic_request *req;
431 u64 tid = le64_to_cpu(hdr->tid);
432 struct ceph_msg *m;
433
434 mutex_lock(&monc->mutex);
435 req = __lookup_generic_req(monc, tid);
436 if (!req) {
437 dout("get_generic_reply %lld dne\n", tid);
438 *skip = 1;
439 m = NULL;
440 } else {
441 dout("get_generic_reply %lld got %p\n", tid, req->reply);
442 m = ceph_msg_get(req->reply);
443 /*
444 * we don't need to track the connection reading into
445 * this reply because we only have one open connection
446 * at a time, ever.
447 */
448 }
449 mutex_unlock(&monc->mutex);
450 return m;
451}
452
453static int do_generic_request(struct ceph_mon_client *monc,
454 struct ceph_mon_generic_request *req)
455{
456 int err;
457
458 /* register request */
459 mutex_lock(&monc->mutex);
460 req->tid = ++monc->last_tid;
461 req->request->hdr.tid = cpu_to_le64(req->tid);
462 __insert_generic_request(monc, req);
463 monc->num_generic_requests++;
464 ceph_con_send(monc->con, ceph_msg_get(req->request));
465 mutex_unlock(&monc->mutex);
466
467 err = wait_for_completion_interruptible(&req->completion);
468
469 mutex_lock(&monc->mutex);
470 rb_erase(&req->node, &monc->generic_request_tree);
471 monc->num_generic_requests--;
472 mutex_unlock(&monc->mutex);
473
474 if (!err)
475 err = req->result;
476 return err;
477}
478
479/*
480 * statfs
481 */
482static void handle_statfs_reply(struct ceph_mon_client *monc,
483 struct ceph_msg *msg)
484{
485 struct ceph_mon_generic_request *req;
486 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
487 u64 tid = le64_to_cpu(msg->hdr.tid);
488
489 if (msg->front.iov_len != sizeof(*reply))
490 goto bad;
491 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
492
493 mutex_lock(&monc->mutex);
494 req = __lookup_generic_req(monc, tid);
495 if (req) {
496 *(struct ceph_statfs *)req->buf = reply->st;
497 req->result = 0;
498 get_generic_request(req);
499 }
500 mutex_unlock(&monc->mutex);
501 if (req) {
502 complete_all(&req->completion);
503 put_generic_request(req);
504 }
505 return;
506
507bad:
508 pr_err("corrupt generic reply, tid %llu\n", tid);
509 ceph_msg_dump(msg);
510}
511
512/*
513 * Do a synchronous statfs().
514 */
515int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
516{
517 struct ceph_mon_generic_request *req;
518 struct ceph_mon_statfs *h;
519 int err;
520
521 req = kzalloc(sizeof(*req), GFP_NOFS);
522 if (!req)
523 return -ENOMEM;
524
525 kref_init(&req->kref);
526 req->buf = buf;
527 req->buf_len = sizeof(*buf);
528 init_completion(&req->completion);
529
530 err = -ENOMEM;
531 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
532 if (!req->request)
533 goto out;
534 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
535 if (!req->reply)
536 goto out;
537
538 /* fill out request */
539 h = req->request->front.iov_base;
540 h->monhdr.have_version = 0;
541 h->monhdr.session_mon = cpu_to_le16(-1);
542 h->monhdr.session_mon_tid = 0;
543 h->fsid = monc->monmap->fsid;
544
545 err = do_generic_request(monc, req);
546
547out:
548 kref_put(&req->kref, release_generic_request);
549 return err;
550}
551EXPORT_SYMBOL(ceph_monc_do_statfs);
552
553/*
554 * pool ops
555 */
556static int get_poolop_reply_buf(const char *src, size_t src_len,
557 char *dst, size_t dst_len)
558{
559 u32 buf_len;
560
561 if (src_len != sizeof(u32) + dst_len)
562 return -EINVAL;
563
564 buf_len = le32_to_cpu(*(u32 *)src);
565 if (buf_len != dst_len)
566 return -EINVAL;
567
568 memcpy(dst, src + sizeof(u32), dst_len);
569 return 0;
570}
571
572static void handle_poolop_reply(struct ceph_mon_client *monc,
573 struct ceph_msg *msg)
574{
575 struct ceph_mon_generic_request *req;
576 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
577 u64 tid = le64_to_cpu(msg->hdr.tid);
578
579 if (msg->front.iov_len < sizeof(*reply))
580 goto bad;
581 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
582
583 mutex_lock(&monc->mutex);
584 req = __lookup_generic_req(monc, tid);
585 if (req) {
586 if (req->buf_len &&
587 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
588 msg->front.iov_len - sizeof(*reply),
589 req->buf, req->buf_len) < 0) {
590 mutex_unlock(&monc->mutex);
591 goto bad;
592 }
593 req->result = le32_to_cpu(reply->reply_code);
594 get_generic_request(req);
595 }
596 mutex_unlock(&monc->mutex);
597 if (req) {
598 complete(&req->completion);
599 put_generic_request(req);
600 }
601 return;
602
603bad:
604 pr_err("corrupt generic reply, tid %llu\n", tid);
605 ceph_msg_dump(msg);
606}
607
608/*
609 * Do a synchronous pool op.
610 */
611int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
612 u32 pool, u64 snapid,
613 char *buf, int len)
614{
615 struct ceph_mon_generic_request *req;
616 struct ceph_mon_poolop *h;
617 int err;
618
619 req = kzalloc(sizeof(*req), GFP_NOFS);
620 if (!req)
621 return -ENOMEM;
622
623 kref_init(&req->kref);
624 req->buf = buf;
625 req->buf_len = len;
626 init_completion(&req->completion);
627
628 err = -ENOMEM;
629 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
630 if (!req->request)
631 goto out;
632 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
633 if (!req->reply)
634 goto out;
635
636 /* fill out request */
637 req->request->hdr.version = cpu_to_le16(2);
638 h = req->request->front.iov_base;
639 h->monhdr.have_version = 0;
640 h->monhdr.session_mon = cpu_to_le16(-1);
641 h->monhdr.session_mon_tid = 0;
642 h->fsid = monc->monmap->fsid;
643 h->pool = cpu_to_le32(pool);
644 h->op = cpu_to_le32(op);
645 h->auid = 0;
646 h->snapid = cpu_to_le64(snapid);
647 h->name_len = 0;
648
649 err = do_generic_request(monc, req);
650
651out:
652 kref_put(&req->kref, release_generic_request);
653 return err;
654}
655
656int ceph_monc_create_snapid(struct ceph_mon_client *monc,
657 u32 pool, u64 *snapid)
658{
659 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
660 pool, 0, (char *)snapid, sizeof(*snapid));
661
662}
663EXPORT_SYMBOL(ceph_monc_create_snapid);
664
665int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
666 u32 pool, u64 snapid)
667{
668 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
669 pool, snapid, 0, 0);
670
671}
672
673/*
674 * Resend pending generic requests.
675 */
676static void __resend_generic_request(struct ceph_mon_client *monc)
677{
678 struct ceph_mon_generic_request *req;
679 struct rb_node *p;
680
681 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
682 req = rb_entry(p, struct ceph_mon_generic_request, node);
683 ceph_con_revoke(monc->con, req->request);
684 ceph_con_send(monc->con, ceph_msg_get(req->request));
685 }
686}
687
688/*
689 * Delayed work. If we haven't mounted yet, retry. Otherwise,
690 * renew/retry subscription as needed (in case it is timing out, or we
691 * got an ENOMEM). And keep the monitor connection alive.
692 */
693static void delayed_work(struct work_struct *work)
694{
695 struct ceph_mon_client *monc =
696 container_of(work, struct ceph_mon_client, delayed_work.work);
697
698 dout("monc delayed_work\n");
699 mutex_lock(&monc->mutex);
700 if (monc->hunting) {
701 __close_session(monc);
702 __open_session(monc); /* continue hunting */
703 } else {
704 ceph_con_keepalive(monc->con);
705
706 __validate_auth(monc);
707
708 if (monc->auth->ops->is_authenticated(monc->auth))
709 __send_subscribe(monc);
710 }
711 __schedule_delayed(monc);
712 mutex_unlock(&monc->mutex);
713}
714
715/*
716 * On startup, we build a temporary monmap populated with the IPs
717 * provided by mount(2).
718 */
719static int build_initial_monmap(struct ceph_mon_client *monc)
720{
721 struct ceph_options *opt = monc->client->options;
722 struct ceph_entity_addr *mon_addr = opt->mon_addr;
723 int num_mon = opt->num_mon;
724 int i;
725
726 /* build initial monmap */
727 monc->monmap = kzalloc(sizeof(*monc->monmap) +
728 num_mon*sizeof(monc->monmap->mon_inst[0]),
729 GFP_KERNEL);
730 if (!monc->monmap)
731 return -ENOMEM;
732 for (i = 0; i < num_mon; i++) {
733 monc->monmap->mon_inst[i].addr = mon_addr[i];
734 monc->monmap->mon_inst[i].addr.nonce = 0;
735 monc->monmap->mon_inst[i].name.type =
736 CEPH_ENTITY_TYPE_MON;
737 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
738 }
739 monc->monmap->num_mon = num_mon;
740 monc->have_fsid = false;
741 return 0;
742}
743
744int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
745{
746 int err = 0;
747
748 dout("init\n");
749 memset(monc, 0, sizeof(*monc));
750 monc->client = cl;
751 monc->monmap = NULL;
752 mutex_init(&monc->mutex);
753
754 err = build_initial_monmap(monc);
755 if (err)
756 goto out;
757
758 monc->con = NULL;
759
760 /* authentication */
761 monc->auth = ceph_auth_init(cl->options->name,
762 cl->options->secret);
763 if (IS_ERR(monc->auth))
764 return PTR_ERR(monc->auth);
765 monc->auth->want_keys =
766 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
767 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
768
769 /* msgs */
770 err = -ENOMEM;
771 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
772 sizeof(struct ceph_mon_subscribe_ack),
773 GFP_NOFS);
774 if (!monc->m_subscribe_ack)
775 goto out_monmap;
776
777 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
778 if (!monc->m_subscribe)
779 goto out_subscribe_ack;
780
781 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
782 if (!monc->m_auth_reply)
783 goto out_subscribe;
784
785 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
786 monc->pending_auth = 0;
787 if (!monc->m_auth)
788 goto out_auth_reply;
789
790 monc->cur_mon = -1;
791 monc->hunting = true;
792 monc->sub_renew_after = jiffies;
793 monc->sub_sent = 0;
794
795 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
796 monc->generic_request_tree = RB_ROOT;
797 monc->num_generic_requests = 0;
798 monc->last_tid = 0;
799
800 monc->have_mdsmap = 0;
801 monc->have_osdmap = 0;
802 monc->want_next_osdmap = 1;
803 return 0;
804
805out_auth_reply:
806 ceph_msg_put(monc->m_auth_reply);
807out_subscribe:
808 ceph_msg_put(monc->m_subscribe);
809out_subscribe_ack:
810 ceph_msg_put(monc->m_subscribe_ack);
811out_monmap:
812 kfree(monc->monmap);
813out:
814 return err;
815}
816EXPORT_SYMBOL(ceph_monc_init);
817
818void ceph_monc_stop(struct ceph_mon_client *monc)
819{
820 dout("stop\n");
821 cancel_delayed_work_sync(&monc->delayed_work);
822
823 mutex_lock(&monc->mutex);
824 __close_session(monc);
825 if (monc->con) {
826 monc->con->private = NULL;
827 monc->con->ops->put(monc->con);
828 monc->con = NULL;
829 }
830 mutex_unlock(&monc->mutex);
831
832 ceph_auth_destroy(monc->auth);
833
834 ceph_msg_put(monc->m_auth);
835 ceph_msg_put(monc->m_auth_reply);
836 ceph_msg_put(monc->m_subscribe);
837 ceph_msg_put(monc->m_subscribe_ack);
838
839 kfree(monc->monmap);
840}
841EXPORT_SYMBOL(ceph_monc_stop);
842
843static void handle_auth_reply(struct ceph_mon_client *monc,
844 struct ceph_msg *msg)
845{
846 int ret;
847 int was_auth = 0;
848
849 mutex_lock(&monc->mutex);
850 if (monc->auth->ops)
851 was_auth = monc->auth->ops->is_authenticated(monc->auth);
852 monc->pending_auth = 0;
853 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
854 msg->front.iov_len,
855 monc->m_auth->front.iov_base,
856 monc->m_auth->front_max);
857 if (ret < 0) {
858 monc->client->auth_err = ret;
859 wake_up_all(&monc->client->auth_wq);
860 } else if (ret > 0) {
861 __send_prepared_auth_request(monc, ret);
862 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
863 dout("authenticated, starting session\n");
864
865 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
866 monc->client->msgr->inst.name.num =
867 cpu_to_le64(monc->auth->global_id);
868
869 __send_subscribe(monc);
870 __resend_generic_request(monc);
871 }
872 mutex_unlock(&monc->mutex);
873}
874
875static int __validate_auth(struct ceph_mon_client *monc)
876{
877 int ret;
878
879 if (monc->pending_auth)
880 return 0;
881
882 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
883 monc->m_auth->front_max);
884 if (ret <= 0)
885 return ret; /* either an error, or no need to authenticate */
886 __send_prepared_auth_request(monc, ret);
887 return 0;
888}
889
890int ceph_monc_validate_auth(struct ceph_mon_client *monc)
891{
892 int ret;
893
894 mutex_lock(&monc->mutex);
895 ret = __validate_auth(monc);
896 mutex_unlock(&monc->mutex);
897 return ret;
898}
899EXPORT_SYMBOL(ceph_monc_validate_auth);
900
901/*
902 * handle incoming message
903 */
904static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
905{
906 struct ceph_mon_client *monc = con->private;
907 int type = le16_to_cpu(msg->hdr.type);
908
909 if (!monc)
910 return;
911
912 switch (type) {
913 case CEPH_MSG_AUTH_REPLY:
914 handle_auth_reply(monc, msg);
915 break;
916
917 case CEPH_MSG_MON_SUBSCRIBE_ACK:
918 handle_subscribe_ack(monc, msg);
919 break;
920
921 case CEPH_MSG_STATFS_REPLY:
922 handle_statfs_reply(monc, msg);
923 break;
924
925 case CEPH_MSG_POOLOP_REPLY:
926 handle_poolop_reply(monc, msg);
927 break;
928
929 case CEPH_MSG_MON_MAP:
930 ceph_monc_handle_map(monc, msg);
931 break;
932
933 case CEPH_MSG_OSD_MAP:
934 ceph_osdc_handle_map(&monc->client->osdc, msg);
935 break;
936
937 default:
938 /* can the chained handler handle it? */
939 if (monc->client->extra_mon_dispatch &&
940 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
941 break;
942
943 pr_err("received unknown message type %d %s\n", type,
944 ceph_msg_type_name(type));
945 }
946 ceph_msg_put(msg);
947}
948
949/*
950 * Allocate memory for incoming message
951 */
952static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
953 struct ceph_msg_header *hdr,
954 int *skip)
955{
956 struct ceph_mon_client *monc = con->private;
957 int type = le16_to_cpu(hdr->type);
958 int front_len = le32_to_cpu(hdr->front_len);
959 struct ceph_msg *m = NULL;
960
961 *skip = 0;
962
963 switch (type) {
964 case CEPH_MSG_MON_SUBSCRIBE_ACK:
965 m = ceph_msg_get(monc->m_subscribe_ack);
966 break;
967 case CEPH_MSG_POOLOP_REPLY:
968 case CEPH_MSG_STATFS_REPLY:
969 return get_generic_reply(con, hdr, skip);
970 case CEPH_MSG_AUTH_REPLY:
971 m = ceph_msg_get(monc->m_auth_reply);
972 break;
973 case CEPH_MSG_MON_MAP:
974 case CEPH_MSG_MDS_MAP:
975 case CEPH_MSG_OSD_MAP:
976 m = ceph_msg_new(type, front_len, GFP_NOFS);
977 break;
978 }
979
980 if (!m) {
981 pr_info("alloc_msg unknown type %d\n", type);
982 *skip = 1;
983 }
984 return m;
985}
986
987/*
988 * If the monitor connection resets, pick a new monitor and resubmit
989 * any pending requests.
990 */
991static void mon_fault(struct ceph_connection *con)
992{
993 struct ceph_mon_client *monc = con->private;
994
995 if (!monc)
996 return;
997
998 dout("mon_fault\n");
999 mutex_lock(&monc->mutex);
1000 if (!con->private)
1001 goto out;
1002
1003 if (monc->con && !monc->hunting)
1004 pr_info("mon%d %s session lost, "
1005 "hunting for new mon\n", monc->cur_mon,
1006 ceph_pr_addr(&monc->con->peer_addr.in_addr));
1007
1008 __close_session(monc);
1009 if (!monc->hunting) {
1010 /* start hunting */
1011 monc->hunting = true;
1012 __open_session(monc);
1013 } else {
1014 /* already hunting, let's wait a bit */
1015 __schedule_delayed(monc);
1016 }
1017out:
1018 mutex_unlock(&monc->mutex);
1019}
1020
1021static const struct ceph_connection_operations mon_con_ops = {
1022 .get = ceph_con_get,
1023 .put = ceph_con_put,
1024 .dispatch = dispatch,
1025 .fault = mon_fault,
1026 .alloc_msg = mon_alloc_msg,
1027};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 000000000000..d5f2d97ac05c
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include <linux/ceph/msgpool.h>
9
10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{
12 struct ceph_msgpool *pool = arg;
13 void *p;
14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
20
21static void free_fn(void *element, void *arg)
22{
23 ceph_msg_put(element);
24}
25
26int ceph_msgpool_init(struct ceph_msgpool *pool,
27 int front_len, int size, bool blocking, const char *name)
28{
29 pool->front_len = front_len;
30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
31 if (!pool->pool)
32 return -ENOMEM;
33 pool->name = name;
34 return 0;
35}
36
37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
38{
39 mempool_destroy(pool->pool);
40}
41
42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
44{
45 if (front_len > pool->front_len) {
46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
47 pool->name, front_len, pool->front_len);
48 WARN_ON(1);
49
50 /* try to alloc a fresh message */
51 return ceph_msg_new(0, front_len, GFP_NOFS);
52 }
53
54 return mempool_alloc(pool->pool, GFP_NOFS);
55}
56
57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
58{
59 /* reset msg front_len; user may have changed it */
60 msg->front.iov_len = pool->front_len;
61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
62
63 kref_init(&msg->kref); /* retake single ref */
64}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 000000000000..79391994b3ed
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,1773 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/highmem.h>
6#include <linux/mm.h>
7#include <linux/pagemap.h>
8#include <linux/slab.h>
9#include <linux/uaccess.h>
10#ifdef CONFIG_BLOCK
11#include <linux/bio.h>
12#endif
13
14#include <linux/ceph/libceph.h>
15#include <linux/ceph/osd_client.h>
16#include <linux/ceph/messenger.h>
17#include <linux/ceph/decode.h>
18#include <linux/ceph/auth.h>
19#include <linux/ceph/pagelist.h>
20
21#define OSD_OP_FRONT_LEN 4096
22#define OSD_OPREPLY_FRONT_LEN 512
23
24static const struct ceph_connection_operations osd_con_ops;
25static int __kick_requests(struct ceph_osd_client *osdc,
26 struct ceph_osd *kickosd);
27
28static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
29
30static int op_needs_trail(int op)
31{
32 switch (op) {
33 case CEPH_OSD_OP_GETXATTR:
34 case CEPH_OSD_OP_SETXATTR:
35 case CEPH_OSD_OP_CMPXATTR:
36 case CEPH_OSD_OP_CALL:
37 return 1;
38 default:
39 return 0;
40 }
41}
42
43static int op_has_extent(int op)
44{
45 return (op == CEPH_OSD_OP_READ ||
46 op == CEPH_OSD_OP_WRITE);
47}
48
49void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
50 struct ceph_file_layout *layout,
51 u64 snapid,
52 u64 off, u64 *plen, u64 *bno,
53 struct ceph_osd_request *req,
54 struct ceph_osd_req_op *op)
55{
56 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59
60 reqhead->snapid = cpu_to_le64(snapid);
61
62 /* object extent? */
63 ceph_calc_file_object_mapping(layout, off, plen, bno,
64 &objoff, &objlen);
65 if (*plen < orig_len)
66 dout(" skipping last %llu, final file extent %llu~%llu\n",
67 orig_len - *plen, off, *plen);
68
69 if (op_has_extent(op->op)) {
70 op->extent.offset = objoff;
71 op->extent.length = objlen;
72 }
73 req->r_num_pages = calc_pages_for(off, *plen);
74 if (op->op == CEPH_OSD_OP_WRITE)
75 op->payload_len = *plen;
76
77 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
78 *bno, objoff, objlen, req->r_num_pages);
79
80}
81EXPORT_SYMBOL(ceph_calc_raw_layout);
82
83/*
84 * Implement client access to distributed object storage cluster.
85 *
86 * All data objects are stored within a cluster/cloud of OSDs, or
87 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
88 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
89 * remote daemons serving up and coordinating consistent and safe
90 * access to storage.
91 *
92 * Cluster membership and the mapping of data objects onto storage devices
93 * are described by the osd map.
94 *
95 * We keep track of pending OSD requests (read, write), resubmit
96 * requests to different OSDs when the cluster topology/data layout
97 * change, or retry the affected requests when the communications
98 * channel with an OSD is reset.
99 */
100
101/*
102 * calculate the mapping of a file extent onto an object, and fill out the
103 * request accordingly. shorten extent as necessary if it crosses an
104 * object boundary.
105 *
106 * fill osd op in request message.
107 */
108static void calc_layout(struct ceph_osd_client *osdc,
109 struct ceph_vino vino,
110 struct ceph_file_layout *layout,
111 u64 off, u64 *plen,
112 struct ceph_osd_request *req,
113 struct ceph_osd_req_op *op)
114{
115 u64 bno;
116
117 ceph_calc_raw_layout(osdc, layout, vino.snap, off,
118 plen, &bno, req, op);
119
120 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
121 req->r_oid_len = strlen(req->r_oid);
122}
123
124/*
125 * requests
126 */
127void ceph_osdc_release_request(struct kref *kref)
128{
129 struct ceph_osd_request *req = container_of(kref,
130 struct ceph_osd_request,
131 r_kref);
132
133 if (req->r_request)
134 ceph_msg_put(req->r_request);
135 if (req->r_reply)
136 ceph_msg_put(req->r_reply);
137 if (req->r_con_filling_msg) {
138 dout("release_request revoking pages %p from con %p\n",
139 req->r_pages, req->r_con_filling_msg);
140 ceph_con_revoke_message(req->r_con_filling_msg,
141 req->r_reply);
142 ceph_con_put(req->r_con_filling_msg);
143 }
144 if (req->r_own_pages)
145 ceph_release_page_vector(req->r_pages,
146 req->r_num_pages);
147#ifdef CONFIG_BLOCK
148 if (req->r_bio)
149 bio_put(req->r_bio);
150#endif
151 ceph_put_snap_context(req->r_snapc);
152 if (req->r_trail) {
153 ceph_pagelist_release(req->r_trail);
154 kfree(req->r_trail);
155 }
156 if (req->r_mempool)
157 mempool_free(req, req->r_osdc->req_mempool);
158 else
159 kfree(req);
160}
161EXPORT_SYMBOL(ceph_osdc_release_request);
162
163static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
164{
165 int i = 0;
166
167 if (needs_trail)
168 *needs_trail = 0;
169 while (ops[i].op) {
170 if (needs_trail && op_needs_trail(ops[i].op))
171 *needs_trail = 1;
172 i++;
173 }
174
175 return i;
176}
177
178struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
179 int flags,
180 struct ceph_snap_context *snapc,
181 struct ceph_osd_req_op *ops,
182 bool use_mempool,
183 gfp_t gfp_flags,
184 struct page **pages,
185 struct bio *bio)
186{
187 struct ceph_osd_request *req;
188 struct ceph_msg *msg;
189 int needs_trail;
190 int num_op = get_num_ops(ops, &needs_trail);
191 size_t msg_size = sizeof(struct ceph_osd_request_head);
192
193 msg_size += num_op*sizeof(struct ceph_osd_op);
194
195 if (use_mempool) {
196 req = mempool_alloc(osdc->req_mempool, gfp_flags);
197 memset(req, 0, sizeof(*req));
198 } else {
199 req = kzalloc(sizeof(*req), gfp_flags);
200 }
201 if (req == NULL)
202 return NULL;
203
204 req->r_osdc = osdc;
205 req->r_mempool = use_mempool;
206
207 kref_init(&req->r_kref);
208 init_completion(&req->r_completion);
209 init_completion(&req->r_safe_completion);
210 INIT_LIST_HEAD(&req->r_unsafe_item);
211 req->r_flags = flags;
212
213 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
214
215 /* create reply message */
216 if (use_mempool)
217 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
218 else
219 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
220 OSD_OPREPLY_FRONT_LEN, gfp_flags);
221 if (!msg) {
222 ceph_osdc_put_request(req);
223 return NULL;
224 }
225 req->r_reply = msg;
226
227 /* allocate space for the trailing data */
228 if (needs_trail) {
229 req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
230 if (!req->r_trail) {
231 ceph_osdc_put_request(req);
232 return NULL;
233 }
234 ceph_pagelist_init(req->r_trail);
235 }
236 /* create request message; allow space for oid */
237 msg_size += 40;
238 if (snapc)
239 msg_size += sizeof(u64) * snapc->num_snaps;
240 if (use_mempool)
241 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
242 else
243 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
244 if (!msg) {
245 ceph_osdc_put_request(req);
246 return NULL;
247 }
248
249 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
250 memset(msg->front.iov_base, 0, msg->front.iov_len);
251
252 req->r_request = msg;
253 req->r_pages = pages;
254#ifdef CONFIG_BLOCK
255 if (bio) {
256 req->r_bio = bio;
257 bio_get(req->r_bio);
258 }
259#endif
260
261 return req;
262}
263EXPORT_SYMBOL(ceph_osdc_alloc_request);
264
265static void osd_req_encode_op(struct ceph_osd_request *req,
266 struct ceph_osd_op *dst,
267 struct ceph_osd_req_op *src)
268{
269 dst->op = cpu_to_le16(src->op);
270
271 switch (dst->op) {
272 case CEPH_OSD_OP_READ:
273 case CEPH_OSD_OP_WRITE:
274 dst->extent.offset =
275 cpu_to_le64(src->extent.offset);
276 dst->extent.length =
277 cpu_to_le64(src->extent.length);
278 dst->extent.truncate_size =
279 cpu_to_le64(src->extent.truncate_size);
280 dst->extent.truncate_seq =
281 cpu_to_le32(src->extent.truncate_seq);
282 break;
283
284 case CEPH_OSD_OP_GETXATTR:
285 case CEPH_OSD_OP_SETXATTR:
286 case CEPH_OSD_OP_CMPXATTR:
287 BUG_ON(!req->r_trail);
288
289 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
290 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
291 dst->xattr.cmp_op = src->xattr.cmp_op;
292 dst->xattr.cmp_mode = src->xattr.cmp_mode;
293 ceph_pagelist_append(req->r_trail, src->xattr.name,
294 src->xattr.name_len);
295 ceph_pagelist_append(req->r_trail, src->xattr.val,
296 src->xattr.value_len);
297 break;
298 case CEPH_OSD_OP_CALL:
299 BUG_ON(!req->r_trail);
300
301 dst->cls.class_len = src->cls.class_len;
302 dst->cls.method_len = src->cls.method_len;
303 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
304
305 ceph_pagelist_append(req->r_trail, src->cls.class_name,
306 src->cls.class_len);
307 ceph_pagelist_append(req->r_trail, src->cls.method_name,
308 src->cls.method_len);
309 ceph_pagelist_append(req->r_trail, src->cls.indata,
310 src->cls.indata_len);
311 break;
312 case CEPH_OSD_OP_ROLLBACK:
313 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
314 break;
315 case CEPH_OSD_OP_STARTSYNC:
316 break;
317 default:
318 pr_err("unrecognized osd opcode %d\n", dst->op);
319 WARN_ON(1);
320 break;
321 }
322 dst->payload_len = cpu_to_le32(src->payload_len);
323}
324
325/*
326 * build new request AND message
327 *
328 */
329void ceph_osdc_build_request(struct ceph_osd_request *req,
330 u64 off, u64 *plen,
331 struct ceph_osd_req_op *src_ops,
332 struct ceph_snap_context *snapc,
333 struct timespec *mtime,
334 const char *oid,
335 int oid_len)
336{
337 struct ceph_msg *msg = req->r_request;
338 struct ceph_osd_request_head *head;
339 struct ceph_osd_req_op *src_op;
340 struct ceph_osd_op *op;
341 void *p;
342 int num_op = get_num_ops(src_ops, NULL);
343 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
344 int flags = req->r_flags;
345 u64 data_len = 0;
346 int i;
347
348 head = msg->front.iov_base;
349 op = (void *)(head + 1);
350 p = (void *)(op + num_op);
351
352 req->r_snapc = ceph_get_snap_context(snapc);
353
354 head->client_inc = cpu_to_le32(1); /* always, for now. */
355 head->flags = cpu_to_le32(flags);
356 if (flags & CEPH_OSD_FLAG_WRITE)
357 ceph_encode_timespec(&head->mtime, mtime);
358 head->num_ops = cpu_to_le16(num_op);
359
360
361 /* fill in oid */
362 head->object_len = cpu_to_le32(oid_len);
363 memcpy(p, oid, oid_len);
364 p += oid_len;
365
366 src_op = src_ops;
367 while (src_op->op) {
368 osd_req_encode_op(req, op, src_op);
369 src_op++;
370 op++;
371 }
372
373 if (req->r_trail)
374 data_len += req->r_trail->length;
375
376 if (snapc) {
377 head->snap_seq = cpu_to_le64(snapc->seq);
378 head->num_snaps = cpu_to_le32(snapc->num_snaps);
379 for (i = 0; i < snapc->num_snaps; i++) {
380 put_unaligned_le64(snapc->snaps[i], p);
381 p += sizeof(u64);
382 }
383 }
384
385 if (flags & CEPH_OSD_FLAG_WRITE) {
386 req->r_request->hdr.data_off = cpu_to_le16(off);
387 req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
388 } else if (data_len) {
389 req->r_request->hdr.data_off = 0;
390 req->r_request->hdr.data_len = cpu_to_le32(data_len);
391 }
392
393 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
394 msg_size = p - msg->front.iov_base;
395 msg->front.iov_len = msg_size;
396 msg->hdr.front_len = cpu_to_le32(msg_size);
397 return;
398}
399EXPORT_SYMBOL(ceph_osdc_build_request);
400
401/*
402 * build new request AND message, calculate layout, and adjust file
403 * extent as needed.
404 *
405 * if the file was recently truncated, we include information about its
406 * old and new size so that the object can be updated appropriately. (we
407 * avoid synchronously deleting truncated objects because it's slow.)
408 *
409 * if @do_sync, include a 'startsync' command so that the osd will flush
410 * data quickly.
411 */
412struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
413 struct ceph_file_layout *layout,
414 struct ceph_vino vino,
415 u64 off, u64 *plen,
416 int opcode, int flags,
417 struct ceph_snap_context *snapc,
418 int do_sync,
419 u32 truncate_seq,
420 u64 truncate_size,
421 struct timespec *mtime,
422 bool use_mempool, int num_reply)
423{
424 struct ceph_osd_req_op ops[3];
425 struct ceph_osd_request *req;
426
427 ops[0].op = opcode;
428 ops[0].extent.truncate_seq = truncate_seq;
429 ops[0].extent.truncate_size = truncate_size;
430 ops[0].payload_len = 0;
431
432 if (do_sync) {
433 ops[1].op = CEPH_OSD_OP_STARTSYNC;
434 ops[1].payload_len = 0;
435 ops[2].op = 0;
436 } else
437 ops[1].op = 0;
438
439 req = ceph_osdc_alloc_request(osdc, flags,
440 snapc, ops,
441 use_mempool,
442 GFP_NOFS, NULL, NULL);
443 if (IS_ERR(req))
444 return req;
445
446 /* calculate max write size */
447 calc_layout(osdc, vino, layout, off, plen, req, ops);
448 req->r_file_layout = *layout; /* keep a copy */
449
450 ceph_osdc_build_request(req, off, plen, ops,
451 snapc,
452 mtime,
453 req->r_oid, req->r_oid_len);
454
455 return req;
456}
457EXPORT_SYMBOL(ceph_osdc_new_request);
458
459/*
460 * We keep osd requests in an rbtree, sorted by ->r_tid.
461 */
462static void __insert_request(struct ceph_osd_client *osdc,
463 struct ceph_osd_request *new)
464{
465 struct rb_node **p = &osdc->requests.rb_node;
466 struct rb_node *parent = NULL;
467 struct ceph_osd_request *req = NULL;
468
469 while (*p) {
470 parent = *p;
471 req = rb_entry(parent, struct ceph_osd_request, r_node);
472 if (new->r_tid < req->r_tid)
473 p = &(*p)->rb_left;
474 else if (new->r_tid > req->r_tid)
475 p = &(*p)->rb_right;
476 else
477 BUG();
478 }
479
480 rb_link_node(&new->r_node, parent, p);
481 rb_insert_color(&new->r_node, &osdc->requests);
482}
483
484static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
485 u64 tid)
486{
487 struct ceph_osd_request *req;
488 struct rb_node *n = osdc->requests.rb_node;
489
490 while (n) {
491 req = rb_entry(n, struct ceph_osd_request, r_node);
492 if (tid < req->r_tid)
493 n = n->rb_left;
494 else if (tid > req->r_tid)
495 n = n->rb_right;
496 else
497 return req;
498 }
499 return NULL;
500}
501
502static struct ceph_osd_request *
503__lookup_request_ge(struct ceph_osd_client *osdc,
504 u64 tid)
505{
506 struct ceph_osd_request *req;
507 struct rb_node *n = osdc->requests.rb_node;
508
509 while (n) {
510 req = rb_entry(n, struct ceph_osd_request, r_node);
511 if (tid < req->r_tid) {
512 if (!n->rb_left)
513 return req;
514 n = n->rb_left;
515 } else if (tid > req->r_tid) {
516 n = n->rb_right;
517 } else {
518 return req;
519 }
520 }
521 return NULL;
522}
523
524
525/*
526 * If the osd connection drops, we need to resubmit all requests.
527 */
528static void osd_reset(struct ceph_connection *con)
529{
530 struct ceph_osd *osd = con->private;
531 struct ceph_osd_client *osdc;
532
533 if (!osd)
534 return;
535 dout("osd_reset osd%d\n", osd->o_osd);
536 osdc = osd->o_osdc;
537 down_read(&osdc->map_sem);
538 kick_requests(osdc, osd);
539 up_read(&osdc->map_sem);
540}
541
542/*
543 * Track open sessions with osds.
544 */
545static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
546{
547 struct ceph_osd *osd;
548
549 osd = kzalloc(sizeof(*osd), GFP_NOFS);
550 if (!osd)
551 return NULL;
552
553 atomic_set(&osd->o_ref, 1);
554 osd->o_osdc = osdc;
555 INIT_LIST_HEAD(&osd->o_requests);
556 INIT_LIST_HEAD(&osd->o_osd_lru);
557 osd->o_incarnation = 1;
558
559 ceph_con_init(osdc->client->msgr, &osd->o_con);
560 osd->o_con.private = osd;
561 osd->o_con.ops = &osd_con_ops;
562 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
563
564 INIT_LIST_HEAD(&osd->o_keepalive_item);
565 return osd;
566}
567
568static struct ceph_osd *get_osd(struct ceph_osd *osd)
569{
570 if (atomic_inc_not_zero(&osd->o_ref)) {
571 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
572 atomic_read(&osd->o_ref));
573 return osd;
574 } else {
575 dout("get_osd %p FAIL\n", osd);
576 return NULL;
577 }
578}
579
580static void put_osd(struct ceph_osd *osd)
581{
582 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
583 atomic_read(&osd->o_ref) - 1);
584 if (atomic_dec_and_test(&osd->o_ref)) {
585 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
586
587 if (osd->o_authorizer)
588 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
589 kfree(osd);
590 }
591}
592
593/*
594 * remove an osd from our map
595 */
596static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
597{
598 dout("__remove_osd %p\n", osd);
599 BUG_ON(!list_empty(&osd->o_requests));
600 rb_erase(&osd->o_node, &osdc->osds);
601 list_del_init(&osd->o_osd_lru);
602 ceph_con_close(&osd->o_con);
603 put_osd(osd);
604}
605
606static void __move_osd_to_lru(struct ceph_osd_client *osdc,
607 struct ceph_osd *osd)
608{
609 dout("__move_osd_to_lru %p\n", osd);
610 BUG_ON(!list_empty(&osd->o_osd_lru));
611 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
612 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
613}
614
615static void __remove_osd_from_lru(struct ceph_osd *osd)
616{
617 dout("__remove_osd_from_lru %p\n", osd);
618 if (!list_empty(&osd->o_osd_lru))
619 list_del_init(&osd->o_osd_lru);
620}
621
622static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
623{
624 struct ceph_osd *osd, *nosd;
625
626 dout("__remove_old_osds %p\n", osdc);
627 mutex_lock(&osdc->request_mutex);
628 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
629 if (!remove_all && time_before(jiffies, osd->lru_ttl))
630 break;
631 __remove_osd(osdc, osd);
632 }
633 mutex_unlock(&osdc->request_mutex);
634}
635
636/*
637 * reset osd connect
638 */
639static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
640{
641 struct ceph_osd_request *req;
642 int ret = 0;
643
644 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
645 if (list_empty(&osd->o_requests)) {
646 __remove_osd(osdc, osd);
647 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
648 &osd->o_con.peer_addr,
649 sizeof(osd->o_con.peer_addr)) == 0 &&
650 !ceph_con_opened(&osd->o_con)) {
651 dout(" osd addr hasn't changed and connection never opened,"
652 " letting msgr retry");
653 /* touch each r_stamp for handle_timeout()'s benfit */
654 list_for_each_entry(req, &osd->o_requests, r_osd_item)
655 req->r_stamp = jiffies;
656 ret = -EAGAIN;
657 } else {
658 ceph_con_close(&osd->o_con);
659 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
660 osd->o_incarnation++;
661 }
662 return ret;
663}
664
665static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
666{
667 struct rb_node **p = &osdc->osds.rb_node;
668 struct rb_node *parent = NULL;
669 struct ceph_osd *osd = NULL;
670
671 while (*p) {
672 parent = *p;
673 osd = rb_entry(parent, struct ceph_osd, o_node);
674 if (new->o_osd < osd->o_osd)
675 p = &(*p)->rb_left;
676 else if (new->o_osd > osd->o_osd)
677 p = &(*p)->rb_right;
678 else
679 BUG();
680 }
681
682 rb_link_node(&new->o_node, parent, p);
683 rb_insert_color(&new->o_node, &osdc->osds);
684}
685
686static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
687{
688 struct ceph_osd *osd;
689 struct rb_node *n = osdc->osds.rb_node;
690
691 while (n) {
692 osd = rb_entry(n, struct ceph_osd, o_node);
693 if (o < osd->o_osd)
694 n = n->rb_left;
695 else if (o > osd->o_osd)
696 n = n->rb_right;
697 else
698 return osd;
699 }
700 return NULL;
701}
702
703static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
704{
705 schedule_delayed_work(&osdc->timeout_work,
706 osdc->client->options->osd_keepalive_timeout * HZ);
707}
708
709static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
710{
711 cancel_delayed_work(&osdc->timeout_work);
712}
713
714/*
715 * Register request, assign tid. If this is the first request, set up
716 * the timeout event.
717 */
718static void register_request(struct ceph_osd_client *osdc,
719 struct ceph_osd_request *req)
720{
721 mutex_lock(&osdc->request_mutex);
722 req->r_tid = ++osdc->last_tid;
723 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
724 INIT_LIST_HEAD(&req->r_req_lru_item);
725
726 dout("register_request %p tid %lld\n", req, req->r_tid);
727 __insert_request(osdc, req);
728 ceph_osdc_get_request(req);
729 osdc->num_requests++;
730
731 if (osdc->num_requests == 1) {
732 dout(" first request, scheduling timeout\n");
733 __schedule_osd_timeout(osdc);
734 }
735 mutex_unlock(&osdc->request_mutex);
736}
737
738/*
739 * called under osdc->request_mutex
740 */
741static void __unregister_request(struct ceph_osd_client *osdc,
742 struct ceph_osd_request *req)
743{
744 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
745 rb_erase(&req->r_node, &osdc->requests);
746 osdc->num_requests--;
747
748 if (req->r_osd) {
749 /* make sure the original request isn't in flight. */
750 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
751
752 list_del_init(&req->r_osd_item);
753 if (list_empty(&req->r_osd->o_requests))
754 __move_osd_to_lru(osdc, req->r_osd);
755 req->r_osd = NULL;
756 }
757
758 ceph_osdc_put_request(req);
759
760 list_del_init(&req->r_req_lru_item);
761 if (osdc->num_requests == 0) {
762 dout(" no requests, canceling timeout\n");
763 __cancel_osd_timeout(osdc);
764 }
765}
766
767/*
768 * Cancel a previously queued request message
769 */
770static void __cancel_request(struct ceph_osd_request *req)
771{
772 if (req->r_sent && req->r_osd) {
773 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
774 req->r_sent = 0;
775 }
776 list_del_init(&req->r_req_lru_item);
777}
778
779/*
780 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
781 * (as needed), and set the request r_osd appropriately. If there is
782 * no up osd, set r_osd to NULL.
783 *
784 * Return 0 if unchanged, 1 if changed, or negative on error.
785 *
786 * Caller should hold map_sem for read and request_mutex.
787 */
788static int __map_osds(struct ceph_osd_client *osdc,
789 struct ceph_osd_request *req)
790{
791 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
792 struct ceph_pg pgid;
793 int acting[CEPH_PG_MAX_SIZE];
794 int o = -1, num = 0;
795 int err;
796
797 dout("map_osds %p tid %lld\n", req, req->r_tid);
798 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
799 &req->r_file_layout, osdc->osdmap);
800 if (err)
801 return err;
802 pgid = reqhead->layout.ol_pgid;
803 req->r_pgid = pgid;
804
805 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
806 if (err > 0) {
807 o = acting[0];
808 num = err;
809 }
810
811 if ((req->r_osd && req->r_osd->o_osd == o &&
812 req->r_sent >= req->r_osd->o_incarnation &&
813 req->r_num_pg_osds == num &&
814 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
815 (req->r_osd == NULL && o == -1))
816 return 0; /* no change */
817
818 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
819 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
820 req->r_osd ? req->r_osd->o_osd : -1);
821
822 /* record full pg acting set */
823 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
824 req->r_num_pg_osds = num;
825
826 if (req->r_osd) {
827 __cancel_request(req);
828 list_del_init(&req->r_osd_item);
829 req->r_osd = NULL;
830 }
831
832 req->r_osd = __lookup_osd(osdc, o);
833 if (!req->r_osd && o >= 0) {
834 err = -ENOMEM;
835 req->r_osd = create_osd(osdc);
836 if (!req->r_osd)
837 goto out;
838
839 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
840 req->r_osd->o_osd = o;
841 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
842 __insert_osd(osdc, req->r_osd);
843
844 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
845 }
846
847 if (req->r_osd) {
848 __remove_osd_from_lru(req->r_osd);
849 list_add(&req->r_osd_item, &req->r_osd->o_requests);
850 }
851 err = 1; /* osd or pg changed */
852
853out:
854 return err;
855}
856
857/*
858 * caller should hold map_sem (for read) and request_mutex
859 */
860static int __send_request(struct ceph_osd_client *osdc,
861 struct ceph_osd_request *req)
862{
863 struct ceph_osd_request_head *reqhead;
864 int err;
865
866 err = __map_osds(osdc, req);
867 if (err < 0)
868 return err;
869 if (req->r_osd == NULL) {
870 dout("send_request %p no up osds in pg\n", req);
871 ceph_monc_request_next_osdmap(&osdc->client->monc);
872 return 0;
873 }
874
875 dout("send_request %p tid %llu to osd%d flags %d\n",
876 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
877
878 reqhead = req->r_request->front.iov_base;
879 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
880 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
881 reqhead->reassert_version = req->r_reassert_version;
882
883 req->r_stamp = jiffies;
884 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
885
886 ceph_msg_get(req->r_request); /* send consumes a ref */
887 ceph_con_send(&req->r_osd->o_con, req->r_request);
888 req->r_sent = req->r_osd->o_incarnation;
889 return 0;
890}
891
892/*
893 * Timeout callback, called every N seconds when 1 or more osd
894 * requests has been active for more than N seconds. When this
895 * happens, we ping all OSDs with requests who have timed out to
896 * ensure any communications channel reset is detected. Reset the
897 * request timeouts another N seconds in the future as we go.
898 * Reschedule the timeout event another N seconds in future (unless
899 * there are no open requests).
900 */
901static void handle_timeout(struct work_struct *work)
902{
903 struct ceph_osd_client *osdc =
904 container_of(work, struct ceph_osd_client, timeout_work.work);
905 struct ceph_osd_request *req, *last_req = NULL;
906 struct ceph_osd *osd;
907 unsigned long timeout = osdc->client->options->osd_timeout * HZ;
908 unsigned long keepalive =
909 osdc->client->options->osd_keepalive_timeout * HZ;
910 unsigned long last_stamp = 0;
911 struct rb_node *p;
912 struct list_head slow_osds;
913
914 dout("timeout\n");
915 down_read(&osdc->map_sem);
916
917 ceph_monc_request_next_osdmap(&osdc->client->monc);
918
919 mutex_lock(&osdc->request_mutex);
920 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
921 req = rb_entry(p, struct ceph_osd_request, r_node);
922
923 if (req->r_resend) {
924 int err;
925
926 dout("osdc resending prev failed %lld\n", req->r_tid);
927 err = __send_request(osdc, req);
928 if (err)
929 dout("osdc failed again on %lld\n", req->r_tid);
930 else
931 req->r_resend = false;
932 continue;
933 }
934 }
935
936 /*
937 * reset osds that appear to be _really_ unresponsive. this
938 * is a failsafe measure.. we really shouldn't be getting to
939 * this point if the system is working properly. the monitors
940 * should mark the osd as failed and we should find out about
941 * it from an updated osd map.
942 */
943 while (timeout && !list_empty(&osdc->req_lru)) {
944 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
945 r_req_lru_item);
946
947 if (time_before(jiffies, req->r_stamp + timeout))
948 break;
949
950 BUG_ON(req == last_req && req->r_stamp == last_stamp);
951 last_req = req;
952 last_stamp = req->r_stamp;
953
954 osd = req->r_osd;
955 BUG_ON(!osd);
956 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
957 req->r_tid, osd->o_osd);
958 __kick_requests(osdc, osd);
959 }
960
961 /*
962 * ping osds that are a bit slow. this ensures that if there
963 * is a break in the TCP connection we will notice, and reopen
964 * a connection with that osd (from the fault callback).
965 */
966 INIT_LIST_HEAD(&slow_osds);
967 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
968 if (time_before(jiffies, req->r_stamp + keepalive))
969 break;
970
971 osd = req->r_osd;
972 BUG_ON(!osd);
973 dout(" tid %llu is slow, will send keepalive on osd%d\n",
974 req->r_tid, osd->o_osd);
975 list_move_tail(&osd->o_keepalive_item, &slow_osds);
976 }
977 while (!list_empty(&slow_osds)) {
978 osd = list_entry(slow_osds.next, struct ceph_osd,
979 o_keepalive_item);
980 list_del_init(&osd->o_keepalive_item);
981 ceph_con_keepalive(&osd->o_con);
982 }
983
984 __schedule_osd_timeout(osdc);
985 mutex_unlock(&osdc->request_mutex);
986
987 up_read(&osdc->map_sem);
988}
989
990static void handle_osds_timeout(struct work_struct *work)
991{
992 struct ceph_osd_client *osdc =
993 container_of(work, struct ceph_osd_client,
994 osds_timeout_work.work);
995 unsigned long delay =
996 osdc->client->options->osd_idle_ttl * HZ >> 2;
997
998 dout("osds timeout\n");
999 down_read(&osdc->map_sem);
1000 remove_old_osds(osdc, 0);
1001 up_read(&osdc->map_sem);
1002
1003 schedule_delayed_work(&osdc->osds_timeout_work,
1004 round_jiffies_relative(delay));
1005}
1006
1007/*
1008 * handle osd op reply. either call the callback if it is specified,
1009 * or do the completion to wake up the waiting thread.
1010 */
1011static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1012 struct ceph_connection *con)
1013{
1014 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
1015 struct ceph_osd_request *req;
1016 u64 tid;
1017 int numops, object_len, flags;
1018 s32 result;
1019
1020 tid = le64_to_cpu(msg->hdr.tid);
1021 if (msg->front.iov_len < sizeof(*rhead))
1022 goto bad;
1023 numops = le32_to_cpu(rhead->num_ops);
1024 object_len = le32_to_cpu(rhead->object_len);
1025 result = le32_to_cpu(rhead->result);
1026 if (msg->front.iov_len != sizeof(*rhead) + object_len +
1027 numops * sizeof(struct ceph_osd_op))
1028 goto bad;
1029 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
1030
1031 /* lookup */
1032 mutex_lock(&osdc->request_mutex);
1033 req = __lookup_request(osdc, tid);
1034 if (req == NULL) {
1035 dout("handle_reply tid %llu dne\n", tid);
1036 mutex_unlock(&osdc->request_mutex);
1037 return;
1038 }
1039 ceph_osdc_get_request(req);
1040 flags = le32_to_cpu(rhead->flags);
1041
1042 /*
1043 * if this connection filled our message, drop our reference now, to
1044 * avoid a (safe but slower) revoke later.
1045 */
1046 if (req->r_con_filling_msg == con && req->r_reply == msg) {
1047 dout(" dropping con_filling_msg ref %p\n", con);
1048 req->r_con_filling_msg = NULL;
1049 ceph_con_put(con);
1050 }
1051
1052 if (!req->r_got_reply) {
1053 unsigned bytes;
1054
1055 req->r_result = le32_to_cpu(rhead->result);
1056 bytes = le32_to_cpu(msg->hdr.data_len);
1057 dout("handle_reply result %d bytes %d\n", req->r_result,
1058 bytes);
1059 if (req->r_result == 0)
1060 req->r_result = bytes;
1061
1062 /* in case this is a write and we need to replay, */
1063 req->r_reassert_version = rhead->reassert_version;
1064
1065 req->r_got_reply = 1;
1066 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
1067 dout("handle_reply tid %llu dup ack\n", tid);
1068 mutex_unlock(&osdc->request_mutex);
1069 goto done;
1070 }
1071
1072 dout("handle_reply tid %llu flags %d\n", tid, flags);
1073
1074 /* either this is a read, or we got the safe response */
1075 if (result < 0 ||
1076 (flags & CEPH_OSD_FLAG_ONDISK) ||
1077 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
1078 __unregister_request(osdc, req);
1079
1080 mutex_unlock(&osdc->request_mutex);
1081
1082 if (req->r_callback)
1083 req->r_callback(req, msg);
1084 else
1085 complete_all(&req->r_completion);
1086
1087 if (flags & CEPH_OSD_FLAG_ONDISK) {
1088 if (req->r_safe_callback)
1089 req->r_safe_callback(req, msg);
1090 complete_all(&req->r_safe_completion); /* fsync waiter */
1091 }
1092
1093done:
1094 ceph_osdc_put_request(req);
1095 return;
1096
1097bad:
1098 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
1099 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
1100 (int)sizeof(*rhead));
1101 ceph_msg_dump(msg);
1102}
1103
1104
1105static int __kick_requests(struct ceph_osd_client *osdc,
1106 struct ceph_osd *kickosd)
1107{
1108 struct ceph_osd_request *req;
1109 struct rb_node *p, *n;
1110 int needmap = 0;
1111 int err;
1112
1113 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
1114 if (kickosd) {
1115 err = __reset_osd(osdc, kickosd);
1116 if (err == -EAGAIN)
1117 return 1;
1118 } else {
1119 for (p = rb_first(&osdc->osds); p; p = n) {
1120 struct ceph_osd *osd =
1121 rb_entry(p, struct ceph_osd, o_node);
1122
1123 n = rb_next(p);
1124 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
1125 memcmp(&osd->o_con.peer_addr,
1126 ceph_osd_addr(osdc->osdmap,
1127 osd->o_osd),
1128 sizeof(struct ceph_entity_addr)) != 0)
1129 __reset_osd(osdc, osd);
1130 }
1131 }
1132
1133 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
1134 req = rb_entry(p, struct ceph_osd_request, r_node);
1135
1136 if (req->r_resend) {
1137 dout(" r_resend set on tid %llu\n", req->r_tid);
1138 __cancel_request(req);
1139 goto kick;
1140 }
1141 if (req->r_osd && kickosd == req->r_osd) {
1142 __cancel_request(req);
1143 goto kick;
1144 }
1145
1146 err = __map_osds(osdc, req);
1147 if (err == 0)
1148 continue; /* no change */
1149 if (err < 0) {
1150 /*
1151 * FIXME: really, we should set the request
1152 * error and fail if this isn't a 'nofail'
1153 * request, but that's a fair bit more
1154 * complicated to do. So retry!
1155 */
1156 dout(" setting r_resend on %llu\n", req->r_tid);
1157 req->r_resend = true;
1158 continue;
1159 }
1160 if (req->r_osd == NULL) {
1161 dout("tid %llu maps to no valid osd\n", req->r_tid);
1162 needmap++; /* request a newer map */
1163 continue;
1164 }
1165
1166kick:
1167 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
1168 req->r_osd ? req->r_osd->o_osd : -1);
1169 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1170 err = __send_request(osdc, req);
1171 if (err) {
1172 dout(" setting r_resend on %llu\n", req->r_tid);
1173 req->r_resend = true;
1174 }
1175 }
1176
1177 return needmap;
1178}
1179
1180/*
1181 * Resubmit osd requests whose osd or osd address has changed. Request
1182 * a new osd map if osds are down, or we are otherwise unable to determine
1183 * how to direct a request.
1184 *
1185 * Close connections to down osds.
1186 *
1187 * If @who is specified, resubmit requests for that specific osd.
1188 *
1189 * Caller should hold map_sem for read and request_mutex.
1190 */
1191static void kick_requests(struct ceph_osd_client *osdc,
1192 struct ceph_osd *kickosd)
1193{
1194 int needmap;
1195
1196 mutex_lock(&osdc->request_mutex);
1197 needmap = __kick_requests(osdc, kickosd);
1198 mutex_unlock(&osdc->request_mutex);
1199
1200 if (needmap) {
1201 dout("%d requests for down osds, need new map\n", needmap);
1202 ceph_monc_request_next_osdmap(&osdc->client->monc);
1203 }
1204
1205}
1206/*
1207 * Process updated osd map.
1208 *
1209 * The message contains any number of incremental and full maps, normally
1210 * indicating some sort of topology change in the cluster. Kick requests
1211 * off to different OSDs as needed.
1212 */
1213void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1214{
1215 void *p, *end, *next;
1216 u32 nr_maps, maplen;
1217 u32 epoch;
1218 struct ceph_osdmap *newmap = NULL, *oldmap;
1219 int err;
1220 struct ceph_fsid fsid;
1221
1222 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1223 p = msg->front.iov_base;
1224 end = p + msg->front.iov_len;
1225
1226 /* verify fsid */
1227 ceph_decode_need(&p, end, sizeof(fsid), bad);
1228 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1229 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1230 return;
1231
1232 down_write(&osdc->map_sem);
1233
1234 /* incremental maps */
1235 ceph_decode_32_safe(&p, end, nr_maps, bad);
1236 dout(" %d inc maps\n", nr_maps);
1237 while (nr_maps > 0) {
1238 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1239 epoch = ceph_decode_32(&p);
1240 maplen = ceph_decode_32(&p);
1241 ceph_decode_need(&p, end, maplen, bad);
1242 next = p + maplen;
1243 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1244 dout("applying incremental map %u len %d\n",
1245 epoch, maplen);
1246 newmap = osdmap_apply_incremental(&p, next,
1247 osdc->osdmap,
1248 osdc->client->msgr);
1249 if (IS_ERR(newmap)) {
1250 err = PTR_ERR(newmap);
1251 goto bad;
1252 }
1253 BUG_ON(!newmap);
1254 if (newmap != osdc->osdmap) {
1255 ceph_osdmap_destroy(osdc->osdmap);
1256 osdc->osdmap = newmap;
1257 }
1258 } else {
1259 dout("ignoring incremental map %u len %d\n",
1260 epoch, maplen);
1261 }
1262 p = next;
1263 nr_maps--;
1264 }
1265 if (newmap)
1266 goto done;
1267
1268 /* full maps */
1269 ceph_decode_32_safe(&p, end, nr_maps, bad);
1270 dout(" %d full maps\n", nr_maps);
1271 while (nr_maps) {
1272 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1273 epoch = ceph_decode_32(&p);
1274 maplen = ceph_decode_32(&p);
1275 ceph_decode_need(&p, end, maplen, bad);
1276 if (nr_maps > 1) {
1277 dout("skipping non-latest full map %u len %d\n",
1278 epoch, maplen);
1279 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1280 dout("skipping full map %u len %d, "
1281 "older than our %u\n", epoch, maplen,
1282 osdc->osdmap->epoch);
1283 } else {
1284 dout("taking full map %u len %d\n", epoch, maplen);
1285 newmap = osdmap_decode(&p, p+maplen);
1286 if (IS_ERR(newmap)) {
1287 err = PTR_ERR(newmap);
1288 goto bad;
1289 }
1290 BUG_ON(!newmap);
1291 oldmap = osdc->osdmap;
1292 osdc->osdmap = newmap;
1293 if (oldmap)
1294 ceph_osdmap_destroy(oldmap);
1295 }
1296 p += maplen;
1297 nr_maps--;
1298 }
1299
1300done:
1301 downgrade_write(&osdc->map_sem);
1302 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1303 if (newmap)
1304 kick_requests(osdc, NULL);
1305 up_read(&osdc->map_sem);
1306 wake_up_all(&osdc->client->auth_wq);
1307 return;
1308
1309bad:
1310 pr_err("osdc handle_map corrupt msg\n");
1311 ceph_msg_dump(msg);
1312 up_write(&osdc->map_sem);
1313 return;
1314}
1315
1316/*
1317 * Register request, send initial attempt.
1318 */
1319int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1320 struct ceph_osd_request *req,
1321 bool nofail)
1322{
1323 int rc = 0;
1324
1325 req->r_request->pages = req->r_pages;
1326 req->r_request->nr_pages = req->r_num_pages;
1327#ifdef CONFIG_BLOCK
1328 req->r_request->bio = req->r_bio;
1329#endif
1330 req->r_request->trail = req->r_trail;
1331
1332 register_request(osdc, req);
1333
1334 down_read(&osdc->map_sem);
1335 mutex_lock(&osdc->request_mutex);
1336 /*
1337 * a racing kick_requests() may have sent the message for us
1338 * while we dropped request_mutex above, so only send now if
1339 * the request still han't been touched yet.
1340 */
1341 if (req->r_sent == 0) {
1342 rc = __send_request(osdc, req);
1343 if (rc) {
1344 if (nofail) {
1345 dout("osdc_start_request failed send, "
1346 " marking %lld\n", req->r_tid);
1347 req->r_resend = true;
1348 rc = 0;
1349 } else {
1350 __unregister_request(osdc, req);
1351 }
1352 }
1353 }
1354 mutex_unlock(&osdc->request_mutex);
1355 up_read(&osdc->map_sem);
1356 return rc;
1357}
1358EXPORT_SYMBOL(ceph_osdc_start_request);
1359
1360/*
1361 * wait for a request to complete
1362 */
1363int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1364 struct ceph_osd_request *req)
1365{
1366 int rc;
1367
1368 rc = wait_for_completion_interruptible(&req->r_completion);
1369 if (rc < 0) {
1370 mutex_lock(&osdc->request_mutex);
1371 __cancel_request(req);
1372 __unregister_request(osdc, req);
1373 mutex_unlock(&osdc->request_mutex);
1374 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1375 return rc;
1376 }
1377
1378 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1379 return req->r_result;
1380}
1381EXPORT_SYMBOL(ceph_osdc_wait_request);
1382
1383/*
1384 * sync - wait for all in-flight requests to flush. avoid starvation.
1385 */
1386void ceph_osdc_sync(struct ceph_osd_client *osdc)
1387{
1388 struct ceph_osd_request *req;
1389 u64 last_tid, next_tid = 0;
1390
1391 mutex_lock(&osdc->request_mutex);
1392 last_tid = osdc->last_tid;
1393 while (1) {
1394 req = __lookup_request_ge(osdc, next_tid);
1395 if (!req)
1396 break;
1397 if (req->r_tid > last_tid)
1398 break;
1399
1400 next_tid = req->r_tid + 1;
1401 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1402 continue;
1403
1404 ceph_osdc_get_request(req);
1405 mutex_unlock(&osdc->request_mutex);
1406 dout("sync waiting on tid %llu (last is %llu)\n",
1407 req->r_tid, last_tid);
1408 wait_for_completion(&req->r_safe_completion);
1409 mutex_lock(&osdc->request_mutex);
1410 ceph_osdc_put_request(req);
1411 }
1412 mutex_unlock(&osdc->request_mutex);
1413 dout("sync done (thru tid %llu)\n", last_tid);
1414}
1415EXPORT_SYMBOL(ceph_osdc_sync);
1416
1417/*
1418 * init, shutdown
1419 */
1420int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1421{
1422 int err;
1423
1424 dout("init\n");
1425 osdc->client = client;
1426 osdc->osdmap = NULL;
1427 init_rwsem(&osdc->map_sem);
1428 init_completion(&osdc->map_waiters);
1429 osdc->last_requested_map = 0;
1430 mutex_init(&osdc->request_mutex);
1431 osdc->last_tid = 0;
1432 osdc->osds = RB_ROOT;
1433 INIT_LIST_HEAD(&osdc->osd_lru);
1434 osdc->requests = RB_ROOT;
1435 INIT_LIST_HEAD(&osdc->req_lru);
1436 osdc->num_requests = 0;
1437 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1438 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1439
1440 schedule_delayed_work(&osdc->osds_timeout_work,
1441 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
1442
1443 err = -ENOMEM;
1444 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1445 sizeof(struct ceph_osd_request));
1446 if (!osdc->req_mempool)
1447 goto out;
1448
1449 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1450 "osd_op");
1451 if (err < 0)
1452 goto out_mempool;
1453 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1454 OSD_OPREPLY_FRONT_LEN, 10, true,
1455 "osd_op_reply");
1456 if (err < 0)
1457 goto out_msgpool;
1458 return 0;
1459
1460out_msgpool:
1461 ceph_msgpool_destroy(&osdc->msgpool_op);
1462out_mempool:
1463 mempool_destroy(osdc->req_mempool);
1464out:
1465 return err;
1466}
1467EXPORT_SYMBOL(ceph_osdc_init);
1468
1469void ceph_osdc_stop(struct ceph_osd_client *osdc)
1470{
1471 cancel_delayed_work_sync(&osdc->timeout_work);
1472 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1473 if (osdc->osdmap) {
1474 ceph_osdmap_destroy(osdc->osdmap);
1475 osdc->osdmap = NULL;
1476 }
1477 remove_old_osds(osdc, 1);
1478 mempool_destroy(osdc->req_mempool);
1479 ceph_msgpool_destroy(&osdc->msgpool_op);
1480 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1481}
1482EXPORT_SYMBOL(ceph_osdc_stop);
1483
1484/*
1485 * Read some contiguous pages. If we cross a stripe boundary, shorten
1486 * *plen. Return number of bytes read, or error.
1487 */
1488int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1489 struct ceph_vino vino, struct ceph_file_layout *layout,
1490 u64 off, u64 *plen,
1491 u32 truncate_seq, u64 truncate_size,
1492 struct page **pages, int num_pages)
1493{
1494 struct ceph_osd_request *req;
1495 int rc = 0;
1496
1497 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1498 vino.snap, off, *plen);
1499 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1500 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1501 NULL, 0, truncate_seq, truncate_size, NULL,
1502 false, 1);
1503 if (!req)
1504 return -ENOMEM;
1505
1506 /* it may be a short read due to an object boundary */
1507 req->r_pages = pages;
1508
1509 dout("readpages final extent is %llu~%llu (%d pages)\n",
1510 off, *plen, req->r_num_pages);
1511
1512 rc = ceph_osdc_start_request(osdc, req, false);
1513 if (!rc)
1514 rc = ceph_osdc_wait_request(osdc, req);
1515
1516 ceph_osdc_put_request(req);
1517 dout("readpages result %d\n", rc);
1518 return rc;
1519}
1520EXPORT_SYMBOL(ceph_osdc_readpages);
1521
1522/*
1523 * do a synchronous write on N pages
1524 */
1525int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1526 struct ceph_file_layout *layout,
1527 struct ceph_snap_context *snapc,
1528 u64 off, u64 len,
1529 u32 truncate_seq, u64 truncate_size,
1530 struct timespec *mtime,
1531 struct page **pages, int num_pages,
1532 int flags, int do_sync, bool nofail)
1533{
1534 struct ceph_osd_request *req;
1535 int rc = 0;
1536
1537 BUG_ON(vino.snap != CEPH_NOSNAP);
1538 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1539 CEPH_OSD_OP_WRITE,
1540 flags | CEPH_OSD_FLAG_ONDISK |
1541 CEPH_OSD_FLAG_WRITE,
1542 snapc, do_sync,
1543 truncate_seq, truncate_size, mtime,
1544 nofail, 1);
1545 if (!req)
1546 return -ENOMEM;
1547
1548 /* it may be a short write due to an object boundary */
1549 req->r_pages = pages;
1550 dout("writepages %llu~%llu (%d pages)\n", off, len,
1551 req->r_num_pages);
1552
1553 rc = ceph_osdc_start_request(osdc, req, nofail);
1554 if (!rc)
1555 rc = ceph_osdc_wait_request(osdc, req);
1556
1557 ceph_osdc_put_request(req);
1558 if (rc == 0)
1559 rc = len;
1560 dout("writepages result %d\n", rc);
1561 return rc;
1562}
1563EXPORT_SYMBOL(ceph_osdc_writepages);
1564
1565/*
1566 * handle incoming message
1567 */
1568static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1569{
1570 struct ceph_osd *osd = con->private;
1571 struct ceph_osd_client *osdc;
1572 int type = le16_to_cpu(msg->hdr.type);
1573
1574 if (!osd)
1575 goto out;
1576 osdc = osd->o_osdc;
1577
1578 switch (type) {
1579 case CEPH_MSG_OSD_MAP:
1580 ceph_osdc_handle_map(osdc, msg);
1581 break;
1582 case CEPH_MSG_OSD_OPREPLY:
1583 handle_reply(osdc, msg, con);
1584 break;
1585
1586 default:
1587 pr_err("received unknown message type %d %s\n", type,
1588 ceph_msg_type_name(type));
1589 }
1590out:
1591 ceph_msg_put(msg);
1592}
1593
1594/*
1595 * lookup and return message for incoming reply. set up reply message
1596 * pages.
1597 */
1598static struct ceph_msg *get_reply(struct ceph_connection *con,
1599 struct ceph_msg_header *hdr,
1600 int *skip)
1601{
1602 struct ceph_osd *osd = con->private;
1603 struct ceph_osd_client *osdc = osd->o_osdc;
1604 struct ceph_msg *m;
1605 struct ceph_osd_request *req;
1606 int front = le32_to_cpu(hdr->front_len);
1607 int data_len = le32_to_cpu(hdr->data_len);
1608 u64 tid;
1609
1610 tid = le64_to_cpu(hdr->tid);
1611 mutex_lock(&osdc->request_mutex);
1612 req = __lookup_request(osdc, tid);
1613 if (!req) {
1614 *skip = 1;
1615 m = NULL;
1616 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1617 osd->o_osd);
1618 goto out;
1619 }
1620
1621 if (req->r_con_filling_msg) {
1622 dout("get_reply revoking msg %p from old con %p\n",
1623 req->r_reply, req->r_con_filling_msg);
1624 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1625 ceph_con_put(req->r_con_filling_msg);
1626 req->r_con_filling_msg = NULL;
1627 }
1628
1629 if (front > req->r_reply->front.iov_len) {
1630 pr_warning("get_reply front %d > preallocated %d\n",
1631 front, (int)req->r_reply->front.iov_len);
1632 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1633 if (!m)
1634 goto out;
1635 ceph_msg_put(req->r_reply);
1636 req->r_reply = m;
1637 }
1638 m = ceph_msg_get(req->r_reply);
1639
1640 if (data_len > 0) {
1641 unsigned data_off = le16_to_cpu(hdr->data_off);
1642 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1643
1644 if (unlikely(req->r_num_pages < want)) {
1645 pr_warning("tid %lld reply %d > expected %d pages\n",
1646 tid, want, m->nr_pages);
1647 *skip = 1;
1648 ceph_msg_put(m);
1649 m = NULL;
1650 goto out;
1651 }
1652 m->pages = req->r_pages;
1653 m->nr_pages = req->r_num_pages;
1654#ifdef CONFIG_BLOCK
1655 m->bio = req->r_bio;
1656#endif
1657 }
1658 *skip = 0;
1659 req->r_con_filling_msg = ceph_con_get(con);
1660 dout("get_reply tid %lld %p\n", tid, m);
1661
1662out:
1663 mutex_unlock(&osdc->request_mutex);
1664 return m;
1665
1666}
1667
1668static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1669 struct ceph_msg_header *hdr,
1670 int *skip)
1671{
1672 struct ceph_osd *osd = con->private;
1673 int type = le16_to_cpu(hdr->type);
1674 int front = le32_to_cpu(hdr->front_len);
1675
1676 switch (type) {
1677 case CEPH_MSG_OSD_MAP:
1678 return ceph_msg_new(type, front, GFP_NOFS);
1679 case CEPH_MSG_OSD_OPREPLY:
1680 return get_reply(con, hdr, skip);
1681 default:
1682 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1683 osd->o_osd);
1684 *skip = 1;
1685 return NULL;
1686 }
1687}
1688
1689/*
1690 * Wrappers to refcount containing ceph_osd struct
1691 */
1692static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1693{
1694 struct ceph_osd *osd = con->private;
1695 if (get_osd(osd))
1696 return con;
1697 return NULL;
1698}
1699
1700static void put_osd_con(struct ceph_connection *con)
1701{
1702 struct ceph_osd *osd = con->private;
1703 put_osd(osd);
1704}
1705
1706/*
1707 * authentication
1708 */
1709static int get_authorizer(struct ceph_connection *con,
1710 void **buf, int *len, int *proto,
1711 void **reply_buf, int *reply_len, int force_new)
1712{
1713 struct ceph_osd *o = con->private;
1714 struct ceph_osd_client *osdc = o->o_osdc;
1715 struct ceph_auth_client *ac = osdc->client->monc.auth;
1716 int ret = 0;
1717
1718 if (force_new && o->o_authorizer) {
1719 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1720 o->o_authorizer = NULL;
1721 }
1722 if (o->o_authorizer == NULL) {
1723 ret = ac->ops->create_authorizer(
1724 ac, CEPH_ENTITY_TYPE_OSD,
1725 &o->o_authorizer,
1726 &o->o_authorizer_buf,
1727 &o->o_authorizer_buf_len,
1728 &o->o_authorizer_reply_buf,
1729 &o->o_authorizer_reply_buf_len);
1730 if (ret)
1731 return ret;
1732 }
1733
1734 *proto = ac->protocol;
1735 *buf = o->o_authorizer_buf;
1736 *len = o->o_authorizer_buf_len;
1737 *reply_buf = o->o_authorizer_reply_buf;
1738 *reply_len = o->o_authorizer_reply_buf_len;
1739 return 0;
1740}
1741
1742
1743static int verify_authorizer_reply(struct ceph_connection *con, int len)
1744{
1745 struct ceph_osd *o = con->private;
1746 struct ceph_osd_client *osdc = o->o_osdc;
1747 struct ceph_auth_client *ac = osdc->client->monc.auth;
1748
1749 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1750}
1751
1752static int invalidate_authorizer(struct ceph_connection *con)
1753{
1754 struct ceph_osd *o = con->private;
1755 struct ceph_osd_client *osdc = o->o_osdc;
1756 struct ceph_auth_client *ac = osdc->client->monc.auth;
1757
1758 if (ac->ops->invalidate_authorizer)
1759 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1760
1761 return ceph_monc_validate_auth(&osdc->client->monc);
1762}
1763
1764static const struct ceph_connection_operations osd_con_ops = {
1765 .get = get_osd_con,
1766 .put = put_osd_con,
1767 .dispatch = dispatch,
1768 .get_authorizer = get_authorizer,
1769 .verify_authorizer_reply = verify_authorizer_reply,
1770 .invalidate_authorizer = invalidate_authorizer,
1771 .alloc_msg = alloc_msg,
1772 .fault = osd_reset,
1773};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 000000000000..d73f3f6efa36
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1128 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/module.h>
5#include <linux/slab.h>
6#include <asm/div64.h>
7
8#include <linux/ceph/libceph.h>
9#include <linux/ceph/osdmap.h>
10#include <linux/ceph/decode.h>
11#include <linux/crush/hash.h>
12#include <linux/crush/mapper.h>
13
14char *ceph_osdmap_state_str(char *str, int len, int state)
15{
16 int flag = 0;
17
18 if (!len)
19 goto done;
20
21 *str = '\0';
22 if (state) {
23 if (state & CEPH_OSD_EXISTS) {
24 snprintf(str, len, "exists");
25 flag = 1;
26 }
27 if (state & CEPH_OSD_UP) {
28 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
29 "up");
30 flag = 1;
31 }
32 } else {
33 snprintf(str, len, "doesn't exist");
34 }
35done:
36 return str;
37}
38
39/* maps */
40
41static int calc_bits_of(unsigned t)
42{
43 int b = 0;
44 while (t) {
45 t = t >> 1;
46 b++;
47 }
48 return b;
49}
50
51/*
52 * the foo_mask is the smallest value 2^n-1 that is >= foo.
53 */
54static void calc_pg_masks(struct ceph_pg_pool_info *pi)
55{
56 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
57 pi->pgp_num_mask =
58 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
59 pi->lpg_num_mask =
60 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
61 pi->lpgp_num_mask =
62 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
63}
64
65/*
66 * decode crush map
67 */
68static int crush_decode_uniform_bucket(void **p, void *end,
69 struct crush_bucket_uniform *b)
70{
71 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
72 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
73 b->item_weight = ceph_decode_32(p);
74 return 0;
75bad:
76 return -EINVAL;
77}
78
79static int crush_decode_list_bucket(void **p, void *end,
80 struct crush_bucket_list *b)
81{
82 int j;
83 dout("crush_decode_list_bucket %p to %p\n", *p, end);
84 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85 if (b->item_weights == NULL)
86 return -ENOMEM;
87 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
88 if (b->sum_weights == NULL)
89 return -ENOMEM;
90 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
91 for (j = 0; j < b->h.size; j++) {
92 b->item_weights[j] = ceph_decode_32(p);
93 b->sum_weights[j] = ceph_decode_32(p);
94 }
95 return 0;
96bad:
97 return -EINVAL;
98}
99
100static int crush_decode_tree_bucket(void **p, void *end,
101 struct crush_bucket_tree *b)
102{
103 int j;
104 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
105 ceph_decode_32_safe(p, end, b->num_nodes, bad);
106 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
107 if (b->node_weights == NULL)
108 return -ENOMEM;
109 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
110 for (j = 0; j < b->num_nodes; j++)
111 b->node_weights[j] = ceph_decode_32(p);
112 return 0;
113bad:
114 return -EINVAL;
115}
116
117static int crush_decode_straw_bucket(void **p, void *end,
118 struct crush_bucket_straw *b)
119{
120 int j;
121 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
122 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123 if (b->item_weights == NULL)
124 return -ENOMEM;
125 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
126 if (b->straws == NULL)
127 return -ENOMEM;
128 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
129 for (j = 0; j < b->h.size; j++) {
130 b->item_weights[j] = ceph_decode_32(p);
131 b->straws[j] = ceph_decode_32(p);
132 }
133 return 0;
134bad:
135 return -EINVAL;
136}
137
138static struct crush_map *crush_decode(void *pbyval, void *end)
139{
140 struct crush_map *c;
141 int err = -EINVAL;
142 int i, j;
143 void **p = &pbyval;
144 void *start = pbyval;
145 u32 magic;
146
147 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
148
149 c = kzalloc(sizeof(*c), GFP_NOFS);
150 if (c == NULL)
151 return ERR_PTR(-ENOMEM);
152
153 ceph_decode_need(p, end, 4*sizeof(u32), bad);
154 magic = ceph_decode_32(p);
155 if (magic != CRUSH_MAGIC) {
156 pr_err("crush_decode magic %x != current %x\n",
157 (unsigned)magic, (unsigned)CRUSH_MAGIC);
158 goto bad;
159 }
160 c->max_buckets = ceph_decode_32(p);
161 c->max_rules = ceph_decode_32(p);
162 c->max_devices = ceph_decode_32(p);
163
164 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
165 if (c->device_parents == NULL)
166 goto badmem;
167 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
168 if (c->bucket_parents == NULL)
169 goto badmem;
170
171 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
172 if (c->buckets == NULL)
173 goto badmem;
174 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
175 if (c->rules == NULL)
176 goto badmem;
177
178 /* buckets */
179 for (i = 0; i < c->max_buckets; i++) {
180 int size = 0;
181 u32 alg;
182 struct crush_bucket *b;
183
184 ceph_decode_32_safe(p, end, alg, bad);
185 if (alg == 0) {
186 c->buckets[i] = NULL;
187 continue;
188 }
189 dout("crush_decode bucket %d off %x %p to %p\n",
190 i, (int)(*p-start), *p, end);
191
192 switch (alg) {
193 case CRUSH_BUCKET_UNIFORM:
194 size = sizeof(struct crush_bucket_uniform);
195 break;
196 case CRUSH_BUCKET_LIST:
197 size = sizeof(struct crush_bucket_list);
198 break;
199 case CRUSH_BUCKET_TREE:
200 size = sizeof(struct crush_bucket_tree);
201 break;
202 case CRUSH_BUCKET_STRAW:
203 size = sizeof(struct crush_bucket_straw);
204 break;
205 default:
206 err = -EINVAL;
207 goto bad;
208 }
209 BUG_ON(size == 0);
210 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
211 if (b == NULL)
212 goto badmem;
213
214 ceph_decode_need(p, end, 4*sizeof(u32), bad);
215 b->id = ceph_decode_32(p);
216 b->type = ceph_decode_16(p);
217 b->alg = ceph_decode_8(p);
218 b->hash = ceph_decode_8(p);
219 b->weight = ceph_decode_32(p);
220 b->size = ceph_decode_32(p);
221
222 dout("crush_decode bucket size %d off %x %p to %p\n",
223 b->size, (int)(*p-start), *p, end);
224
225 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
226 if (b->items == NULL)
227 goto badmem;
228 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
229 if (b->perm == NULL)
230 goto badmem;
231 b->perm_n = 0;
232
233 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
234 for (j = 0; j < b->size; j++)
235 b->items[j] = ceph_decode_32(p);
236
237 switch (b->alg) {
238 case CRUSH_BUCKET_UNIFORM:
239 err = crush_decode_uniform_bucket(p, end,
240 (struct crush_bucket_uniform *)b);
241 if (err < 0)
242 goto bad;
243 break;
244 case CRUSH_BUCKET_LIST:
245 err = crush_decode_list_bucket(p, end,
246 (struct crush_bucket_list *)b);
247 if (err < 0)
248 goto bad;
249 break;
250 case CRUSH_BUCKET_TREE:
251 err = crush_decode_tree_bucket(p, end,
252 (struct crush_bucket_tree *)b);
253 if (err < 0)
254 goto bad;
255 break;
256 case CRUSH_BUCKET_STRAW:
257 err = crush_decode_straw_bucket(p, end,
258 (struct crush_bucket_straw *)b);
259 if (err < 0)
260 goto bad;
261 break;
262 }
263 }
264
265 /* rules */
266 dout("rule vec is %p\n", c->rules);
267 for (i = 0; i < c->max_rules; i++) {
268 u32 yes;
269 struct crush_rule *r;
270
271 ceph_decode_32_safe(p, end, yes, bad);
272 if (!yes) {
273 dout("crush_decode NO rule %d off %x %p to %p\n",
274 i, (int)(*p-start), *p, end);
275 c->rules[i] = NULL;
276 continue;
277 }
278
279 dout("crush_decode rule %d off %x %p to %p\n",
280 i, (int)(*p-start), *p, end);
281
282 /* len */
283 ceph_decode_32_safe(p, end, yes, bad);
284#if BITS_PER_LONG == 32
285 err = -EINVAL;
286 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
287 goto bad;
288#endif
289 r = c->rules[i] = kmalloc(sizeof(*r) +
290 yes*sizeof(struct crush_rule_step),
291 GFP_NOFS);
292 if (r == NULL)
293 goto badmem;
294 dout(" rule %d is at %p\n", i, r);
295 r->len = yes;
296 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
297 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
298 for (j = 0; j < r->len; j++) {
299 r->steps[j].op = ceph_decode_32(p);
300 r->steps[j].arg1 = ceph_decode_32(p);
301 r->steps[j].arg2 = ceph_decode_32(p);
302 }
303 }
304
305 /* ignore trailing name maps. */
306
307 dout("crush_decode success\n");
308 return c;
309
310badmem:
311 err = -ENOMEM;
312bad:
313 dout("crush_decode fail %d\n", err);
314 crush_destroy(c);
315 return ERR_PTR(err);
316}
317
318/*
319 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
320 * to a set of osds)
321 */
322static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
323{
324 u64 a = *(u64 *)&l;
325 u64 b = *(u64 *)&r;
326
327 if (a < b)
328 return -1;
329 if (a > b)
330 return 1;
331 return 0;
332}
333
334static int __insert_pg_mapping(struct ceph_pg_mapping *new,
335 struct rb_root *root)
336{
337 struct rb_node **p = &root->rb_node;
338 struct rb_node *parent = NULL;
339 struct ceph_pg_mapping *pg = NULL;
340 int c;
341
342 while (*p) {
343 parent = *p;
344 pg = rb_entry(parent, struct ceph_pg_mapping, node);
345 c = pgid_cmp(new->pgid, pg->pgid);
346 if (c < 0)
347 p = &(*p)->rb_left;
348 else if (c > 0)
349 p = &(*p)->rb_right;
350 else
351 return -EEXIST;
352 }
353
354 rb_link_node(&new->node, parent, p);
355 rb_insert_color(&new->node, root);
356 return 0;
357}
358
359static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
360 struct ceph_pg pgid)
361{
362 struct rb_node *n = root->rb_node;
363 struct ceph_pg_mapping *pg;
364 int c;
365
366 while (n) {
367 pg = rb_entry(n, struct ceph_pg_mapping, node);
368 c = pgid_cmp(pgid, pg->pgid);
369 if (c < 0)
370 n = n->rb_left;
371 else if (c > 0)
372 n = n->rb_right;
373 else
374 return pg;
375 }
376 return NULL;
377}
378
379/*
380 * rbtree of pg pool info
381 */
382static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
383{
384 struct rb_node **p = &root->rb_node;
385 struct rb_node *parent = NULL;
386 struct ceph_pg_pool_info *pi = NULL;
387
388 while (*p) {
389 parent = *p;
390 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
391 if (new->id < pi->id)
392 p = &(*p)->rb_left;
393 else if (new->id > pi->id)
394 p = &(*p)->rb_right;
395 else
396 return -EEXIST;
397 }
398
399 rb_link_node(&new->node, parent, p);
400 rb_insert_color(&new->node, root);
401 return 0;
402}
403
404static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
405{
406 struct ceph_pg_pool_info *pi;
407 struct rb_node *n = root->rb_node;
408
409 while (n) {
410 pi = rb_entry(n, struct ceph_pg_pool_info, node);
411 if (id < pi->id)
412 n = n->rb_left;
413 else if (id > pi->id)
414 n = n->rb_right;
415 else
416 return pi;
417 }
418 return NULL;
419}
420
421int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
422{
423 struct rb_node *rbp;
424
425 for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
426 struct ceph_pg_pool_info *pi =
427 rb_entry(rbp, struct ceph_pg_pool_info, node);
428 if (pi->name && strcmp(pi->name, name) == 0)
429 return pi->id;
430 }
431 return -ENOENT;
432}
433EXPORT_SYMBOL(ceph_pg_poolid_by_name);
434
435static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
436{
437 rb_erase(&pi->node, root);
438 kfree(pi->name);
439 kfree(pi);
440}
441
442static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
443{
444 unsigned n, m;
445
446 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
447 calc_pg_masks(pi);
448
449 /* num_snaps * snap_info_t */
450 n = le32_to_cpu(pi->v.num_snaps);
451 while (n--) {
452 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
453 sizeof(struct ceph_timespec), bad);
454 *p += sizeof(u64) + /* key */
455 1 + sizeof(u64) + /* u8, snapid */
456 sizeof(struct ceph_timespec);
457 m = ceph_decode_32(p); /* snap name */
458 *p += m;
459 }
460
461 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
462 return 0;
463
464bad:
465 return -EINVAL;
466}
467
468static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
469{
470 struct ceph_pg_pool_info *pi;
471 u32 num, len, pool;
472
473 ceph_decode_32_safe(p, end, num, bad);
474 dout(" %d pool names\n", num);
475 while (num--) {
476 ceph_decode_32_safe(p, end, pool, bad);
477 ceph_decode_32_safe(p, end, len, bad);
478 dout(" pool %d len %d\n", pool, len);
479 pi = __lookup_pg_pool(&map->pg_pools, pool);
480 if (pi) {
481 kfree(pi->name);
482 pi->name = kmalloc(len + 1, GFP_NOFS);
483 if (pi->name) {
484 memcpy(pi->name, *p, len);
485 pi->name[len] = '\0';
486 dout(" name is %s\n", pi->name);
487 }
488 }
489 *p += len;
490 }
491 return 0;
492
493bad:
494 return -EINVAL;
495}
496
497/*
498 * osd map
499 */
500void ceph_osdmap_destroy(struct ceph_osdmap *map)
501{
502 dout("osdmap_destroy %p\n", map);
503 if (map->crush)
504 crush_destroy(map->crush);
505 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
506 struct ceph_pg_mapping *pg =
507 rb_entry(rb_first(&map->pg_temp),
508 struct ceph_pg_mapping, node);
509 rb_erase(&pg->node, &map->pg_temp);
510 kfree(pg);
511 }
512 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
513 struct ceph_pg_pool_info *pi =
514 rb_entry(rb_first(&map->pg_pools),
515 struct ceph_pg_pool_info, node);
516 __remove_pg_pool(&map->pg_pools, pi);
517 }
518 kfree(map->osd_state);
519 kfree(map->osd_weight);
520 kfree(map->osd_addr);
521 kfree(map);
522}
523
524/*
525 * adjust max osd value. reallocate arrays.
526 */
527static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
528{
529 u8 *state;
530 struct ceph_entity_addr *addr;
531 u32 *weight;
532
533 state = kcalloc(max, sizeof(*state), GFP_NOFS);
534 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
535 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
536 if (state == NULL || addr == NULL || weight == NULL) {
537 kfree(state);
538 kfree(addr);
539 kfree(weight);
540 return -ENOMEM;
541 }
542
543 /* copy old? */
544 if (map->osd_state) {
545 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
546 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
547 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
548 kfree(map->osd_state);
549 kfree(map->osd_addr);
550 kfree(map->osd_weight);
551 }
552
553 map->osd_state = state;
554 map->osd_weight = weight;
555 map->osd_addr = addr;
556 map->max_osd = max;
557 return 0;
558}
559
560/*
561 * decode a full map.
562 */
563struct ceph_osdmap *osdmap_decode(void **p, void *end)
564{
565 struct ceph_osdmap *map;
566 u16 version;
567 u32 len, max, i;
568 u8 ev;
569 int err = -EINVAL;
570 void *start = *p;
571 struct ceph_pg_pool_info *pi;
572
573 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
574
575 map = kzalloc(sizeof(*map), GFP_NOFS);
576 if (map == NULL)
577 return ERR_PTR(-ENOMEM);
578 map->pg_temp = RB_ROOT;
579
580 ceph_decode_16_safe(p, end, version, bad);
581 if (version > CEPH_OSDMAP_VERSION) {
582 pr_warning("got unknown v %d > %d of osdmap\n", version,
583 CEPH_OSDMAP_VERSION);
584 goto bad;
585 }
586
587 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
588 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
589 map->epoch = ceph_decode_32(p);
590 ceph_decode_copy(p, &map->created, sizeof(map->created));
591 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
592
593 ceph_decode_32_safe(p, end, max, bad);
594 while (max--) {
595 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
596 pi = kzalloc(sizeof(*pi), GFP_NOFS);
597 if (!pi)
598 goto bad;
599 pi->id = ceph_decode_32(p);
600 ev = ceph_decode_8(p); /* encoding version */
601 if (ev > CEPH_PG_POOL_VERSION) {
602 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
603 ev, CEPH_PG_POOL_VERSION);
604 kfree(pi);
605 goto bad;
606 }
607 err = __decode_pool(p, end, pi);
608 if (err < 0)
609 goto bad;
610 __insert_pg_pool(&map->pg_pools, pi);
611 }
612
613 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
614 goto bad;
615
616 ceph_decode_32_safe(p, end, map->pool_max, bad);
617
618 ceph_decode_32_safe(p, end, map->flags, bad);
619
620 max = ceph_decode_32(p);
621
622 /* (re)alloc osd arrays */
623 err = osdmap_set_max_osd(map, max);
624 if (err < 0)
625 goto bad;
626 dout("osdmap_decode max_osd = %d\n", map->max_osd);
627
628 /* osds */
629 err = -EINVAL;
630 ceph_decode_need(p, end, 3*sizeof(u32) +
631 map->max_osd*(1 + sizeof(*map->osd_weight) +
632 sizeof(*map->osd_addr)), bad);
633 *p += 4; /* skip length field (should match max) */
634 ceph_decode_copy(p, map->osd_state, map->max_osd);
635
636 *p += 4; /* skip length field (should match max) */
637 for (i = 0; i < map->max_osd; i++)
638 map->osd_weight[i] = ceph_decode_32(p);
639
640 *p += 4; /* skip length field (should match max) */
641 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
642 for (i = 0; i < map->max_osd; i++)
643 ceph_decode_addr(&map->osd_addr[i]);
644
645 /* pg_temp */
646 ceph_decode_32_safe(p, end, len, bad);
647 for (i = 0; i < len; i++) {
648 int n, j;
649 struct ceph_pg pgid;
650 struct ceph_pg_mapping *pg;
651
652 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
653 ceph_decode_copy(p, &pgid, sizeof(pgid));
654 n = ceph_decode_32(p);
655 ceph_decode_need(p, end, n * sizeof(u32), bad);
656 err = -ENOMEM;
657 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
658 if (!pg)
659 goto bad;
660 pg->pgid = pgid;
661 pg->len = n;
662 for (j = 0; j < n; j++)
663 pg->osds[j] = ceph_decode_32(p);
664
665 err = __insert_pg_mapping(pg, &map->pg_temp);
666 if (err)
667 goto bad;
668 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
669 }
670
671 /* crush */
672 ceph_decode_32_safe(p, end, len, bad);
673 dout("osdmap_decode crush len %d from off 0x%x\n", len,
674 (int)(*p - start));
675 ceph_decode_need(p, end, len, bad);
676 map->crush = crush_decode(*p, end);
677 *p += len;
678 if (IS_ERR(map->crush)) {
679 err = PTR_ERR(map->crush);
680 map->crush = NULL;
681 goto bad;
682 }
683
684 /* ignore the rest of the map */
685 *p = end;
686
687 dout("osdmap_decode done %p %p\n", *p, end);
688 return map;
689
690bad:
691 dout("osdmap_decode fail\n");
692 ceph_osdmap_destroy(map);
693 return ERR_PTR(err);
694}
695
696/*
697 * decode and apply an incremental map update.
698 */
699struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
700 struct ceph_osdmap *map,
701 struct ceph_messenger *msgr)
702{
703 struct crush_map *newcrush = NULL;
704 struct ceph_fsid fsid;
705 u32 epoch = 0;
706 struct ceph_timespec modified;
707 u32 len, pool;
708 __s32 new_pool_max, new_flags, max;
709 void *start = *p;
710 int err = -EINVAL;
711 u16 version;
712 struct rb_node *rbp;
713
714 ceph_decode_16_safe(p, end, version, bad);
715 if (version > CEPH_OSDMAP_INC_VERSION) {
716 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
717 CEPH_OSDMAP_INC_VERSION);
718 goto bad;
719 }
720
721 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
722 bad);
723 ceph_decode_copy(p, &fsid, sizeof(fsid));
724 epoch = ceph_decode_32(p);
725 BUG_ON(epoch != map->epoch+1);
726 ceph_decode_copy(p, &modified, sizeof(modified));
727 new_pool_max = ceph_decode_32(p);
728 new_flags = ceph_decode_32(p);
729
730 /* full map? */
731 ceph_decode_32_safe(p, end, len, bad);
732 if (len > 0) {
733 dout("apply_incremental full map len %d, %p to %p\n",
734 len, *p, end);
735 return osdmap_decode(p, min(*p+len, end));
736 }
737
738 /* new crush? */
739 ceph_decode_32_safe(p, end, len, bad);
740 if (len > 0) {
741 dout("apply_incremental new crush map len %d, %p to %p\n",
742 len, *p, end);
743 newcrush = crush_decode(*p, min(*p+len, end));
744 if (IS_ERR(newcrush))
745 return ERR_CAST(newcrush);
746 *p += len;
747 }
748
749 /* new flags? */
750 if (new_flags >= 0)
751 map->flags = new_flags;
752 if (new_pool_max >= 0)
753 map->pool_max = new_pool_max;
754
755 ceph_decode_need(p, end, 5*sizeof(u32), bad);
756
757 /* new max? */
758 max = ceph_decode_32(p);
759 if (max >= 0) {
760 err = osdmap_set_max_osd(map, max);
761 if (err < 0)
762 goto bad;
763 }
764
765 map->epoch++;
766 map->modified = map->modified;
767 if (newcrush) {
768 if (map->crush)
769 crush_destroy(map->crush);
770 map->crush = newcrush;
771 newcrush = NULL;
772 }
773
774 /* new_pool */
775 ceph_decode_32_safe(p, end, len, bad);
776 while (len--) {
777 __u8 ev;
778 struct ceph_pg_pool_info *pi;
779
780 ceph_decode_32_safe(p, end, pool, bad);
781 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
782 ev = ceph_decode_8(p); /* encoding version */
783 if (ev > CEPH_PG_POOL_VERSION) {
784 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
785 ev, CEPH_PG_POOL_VERSION);
786 goto bad;
787 }
788 pi = __lookup_pg_pool(&map->pg_pools, pool);
789 if (!pi) {
790 pi = kzalloc(sizeof(*pi), GFP_NOFS);
791 if (!pi) {
792 err = -ENOMEM;
793 goto bad;
794 }
795 pi->id = pool;
796 __insert_pg_pool(&map->pg_pools, pi);
797 }
798 err = __decode_pool(p, end, pi);
799 if (err < 0)
800 goto bad;
801 }
802 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
803 goto bad;
804
805 /* old_pool */
806 ceph_decode_32_safe(p, end, len, bad);
807 while (len--) {
808 struct ceph_pg_pool_info *pi;
809
810 ceph_decode_32_safe(p, end, pool, bad);
811 pi = __lookup_pg_pool(&map->pg_pools, pool);
812 if (pi)
813 __remove_pg_pool(&map->pg_pools, pi);
814 }
815
816 /* new_up */
817 err = -EINVAL;
818 ceph_decode_32_safe(p, end, len, bad);
819 while (len--) {
820 u32 osd;
821 struct ceph_entity_addr addr;
822 ceph_decode_32_safe(p, end, osd, bad);
823 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
824 ceph_decode_addr(&addr);
825 pr_info("osd%d up\n", osd);
826 BUG_ON(osd >= map->max_osd);
827 map->osd_state[osd] |= CEPH_OSD_UP;
828 map->osd_addr[osd] = addr;
829 }
830
831 /* new_down */
832 ceph_decode_32_safe(p, end, len, bad);
833 while (len--) {
834 u32 osd;
835 ceph_decode_32_safe(p, end, osd, bad);
836 (*p)++; /* clean flag */
837 pr_info("osd%d down\n", osd);
838 if (osd < map->max_osd)
839 map->osd_state[osd] &= ~CEPH_OSD_UP;
840 }
841
842 /* new_weight */
843 ceph_decode_32_safe(p, end, len, bad);
844 while (len--) {
845 u32 osd, off;
846 ceph_decode_need(p, end, sizeof(u32)*2, bad);
847 osd = ceph_decode_32(p);
848 off = ceph_decode_32(p);
849 pr_info("osd%d weight 0x%x %s\n", osd, off,
850 off == CEPH_OSD_IN ? "(in)" :
851 (off == CEPH_OSD_OUT ? "(out)" : ""));
852 if (osd < map->max_osd)
853 map->osd_weight[osd] = off;
854 }
855
856 /* new_pg_temp */
857 rbp = rb_first(&map->pg_temp);
858 ceph_decode_32_safe(p, end, len, bad);
859 while (len--) {
860 struct ceph_pg_mapping *pg;
861 int j;
862 struct ceph_pg pgid;
863 u32 pglen;
864 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
865 ceph_decode_copy(p, &pgid, sizeof(pgid));
866 pglen = ceph_decode_32(p);
867
868 /* remove any? */
869 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
870 node)->pgid, pgid) <= 0) {
871 struct ceph_pg_mapping *cur =
872 rb_entry(rbp, struct ceph_pg_mapping, node);
873
874 rbp = rb_next(rbp);
875 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
876 rb_erase(&cur->node, &map->pg_temp);
877 kfree(cur);
878 }
879
880 if (pglen) {
881 /* insert */
882 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
883 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
884 if (!pg) {
885 err = -ENOMEM;
886 goto bad;
887 }
888 pg->pgid = pgid;
889 pg->len = pglen;
890 for (j = 0; j < pglen; j++)
891 pg->osds[j] = ceph_decode_32(p);
892 err = __insert_pg_mapping(pg, &map->pg_temp);
893 if (err) {
894 kfree(pg);
895 goto bad;
896 }
897 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
898 pglen);
899 }
900 }
901 while (rbp) {
902 struct ceph_pg_mapping *cur =
903 rb_entry(rbp, struct ceph_pg_mapping, node);
904
905 rbp = rb_next(rbp);
906 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
907 rb_erase(&cur->node, &map->pg_temp);
908 kfree(cur);
909 }
910
911 /* ignore the rest */
912 *p = end;
913 return map;
914
915bad:
916 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
917 epoch, (int)(*p - start), *p, start, end);
918 print_hex_dump(KERN_DEBUG, "osdmap: ",
919 DUMP_PREFIX_OFFSET, 16, 1,
920 start, end - start, true);
921 if (newcrush)
922 crush_destroy(newcrush);
923 return ERR_PTR(err);
924}
925
926
927
928
929/*
930 * calculate file layout from given offset, length.
931 * fill in correct oid, logical length, and object extent
932 * offset, length.
933 *
934 * for now, we write only a single su, until we can
935 * pass a stride back to the caller.
936 */
937void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
938 u64 off, u64 *plen,
939 u64 *ono,
940 u64 *oxoff, u64 *oxlen)
941{
942 u32 osize = le32_to_cpu(layout->fl_object_size);
943 u32 su = le32_to_cpu(layout->fl_stripe_unit);
944 u32 sc = le32_to_cpu(layout->fl_stripe_count);
945 u32 bl, stripeno, stripepos, objsetno;
946 u32 su_per_object;
947 u64 t, su_offset;
948
949 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
950 osize, su);
951 su_per_object = osize / su;
952 dout("osize %u / su %u = su_per_object %u\n", osize, su,
953 su_per_object);
954
955 BUG_ON((su & ~PAGE_MASK) != 0);
956 /* bl = *off / su; */
957 t = off;
958 do_div(t, su);
959 bl = t;
960 dout("off %llu / su %u = bl %u\n", off, su, bl);
961
962 stripeno = bl / sc;
963 stripepos = bl % sc;
964 objsetno = stripeno / su_per_object;
965
966 *ono = objsetno * sc + stripepos;
967 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
968
969 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
970 t = off;
971 su_offset = do_div(t, su);
972 *oxoff = su_offset + (stripeno % su_per_object) * su;
973
974 /*
975 * Calculate the length of the extent being written to the selected
976 * object. This is the minimum of the full length requested (plen) or
977 * the remainder of the current stripe being written to.
978 */
979 *oxlen = min_t(u64, *plen, su - su_offset);
980 *plen = *oxlen;
981
982 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
983}
984EXPORT_SYMBOL(ceph_calc_file_object_mapping);
985
986/*
987 * calculate an object layout (i.e. pgid) from an oid,
988 * file_layout, and osdmap
989 */
990int ceph_calc_object_layout(struct ceph_object_layout *ol,
991 const char *oid,
992 struct ceph_file_layout *fl,
993 struct ceph_osdmap *osdmap)
994{
995 unsigned num, num_mask;
996 struct ceph_pg pgid;
997 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
998 int poolid = le32_to_cpu(fl->fl_pg_pool);
999 struct ceph_pg_pool_info *pool;
1000 unsigned ps;
1001
1002 BUG_ON(!osdmap);
1003
1004 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1005 if (!pool)
1006 return -EIO;
1007 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
1008 if (preferred >= 0) {
1009 ps += preferred;
1010 num = le32_to_cpu(pool->v.lpg_num);
1011 num_mask = pool->lpg_num_mask;
1012 } else {
1013 num = le32_to_cpu(pool->v.pg_num);
1014 num_mask = pool->pg_num_mask;
1015 }
1016
1017 pgid.ps = cpu_to_le16(ps);
1018 pgid.preferred = cpu_to_le16(preferred);
1019 pgid.pool = fl->fl_pg_pool;
1020 if (preferred >= 0)
1021 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1022 (int)preferred);
1023 else
1024 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1025
1026 ol->ol_pgid = pgid;
1027 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1028 return 0;
1029}
1030EXPORT_SYMBOL(ceph_calc_object_layout);
1031
1032/*
1033 * Calculate raw osd vector for the given pgid. Return pointer to osd
1034 * array, or NULL on failure.
1035 */
1036static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1037 int *osds, int *num)
1038{
1039 struct ceph_pg_mapping *pg;
1040 struct ceph_pg_pool_info *pool;
1041 int ruleno;
1042 unsigned poolid, ps, pps;
1043 int preferred;
1044
1045 /* pg_temp? */
1046 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1047 if (pg) {
1048 *num = pg->len;
1049 return pg->osds;
1050 }
1051
1052 /* crush */
1053 poolid = le32_to_cpu(pgid.pool);
1054 ps = le16_to_cpu(pgid.ps);
1055 preferred = (s16)le16_to_cpu(pgid.preferred);
1056
1057 /* don't forcefeed bad device ids to crush */
1058 if (preferred >= osdmap->max_osd ||
1059 preferred >= osdmap->crush->max_devices)
1060 preferred = -1;
1061
1062 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1063 if (!pool)
1064 return NULL;
1065 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1066 pool->v.type, pool->v.size);
1067 if (ruleno < 0) {
1068 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1069 poolid, pool->v.crush_ruleset, pool->v.type,
1070 pool->v.size);
1071 return NULL;
1072 }
1073
1074 if (preferred >= 0)
1075 pps = ceph_stable_mod(ps,
1076 le32_to_cpu(pool->v.lpgp_num),
1077 pool->lpgp_num_mask);
1078 else
1079 pps = ceph_stable_mod(ps,
1080 le32_to_cpu(pool->v.pgp_num),
1081 pool->pgp_num_mask);
1082 pps += poolid;
1083 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1084 min_t(int, pool->v.size, *num),
1085 preferred, osdmap->osd_weight);
1086 return osds;
1087}
1088
1089/*
1090 * Return acting set for given pgid.
1091 */
1092int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1093 int *acting)
1094{
1095 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1096 int i, o, num = CEPH_PG_MAX_SIZE;
1097
1098 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1099 if (!osds)
1100 return -1;
1101
1102 /* primary is first up osd */
1103 o = 0;
1104 for (i = 0; i < num; i++)
1105 if (ceph_osd_is_up(osdmap, osds[i]))
1106 acting[o++] = osds[i];
1107 return o;
1108}
1109
1110/*
1111 * Return primary osd for given pgid, or -1 if none.
1112 */
1113int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1114{
1115 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1116 int i, num = CEPH_PG_MAX_SIZE;
1117
1118 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1119 if (!osds)
1120 return -1;
1121
1122 /* primary is first up osd */
1123 for (i = 0; i < num; i++)
1124 if (ceph_osd_is_up(osdmap, osds[i]))
1125 return osds[i];
1126 return -1;
1127}
1128EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000000..13cb409a7bba
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
1
2#include <linux/module.h>
3#include <linux/gfp.h>
4#include <linux/pagemap.h>
5#include <linux/highmem.h>
6#include <linux/ceph/pagelist.h>
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail) {
11 struct page *page = list_entry(pl->head.prev, struct page, lru);
12 kunmap(page);
13 pl->mapped_tail = NULL;
14 }
15}
16
17int ceph_pagelist_release(struct ceph_pagelist *pl)
18{
19 ceph_pagelist_unmap_tail(pl);
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 ceph_pagelist_free_reserve(pl);
27 return 0;
28}
29EXPORT_SYMBOL(ceph_pagelist_release);
30
31static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
32{
33 struct page *page;
34
35 if (!pl->num_pages_free) {
36 page = __page_cache_alloc(GFP_NOFS);
37 } else {
38 page = list_first_entry(&pl->free_list, struct page, lru);
39 list_del(&page->lru);
40 --pl->num_pages_free;
41 }
42 if (!page)
43 return -ENOMEM;
44 pl->room += PAGE_SIZE;
45 ceph_pagelist_unmap_tail(pl);
46 list_add_tail(&page->lru, &pl->head);
47 pl->mapped_tail = kmap(page);
48 return 0;
49}
50
51int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
52{
53 while (pl->room < len) {
54 size_t bit = pl->room;
55 int ret;
56
57 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
58 buf, bit);
59 pl->length += bit;
60 pl->room -= bit;
61 buf += bit;
62 len -= bit;
63 ret = ceph_pagelist_addpage(pl);
64 if (ret)
65 return ret;
66 }
67
68 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
69 pl->length += len;
70 pl->room -= len;
71 return 0;
72}
73EXPORT_SYMBOL(ceph_pagelist_append);
74
75/**
76 * Allocate enough pages for a pagelist to append the given amount
77 * of data without without allocating.
78 * Returns: 0 on success, -ENOMEM on error.
79 */
80int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
81{
82 if (space <= pl->room)
83 return 0;
84 space -= pl->room;
85 space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
86
87 while (space > pl->num_pages_free) {
88 struct page *page = __page_cache_alloc(GFP_NOFS);
89 if (!page)
90 return -ENOMEM;
91 list_add_tail(&page->lru, &pl->free_list);
92 ++pl->num_pages_free;
93 }
94 return 0;
95}
96EXPORT_SYMBOL(ceph_pagelist_reserve);
97
98/**
99 * Free any pages that have been preallocated.
100 */
101int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
102{
103 while (!list_empty(&pl->free_list)) {
104 struct page *page = list_first_entry(&pl->free_list,
105 struct page, lru);
106 list_del(&page->lru);
107 __free_page(page);
108 --pl->num_pages_free;
109 }
110 BUG_ON(pl->num_pages_free);
111 return 0;
112}
113EXPORT_SYMBOL(ceph_pagelist_free_reserve);
114
115/**
116 * Create a truncation point.
117 */
118void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
119 struct ceph_pagelist_cursor *c)
120{
121 c->pl = pl;
122 c->page_lru = pl->head.prev;
123 c->room = pl->room;
124}
125EXPORT_SYMBOL(ceph_pagelist_set_cursor);
126
127/**
128 * Truncate a pagelist to the given point. Move extra pages to reserve.
129 * This won't sleep.
130 * Returns: 0 on success,
131 * -EINVAL if the pagelist doesn't match the trunc point pagelist
132 */
133int ceph_pagelist_truncate(struct ceph_pagelist *pl,
134 struct ceph_pagelist_cursor *c)
135{
136 struct page *page;
137
138 if (pl != c->pl)
139 return -EINVAL;
140 ceph_pagelist_unmap_tail(pl);
141 while (pl->head.prev != c->page_lru) {
142 page = list_entry(pl->head.prev, struct page, lru);
143 list_del(&page->lru); /* remove from pagelist */
144 list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
145 ++pl->num_pages_free;
146 }
147 pl->room = c->room;
148 if (!list_empty(&pl->head)) {
149 page = list_entry(pl->head.prev, struct page, lru);
150 pl->mapped_tail = kmap(page);
151 }
152 return 0;
153}
154EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000000..54caf0687155
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/sched.h>
5#include <linux/slab.h>
6#include <linux/file.h>
7#include <linux/namei.h>
8#include <linux/writeback.h>
9
10#include <linux/ceph/libceph.h>
11
12/*
13 * build a vector of user pages
14 */
15struct page **ceph_get_direct_page_vector(const char __user *data,
16 int num_pages,
17 loff_t off, size_t len)
18{
19 struct page **pages;
20 int rc;
21
22 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
23 if (!pages)
24 return ERR_PTR(-ENOMEM);
25
26 down_read(&current->mm->mmap_sem);
27 rc = get_user_pages(current, current->mm, (unsigned long)data,
28 num_pages, 0, 0, pages, NULL);
29 up_read(&current->mm->mmap_sem);
30 if (rc < 0)
31 goto fail;
32 return pages;
33
34fail:
35 kfree(pages);
36 return ERR_PTR(rc);
37}
38EXPORT_SYMBOL(ceph_get_direct_page_vector);
39
40void ceph_put_page_vector(struct page **pages, int num_pages)
41{
42 int i;
43
44 for (i = 0; i < num_pages; i++)
45 put_page(pages[i]);
46 kfree(pages);
47}
48EXPORT_SYMBOL(ceph_put_page_vector);
49
50void ceph_release_page_vector(struct page **pages, int num_pages)
51{
52 int i;
53
54 for (i = 0; i < num_pages; i++)
55 __free_pages(pages[i], 0);
56 kfree(pages);
57}
58EXPORT_SYMBOL(ceph_release_page_vector);
59
60/*
61 * allocate a vector new pages
62 */
63struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
64{
65 struct page **pages;
66 int i;
67
68 pages = kmalloc(sizeof(*pages) * num_pages, flags);
69 if (!pages)
70 return ERR_PTR(-ENOMEM);
71 for (i = 0; i < num_pages; i++) {
72 pages[i] = __page_cache_alloc(flags);
73 if (pages[i] == NULL) {
74 ceph_release_page_vector(pages, i);
75 return ERR_PTR(-ENOMEM);
76 }
77 }
78 return pages;
79}
80EXPORT_SYMBOL(ceph_alloc_page_vector);
81
82/*
83 * copy user data into a page vector
84 */
85int ceph_copy_user_to_page_vector(struct page **pages,
86 const char __user *data,
87 loff_t off, size_t len)
88{
89 int i = 0;
90 int po = off & ~PAGE_CACHE_MASK;
91 int left = len;
92 int l, bad;
93
94 while (left > 0) {
95 l = min_t(int, PAGE_CACHE_SIZE-po, left);
96 bad = copy_from_user(page_address(pages[i]) + po, data, l);
97 if (bad == l)
98 return -EFAULT;
99 data += l - bad;
100 left -= l - bad;
101 po += l - bad;
102 if (po == PAGE_CACHE_SIZE) {
103 po = 0;
104 i++;
105 }
106 }
107 return len;
108}
109EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
110
111int ceph_copy_to_page_vector(struct page **pages,
112 const char *data,
113 loff_t off, size_t len)
114{
115 int i = 0;
116 size_t po = off & ~PAGE_CACHE_MASK;
117 size_t left = len;
118 size_t l;
119
120 while (left > 0) {
121 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
122 memcpy(page_address(pages[i]) + po, data, l);
123 data += l;
124 left -= l;
125 po += l;
126 if (po == PAGE_CACHE_SIZE) {
127 po = 0;
128 i++;
129 }
130 }
131 return len;
132}
133EXPORT_SYMBOL(ceph_copy_to_page_vector);
134
135int ceph_copy_from_page_vector(struct page **pages,
136 char *data,
137 loff_t off, size_t len)
138{
139 int i = 0;
140 size_t po = off & ~PAGE_CACHE_MASK;
141 size_t left = len;
142 size_t l;
143
144 while (left > 0) {
145 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
146 memcpy(data, page_address(pages[i]) + po, l);
147 data += l;
148 left -= l;
149 po += l;
150 if (po == PAGE_CACHE_SIZE) {
151 po = 0;
152 i++;
153 }
154 }
155 return len;
156}
157EXPORT_SYMBOL(ceph_copy_from_page_vector);
158
159/*
160 * copy user data from a page vector into a user pointer
161 */
162int ceph_copy_page_vector_to_user(struct page **pages,
163 char __user *data,
164 loff_t off, size_t len)
165{
166 int i = 0;
167 int po = off & ~PAGE_CACHE_MASK;
168 int left = len;
169 int l, bad;
170
171 while (left > 0) {
172 l = min_t(int, left, PAGE_CACHE_SIZE-po);
173 bad = copy_to_user(data, page_address(pages[i]) + po, l);
174 if (bad == l)
175 return -EFAULT;
176 data += l - bad;
177 left -= l - bad;
178 if (po) {
179 po += l - bad;
180 if (po == PAGE_CACHE_SIZE)
181 po = 0;
182 }
183 i++;
184 }
185 return len;
186}
187EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
188
189/*
190 * Zero an extent within a page vector. Offset is relative to the
191 * start of the first page.
192 */
193void ceph_zero_page_vector_range(int off, int len, struct page **pages)
194{
195 int i = off >> PAGE_CACHE_SHIFT;
196
197 off &= ~PAGE_CACHE_MASK;
198
199 dout("zero_page_vector_page %u~%u\n", off, len);
200
201 /* leading partial page? */
202 if (off) {
203 int end = min((int)PAGE_CACHE_SIZE, off + len);
204 dout("zeroing %d %p head from %d\n", i, pages[i],
205 (int)off);
206 zero_user_segment(pages[i], off, end);
207 len -= (end - off);
208 i++;
209 }
210 while (len >= PAGE_CACHE_SIZE) {
211 dout("zeroing %d %p len=%d\n", i, pages[i], len);
212 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
213 len -= PAGE_CACHE_SIZE;
214 i++;
215 }
216 /* trailing partial page? */
217 if (len) {
218 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
219 zero_user_segment(pages[i], 0, len);
220 }
221}
222EXPORT_SYMBOL(ceph_zero_page_vector_range);
223