aboutsummaryrefslogtreecommitdiffstats
path: root/net/ceph
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ceph
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'net/ceph')
-rw-r--r--net/ceph/Kconfig14
-rw-r--r--net/ceph/auth_none.c15
-rw-r--r--net/ceph/auth_x.c15
-rw-r--r--net/ceph/auth_x.h6
-rw-r--r--net/ceph/ceph_common.c81
-rw-r--r--net/ceph/ceph_hash.c6
-rw-r--r--net/ceph/crush/crush.c39
-rw-r--r--net/ceph/crush/mapper.c148
-rw-r--r--net/ceph/crypto.c13
-rw-r--r--net/ceph/crypto.h3
-rw-r--r--net/ceph/debugfs.c10
-rw-r--r--net/ceph/messenger.c1688
-rw-r--r--net/ceph/mon_client.c204
-rw-r--r--net/ceph/msgpool.c7
-rw-r--r--net/ceph/osd_client.c328
-rw-r--r--net/ceph/osdmap.c206
-rw-r--r--net/ceph/pagelist.c19
17 files changed, 1167 insertions, 1635 deletions
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index cc04dd667a1..be683f2d401 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -27,17 +27,3 @@ config CEPH_LIB_PRETTYDEBUG
27 27
28 If unsure, say N. 28 If unsure, say N.
29 29
30config CEPH_LIB_USE_DNS_RESOLVER
31 bool "Use in-kernel support for DNS lookup"
32 depends on CEPH_LIB
33 select DNS_RESOLVER
34 default n
35 help
36 If you say Y here, hostnames (e.g. monitor addresses) will
37 be resolved using the CONFIG_DNS_RESOLVER facility.
38
39 For information on how to use CONFIG_DNS_RESOLVER consult
40 Documentation/networking/dns_resolver.txt
41
42 If unsure, say N.
43
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 925ca583c09..214c2bb43d6 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -59,7 +59,9 @@ static int handle_reply(struct ceph_auth_client *ac, int result,
59 */ 59 */
60static int ceph_auth_none_create_authorizer( 60static int ceph_auth_none_create_authorizer(
61 struct ceph_auth_client *ac, int peer_type, 61 struct ceph_auth_client *ac, int peer_type,
62 struct ceph_auth_handshake *auth) 62 struct ceph_authorizer **a,
63 void **buf, size_t *len,
64 void **reply_buf, size_t *reply_len)
63{ 65{
64 struct ceph_auth_none_info *ai = ac->private; 66 struct ceph_auth_none_info *ai = ac->private;
65 struct ceph_none_authorizer *au = &ai->au; 67 struct ceph_none_authorizer *au = &ai->au;
@@ -80,12 +82,11 @@ static int ceph_auth_none_create_authorizer(
80 dout("built authorizer len %d\n", au->buf_len); 82 dout("built authorizer len %d\n", au->buf_len);
81 } 83 }
82 84
83 auth->authorizer = (struct ceph_authorizer *) au; 85 *a = (struct ceph_authorizer *)au;
84 auth->authorizer_buf = au->buf; 86 *buf = au->buf;
85 auth->authorizer_buf_len = au->buf_len; 87 *len = au->buf_len;
86 auth->authorizer_reply_buf = au->reply_buf; 88 *reply_buf = au->reply_buf;
87 auth->authorizer_reply_buf_len = sizeof (au->reply_buf); 89 *reply_len = sizeof(au->reply_buf);
88
89 return 0; 90 return 0;
90 91
91bad2: 92bad2:
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a16bf14eb02..1587dc6010c 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -526,7 +526,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
526 526
527static int ceph_x_create_authorizer( 527static int ceph_x_create_authorizer(
528 struct ceph_auth_client *ac, int peer_type, 528 struct ceph_auth_client *ac, int peer_type,
529 struct ceph_auth_handshake *auth) 529 struct ceph_authorizer **a,
530 void **buf, size_t *len,
531 void **reply_buf, size_t *reply_len)
530{ 532{
531 struct ceph_x_authorizer *au; 533 struct ceph_x_authorizer *au;
532 struct ceph_x_ticket_handler *th; 534 struct ceph_x_ticket_handler *th;
@@ -546,12 +548,11 @@ static int ceph_x_create_authorizer(
546 return ret; 548 return ret;
547 } 549 }
548 550
549 auth->authorizer = (struct ceph_authorizer *) au; 551 *a = (struct ceph_authorizer *)au;
550 auth->authorizer_buf = au->buf->vec.iov_base; 552 *buf = au->buf->vec.iov_base;
551 auth->authorizer_buf_len = au->buf->vec.iov_len; 553 *len = au->buf->vec.iov_len;
552 auth->authorizer_reply_buf = au->reply_buf; 554 *reply_buf = au->reply_buf;
553 auth->authorizer_reply_buf_len = sizeof (au->reply_buf); 555 *reply_len = sizeof(au->reply_buf);
554
555 return 0; 556 return 0;
556} 557}
557 558
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index f459e93b774..e02da7a5c5a 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -13,7 +13,7 @@
13 */ 13 */
14struct ceph_x_ticket_handler { 14struct ceph_x_ticket_handler {
15 struct rb_node node; 15 struct rb_node node;
16 unsigned int service; 16 unsigned service;
17 17
18 struct ceph_crypto_key session_key; 18 struct ceph_crypto_key session_key;
19 struct ceph_timespec validity; 19 struct ceph_timespec validity;
@@ -27,7 +27,7 @@ struct ceph_x_ticket_handler {
27 27
28struct ceph_x_authorizer { 28struct ceph_x_authorizer {
29 struct ceph_buffer *buf; 29 struct ceph_buffer *buf;
30 unsigned int service; 30 unsigned service;
31 u64 nonce; 31 u64 nonce;
32 char reply_buf[128]; /* big enough for encrypted blob */ 32 char reply_buf[128]; /* big enough for encrypted blob */
33}; 33};
@@ -38,7 +38,7 @@ struct ceph_x_info {
38 bool starting; 38 bool starting;
39 u64 server_challenge; 39 u64 server_challenge;
40 40
41 unsigned int have_keys; 41 unsigned have_keys;
42 struct rb_root ticket_handlers; 42 struct rb_root ticket_handlers;
43 43
44 struct ceph_x_authorizer auth_authorizer; 44 struct ceph_x_authorizer auth_authorizer;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index ee71ea26777..2883ea01e68 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -17,7 +17,6 @@
17#include <linux/string.h> 17#include <linux/string.h>
18 18
19 19
20#include <linux/ceph/ceph_features.h>
21#include <linux/ceph/libceph.h> 20#include <linux/ceph/libceph.h>
22#include <linux/ceph/debugfs.h> 21#include <linux/ceph/debugfs.h>
23#include <linux/ceph/decode.h> 22#include <linux/ceph/decode.h>
@@ -84,7 +83,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
84 return -1; 83 return -1;
85 } 84 }
86 } else { 85 } else {
86 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
87 memcpy(&client->fsid, fsid, sizeof(*fsid)); 87 memcpy(&client->fsid, fsid, sizeof(*fsid));
88 ceph_debugfs_client_init(client);
89 client->have_fsid = true;
88 } 90 }
89 return 0; 91 return 0;
90} 92}
@@ -201,9 +203,7 @@ enum {
201 Opt_ip, 203 Opt_ip,
202 Opt_last_string, 204 Opt_last_string,
203 /* string args above */ 205 /* string args above */
204 Opt_share,
205 Opt_noshare, 206 Opt_noshare,
206 Opt_crc,
207 Opt_nocrc, 207 Opt_nocrc,
208}; 208};
209 209
@@ -219,9 +219,7 @@ static match_table_t opt_tokens = {
219 {Opt_key, "key=%s"}, 219 {Opt_key, "key=%s"},
220 {Opt_ip, "ip=%s"}, 220 {Opt_ip, "ip=%s"},
221 /* string args above */ 221 /* string args above */
222 {Opt_share, "share"},
223 {Opt_noshare, "noshare"}, 222 {Opt_noshare, "noshare"},
224 {Opt_crc, "crc"},
225 {Opt_nocrc, "nocrc"}, 223 {Opt_nocrc, "nocrc"},
226 {-1, NULL} 224 {-1, NULL}
227}; 225};
@@ -281,11 +279,10 @@ out:
281 return err; 279 return err;
282} 280}
283 281
284struct ceph_options * 282int ceph_parse_options(struct ceph_options **popt, char *options,
285ceph_parse_options(char *options, const char *dev_name, 283 const char *dev_name, const char *dev_name_end,
286 const char *dev_name_end, 284 int (*parse_extra_token)(char *c, void *private),
287 int (*parse_extra_token)(char *c, void *private), 285 void *private)
288 void *private)
289{ 286{
290 struct ceph_options *opt; 287 struct ceph_options *opt;
291 const char *c; 288 const char *c;
@@ -294,7 +291,7 @@ ceph_parse_options(char *options, const char *dev_name,
294 291
295 opt = kzalloc(sizeof(*opt), GFP_KERNEL); 292 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
296 if (!opt) 293 if (!opt)
297 return ERR_PTR(-ENOMEM); 294 return err;
298 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), 295 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
299 GFP_KERNEL); 296 GFP_KERNEL);
300 if (!opt->mon_addr) 297 if (!opt->mon_addr)
@@ -305,6 +302,7 @@ ceph_parse_options(char *options, const char *dev_name,
305 302
306 /* start with defaults */ 303 /* start with defaults */
307 opt->flags = CEPH_OPT_DEFAULT; 304 opt->flags = CEPH_OPT_DEFAULT;
305 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
308 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 306 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
309 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 307 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
310 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ 308 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
@@ -390,7 +388,7 @@ ceph_parse_options(char *options, const char *dev_name,
390 388
391 /* misc */ 389 /* misc */
392 case Opt_osdtimeout: 390 case Opt_osdtimeout:
393 pr_warning("ignoring deprecated osdtimeout option\n"); 391 opt->osd_timeout = intval;
394 break; 392 break;
395 case Opt_osdkeepalivetimeout: 393 case Opt_osdkeepalivetimeout:
396 opt->osd_keepalive_timeout = intval; 394 opt->osd_keepalive_timeout = intval;
@@ -402,16 +400,10 @@ ceph_parse_options(char *options, const char *dev_name,
402 opt->mount_timeout = intval; 400 opt->mount_timeout = intval;
403 break; 401 break;
404 402
405 case Opt_share:
406 opt->flags &= ~CEPH_OPT_NOSHARE;
407 break;
408 case Opt_noshare: 403 case Opt_noshare:
409 opt->flags |= CEPH_OPT_NOSHARE; 404 opt->flags |= CEPH_OPT_NOSHARE;
410 break; 405 break;
411 406
412 case Opt_crc:
413 opt->flags &= ~CEPH_OPT_NOCRC;
414 break;
415 case Opt_nocrc: 407 case Opt_nocrc:
416 opt->flags |= CEPH_OPT_NOCRC; 408 opt->flags |= CEPH_OPT_NOCRC;
417 break; 409 break;
@@ -422,11 +414,12 @@ ceph_parse_options(char *options, const char *dev_name,
422 } 414 }
423 415
424 /* success */ 416 /* success */
425 return opt; 417 *popt = opt;
418 return 0;
426 419
427out: 420out:
428 ceph_destroy_options(opt); 421 ceph_destroy_options(opt);
429 return ERR_PTR(err); 422 return err;
430} 423}
431EXPORT_SYMBOL(ceph_parse_options); 424EXPORT_SYMBOL(ceph_parse_options);
432 425
@@ -439,12 +432,9 @@ EXPORT_SYMBOL(ceph_client_id);
439/* 432/*
440 * create a fresh client instance 433 * create a fresh client instance
441 */ 434 */
442struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, 435struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
443 unsigned int supported_features,
444 unsigned int required_features)
445{ 436{
446 struct ceph_client *client; 437 struct ceph_client *client;
447 struct ceph_entity_addr *myaddr = NULL;
448 int err = -ENOMEM; 438 int err = -ENOMEM;
449 439
450 client = kzalloc(sizeof(*client), GFP_KERNEL); 440 client = kzalloc(sizeof(*client), GFP_KERNEL);
@@ -459,18 +449,10 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
459 client->auth_err = 0; 449 client->auth_err = 0;
460 450
461 client->extra_mon_dispatch = NULL; 451 client->extra_mon_dispatch = NULL;
462 client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | 452 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
463 supported_features; 453 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
464 client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | 454
465 required_features; 455 client->msgr = NULL;
466
467 /* msgr */
468 if (ceph_test_opt(client, MYIP))
469 myaddr = &client->options->my_addr;
470 ceph_messenger_init(&client->msgr, myaddr,
471 client->supported_features,
472 client->required_features,
473 ceph_test_opt(client, NOCRC));
474 456
475 /* subsystems */ 457 /* subsystems */
476 err = ceph_monc_init(&client->monc, client); 458 err = ceph_monc_init(&client->monc, client);
@@ -494,15 +476,23 @@ void ceph_destroy_client(struct ceph_client *client)
494{ 476{
495 dout("destroy_client %p\n", client); 477 dout("destroy_client %p\n", client);
496 478
497 atomic_set(&client->msgr.stopping, 1);
498
499 /* unmount */ 479 /* unmount */
500 ceph_osdc_stop(&client->osdc); 480 ceph_osdc_stop(&client->osdc);
501 481
482 /*
483 * make sure osd connections close out before destroying the
484 * auth module, which is needed to free those connections'
485 * ceph_authorizers.
486 */
487 ceph_msgr_flush();
488
502 ceph_monc_stop(&client->monc); 489 ceph_monc_stop(&client->monc);
503 490
504 ceph_debugfs_client_cleanup(client); 491 ceph_debugfs_client_cleanup(client);
505 492
493 if (client->msgr)
494 ceph_messenger_destroy(client->msgr);
495
506 ceph_destroy_options(client->options); 496 ceph_destroy_options(client->options);
507 497
508 kfree(client); 498 kfree(client);
@@ -524,9 +514,24 @@ static int have_mon_and_osd_map(struct ceph_client *client)
524 */ 514 */
525int __ceph_open_session(struct ceph_client *client, unsigned long started) 515int __ceph_open_session(struct ceph_client *client, unsigned long started)
526{ 516{
517 struct ceph_entity_addr *myaddr = NULL;
527 int err; 518 int err;
528 unsigned long timeout = client->options->mount_timeout * HZ; 519 unsigned long timeout = client->options->mount_timeout * HZ;
529 520
521 /* initialize the messenger */
522 if (client->msgr == NULL) {
523 if (ceph_test_opt(client, MYIP))
524 myaddr = &client->options->my_addr;
525 client->msgr = ceph_messenger_create(myaddr,
526 client->supported_features,
527 client->required_features);
528 if (IS_ERR(client->msgr)) {
529 client->msgr = NULL;
530 return PTR_ERR(client->msgr);
531 }
532 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
533 }
534
530 /* open session, and wait for mon and osd maps */ 535 /* open session, and wait for mon and osd maps */
531 err = ceph_monc_open_session(&client->monc); 536 err = ceph_monc_open_session(&client->monc);
532 if (err < 0) 537 if (err < 0)
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
index 67bb1f11e61..0a1b53bce76 100644
--- a/net/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -20,7 +20,7 @@
20 c = c - a; c = c - b; c = c ^ (b >> 15); \ 20 c = c - a; c = c - b; c = c ^ (b >> 15); \
21 } while (0) 21 } while (0)
22 22
23unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length) 23unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
24{ 24{
25 const unsigned char *k = (const unsigned char *)str; 25 const unsigned char *k = (const unsigned char *)str;
26 __u32 a, b, c; /* the internal state */ 26 __u32 a, b, c; /* the internal state */
@@ -81,7 +81,7 @@ unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length)
81/* 81/*
82 * linux dcache hash 82 * linux dcache hash
83 */ 83 */
84unsigned int ceph_str_hash_linux(const char *str, unsigned int length) 84unsigned ceph_str_hash_linux(const char *str, unsigned length)
85{ 85{
86 unsigned long hash = 0; 86 unsigned long hash = 0;
87 unsigned char c; 87 unsigned char c;
@@ -94,7 +94,7 @@ unsigned int ceph_str_hash_linux(const char *str, unsigned int length)
94} 94}
95 95
96 96
97unsigned int ceph_str_hash(int type, const char *s, unsigned int len) 97unsigned ceph_str_hash(int type, const char *s, unsigned len)
98{ 98{
99 switch (type) { 99 switch (type) {
100 case CEPH_STR_HASH_LINUX: 100 case CEPH_STR_HASH_LINUX:
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 089613234f0..d6ebb13a18a 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -26,9 +26,9 @@ const char *crush_bucket_alg_name(int alg)
26 * @b: bucket pointer 26 * @b: bucket pointer
27 * @p: item index in bucket 27 * @p: item index in bucket
28 */ 28 */
29int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) 29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{ 30{
31 if ((__u32)p >= b->size) 31 if (p >= b->size)
32 return 0; 32 return 0;
33 33
34 switch (b->alg) { 34 switch (b->alg) {
@@ -37,13 +37,38 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
37 case CRUSH_BUCKET_LIST: 37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p]; 38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE: 39 case CRUSH_BUCKET_TREE:
40 return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; 40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
41 case CRUSH_BUCKET_STRAW: 43 case CRUSH_BUCKET_STRAW:
42 return ((struct crush_bucket_straw *)b)->item_weights[p]; 44 return ((struct crush_bucket_straw *)b)->item_weights[p];
43 } 45 }
44 return 0; 46 return 0;
45} 47}
46 48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
47void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) 72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
48{ 73{
49 kfree(b->h.perm); 74 kfree(b->h.perm);
@@ -62,8 +87,6 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
62 87
63void crush_destroy_bucket_tree(struct crush_bucket_tree *b) 88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
64{ 89{
65 kfree(b->h.perm);
66 kfree(b->h.items);
67 kfree(b->node_weights); 90 kfree(b->node_weights);
68 kfree(b); 91 kfree(b);
69} 92}
@@ -101,9 +124,10 @@ void crush_destroy_bucket(struct crush_bucket *b)
101 */ 124 */
102void crush_destroy(struct crush_map *map) 125void crush_destroy(struct crush_map *map)
103{ 126{
127 int b;
128
104 /* buckets */ 129 /* buckets */
105 if (map->buckets) { 130 if (map->buckets) {
106 __s32 b;
107 for (b = 0; b < map->max_buckets; b++) { 131 for (b = 0; b < map->max_buckets; b++) {
108 if (map->buckets[b] == NULL) 132 if (map->buckets[b] == NULL)
109 continue; 133 continue;
@@ -114,12 +138,13 @@ void crush_destroy(struct crush_map *map)
114 138
115 /* rules */ 139 /* rules */
116 if (map->rules) { 140 if (map->rules) {
117 __u32 b;
118 for (b = 0; b < map->max_rules; b++) 141 for (b = 0; b < map->max_rules; b++)
119 kfree(map->rules[b]); 142 kfree(map->rules[b]);
120 kfree(map->rules); 143 kfree(map->rules);
121 } 144 }
122 145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
123 kfree(map); 148 kfree(map);
124} 149}
125 150
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 35fce755ce1..42599e31dca 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -20,7 +20,6 @@
20 20
21#include <linux/crush/crush.h> 21#include <linux/crush/crush.h>
22#include <linux/crush/hash.h> 22#include <linux/crush/hash.h>
23#include <linux/crush/mapper.h>
24 23
25/* 24/*
26 * Implement the core CRUSH mapping algorithm. 25 * Implement the core CRUSH mapping algorithm.
@@ -33,9 +32,9 @@
33 * @type: storage ruleset type (user defined) 32 * @type: storage ruleset type (user defined)
34 * @size: output set size 33 * @size: output set size
35 */ 34 */
36int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) 35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
37{ 36{
38 __u32 i; 37 int i;
39 38
40 for (i = 0; i < map->max_rules; i++) { 39 for (i = 0; i < map->max_rules; i++) {
41 if (map->rules[i] && 40 if (map->rules[i] &&
@@ -69,11 +68,11 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
69static int bucket_perm_choose(struct crush_bucket *bucket, 68static int bucket_perm_choose(struct crush_bucket *bucket,
70 int x, int r) 69 int x, int r)
71{ 70{
72 unsigned int pr = r % bucket->size; 71 unsigned pr = r % bucket->size;
73 unsigned int i, s; 72 unsigned i, s;
74 73
75 /* start a new permutation if @x has changed */ 74 /* start a new permutation if @x has changed */
76 if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { 75 if (bucket->perm_x != x || bucket->perm_n == 0) {
77 dprintk("bucket %d new x=%d\n", bucket->id, x); 76 dprintk("bucket %d new x=%d\n", bucket->id, x);
78 bucket->perm_x = x; 77 bucket->perm_x = x;
79 78
@@ -101,13 +100,13 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
101 for (i = 0; i < bucket->perm_n; i++) 100 for (i = 0; i < bucket->perm_n; i++)
102 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); 101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
103 while (bucket->perm_n <= pr) { 102 while (bucket->perm_n <= pr) {
104 unsigned int p = bucket->perm_n; 103 unsigned p = bucket->perm_n;
105 /* no point in swapping the final entry */ 104 /* no point in swapping the final entry */
106 if (p < bucket->size - 1) { 105 if (p < bucket->size - 1) {
107 i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
108 (bucket->size - p); 107 (bucket->size - p);
109 if (i) { 108 if (i) {
110 unsigned int t = bucket->perm[p + i]; 109 unsigned t = bucket->perm[p + i];
111 bucket->perm[p + i] = bucket->perm[p]; 110 bucket->perm[p + i] = bucket->perm[p];
112 bucket->perm[p] = t; 111 bucket->perm[p] = t;
113 } 112 }
@@ -153,8 +152,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
153 return bucket->h.items[i]; 152 return bucket->h.items[i];
154 } 153 }
155 154
156 dprintk("bad list sums for bucket %d\n", bucket->h.id); 155 BUG_ON(1);
157 return bucket->h.items[0]; 156 return 0;
158} 157}
159 158
160 159
@@ -220,7 +219,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
220static int bucket_straw_choose(struct crush_bucket_straw *bucket, 219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
221 int x, int r) 220 int x, int r)
222{ 221{
223 __u32 i; 222 int i;
224 int high = 0; 223 int high = 0;
225 __u64 high_draw = 0; 224 __u64 high_draw = 0;
226 __u64 draw; 225 __u64 draw;
@@ -240,7 +239,6 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
240static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
241{ 240{
242 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
243 BUG_ON(in->size == 0);
244 switch (in->alg) { 242 switch (in->alg) {
245 case CRUSH_BUCKET_UNIFORM: 243 case CRUSH_BUCKET_UNIFORM:
246 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -255,7 +253,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
255 return bucket_straw_choose((struct crush_bucket_straw *)in, 253 return bucket_straw_choose((struct crush_bucket_straw *)in,
256 x, r); 254 x, r);
257 default: 255 default:
258 dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 256 BUG_ON(1);
259 return in->items[0]; 257 return in->items[0];
260 } 258 }
261} 259}
@@ -264,7 +262,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
264 * true if device is marked "out" (failed, fully offloaded) 262 * true if device is marked "out" (failed, fully offloaded)
265 * of the cluster 263 * of the cluster
266 */ 264 */
267static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) 265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
268{ 266{
269 if (weight[item] >= 0x10000) 267 if (weight[item] >= 0x10000)
270 return 0; 268 return 0;
@@ -289,16 +287,16 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
289 * @recurse_to_leaf: true if we want one device under each item of given type 287 * @recurse_to_leaf: true if we want one device under each item of given type
290 * @out2: second output vector for leaf items (if @recurse_to_leaf) 288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
291 */ 289 */
292static int crush_choose(const struct crush_map *map, 290static int crush_choose(struct crush_map *map,
293 struct crush_bucket *bucket, 291 struct crush_bucket *bucket,
294 const __u32 *weight, 292 __u32 *weight,
295 int x, int numrep, int type, 293 int x, int numrep, int type,
296 int *out, int outpos, 294 int *out, int outpos,
297 int firstn, int recurse_to_leaf, 295 int firstn, int recurse_to_leaf,
298 int *out2) 296 int *out2)
299{ 297{
300 int rep; 298 int rep;
301 unsigned int ftotal, flocal; 299 int ftotal, flocal;
302 int retry_descent, retry_bucket, skip_rep; 300 int retry_descent, retry_bucket, skip_rep;
303 struct crush_bucket *in = bucket; 301 struct crush_bucket *in = bucket;
304 int r; 302 int r;
@@ -306,6 +304,7 @@ static int crush_choose(const struct crush_map *map,
306 int item = 0; 304 int item = 0;
307 int itemtype; 305 int itemtype;
308 int collide, reject; 306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
309 308
310 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
311 bucket->id, x, outpos, numrep); 310 bucket->id, x, outpos, numrep);
@@ -326,7 +325,7 @@ static int crush_choose(const struct crush_map *map,
326 r = rep; 325 r = rep;
327 if (in->alg == CRUSH_BUCKET_UNIFORM) { 326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
328 /* be careful */ 327 /* be careful */
329 if (firstn || (__u32)numrep >= in->size) 328 if (firstn || numrep >= in->size)
330 /* r' = r + f_total */ 329 /* r' = r + f_total */
331 r += ftotal; 330 r += ftotal;
332 else if (in->size % numrep == 0) 331 else if (in->size % numrep == 0)
@@ -350,17 +349,12 @@ static int crush_choose(const struct crush_map *map,
350 reject = 1; 349 reject = 1;
351 goto reject; 350 goto reject;
352 } 351 }
353 if (map->choose_local_fallback_tries > 0 && 352 if (flocal >= (in->size>>1) &&
354 flocal >= (in->size>>1) && 353 flocal > orig_tries)
355 flocal > map->choose_local_fallback_tries)
356 item = bucket_perm_choose(in, x, r); 354 item = bucket_perm_choose(in, x, r);
357 else 355 else
358 item = crush_bucket_choose(in, x, r); 356 item = crush_bucket_choose(in, x, r);
359 if (item >= map->max_devices) { 357 BUG_ON(item >= map->max_devices);
360 dprintk(" bad item %d\n", item);
361 skip_rep = 1;
362 break;
363 }
364 358
365 /* desired type? */ 359 /* desired type? */
366 if (item < 0) 360 if (item < 0)
@@ -371,12 +365,8 @@ static int crush_choose(const struct crush_map *map,
371 365
372 /* keep going? */ 366 /* keep going? */
373 if (itemtype != type) { 367 if (itemtype != type) {
374 if (item >= 0 || 368 BUG_ON(item >= 0 ||
375 (-1-item) >= map->max_buckets) { 369 (-1-item) >= map->max_buckets);
376 dprintk(" bad item type %d\n", type);
377 skip_rep = 1;
378 break;
379 }
380 in = map->buckets[-1-item]; 370 in = map->buckets[-1-item];
381 retry_bucket = 1; 371 retry_bucket = 1;
382 continue; 372 continue;
@@ -422,21 +412,20 @@ reject:
422 ftotal++; 412 ftotal++;
423 flocal++; 413 flocal++;
424 414
425 if (collide && flocal <= map->choose_local_tries) 415 if (collide && flocal < 3)
426 /* retry locally a few times */ 416 /* retry locally a few times */
427 retry_bucket = 1; 417 retry_bucket = 1;
428 else if (map->choose_local_fallback_tries > 0 && 418 else if (flocal < in->size + orig_tries)
429 flocal <= in->size + map->choose_local_fallback_tries)
430 /* exhaustive bucket search */ 419 /* exhaustive bucket search */
431 retry_bucket = 1; 420 retry_bucket = 1;
432 else if (ftotal <= map->choose_total_tries) 421 else if (ftotal < 20)
433 /* then retry descent */ 422 /* then retry descent */
434 retry_descent = 1; 423 retry_descent = 1;
435 else 424 else
436 /* else give up */ 425 /* else give up */
437 skip_rep = 1; 426 skip_rep = 1;
438 dprintk(" reject %d collide %d " 427 dprintk(" reject %d collide %d "
439 "ftotal %u flocal %u\n", 428 "ftotal %d flocal %d\n",
440 reject, collide, ftotal, 429 reject, collide, ftotal,
441 flocal); 430 flocal);
442 } 431 }
@@ -465,12 +454,15 @@ reject:
465 * @x: hash input 454 * @x: hash input
466 * @result: pointer to result vector 455 * @result: pointer to result vector
467 * @result_max: maximum result size 456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
468 */ 458 */
469int crush_do_rule(const struct crush_map *map, 459int crush_do_rule(struct crush_map *map,
470 int ruleno, int x, int *result, int result_max, 460 int ruleno, int x, int *result, int result_max,
471 const __u32 *weight) 461 int force, __u32 *weight)
472{ 462{
473 int result_len; 463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
474 int a[CRUSH_MAX_SET]; 466 int a[CRUSH_MAX_SET];
475 int b[CRUSH_MAX_SET]; 467 int b[CRUSH_MAX_SET];
476 int c[CRUSH_MAX_SET]; 468 int c[CRUSH_MAX_SET];
@@ -481,44 +473,67 @@ int crush_do_rule(const struct crush_map *map,
481 int osize; 473 int osize;
482 int *tmp; 474 int *tmp;
483 struct crush_rule *rule; 475 struct crush_rule *rule;
484 __u32 step; 476 int step;
485 int i, j; 477 int i, j;
486 int numrep; 478 int numrep;
487 int firstn; 479 int firstn;
480 int rc = -1;
488 481
489 if ((__u32)ruleno >= map->max_rules) { 482 BUG_ON(ruleno >= map->max_rules);
490 dprintk(" bad ruleno %d\n", ruleno);
491 return 0;
492 }
493 483
494 rule = map->rules[ruleno]; 484 rule = map->rules[ruleno];
495 result_len = 0; 485 result_len = 0;
496 w = a; 486 w = a;
497 o = b; 487 o = b;
498 488
499 for (step = 0; step < rule->len; step++) { 489 /*
500 struct crush_rule_step *curstep = &rule->steps[step]; 490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
501 513
514 for (step = 0; step < rule->len; step++) {
502 firstn = 0; 515 firstn = 0;
503 switch (curstep->op) { 516 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE: 517 case CRUSH_RULE_TAKE:
505 w[0] = curstep->arg1; 518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
506 wsize = 1; 523 wsize = 1;
507 break; 524 break;
508 525
509 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: 526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
510 case CRUSH_RULE_CHOOSE_FIRSTN: 527 case CRUSH_RULE_CHOOSE_FIRSTN:
511 firstn = 1; 528 firstn = 1;
512 /* fall through */
513 case CRUSH_RULE_CHOOSE_LEAF_INDEP: 529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
514 case CRUSH_RULE_CHOOSE_INDEP: 530 case CRUSH_RULE_CHOOSE_INDEP:
515 if (wsize == 0) 531 BUG_ON(wsize == 0);
516 break;
517 532
518 recurse_to_leaf = 533 recurse_to_leaf =
519 curstep->op == 534 rule->steps[step].op ==
520 CRUSH_RULE_CHOOSE_LEAF_FIRSTN || 535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
521 curstep->op == 536 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_INDEP; 537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
523 538
524 /* reset output */ 539 /* reset output */
@@ -530,18 +545,32 @@ int crush_do_rule(const struct crush_map *map,
530 * basically, numrep <= 0 means relative to 545 * basically, numrep <= 0 means relative to
531 * the provided result_max 546 * the provided result_max
532 */ 547 */
533 numrep = curstep->arg1; 548 numrep = rule->steps[step].arg1;
534 if (numrep <= 0) { 549 if (numrep <= 0) {
535 numrep += result_max; 550 numrep += result_max;
536 if (numrep <= 0) 551 if (numrep <= 0)
537 continue; 552 continue;
538 } 553 }
539 j = 0; 554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
540 osize += crush_choose(map, 569 osize += crush_choose(map,
541 map->buckets[-1-w[i]], 570 map->buckets[-1-w[i]],
542 weight, 571 weight,
543 x, numrep, 572 x, numrep,
544 curstep->arg2, 573 rule->steps[step].arg2,
545 o+osize, j, 574 o+osize, j,
546 firstn, 575 firstn,
547 recurse_to_leaf, c+osize); 576 recurse_to_leaf, c+osize);
@@ -568,12 +597,13 @@ int crush_do_rule(const struct crush_map *map,
568 break; 597 break;
569 598
570 default: 599 default:
571 dprintk(" unknown op %d at step %d\n", 600 BUG_ON(1);
572 curstep->op, step);
573 break;
574 } 601 }
575 } 602 }
576 return result_len; 603 rc = result_len;
604
605out:
606 return rc;
577} 607}
578 608
579 609
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index af14cb42516..85f3bc0a706 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -15,9 +15,10 @@ int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
15 const struct ceph_crypto_key *src) 15 const struct ceph_crypto_key *src)
16{ 16{
17 memcpy(dst, src, sizeof(struct ceph_crypto_key)); 17 memcpy(dst, src, sizeof(struct ceph_crypto_key));
18 dst->key = kmemdup(src->key, src->len, GFP_NOFS); 18 dst->key = kmalloc(src->len, GFP_NOFS);
19 if (!dst->key) 19 if (!dst->key)
20 return -ENOMEM; 20 return -ENOMEM;
21 memcpy(dst->key, src->key, src->len);
21 return 0; 22 return 0;
22} 23}
23 24
@@ -423,15 +424,14 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
423 } 424 }
424} 425}
425 426
426int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) 427int ceph_key_instantiate(struct key *key, const void *data, size_t datalen)
427{ 428{
428 struct ceph_crypto_key *ckey; 429 struct ceph_crypto_key *ckey;
429 size_t datalen = prep->datalen;
430 int ret; 430 int ret;
431 void *p; 431 void *p;
432 432
433 ret = -EINVAL; 433 ret = -EINVAL;
434 if (datalen <= 0 || datalen > 32767 || !prep->data) 434 if (datalen <= 0 || datalen > 32767 || !data)
435 goto err; 435 goto err;
436 436
437 ret = key_payload_reserve(key, datalen); 437 ret = key_payload_reserve(key, datalen);
@@ -444,8 +444,8 @@ int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
444 goto err; 444 goto err;
445 445
446 /* TODO ceph_crypto_key_decode should really take const input */ 446 /* TODO ceph_crypto_key_decode should really take const input */
447 p = (void *)prep->data; 447 p = (void *)data;
448 ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); 448 ret = ceph_crypto_key_decode(ckey, &p, (char*)data+datalen);
449 if (ret < 0) 449 if (ret < 0)
450 goto err_ckey; 450 goto err_ckey;
451 451
@@ -467,7 +467,6 @@ void ceph_key_destroy(struct key *key) {
467 struct ceph_crypto_key *ckey = key->payload.data; 467 struct ceph_crypto_key *ckey = key->payload.data;
468 468
469 ceph_crypto_key_destroy(ckey); 469 ceph_crypto_key_destroy(ckey);
470 kfree(ckey);
471} 470}
472 471
473struct key_type key_type_ceph = { 472struct key_type key_type_ceph = {
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 3572dc518bc..1919d1550d7 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -16,8 +16,7 @@ struct ceph_crypto_key {
16 16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) 17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{ 18{
19 if (key) 19 kfree(key->key);
20 kfree(key->key);
21} 20}
22 21
23extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst, 22extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 38b5dc1823d..27d4ea315d1 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -94,9 +94,9 @@ static int monc_show(struct seq_file *s, void *p)
94 mutex_lock(&monc->mutex); 94 mutex_lock(&monc->mutex);
95 95
96 if (monc->have_mdsmap) 96 if (monc->have_mdsmap)
97 seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); 97 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
98 if (monc->have_osdmap) 98 if (monc->have_osdmap)
99 seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); 99 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
100 if (monc->want_next_osdmap) 100 if (monc->want_next_osdmap)
101 seq_printf(s, "want next osdmap\n"); 101 seq_printf(s, "want next osdmap\n");
102 102
@@ -146,7 +146,7 @@ static int osdc_show(struct seq_file *s, void *pp)
146 146
147 if (req->r_reassert_version.epoch) 147 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu", 148 seq_printf(s, "\t%u'%llu",
149 (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), 149 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
150 le64_to_cpu(req->r_reassert_version.version)); 150 le64_to_cpu(req->r_reassert_version.version));
151 else 151 else
152 seq_printf(s, "\t"); 152 seq_printf(s, "\t");
@@ -189,9 +189,6 @@ int ceph_debugfs_client_init(struct ceph_client *client)
189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, 189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
190 client->monc.auth->global_id); 190 client->monc.auth->global_id);
191 191
192 dout("ceph_debugfs_client_init %p %s\n", client, name);
193
194 BUG_ON(client->debugfs_dir);
195 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 192 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
196 if (!client->debugfs_dir) 193 if (!client->debugfs_dir)
197 goto out; 194 goto out;
@@ -237,7 +234,6 @@ out:
237 234
238void ceph_debugfs_client_cleanup(struct ceph_client *client) 235void ceph_debugfs_client_cleanup(struct ceph_client *client)
239{ 236{
240 dout("ceph_debugfs_client_cleanup %p\n", client);
241 debugfs_remove(client->debugfs_osdmap); 237 debugfs_remove(client->debugfs_osdmap);
242 debugfs_remove(client->debugfs_monmap); 238 debugfs_remove(client->debugfs_monmap);
243 debugfs_remove(client->osdc.debugfs_file); 239 debugfs_remove(client->osdc.debugfs_file);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5ccf87ed8d6..9918e9eb276 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -11,14 +11,12 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/blkdev.h> 13#include <linux/blkdev.h>
14#include <linux/dns_resolver.h>
15#include <net/tcp.h> 14#include <net/tcp.h>
16 15
17#include <linux/ceph/libceph.h> 16#include <linux/ceph/libceph.h>
18#include <linux/ceph/messenger.h> 17#include <linux/ceph/messenger.h>
19#include <linux/ceph/decode.h> 18#include <linux/ceph/decode.h>
20#include <linux/ceph/pagelist.h> 19#include <linux/ceph/pagelist.h>
21#include <linux/export.h>
22 20
23/* 21/*
24 * Ceph uses the messenger to exchange ceph_msg messages with other 22 * Ceph uses the messenger to exchange ceph_msg messages with other
@@ -29,74 +27,6 @@
29 * the sender. 27 * the sender.
30 */ 28 */
31 29
32/*
33 * We track the state of the socket on a given connection using
34 * values defined below. The transition to a new socket state is
35 * handled by a function which verifies we aren't coming from an
36 * unexpected state.
37 *
38 * --------
39 * | NEW* | transient initial state
40 * --------
41 * | con_sock_state_init()
42 * v
43 * ----------
44 * | CLOSED | initialized, but no socket (and no
45 * ---------- TCP connection)
46 * ^ \
47 * | \ con_sock_state_connecting()
48 * | ----------------------
49 * | \
50 * + con_sock_state_closed() \
51 * |+--------------------------- \
52 * | \ \ \
53 * | ----------- \ \
54 * | | CLOSING | socket event; \ \
55 * | ----------- await close \ \
56 * | ^ \ |
57 * | | \ |
58 * | + con_sock_state_closing() \ |
59 * | / \ | |
60 * | / --------------- | |
61 * | / \ v v
62 * | / --------------
63 * | / -----------------| CONNECTING | socket created, TCP
64 * | | / -------------- connect initiated
65 * | | | con_sock_state_connected()
66 * | | v
67 * -------------
68 * | CONNECTED | TCP connection established
69 * -------------
70 *
71 * State values for ceph_connection->sock_state; NEW is assumed to be 0.
72 */
73
74#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */
75#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */
76#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */
77#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
78#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
79
80/*
81 * connection states
82 */
83#define CON_STATE_CLOSED 1 /* -> PREOPEN */
84#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
85#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
86#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
87#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
88#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
89
90/*
91 * ceph_connection flag bits
92 */
93#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
94 * messages on errors */
95#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
96#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
97#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
98#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
99
100/* static tag bytes (protocol control messages) */ 30/* static tag bytes (protocol control messages) */
101static char tag_msg = CEPH_MSGR_TAG_MSG; 31static char tag_msg = CEPH_MSGR_TAG_MSG;
102static char tag_ack = CEPH_MSGR_TAG_ACK; 32static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -106,54 +36,48 @@ static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
106static struct lock_class_key socket_class; 36static struct lock_class_key socket_class;
107#endif 37#endif
108 38
109/*
110 * When skipping (ignoring) a block of input we read it into a "skip
111 * buffer," which is this many bytes in size.
112 */
113#define SKIP_BUF_SIZE 1024
114 39
115static void queue_con(struct ceph_connection *con); 40static void queue_con(struct ceph_connection *con);
116static void con_work(struct work_struct *); 41static void con_work(struct work_struct *);
117static void ceph_fault(struct ceph_connection *con); 42static void ceph_fault(struct ceph_connection *con);
118 43
119/* 44/*
120 * Nicely render a sockaddr as a string. An array of formatted 45 * nicely render a sockaddr as a string.
121 * strings is used, to approximate reentrancy.
122 */ 46 */
123#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ 47#define MAX_ADDR_STR 20
124#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) 48#define MAX_ADDR_STR_LEN 60
125#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) 49static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
126#define MAX_ADDR_STR_LEN 64 /* 54 is enough */ 50static DEFINE_SPINLOCK(addr_str_lock);
127 51static int last_addr_str;
128static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
129static atomic_t addr_str_seq = ATOMIC_INIT(0);
130
131static struct page *zero_page; /* used in certain error cases */
132 52
133const char *ceph_pr_addr(const struct sockaddr_storage *ss) 53const char *ceph_pr_addr(const struct sockaddr_storage *ss)
134{ 54{
135 int i; 55 int i;
136 char *s; 56 char *s;
137 struct sockaddr_in *in4 = (struct sockaddr_in *) ss; 57 struct sockaddr_in *in4 = (void *)ss;
138 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; 58 struct sockaddr_in6 *in6 = (void *)ss;
139 59
140 i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; 60 spin_lock(&addr_str_lock);
61 i = last_addr_str++;
62 if (last_addr_str == MAX_ADDR_STR)
63 last_addr_str = 0;
64 spin_unlock(&addr_str_lock);
141 s = addr_str[i]; 65 s = addr_str[i];
142 66
143 switch (ss->ss_family) { 67 switch (ss->ss_family) {
144 case AF_INET: 68 case AF_INET:
145 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, 69 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
146 ntohs(in4->sin_port)); 70 (unsigned int)ntohs(in4->sin_port));
147 break; 71 break;
148 72
149 case AF_INET6: 73 case AF_INET6:
150 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, 74 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
151 ntohs(in6->sin6_port)); 75 (unsigned int)ntohs(in6->sin6_port));
152 break; 76 break;
153 77
154 default: 78 default:
155 snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", 79 snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)",
156 ss->ss_family); 80 (int)ss->ss_family);
157 } 81 }
158 82
159 return s; 83 return s;
@@ -169,43 +93,22 @@ static void encode_my_addr(struct ceph_messenger *msgr)
169/* 93/*
170 * work queue for all reading and writing to/from the socket. 94 * work queue for all reading and writing to/from the socket.
171 */ 95 */
172static struct workqueue_struct *ceph_msgr_wq; 96struct workqueue_struct *ceph_msgr_wq;
173
174void _ceph_msgr_exit(void)
175{
176 if (ceph_msgr_wq) {
177 destroy_workqueue(ceph_msgr_wq);
178 ceph_msgr_wq = NULL;
179 }
180
181 BUG_ON(zero_page == NULL);
182 kunmap(zero_page);
183 page_cache_release(zero_page);
184 zero_page = NULL;
185}
186 97
187int ceph_msgr_init(void) 98int ceph_msgr_init(void)
188{ 99{
189 BUG_ON(zero_page != NULL);
190 zero_page = ZERO_PAGE(0);
191 page_cache_get(zero_page);
192
193 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); 100 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
194 if (ceph_msgr_wq) 101 if (!ceph_msgr_wq) {
195 return 0; 102 pr_err("msgr_init failed to create workqueue\n");
196 103 return -ENOMEM;
197 pr_err("msgr_init failed to create workqueue\n"); 104 }
198 _ceph_msgr_exit(); 105 return 0;
199
200 return -ENOMEM;
201} 106}
202EXPORT_SYMBOL(ceph_msgr_init); 107EXPORT_SYMBOL(ceph_msgr_init);
203 108
204void ceph_msgr_exit(void) 109void ceph_msgr_exit(void)
205{ 110{
206 BUG_ON(ceph_msgr_wq == NULL); 111 destroy_workqueue(ceph_msgr_wq);
207
208 _ceph_msgr_exit();
209} 112}
210EXPORT_SYMBOL(ceph_msgr_exit); 113EXPORT_SYMBOL(ceph_msgr_exit);
211 114
@@ -215,134 +118,70 @@ void ceph_msgr_flush(void)
215} 118}
216EXPORT_SYMBOL(ceph_msgr_flush); 119EXPORT_SYMBOL(ceph_msgr_flush);
217 120
218/* Connection socket state transition functions */
219
220static void con_sock_state_init(struct ceph_connection *con)
221{
222 int old_state;
223
224 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
225 if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
226 printk("%s: unexpected old state %d\n", __func__, old_state);
227 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
228 CON_SOCK_STATE_CLOSED);
229}
230
231static void con_sock_state_connecting(struct ceph_connection *con)
232{
233 int old_state;
234
235 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
236 if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
237 printk("%s: unexpected old state %d\n", __func__, old_state);
238 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
239 CON_SOCK_STATE_CONNECTING);
240}
241
242static void con_sock_state_connected(struct ceph_connection *con)
243{
244 int old_state;
245
246 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
247 if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
248 printk("%s: unexpected old state %d\n", __func__, old_state);
249 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
250 CON_SOCK_STATE_CONNECTED);
251}
252
253static void con_sock_state_closing(struct ceph_connection *con)
254{
255 int old_state;
256
257 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
258 if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
259 old_state != CON_SOCK_STATE_CONNECTED &&
260 old_state != CON_SOCK_STATE_CLOSING))
261 printk("%s: unexpected old state %d\n", __func__, old_state);
262 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
263 CON_SOCK_STATE_CLOSING);
264}
265
266static void con_sock_state_closed(struct ceph_connection *con)
267{
268 int old_state;
269
270 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
271 if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
272 old_state != CON_SOCK_STATE_CLOSING &&
273 old_state != CON_SOCK_STATE_CONNECTING &&
274 old_state != CON_SOCK_STATE_CLOSED))
275 printk("%s: unexpected old state %d\n", __func__, old_state);
276 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
277 CON_SOCK_STATE_CLOSED);
278}
279 121
280/* 122/*
281 * socket callback functions 123 * socket callback functions
282 */ 124 */
283 125
284/* data available on socket, or listen socket received a connect */ 126/* data available on socket, or listen socket received a connect */
285static void ceph_sock_data_ready(struct sock *sk, int count_unused) 127static void ceph_data_ready(struct sock *sk, int count_unused)
286{ 128{
287 struct ceph_connection *con = sk->sk_user_data; 129 struct ceph_connection *con =
288 if (atomic_read(&con->msgr->stopping)) { 130 (struct ceph_connection *)sk->sk_user_data;
289 return;
290 }
291
292 if (sk->sk_state != TCP_CLOSE_WAIT) { 131 if (sk->sk_state != TCP_CLOSE_WAIT) {
293 dout("%s on %p state = %lu, queueing work\n", __func__, 132 dout("ceph_data_ready on %p state = %lu, queueing work\n",
294 con, con->state); 133 con, con->state);
295 queue_con(con); 134 queue_con(con);
296 } 135 }
297} 136}
298 137
299/* socket has buffer space for writing */ 138/* socket has buffer space for writing */
300static void ceph_sock_write_space(struct sock *sk) 139static void ceph_write_space(struct sock *sk)
301{ 140{
302 struct ceph_connection *con = sk->sk_user_data; 141 struct ceph_connection *con =
142 (struct ceph_connection *)sk->sk_user_data;
303 143
304 /* only queue to workqueue if there is data we want to write, 144 /* only queue to workqueue if there is data we want to write. */
305 * and there is sufficient space in the socket buffer to accept 145 if (test_bit(WRITE_PENDING, &con->state)) {
306 * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() 146 dout("ceph_write_space %p queueing write work\n", con);
307 * doesn't get called again until try_write() fills the socket 147 queue_con(con);
308 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
309 * and net/core/stream.c:sk_stream_write_space().
310 */
311 if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) {
312 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
313 dout("%s %p queueing write work\n", __func__, con);
314 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
315 queue_con(con);
316 }
317 } else { 148 } else {
318 dout("%s %p nothing to write\n", __func__, con); 149 dout("ceph_write_space %p nothing to write\n", con);
319 } 150 }
151
152 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
153 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
320} 154}
321 155
322/* socket's state has changed */ 156/* socket's state has changed */
323static void ceph_sock_state_change(struct sock *sk) 157static void ceph_state_change(struct sock *sk)
324{ 158{
325 struct ceph_connection *con = sk->sk_user_data; 159 struct ceph_connection *con =
160 (struct ceph_connection *)sk->sk_user_data;
326 161
327 dout("%s %p state = %lu sk_state = %u\n", __func__, 162 dout("ceph_state_change %p state = %lu sk_state = %u\n",
328 con, con->state, sk->sk_state); 163 con, con->state, sk->sk_state);
329 164
165 if (test_bit(CLOSED, &con->state))
166 return;
167
330 switch (sk->sk_state) { 168 switch (sk->sk_state) {
331 case TCP_CLOSE: 169 case TCP_CLOSE:
332 dout("%s TCP_CLOSE\n", __func__); 170 dout("ceph_state_change TCP_CLOSE\n");
333 case TCP_CLOSE_WAIT: 171 case TCP_CLOSE_WAIT:
334 dout("%s TCP_CLOSE_WAIT\n", __func__); 172 dout("ceph_state_change TCP_CLOSE_WAIT\n");
335 con_sock_state_closing(con); 173 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
336 set_bit(CON_FLAG_SOCK_CLOSED, &con->flags); 174 if (test_bit(CONNECTING, &con->state))
337 queue_con(con); 175 con->error_msg = "connection failed";
176 else
177 con->error_msg = "socket closed";
178 queue_con(con);
179 }
338 break; 180 break;
339 case TCP_ESTABLISHED: 181 case TCP_ESTABLISHED:
340 dout("%s TCP_ESTABLISHED\n", __func__); 182 dout("ceph_state_change TCP_ESTABLISHED\n");
341 con_sock_state_connected(con);
342 queue_con(con); 183 queue_con(con);
343 break; 184 break;
344 default: /* Everything else is uninteresting */
345 break;
346 } 185 }
347} 186}
348 187
@@ -353,10 +192,10 @@ static void set_sock_callbacks(struct socket *sock,
353 struct ceph_connection *con) 192 struct ceph_connection *con)
354{ 193{
355 struct sock *sk = sock->sk; 194 struct sock *sk = sock->sk;
356 sk->sk_user_data = con; 195 sk->sk_user_data = (void *)con;
357 sk->sk_data_ready = ceph_sock_data_ready; 196 sk->sk_data_ready = ceph_data_ready;
358 sk->sk_write_space = ceph_sock_write_space; 197 sk->sk_write_space = ceph_write_space;
359 sk->sk_state_change = ceph_sock_state_change; 198 sk->sk_state_change = ceph_state_change;
360} 199}
361 200
362 201
@@ -367,7 +206,7 @@ static void set_sock_callbacks(struct socket *sock,
367/* 206/*
368 * initiate connection to a remote socket. 207 * initiate connection to a remote socket.
369 */ 208 */
370static int ceph_tcp_connect(struct ceph_connection *con) 209static struct socket *ceph_tcp_connect(struct ceph_connection *con)
371{ 210{
372 struct sockaddr_storage *paddr = &con->peer_addr.in_addr; 211 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
373 struct socket *sock; 212 struct socket *sock;
@@ -377,7 +216,8 @@ static int ceph_tcp_connect(struct ceph_connection *con)
377 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, 216 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
378 IPPROTO_TCP, &sock); 217 IPPROTO_TCP, &sock);
379 if (ret) 218 if (ret)
380 return ret; 219 return ERR_PTR(ret);
220 con->sock = sock;
381 sock->sk->sk_allocation = GFP_NOFS; 221 sock->sk->sk_allocation = GFP_NOFS;
382 222
383#ifdef CONFIG_LOCKDEP 223#ifdef CONFIG_LOCKDEP
@@ -388,23 +228,25 @@ static int ceph_tcp_connect(struct ceph_connection *con)
388 228
389 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); 229 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
390 230
391 con_sock_state_connecting(con);
392 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), 231 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
393 O_NONBLOCK); 232 O_NONBLOCK);
394 if (ret == -EINPROGRESS) { 233 if (ret == -EINPROGRESS) {
395 dout("connect %s EINPROGRESS sk_state = %u\n", 234 dout("connect %s EINPROGRESS sk_state = %u\n",
396 ceph_pr_addr(&con->peer_addr.in_addr), 235 ceph_pr_addr(&con->peer_addr.in_addr),
397 sock->sk->sk_state); 236 sock->sk->sk_state);
398 } else if (ret < 0) { 237 ret = 0;
238 }
239 if (ret < 0) {
399 pr_err("connect %s error %d\n", 240 pr_err("connect %s error %d\n",
400 ceph_pr_addr(&con->peer_addr.in_addr), ret); 241 ceph_pr_addr(&con->peer_addr.in_addr), ret);
401 sock_release(sock); 242 sock_release(sock);
243 con->sock = NULL;
402 con->error_msg = "connect error"; 244 con->error_msg = "connect error";
403
404 return ret;
405 } 245 }
406 con->sock = sock; 246
407 return 0; 247 if (ret < 0)
248 return ERR_PTR(ret);
249 return sock;
408} 250}
409 251
410static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) 252static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
@@ -440,43 +282,22 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
440 return r; 282 return r;
441} 283}
442 284
443static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
444 int offset, size_t size, int more)
445{
446 int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
447 int ret;
448
449 ret = kernel_sendpage(sock, page, offset, size, flags);
450 if (ret == -EAGAIN)
451 ret = 0;
452
453 return ret;
454}
455
456 285
457/* 286/*
458 * Shutdown/close the socket for the given connection. 287 * Shutdown/close the socket for the given connection.
459 */ 288 */
460static int con_close_socket(struct ceph_connection *con) 289static int con_close_socket(struct ceph_connection *con)
461{ 290{
462 int rc = 0; 291 int rc;
463 292
464 dout("con_close_socket on %p sock %p\n", con, con->sock); 293 dout("con_close_socket on %p sock %p\n", con, con->sock);
465 if (con->sock) { 294 if (!con->sock)
466 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); 295 return 0;
467 sock_release(con->sock); 296 set_bit(SOCK_CLOSED, &con->state);
468 con->sock = NULL; 297 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
469 } 298 sock_release(con->sock);
470 299 con->sock = NULL;
471 /* 300 clear_bit(SOCK_CLOSED, &con->state);
472 * Forcibly clear the SOCK_CLOSED flag. It gets set
473 * independent of the connection mutex, and we could have
474 * received a socket close event before we had the chance to
475 * shut the socket down.
476 */
477 clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
478
479 con_sock_state_closed(con);
480 return rc; 301 return rc;
481} 302}
482 303
@@ -487,10 +308,6 @@ static int con_close_socket(struct ceph_connection *con)
487static void ceph_msg_remove(struct ceph_msg *msg) 308static void ceph_msg_remove(struct ceph_msg *msg)
488{ 309{
489 list_del_init(&msg->list_head); 310 list_del_init(&msg->list_head);
490 BUG_ON(msg->con == NULL);
491 msg->con->ops->put(msg->con);
492 msg->con = NULL;
493
494 ceph_msg_put(msg); 311 ceph_msg_put(msg);
495} 312}
496static void ceph_msg_remove_list(struct list_head *head) 313static void ceph_msg_remove_list(struct list_head *head)
@@ -506,16 +323,12 @@ static void reset_connection(struct ceph_connection *con)
506{ 323{
507 /* reset connection, out_queue, msg_ and connect_seq */ 324 /* reset connection, out_queue, msg_ and connect_seq */
508 /* discard existing out_queue and msg_seq */ 325 /* discard existing out_queue and msg_seq */
509 dout("reset_connection %p\n", con);
510 ceph_msg_remove_list(&con->out_queue); 326 ceph_msg_remove_list(&con->out_queue);
511 ceph_msg_remove_list(&con->out_sent); 327 ceph_msg_remove_list(&con->out_sent);
512 328
513 if (con->in_msg) { 329 if (con->in_msg) {
514 BUG_ON(con->in_msg->con != con);
515 con->in_msg->con = NULL;
516 ceph_msg_put(con->in_msg); 330 ceph_msg_put(con->in_msg);
517 con->in_msg = NULL; 331 con->in_msg = NULL;
518 con->ops->put(con);
519 } 332 }
520 333
521 con->connect_seq = 0; 334 con->connect_seq = 0;
@@ -533,44 +346,32 @@ static void reset_connection(struct ceph_connection *con)
533 */ 346 */
534void ceph_con_close(struct ceph_connection *con) 347void ceph_con_close(struct ceph_connection *con)
535{ 348{
536 mutex_lock(&con->mutex);
537 dout("con_close %p peer %s\n", con, 349 dout("con_close %p peer %s\n", con,
538 ceph_pr_addr(&con->peer_addr.in_addr)); 350 ceph_pr_addr(&con->peer_addr.in_addr));
539 con->state = CON_STATE_CLOSED; 351 set_bit(CLOSED, &con->state); /* in case there's queued work */
540 352 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
541 clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */ 353 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
542 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); 354 clear_bit(KEEPALIVE_PENDING, &con->state);
543 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 355 clear_bit(WRITE_PENDING, &con->state);
544 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags); 356 mutex_lock(&con->mutex);
545 clear_bit(CON_FLAG_BACKOFF, &con->flags);
546
547 reset_connection(con); 357 reset_connection(con);
548 con->peer_global_seq = 0; 358 con->peer_global_seq = 0;
549 cancel_delayed_work(&con->work); 359 cancel_delayed_work(&con->work);
550 con_close_socket(con);
551 mutex_unlock(&con->mutex); 360 mutex_unlock(&con->mutex);
361 queue_con(con);
552} 362}
553EXPORT_SYMBOL(ceph_con_close); 363EXPORT_SYMBOL(ceph_con_close);
554 364
555/* 365/*
556 * Reopen a closed connection, with a new peer address. 366 * Reopen a closed connection, with a new peer address.
557 */ 367 */
558void ceph_con_open(struct ceph_connection *con, 368void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
559 __u8 entity_type, __u64 entity_num,
560 struct ceph_entity_addr *addr)
561{ 369{
562 mutex_lock(&con->mutex);
563 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); 370 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
564 371 set_bit(OPENING, &con->state);
565 WARN_ON(con->state != CON_STATE_CLOSED); 372 clear_bit(CLOSED, &con->state);
566 con->state = CON_STATE_PREOPEN;
567
568 con->peer_name.type = (__u8) entity_type;
569 con->peer_name.num = cpu_to_le64(entity_num);
570
571 memcpy(&con->peer_addr, addr, sizeof(*addr)); 373 memcpy(&con->peer_addr, addr, sizeof(*addr));
572 con->delay = 0; /* reset backoff memory */ 374 con->delay = 0; /* reset backoff memory */
573 mutex_unlock(&con->mutex);
574 queue_con(con); 375 queue_con(con);
575} 376}
576EXPORT_SYMBOL(ceph_con_open); 377EXPORT_SYMBOL(ceph_con_open);
@@ -584,26 +385,41 @@ bool ceph_con_opened(struct ceph_connection *con)
584} 385}
585 386
586/* 387/*
388 * generic get/put
389 */
390struct ceph_connection *ceph_con_get(struct ceph_connection *con)
391{
392 dout("con_get %p nref = %d -> %d\n", con,
393 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
394 if (atomic_inc_not_zero(&con->nref))
395 return con;
396 return NULL;
397}
398
399void ceph_con_put(struct ceph_connection *con)
400{
401 dout("con_put %p nref = %d -> %d\n", con,
402 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
403 BUG_ON(atomic_read(&con->nref) == 0);
404 if (atomic_dec_and_test(&con->nref)) {
405 BUG_ON(con->sock);
406 kfree(con);
407 }
408}
409
410/*
587 * initialize a new connection. 411 * initialize a new connection.
588 */ 412 */
589void ceph_con_init(struct ceph_connection *con, void *private, 413void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
590 const struct ceph_connection_operations *ops,
591 struct ceph_messenger *msgr)
592{ 414{
593 dout("con_init %p\n", con); 415 dout("con_init %p\n", con);
594 memset(con, 0, sizeof(*con)); 416 memset(con, 0, sizeof(*con));
595 con->private = private; 417 atomic_set(&con->nref, 1);
596 con->ops = ops;
597 con->msgr = msgr; 418 con->msgr = msgr;
598
599 con_sock_state_init(con);
600
601 mutex_init(&con->mutex); 419 mutex_init(&con->mutex);
602 INIT_LIST_HEAD(&con->out_queue); 420 INIT_LIST_HEAD(&con->out_queue);
603 INIT_LIST_HEAD(&con->out_sent); 421 INIT_LIST_HEAD(&con->out_sent);
604 INIT_DELAYED_WORK(&con->work, con_work); 422 INIT_DELAYED_WORK(&con->work, con_work);
605
606 con->state = CON_STATE_CLOSED;
607} 423}
608EXPORT_SYMBOL(ceph_con_init); 424EXPORT_SYMBOL(ceph_con_init);
609 425
@@ -624,84 +440,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
624 return ret; 440 return ret;
625} 441}
626 442
627static void con_out_kvec_reset(struct ceph_connection *con)
628{
629 con->out_kvec_left = 0;
630 con->out_kvec_bytes = 0;
631 con->out_kvec_cur = &con->out_kvec[0];
632}
633
634static void con_out_kvec_add(struct ceph_connection *con,
635 size_t size, void *data)
636{
637 int index;
638
639 index = con->out_kvec_left;
640 BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
641
642 con->out_kvec[index].iov_len = size;
643 con->out_kvec[index].iov_base = data;
644 con->out_kvec_left++;
645 con->out_kvec_bytes += size;
646}
647
648#ifdef CONFIG_BLOCK
649static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
650{
651 if (!bio) {
652 *iter = NULL;
653 *seg = 0;
654 return;
655 }
656 *iter = bio;
657 *seg = bio->bi_idx;
658}
659
660static void iter_bio_next(struct bio **bio_iter, int *seg)
661{
662 if (*bio_iter == NULL)
663 return;
664
665 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
666
667 (*seg)++;
668 if (*seg == (*bio_iter)->bi_vcnt)
669 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
670}
671#endif
672
673static void prepare_write_message_data(struct ceph_connection *con)
674{
675 struct ceph_msg *msg = con->out_msg;
676
677 BUG_ON(!msg);
678 BUG_ON(!msg->hdr.data_len);
679
680 /* initialize page iterator */
681 con->out_msg_pos.page = 0;
682 if (msg->pages)
683 con->out_msg_pos.page_pos = msg->page_alignment;
684 else
685 con->out_msg_pos.page_pos = 0;
686#ifdef CONFIG_BLOCK
687 if (msg->bio)
688 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
689#endif
690 con->out_msg_pos.data_pos = 0;
691 con->out_msg_pos.did_page_crc = false;
692 con->out_more = 1; /* data + footer will follow */
693}
694 443
695/* 444/*
696 * Prepare footer for currently outgoing message, and finish things 445 * Prepare footer for currently outgoing message, and finish things
697 * off. Assumes out_kvec* are already valid.. we just add on to the end. 446 * off. Assumes out_kvec* are already valid.. we just add on to the end.
698 */ 447 */
699static void prepare_write_message_footer(struct ceph_connection *con) 448static void prepare_write_message_footer(struct ceph_connection *con, int v)
700{ 449{
701 struct ceph_msg *m = con->out_msg; 450 struct ceph_msg *m = con->out_msg;
702 int v = con->out_kvec_left;
703
704 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
705 451
706 dout("prepare_write_message_footer %p\n", con); 452 dout("prepare_write_message_footer %p\n", con);
707 con->out_kvec_is_msg = true; 453 con->out_kvec_is_msg = true;
@@ -719,9 +465,9 @@ static void prepare_write_message_footer(struct ceph_connection *con)
719static void prepare_write_message(struct ceph_connection *con) 465static void prepare_write_message(struct ceph_connection *con)
720{ 466{
721 struct ceph_msg *m; 467 struct ceph_msg *m;
722 u32 crc; 468 int v = 0;
723 469
724 con_out_kvec_reset(con); 470 con->out_kvec_bytes = 0;
725 con->out_kvec_is_msg = true; 471 con->out_kvec_is_msg = true;
726 con->out_msg_done = false; 472 con->out_msg_done = false;
727 473
@@ -729,16 +475,17 @@ static void prepare_write_message(struct ceph_connection *con)
729 * TCP packet that's a good thing. */ 475 * TCP packet that's a good thing. */
730 if (con->in_seq > con->in_seq_acked) { 476 if (con->in_seq > con->in_seq_acked) {
731 con->in_seq_acked = con->in_seq; 477 con->in_seq_acked = con->in_seq;
732 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); 478 con->out_kvec[v].iov_base = &tag_ack;
479 con->out_kvec[v++].iov_len = 1;
733 con->out_temp_ack = cpu_to_le64(con->in_seq_acked); 480 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
734 con_out_kvec_add(con, sizeof (con->out_temp_ack), 481 con->out_kvec[v].iov_base = &con->out_temp_ack;
735 &con->out_temp_ack); 482 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
483 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
736 } 484 }
737 485
738 BUG_ON(list_empty(&con->out_queue)); 486 m = list_first_entry(&con->out_queue,
739 m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); 487 struct ceph_msg, list_head);
740 con->out_msg = m; 488 con->out_msg = m;
741 BUG_ON(m->con != con);
742 489
743 /* put message on sent list */ 490 /* put message on sent list */
744 ceph_msg_get(m); 491 ceph_msg_get(m);
@@ -752,10 +499,6 @@ static void prepare_write_message(struct ceph_connection *con)
752 m->hdr.seq = cpu_to_le64(++con->out_seq); 499 m->hdr.seq = cpu_to_le64(++con->out_seq);
753 m->needs_out_seq = false; 500 m->needs_out_seq = false;
754 } 501 }
755#ifdef CONFIG_BLOCK
756 else
757 m->bio_iter = NULL;
758#endif
759 502
760 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", 503 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
761 m, con->out_seq, le16_to_cpu(m->hdr.type), 504 m, con->out_seq, le16_to_cpu(m->hdr.type),
@@ -765,40 +508,53 @@ static void prepare_write_message(struct ceph_connection *con)
765 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); 508 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
766 509
767 /* tag + hdr + front + middle */ 510 /* tag + hdr + front + middle */
768 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); 511 con->out_kvec[v].iov_base = &tag_msg;
769 con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); 512 con->out_kvec[v++].iov_len = 1;
770 con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); 513 con->out_kvec[v].iov_base = &m->hdr;
771 514 con->out_kvec[v++].iov_len = sizeof(m->hdr);
515 con->out_kvec[v++] = m->front;
772 if (m->middle) 516 if (m->middle)
773 con_out_kvec_add(con, m->middle->vec.iov_len, 517 con->out_kvec[v++] = m->middle->vec;
774 m->middle->vec.iov_base); 518 con->out_kvec_left = v;
519 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
520 (m->middle ? m->middle->vec.iov_len : 0);
521 con->out_kvec_cur = con->out_kvec;
775 522
776 /* fill in crc (except data pages), footer */ 523 /* fill in crc (except data pages), footer */
777 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); 524 con->out_msg->hdr.crc =
778 con->out_msg->hdr.crc = cpu_to_le32(crc); 525 cpu_to_le32(crc32c(0, (void *)&m->hdr,
779 con->out_msg->footer.flags = 0; 526 sizeof(m->hdr) - sizeof(m->hdr.crc)));
780 527 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
781 crc = crc32c(0, m->front.iov_base, m->front.iov_len); 528 con->out_msg->footer.front_crc =
782 con->out_msg->footer.front_crc = cpu_to_le32(crc); 529 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
783 if (m->middle) { 530 if (m->middle)
784 crc = crc32c(0, m->middle->vec.iov_base, 531 con->out_msg->footer.middle_crc =
785 m->middle->vec.iov_len); 532 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
786 con->out_msg->footer.middle_crc = cpu_to_le32(crc); 533 m->middle->vec.iov_len));
787 } else 534 else
788 con->out_msg->footer.middle_crc = 0; 535 con->out_msg->footer.middle_crc = 0;
789 dout("%s front_crc %u middle_crc %u\n", __func__, 536 con->out_msg->footer.data_crc = 0;
537 dout("prepare_write_message front_crc %u data_crc %u\n",
790 le32_to_cpu(con->out_msg->footer.front_crc), 538 le32_to_cpu(con->out_msg->footer.front_crc),
791 le32_to_cpu(con->out_msg->footer.middle_crc)); 539 le32_to_cpu(con->out_msg->footer.middle_crc));
792 540
793 /* is there a data payload? */ 541 /* is there a data payload? */
794 con->out_msg->footer.data_crc = 0; 542 if (le32_to_cpu(m->hdr.data_len) > 0) {
795 if (m->hdr.data_len) 543 /* initialize page iterator */
796 prepare_write_message_data(con); 544 con->out_msg_pos.page = 0;
797 else 545 if (m->pages)
546 con->out_msg_pos.page_pos = m->page_alignment;
547 else
548 con->out_msg_pos.page_pos = 0;
549 con->out_msg_pos.data_pos = 0;
550 con->out_msg_pos.did_page_crc = 0;
551 con->out_more = 1; /* data + footer will follow */
552 } else {
798 /* no, queue up footer too and be done */ 553 /* no, queue up footer too and be done */
799 prepare_write_message_footer(con); 554 prepare_write_message_footer(con, v);
555 }
800 556
801 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 557 set_bit(WRITE_PENDING, &con->state);
802} 558}
803 559
804/* 560/*
@@ -810,16 +566,16 @@ static void prepare_write_ack(struct ceph_connection *con)
810 con->in_seq_acked, con->in_seq); 566 con->in_seq_acked, con->in_seq);
811 con->in_seq_acked = con->in_seq; 567 con->in_seq_acked = con->in_seq;
812 568
813 con_out_kvec_reset(con); 569 con->out_kvec[0].iov_base = &tag_ack;
814 570 con->out_kvec[0].iov_len = 1;
815 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
816
817 con->out_temp_ack = cpu_to_le64(con->in_seq_acked); 571 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
818 con_out_kvec_add(con, sizeof (con->out_temp_ack), 572 con->out_kvec[1].iov_base = &con->out_temp_ack;
819 &con->out_temp_ack); 573 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
820 574 con->out_kvec_left = 2;
575 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
576 con->out_kvec_cur = con->out_kvec;
821 con->out_more = 1; /* more will follow.. eventually.. */ 577 con->out_more = 1; /* more will follow.. eventually.. */
822 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 578 set_bit(WRITE_PENDING, &con->state);
823} 579}
824 580
825/* 581/*
@@ -828,60 +584,73 @@ static void prepare_write_ack(struct ceph_connection *con)
828static void prepare_write_keepalive(struct ceph_connection *con) 584static void prepare_write_keepalive(struct ceph_connection *con)
829{ 585{
830 dout("prepare_write_keepalive %p\n", con); 586 dout("prepare_write_keepalive %p\n", con);
831 con_out_kvec_reset(con); 587 con->out_kvec[0].iov_base = &tag_keepalive;
832 con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); 588 con->out_kvec[0].iov_len = 1;
833 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 589 con->out_kvec_left = 1;
590 con->out_kvec_bytes = 1;
591 con->out_kvec_cur = con->out_kvec;
592 set_bit(WRITE_PENDING, &con->state);
834} 593}
835 594
836/* 595/*
837 * Connection negotiation. 596 * Connection negotiation.
838 */ 597 */
839 598
840static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, 599static int prepare_connect_authorizer(struct ceph_connection *con)
841 int *auth_proto)
842{ 600{
843 struct ceph_auth_handshake *auth; 601 void *auth_buf;
844 602 int auth_len = 0;
845 if (!con->ops->get_authorizer) { 603 int auth_protocol = 0;
846 con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
847 con->out_connect.authorizer_len = 0;
848 return NULL;
849 }
850 604
851 /* Can't hold the mutex while getting authorizer */
852 mutex_unlock(&con->mutex); 605 mutex_unlock(&con->mutex);
853 auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); 606 if (con->ops->get_authorizer)
607 con->ops->get_authorizer(con, &auth_buf, &auth_len,
608 &auth_protocol, &con->auth_reply_buf,
609 &con->auth_reply_buf_len,
610 con->auth_retry);
854 mutex_lock(&con->mutex); 611 mutex_lock(&con->mutex);
855 612
856 if (IS_ERR(auth)) 613 if (test_bit(CLOSED, &con->state) ||
857 return auth; 614 test_bit(OPENING, &con->state))
858 if (con->state != CON_STATE_NEGOTIATING) 615 return -EAGAIN;
859 return ERR_PTR(-EAGAIN);
860 616
861 con->auth_reply_buf = auth->authorizer_reply_buf; 617 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
862 con->auth_reply_buf_len = auth->authorizer_reply_buf_len; 618 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
863 return auth; 619
620 if (auth_len) {
621 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
622 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
623 con->out_kvec_left++;
624 con->out_kvec_bytes += auth_len;
625 }
626 return 0;
864} 627}
865 628
866/* 629/*
867 * We connected to a peer and are saying hello. 630 * We connected to a peer and are saying hello.
868 */ 631 */
869static void prepare_write_banner(struct ceph_connection *con) 632static void prepare_write_banner(struct ceph_messenger *msgr,
870{ 633 struct ceph_connection *con)
871 con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); 634{
872 con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), 635 int len = strlen(CEPH_BANNER);
873 &con->msgr->my_enc_addr); 636
874 637 con->out_kvec[0].iov_base = CEPH_BANNER;
638 con->out_kvec[0].iov_len = len;
639 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
640 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
641 con->out_kvec_left = 2;
642 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
643 con->out_kvec_cur = con->out_kvec;
875 con->out_more = 0; 644 con->out_more = 0;
876 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 645 set_bit(WRITE_PENDING, &con->state);
877} 646}
878 647
879static int prepare_write_connect(struct ceph_connection *con) 648static int prepare_write_connect(struct ceph_messenger *msgr,
649 struct ceph_connection *con,
650 int after_banner)
880{ 651{
881 unsigned int global_seq = get_global_seq(con->msgr, 0); 652 unsigned global_seq = get_global_seq(con->msgr, 0);
882 int proto; 653 int proto;
883 int auth_proto;
884 struct ceph_auth_handshake *auth;
885 654
886 switch (con->peer_name.type) { 655 switch (con->peer_name.type) {
887 case CEPH_ENTITY_TYPE_MON: 656 case CEPH_ENTITY_TYPE_MON:
@@ -900,34 +669,29 @@ static int prepare_write_connect(struct ceph_connection *con)
900 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 669 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
901 con->connect_seq, global_seq, proto); 670 con->connect_seq, global_seq, proto);
902 671
903 con->out_connect.features = cpu_to_le64(con->msgr->supported_features); 672 con->out_connect.features = cpu_to_le64(msgr->supported_features);
904 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 673 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
905 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 674 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
906 con->out_connect.global_seq = cpu_to_le32(global_seq); 675 con->out_connect.global_seq = cpu_to_le32(global_seq);
907 con->out_connect.protocol_version = cpu_to_le32(proto); 676 con->out_connect.protocol_version = cpu_to_le32(proto);
908 con->out_connect.flags = 0; 677 con->out_connect.flags = 0;
909 678
910 auth_proto = CEPH_AUTH_UNKNOWN; 679 if (!after_banner) {
911 auth = get_connect_authorizer(con, &auth_proto); 680 con->out_kvec_left = 0;
912 if (IS_ERR(auth)) 681 con->out_kvec_bytes = 0;
913 return PTR_ERR(auth); 682 }
914 683 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
915 con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); 684 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
916 con->out_connect.authorizer_len = auth ? 685 con->out_kvec_left++;
917 cpu_to_le32(auth->authorizer_buf_len) : 0; 686 con->out_kvec_bytes += sizeof(con->out_connect);
918 687 con->out_kvec_cur = con->out_kvec;
919 con_out_kvec_add(con, sizeof (con->out_connect),
920 &con->out_connect);
921 if (auth && auth->authorizer_buf_len)
922 con_out_kvec_add(con, auth->authorizer_buf_len,
923 auth->authorizer_buf);
924
925 con->out_more = 0; 688 con->out_more = 0;
926 set_bit(CON_FLAG_WRITE_PENDING, &con->flags); 689 set_bit(WRITE_PENDING, &con->state);
927 690
928 return 0; 691 return prepare_connect_authorizer(con);
929} 692}
930 693
694
931/* 695/*
932 * write as much of pending kvecs to the socket as we can. 696 * write as much of pending kvecs to the socket as we can.
933 * 1 -> done 697 * 1 -> done
@@ -948,18 +712,17 @@ static int write_partial_kvec(struct ceph_connection *con)
948 con->out_kvec_bytes -= ret; 712 con->out_kvec_bytes -= ret;
949 if (con->out_kvec_bytes == 0) 713 if (con->out_kvec_bytes == 0)
950 break; /* done */ 714 break; /* done */
951 715 while (ret > 0) {
952 /* account for full iov entries consumed */ 716 if (ret >= con->out_kvec_cur->iov_len) {
953 while (ret >= con->out_kvec_cur->iov_len) { 717 ret -= con->out_kvec_cur->iov_len;
954 BUG_ON(!con->out_kvec_left); 718 con->out_kvec_cur++;
955 ret -= con->out_kvec_cur->iov_len; 719 con->out_kvec_left--;
956 con->out_kvec_cur++; 720 } else {
957 con->out_kvec_left--; 721 con->out_kvec_cur->iov_len -= ret;
958 } 722 con->out_kvec_cur->iov_base += ret;
959 /* and for a partially-consumed entry */ 723 ret = 0;
960 if (ret) { 724 break;
961 con->out_kvec_cur->iov_len -= ret; 725 }
962 con->out_kvec_cur->iov_base += ret;
963 } 726 }
964 } 727 }
965 con->out_kvec_left = 0; 728 con->out_kvec_left = 0;
@@ -971,34 +734,30 @@ out:
971 return ret; /* done! */ 734 return ret; /* done! */
972} 735}
973 736
974static void out_msg_pos_next(struct ceph_connection *con, struct page *page, 737#ifdef CONFIG_BLOCK
975 size_t len, size_t sent, bool in_trail) 738static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
976{ 739{
977 struct ceph_msg *msg = con->out_msg; 740 if (!bio) {
978 741 *iter = NULL;
979 BUG_ON(!msg); 742 *seg = 0;
980 BUG_ON(!sent); 743 return;
744 }
745 *iter = bio;
746 *seg = bio->bi_idx;
747}
981 748
982 con->out_msg_pos.data_pos += sent; 749static void iter_bio_next(struct bio **bio_iter, int *seg)
983 con->out_msg_pos.page_pos += sent; 750{
984 if (sent < len) 751 if (*bio_iter == NULL)
985 return; 752 return;
986 753
987 BUG_ON(sent != len); 754 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
988 con->out_msg_pos.page_pos = 0; 755
989 con->out_msg_pos.page++; 756 (*seg)++;
990 con->out_msg_pos.did_page_crc = false; 757 if (*seg == (*bio_iter)->bi_vcnt)
991 if (in_trail) 758 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
992 list_move_tail(&page->lru,
993 &msg->trail->head);
994 else if (msg->pagelist)
995 list_move_tail(&page->lru,
996 &msg->pagelist->head);
997#ifdef CONFIG_BLOCK
998 else if (msg->bio)
999 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
1000#endif
1001} 759}
760#endif
1002 761
1003/* 762/*
1004 * Write as much message data payload as we can. If we finish, queue 763 * Write as much message data payload as we can. If we finish, queue
@@ -1010,90 +769,129 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
1010static int write_partial_msg_pages(struct ceph_connection *con) 769static int write_partial_msg_pages(struct ceph_connection *con)
1011{ 770{
1012 struct ceph_msg *msg = con->out_msg; 771 struct ceph_msg *msg = con->out_msg;
1013 unsigned int data_len = le32_to_cpu(msg->hdr.data_len); 772 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
1014 size_t len; 773 size_t len;
1015 bool do_datacrc = !con->msgr->nocrc; 774 int crc = con->msgr->nocrc;
1016 int ret; 775 int ret;
1017 int total_max_write; 776 int total_max_write;
1018 bool in_trail = false; 777 int in_trail = 0;
1019 const size_t trail_len = (msg->trail ? msg->trail->length : 0); 778 size_t trail_len = (msg->trail ? msg->trail->length : 0);
1020 const size_t trail_off = data_len - trail_len;
1021 779
1022 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", 780 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
1023 con, msg, con->out_msg_pos.page, msg->nr_pages, 781 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
1024 con->out_msg_pos.page_pos); 782 con->out_msg_pos.page_pos);
1025 783
1026 /* 784#ifdef CONFIG_BLOCK
1027 * Iterate through each page that contains data to be 785 if (msg->bio && !msg->bio_iter)
1028 * written, and send as much as possible for each. 786 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
1029 * 787#endif
1030 * If we are calculating the data crc (the default), we will 788
1031 * need to map the page. If we have no pages, they have
1032 * been revoked, so use the zero page.
1033 */
1034 while (data_len > con->out_msg_pos.data_pos) { 789 while (data_len > con->out_msg_pos.data_pos) {
1035 struct page *page = NULL; 790 struct page *page = NULL;
791 void *kaddr = NULL;
1036 int max_write = PAGE_SIZE; 792 int max_write = PAGE_SIZE;
1037 int bio_offset = 0; 793 int page_shift = 0;
1038 794
1039 in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; 795 total_max_write = data_len - trail_len -
1040 if (!in_trail) 796 con->out_msg_pos.data_pos;
1041 total_max_write = trail_off - con->out_msg_pos.data_pos; 797
798 /*
799 * if we are calculating the data crc (the default), we need
800 * to map the page. if our pages[] has been revoked, use the
801 * zero page.
802 */
803
804 /* have we reached the trail part of the data? */
805 if (con->out_msg_pos.data_pos >= data_len - trail_len) {
806 in_trail = 1;
1042 807
1043 if (in_trail) {
1044 total_max_write = data_len - con->out_msg_pos.data_pos; 808 total_max_write = data_len - con->out_msg_pos.data_pos;
1045 809
1046 page = list_first_entry(&msg->trail->head, 810 page = list_first_entry(&msg->trail->head,
1047 struct page, lru); 811 struct page, lru);
812 if (crc)
813 kaddr = kmap(page);
814 max_write = PAGE_SIZE;
1048 } else if (msg->pages) { 815 } else if (msg->pages) {
1049 page = msg->pages[con->out_msg_pos.page]; 816 page = msg->pages[con->out_msg_pos.page];
817 if (crc)
818 kaddr = kmap(page);
1050 } else if (msg->pagelist) { 819 } else if (msg->pagelist) {
1051 page = list_first_entry(&msg->pagelist->head, 820 page = list_first_entry(&msg->pagelist->head,
1052 struct page, lru); 821 struct page, lru);
822 if (crc)
823 kaddr = kmap(page);
1053#ifdef CONFIG_BLOCK 824#ifdef CONFIG_BLOCK
1054 } else if (msg->bio) { 825 } else if (msg->bio) {
1055 struct bio_vec *bv; 826 struct bio_vec *bv;
1056 827
1057 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); 828 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
1058 page = bv->bv_page; 829 page = bv->bv_page;
1059 bio_offset = bv->bv_offset; 830 page_shift = bv->bv_offset;
831 if (crc)
832 kaddr = kmap(page) + page_shift;
1060 max_write = bv->bv_len; 833 max_write = bv->bv_len;
1061#endif 834#endif
1062 } else { 835 } else {
1063 page = zero_page; 836 page = con->msgr->zero_page;
837 if (crc)
838 kaddr = page_address(con->msgr->zero_page);
1064 } 839 }
1065 len = min_t(int, max_write - con->out_msg_pos.page_pos, 840 len = min_t(int, max_write - con->out_msg_pos.page_pos,
1066 total_max_write); 841 total_max_write);
1067 842
1068 if (do_datacrc && !con->out_msg_pos.did_page_crc) { 843 if (crc && !con->out_msg_pos.did_page_crc) {
1069 void *base; 844 void *base = kaddr + con->out_msg_pos.page_pos;
1070 u32 crc = le32_to_cpu(msg->footer.data_crc); 845 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
1071 char *kaddr;
1072 846
1073 kaddr = kmap(page);
1074 BUG_ON(kaddr == NULL); 847 BUG_ON(kaddr == NULL);
1075 base = kaddr + con->out_msg_pos.page_pos + bio_offset; 848 con->out_msg->footer.data_crc =
1076 crc = crc32c(crc, base, len); 849 cpu_to_le32(crc32c(tmpcrc, base, len));
1077 kunmap(page); 850 con->out_msg_pos.did_page_crc = 1;
1078 msg->footer.data_crc = cpu_to_le32(crc);
1079 con->out_msg_pos.did_page_crc = true;
1080 } 851 }
1081 ret = ceph_tcp_sendpage(con->sock, page, 852 ret = kernel_sendpage(con->sock, page,
1082 con->out_msg_pos.page_pos + bio_offset, 853 con->out_msg_pos.page_pos + page_shift,
1083 len, 1); 854 len,
855 MSG_DONTWAIT | MSG_NOSIGNAL |
856 MSG_MORE);
857
858 if (crc &&
859 (msg->pages || msg->pagelist || msg->bio || in_trail))
860 kunmap(page);
861
862 if (ret == -EAGAIN)
863 ret = 0;
1084 if (ret <= 0) 864 if (ret <= 0)
1085 goto out; 865 goto out;
1086 866
1087 out_msg_pos_next(con, page, len, (size_t) ret, in_trail); 867 con->out_msg_pos.data_pos += ret;
868 con->out_msg_pos.page_pos += ret;
869 if (ret == len) {
870 con->out_msg_pos.page_pos = 0;
871 con->out_msg_pos.page++;
872 con->out_msg_pos.did_page_crc = 0;
873 if (in_trail)
874 list_move_tail(&page->lru,
875 &msg->trail->head);
876 else if (msg->pagelist)
877 list_move_tail(&page->lru,
878 &msg->pagelist->head);
879#ifdef CONFIG_BLOCK
880 else if (msg->bio)
881 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
882#endif
883 }
1088 } 884 }
1089 885
1090 dout("write_partial_msg_pages %p msg %p done\n", con, msg); 886 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
1091 887
1092 /* prepare and queue up footer, too */ 888 /* prepare and queue up footer, too */
1093 if (!do_datacrc) 889 if (!crc)
1094 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; 890 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
1095 con_out_kvec_reset(con); 891 con->out_kvec_bytes = 0;
1096 prepare_write_message_footer(con); 892 con->out_kvec_left = 0;
893 con->out_kvec_cur = con->out_kvec;
894 prepare_write_message_footer(con, 0);
1097 ret = 1; 895 ret = 1;
1098out: 896out:
1099 return ret; 897 return ret;
@@ -1107,9 +905,12 @@ static int write_partial_skip(struct ceph_connection *con)
1107 int ret; 905 int ret;
1108 906
1109 while (con->out_skip > 0) { 907 while (con->out_skip > 0) {
1110 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); 908 struct kvec iov = {
909 .iov_base = page_address(con->msgr->zero_page),
910 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
911 };
1111 912
1112 ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); 913 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
1113 if (ret <= 0) 914 if (ret <= 0)
1114 goto out; 915 goto out;
1115 con->out_skip -= ret; 916 con->out_skip -= ret;
@@ -1161,10 +962,11 @@ static int prepare_read_message(struct ceph_connection *con)
1161 962
1162 963
1163static int read_partial(struct ceph_connection *con, 964static int read_partial(struct ceph_connection *con,
1164 int end, int size, void *object) 965 int *to, int size, void *object)
1165{ 966{
1166 while (con->in_base_pos < end) { 967 *to += size;
1167 int left = end - con->in_base_pos; 968 while (con->in_base_pos < *to) {
969 int left = *to - con->in_base_pos;
1168 int have = size - left; 970 int have = size - left;
1169 int ret = ceph_tcp_recvmsg(con->sock, object + have, left); 971 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
1170 if (ret <= 0) 972 if (ret <= 0)
@@ -1180,52 +982,37 @@ static int read_partial(struct ceph_connection *con,
1180 */ 982 */
1181static int read_partial_banner(struct ceph_connection *con) 983static int read_partial_banner(struct ceph_connection *con)
1182{ 984{
1183 int size; 985 int ret, to = 0;
1184 int end;
1185 int ret;
1186 986
1187 dout("read_partial_banner %p at %d\n", con, con->in_base_pos); 987 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
1188 988
1189 /* peer's banner */ 989 /* peer's banner */
1190 size = strlen(CEPH_BANNER); 990 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
1191 end = size;
1192 ret = read_partial(con, end, size, con->in_banner);
1193 if (ret <= 0) 991 if (ret <= 0)
1194 goto out; 992 goto out;
1195 993 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
1196 size = sizeof (con->actual_peer_addr); 994 &con->actual_peer_addr);
1197 end += size;
1198 ret = read_partial(con, end, size, &con->actual_peer_addr);
1199 if (ret <= 0) 995 if (ret <= 0)
1200 goto out; 996 goto out;
1201 997 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
1202 size = sizeof (con->peer_addr_for_me); 998 &con->peer_addr_for_me);
1203 end += size;
1204 ret = read_partial(con, end, size, &con->peer_addr_for_me);
1205 if (ret <= 0) 999 if (ret <= 0)
1206 goto out; 1000 goto out;
1207
1208out: 1001out:
1209 return ret; 1002 return ret;
1210} 1003}
1211 1004
1212static int read_partial_connect(struct ceph_connection *con) 1005static int read_partial_connect(struct ceph_connection *con)
1213{ 1006{
1214 int size; 1007 int ret, to = 0;
1215 int end;
1216 int ret;
1217 1008
1218 dout("read_partial_connect %p at %d\n", con, con->in_base_pos); 1009 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
1219 1010
1220 size = sizeof (con->in_reply); 1011 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
1221 end = size;
1222 ret = read_partial(con, end, size, &con->in_reply);
1223 if (ret <= 0) 1012 if (ret <= 0)
1224 goto out; 1013 goto out;
1225 1014 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
1226 size = le32_to_cpu(con->in_reply.authorizer_len); 1015 con->auth_reply_buf);
1227 end += size;
1228 ret = read_partial(con, end, size, con->auth_reply_buf);
1229 if (ret <= 0) 1016 if (ret <= 0)
1230 goto out; 1017 goto out;
1231 1018
@@ -1291,101 +1078,6 @@ static void addr_set_port(struct sockaddr_storage *ss, int p)
1291} 1078}
1292 1079
1293/* 1080/*
1294 * Unlike other *_pton function semantics, zero indicates success.
1295 */
1296static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
1297 char delim, const char **ipend)
1298{
1299 struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
1300 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
1301
1302 memset(ss, 0, sizeof(*ss));
1303
1304 if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) {
1305 ss->ss_family = AF_INET;
1306 return 0;
1307 }
1308
1309 if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) {
1310 ss->ss_family = AF_INET6;
1311 return 0;
1312 }
1313
1314 return -EINVAL;
1315}
1316
1317/*
1318 * Extract hostname string and resolve using kernel DNS facility.
1319 */
1320#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
1321static int ceph_dns_resolve_name(const char *name, size_t namelen,
1322 struct sockaddr_storage *ss, char delim, const char **ipend)
1323{
1324 const char *end, *delim_p;
1325 char *colon_p, *ip_addr = NULL;
1326 int ip_len, ret;
1327
1328 /*
1329 * The end of the hostname occurs immediately preceding the delimiter or
1330 * the port marker (':') where the delimiter takes precedence.
1331 */
1332 delim_p = memchr(name, delim, namelen);
1333 colon_p = memchr(name, ':', namelen);
1334
1335 if (delim_p && colon_p)
1336 end = delim_p < colon_p ? delim_p : colon_p;
1337 else if (!delim_p && colon_p)
1338 end = colon_p;
1339 else {
1340 end = delim_p;
1341 if (!end) /* case: hostname:/ */
1342 end = name + namelen;
1343 }
1344
1345 if (end <= name)
1346 return -EINVAL;
1347
1348 /* do dns_resolve upcall */
1349 ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL);
1350 if (ip_len > 0)
1351 ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL);
1352 else
1353 ret = -ESRCH;
1354
1355 kfree(ip_addr);
1356
1357 *ipend = end;
1358
1359 pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
1360 ret, ret ? "failed" : ceph_pr_addr(ss));
1361
1362 return ret;
1363}
1364#else
1365static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
1366 struct sockaddr_storage *ss, char delim, const char **ipend)
1367{
1368 return -EINVAL;
1369}
1370#endif
1371
1372/*
1373 * Parse a server name (IP or hostname). If a valid IP address is not found
1374 * then try to extract a hostname to resolve using userspace DNS upcall.
1375 */
1376static int ceph_parse_server_name(const char *name, size_t namelen,
1377 struct sockaddr_storage *ss, char delim, const char **ipend)
1378{
1379 int ret;
1380
1381 ret = ceph_pton(name, namelen, ss, delim, ipend);
1382 if (ret)
1383 ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend);
1384
1385 return ret;
1386}
1387
1388/*
1389 * Parse an ip[:port] list into an addr array. Use the default 1081 * Parse an ip[:port] list into an addr array. Use the default
1390 * monitor port if a port isn't specified. 1082 * monitor port if a port isn't specified.
1391 */ 1083 */
@@ -1393,13 +1085,15 @@ int ceph_parse_ips(const char *c, const char *end,
1393 struct ceph_entity_addr *addr, 1085 struct ceph_entity_addr *addr,
1394 int max_count, int *count) 1086 int max_count, int *count)
1395{ 1087{
1396 int i, ret = -EINVAL; 1088 int i;
1397 const char *p = c; 1089 const char *p = c;
1398 1090
1399 dout("parse_ips on '%.*s'\n", (int)(end-c), c); 1091 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1400 for (i = 0; i < max_count; i++) { 1092 for (i = 0; i < max_count; i++) {
1401 const char *ipend; 1093 const char *ipend;
1402 struct sockaddr_storage *ss = &addr[i].in_addr; 1094 struct sockaddr_storage *ss = &addr[i].in_addr;
1095 struct sockaddr_in *in4 = (void *)ss;
1096 struct sockaddr_in6 *in6 = (void *)ss;
1403 int port; 1097 int port;
1404 char delim = ','; 1098 char delim = ',';
1405 1099
@@ -1408,11 +1102,15 @@ int ceph_parse_ips(const char *c, const char *end,
1408 p++; 1102 p++;
1409 } 1103 }
1410 1104
1411 ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); 1105 memset(ss, 0, sizeof(*ss));
1412 if (ret) 1106 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1107 delim, &ipend))
1108 ss->ss_family = AF_INET;
1109 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1110 delim, &ipend))
1111 ss->ss_family = AF_INET6;
1112 else
1413 goto bad; 1113 goto bad;
1414 ret = -EINVAL;
1415
1416 p = ipend; 1114 p = ipend;
1417 1115
1418 if (delim == ']') { 1116 if (delim == ']') {
@@ -1457,7 +1155,7 @@ int ceph_parse_ips(const char *c, const char *end,
1457 1155
1458bad: 1156bad:
1459 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); 1157 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1460 return ret; 1158 return -EINVAL;
1461} 1159}
1462EXPORT_SYMBOL(ceph_parse_ips); 1160EXPORT_SYMBOL(ceph_parse_ips);
1463 1161
@@ -1504,9 +1202,22 @@ static int process_banner(struct ceph_connection *con)
1504 ceph_pr_addr(&con->msgr->inst.addr.in_addr)); 1202 ceph_pr_addr(&con->msgr->inst.addr.in_addr));
1505 } 1203 }
1506 1204
1205 set_bit(NEGOTIATING, &con->state);
1206 prepare_read_connect(con);
1507 return 0; 1207 return 0;
1508} 1208}
1509 1209
1210static void fail_protocol(struct ceph_connection *con)
1211{
1212 reset_connection(con);
1213 set_bit(CLOSED, &con->state); /* in case there's queued work */
1214
1215 mutex_unlock(&con->mutex);
1216 if (con->ops->bad_proto)
1217 con->ops->bad_proto(con);
1218 mutex_lock(&con->mutex);
1219}
1220
1510static int process_connect(struct ceph_connection *con) 1221static int process_connect(struct ceph_connection *con)
1511{ 1222{
1512 u64 sup_feat = con->msgr->supported_features; 1223 u64 sup_feat = con->msgr->supported_features;
@@ -1524,7 +1235,7 @@ static int process_connect(struct ceph_connection *con)
1524 ceph_pr_addr(&con->peer_addr.in_addr), 1235 ceph_pr_addr(&con->peer_addr.in_addr),
1525 sup_feat, server_feat, server_feat & ~sup_feat); 1236 sup_feat, server_feat, server_feat & ~sup_feat);
1526 con->error_msg = "missing required protocol features"; 1237 con->error_msg = "missing required protocol features";
1527 reset_connection(con); 1238 fail_protocol(con);
1528 return -1; 1239 return -1;
1529 1240
1530 case CEPH_MSGR_TAG_BADPROTOVER: 1241 case CEPH_MSGR_TAG_BADPROTOVER:
@@ -1535,7 +1246,7 @@ static int process_connect(struct ceph_connection *con)
1535 le32_to_cpu(con->out_connect.protocol_version), 1246 le32_to_cpu(con->out_connect.protocol_version),
1536 le32_to_cpu(con->in_reply.protocol_version)); 1247 le32_to_cpu(con->in_reply.protocol_version));
1537 con->error_msg = "protocol version mismatch"; 1248 con->error_msg = "protocol version mismatch";
1538 reset_connection(con); 1249 fail_protocol(con);
1539 return -1; 1250 return -1;
1540 1251
1541 case CEPH_MSGR_TAG_BADAUTHORIZER: 1252 case CEPH_MSGR_TAG_BADAUTHORIZER:
@@ -1547,8 +1258,7 @@ static int process_connect(struct ceph_connection *con)
1547 return -1; 1258 return -1;
1548 } 1259 }
1549 con->auth_retry = 1; 1260 con->auth_retry = 1;
1550 con_out_kvec_reset(con); 1261 ret = prepare_write_connect(con->msgr, con, 0);
1551 ret = prepare_write_connect(con);
1552 if (ret < 0) 1262 if (ret < 0)
1553 return ret; 1263 return ret;
1554 prepare_read_connect(con); 1264 prepare_read_connect(con);
@@ -1563,15 +1273,12 @@ static int process_connect(struct ceph_connection *con)
1563 * dropped messages. 1273 * dropped messages.
1564 */ 1274 */
1565 dout("process_connect got RESET peer seq %u\n", 1275 dout("process_connect got RESET peer seq %u\n",
1566 le32_to_cpu(con->in_reply.connect_seq)); 1276 le32_to_cpu(con->in_connect.connect_seq));
1567 pr_err("%s%lld %s connection reset\n", 1277 pr_err("%s%lld %s connection reset\n",
1568 ENTITY_NAME(con->peer_name), 1278 ENTITY_NAME(con->peer_name),
1569 ceph_pr_addr(&con->peer_addr.in_addr)); 1279 ceph_pr_addr(&con->peer_addr.in_addr));
1570 reset_connection(con); 1280 reset_connection(con);
1571 con_out_kvec_reset(con); 1281 prepare_write_connect(con->msgr, con, 0);
1572 ret = prepare_write_connect(con);
1573 if (ret < 0)
1574 return ret;
1575 prepare_read_connect(con); 1282 prepare_read_connect(con);
1576 1283
1577 /* Tell ceph about it. */ 1284 /* Tell ceph about it. */
@@ -1580,7 +1287,8 @@ static int process_connect(struct ceph_connection *con)
1580 if (con->ops->peer_reset) 1287 if (con->ops->peer_reset)
1581 con->ops->peer_reset(con); 1288 con->ops->peer_reset(con);
1582 mutex_lock(&con->mutex); 1289 mutex_lock(&con->mutex);
1583 if (con->state != CON_STATE_NEGOTIATING) 1290 if (test_bit(CLOSED, &con->state) ||
1291 test_bit(OPENING, &con->state))
1584 return -EAGAIN; 1292 return -EAGAIN;
1585 break; 1293 break;
1586 1294
@@ -1589,14 +1297,11 @@ static int process_connect(struct ceph_connection *con)
1589 * If we sent a smaller connect_seq than the peer has, try 1297 * If we sent a smaller connect_seq than the peer has, try
1590 * again with a larger value. 1298 * again with a larger value.
1591 */ 1299 */
1592 dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", 1300 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1593 le32_to_cpu(con->out_connect.connect_seq), 1301 le32_to_cpu(con->out_connect.connect_seq),
1594 le32_to_cpu(con->in_reply.connect_seq)); 1302 le32_to_cpu(con->in_connect.connect_seq));
1595 con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); 1303 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1596 con_out_kvec_reset(con); 1304 prepare_write_connect(con->msgr, con, 0);
1597 ret = prepare_write_connect(con);
1598 if (ret < 0)
1599 return ret;
1600 prepare_read_connect(con); 1305 prepare_read_connect(con);
1601 break; 1306 break;
1602 1307
@@ -1607,13 +1312,10 @@ static int process_connect(struct ceph_connection *con)
1607 */ 1312 */
1608 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", 1313 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1609 con->peer_global_seq, 1314 con->peer_global_seq,
1610 le32_to_cpu(con->in_reply.global_seq)); 1315 le32_to_cpu(con->in_connect.global_seq));
1611 get_global_seq(con->msgr, 1316 get_global_seq(con->msgr,
1612 le32_to_cpu(con->in_reply.global_seq)); 1317 le32_to_cpu(con->in_connect.global_seq));
1613 con_out_kvec_reset(con); 1318 prepare_write_connect(con->msgr, con, 0);
1614 ret = prepare_write_connect(con);
1615 if (ret < 0)
1616 return ret;
1617 prepare_read_connect(con); 1319 prepare_read_connect(con);
1618 break; 1320 break;
1619 1321
@@ -1625,13 +1327,10 @@ static int process_connect(struct ceph_connection *con)
1625 ceph_pr_addr(&con->peer_addr.in_addr), 1327 ceph_pr_addr(&con->peer_addr.in_addr),
1626 req_feat, server_feat, req_feat & ~server_feat); 1328 req_feat, server_feat, req_feat & ~server_feat);
1627 con->error_msg = "missing required protocol features"; 1329 con->error_msg = "missing required protocol features";
1628 reset_connection(con); 1330 fail_protocol(con);
1629 return -1; 1331 return -1;
1630 } 1332 }
1631 1333 clear_bit(CONNECTING, &con->state);
1632 WARN_ON(con->state != CON_STATE_NEGOTIATING);
1633 con->state = CON_STATE_OPEN;
1634
1635 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1334 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1636 con->connect_seq++; 1335 con->connect_seq++;
1637 con->peer_features = server_feat; 1336 con->peer_features = server_feat;
@@ -1643,9 +1342,7 @@ static int process_connect(struct ceph_connection *con)
1643 le32_to_cpu(con->in_reply.connect_seq)); 1342 le32_to_cpu(con->in_reply.connect_seq));
1644 1343
1645 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) 1344 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1646 set_bit(CON_FLAG_LOSSYTX, &con->flags); 1345 set_bit(LOSSYTX, &con->state);
1647
1648 con->delay = 0; /* reset backoff memory */
1649 1346
1650 prepare_read_tag(con); 1347 prepare_read_tag(con);
1651 break; 1348 break;
@@ -1675,10 +1372,10 @@ static int process_connect(struct ceph_connection *con)
1675 */ 1372 */
1676static int read_partial_ack(struct ceph_connection *con) 1373static int read_partial_ack(struct ceph_connection *con)
1677{ 1374{
1678 int size = sizeof (con->in_temp_ack); 1375 int to = 0;
1679 int end = size;
1680 1376
1681 return read_partial(con, end, size, &con->in_temp_ack); 1377 return read_partial(con, &to, sizeof(con->in_temp_ack),
1378 &con->in_temp_ack);
1682} 1379}
1683 1380
1684 1381
@@ -1724,18 +1421,22 @@ static int read_partial_message_section(struct ceph_connection *con,
1724 if (ret <= 0) 1421 if (ret <= 0)
1725 return ret; 1422 return ret;
1726 section->iov_len += ret; 1423 section->iov_len += ret;
1424 if (section->iov_len == sec_len)
1425 *crc = crc32c(0, section->iov_base,
1426 section->iov_len);
1727 } 1427 }
1728 if (section->iov_len == sec_len)
1729 *crc = crc32c(0, section->iov_base, section->iov_len);
1730 1428
1731 return 1; 1429 return 1;
1732} 1430}
1733 1431
1734static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); 1432static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1433 struct ceph_msg_header *hdr,
1434 int *skip);
1435
1735 1436
1736static int read_partial_message_pages(struct ceph_connection *con, 1437static int read_partial_message_pages(struct ceph_connection *con,
1737 struct page **pages, 1438 struct page **pages,
1738 unsigned int data_len, bool do_datacrc) 1439 unsigned data_len, int datacrc)
1739{ 1440{
1740 void *p; 1441 void *p;
1741 int ret; 1442 int ret;
@@ -1748,7 +1449,7 @@ static int read_partial_message_pages(struct ceph_connection *con,
1748 p = kmap(pages[con->in_msg_pos.page]); 1449 p = kmap(pages[con->in_msg_pos.page]);
1749 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 1450 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1750 left); 1451 left);
1751 if (ret > 0 && do_datacrc) 1452 if (ret > 0 && datacrc)
1752 con->in_data_crc = 1453 con->in_data_crc =
1753 crc32c(con->in_data_crc, 1454 crc32c(con->in_data_crc,
1754 p + con->in_msg_pos.page_pos, ret); 1455 p + con->in_msg_pos.page_pos, ret);
@@ -1768,12 +1469,15 @@ static int read_partial_message_pages(struct ceph_connection *con,
1768#ifdef CONFIG_BLOCK 1469#ifdef CONFIG_BLOCK
1769static int read_partial_message_bio(struct ceph_connection *con, 1470static int read_partial_message_bio(struct ceph_connection *con,
1770 struct bio **bio_iter, int *bio_seg, 1471 struct bio **bio_iter, int *bio_seg,
1771 unsigned int data_len, bool do_datacrc) 1472 unsigned data_len, int datacrc)
1772{ 1473{
1773 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); 1474 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1774 void *p; 1475 void *p;
1775 int ret, left; 1476 int ret, left;
1776 1477
1478 if (IS_ERR(bv))
1479 return PTR_ERR(bv);
1480
1777 left = min((int)(data_len - con->in_msg_pos.data_pos), 1481 left = min((int)(data_len - con->in_msg_pos.data_pos),
1778 (int)(bv->bv_len - con->in_msg_pos.page_pos)); 1482 (int)(bv->bv_len - con->in_msg_pos.page_pos));
1779 1483
@@ -1781,7 +1485,7 @@ static int read_partial_message_bio(struct ceph_connection *con,
1781 1485
1782 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 1486 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1783 left); 1487 left);
1784 if (ret > 0 && do_datacrc) 1488 if (ret > 0 && datacrc)
1785 con->in_data_crc = 1489 con->in_data_crc =
1786 crc32c(con->in_data_crc, 1490 crc32c(con->in_data_crc,
1787 p + con->in_msg_pos.page_pos, ret); 1491 p + con->in_msg_pos.page_pos, ret);
@@ -1805,31 +1509,35 @@ static int read_partial_message_bio(struct ceph_connection *con,
1805static int read_partial_message(struct ceph_connection *con) 1509static int read_partial_message(struct ceph_connection *con)
1806{ 1510{
1807 struct ceph_msg *m = con->in_msg; 1511 struct ceph_msg *m = con->in_msg;
1808 int size;
1809 int end;
1810 int ret; 1512 int ret;
1811 unsigned int front_len, middle_len, data_len; 1513 int to, left;
1812 bool do_datacrc = !con->msgr->nocrc; 1514 unsigned front_len, middle_len, data_len;
1515 int datacrc = con->msgr->nocrc;
1516 int skip;
1813 u64 seq; 1517 u64 seq;
1814 u32 crc;
1815 1518
1816 dout("read_partial_message con %p msg %p\n", con, m); 1519 dout("read_partial_message con %p msg %p\n", con, m);
1817 1520
1818 /* header */ 1521 /* header */
1819 size = sizeof (con->in_hdr); 1522 while (con->in_base_pos < sizeof(con->in_hdr)) {
1820 end = size; 1523 left = sizeof(con->in_hdr) - con->in_base_pos;
1821 ret = read_partial(con, end, size, &con->in_hdr); 1524 ret = ceph_tcp_recvmsg(con->sock,
1822 if (ret <= 0) 1525 (char *)&con->in_hdr + con->in_base_pos,
1823 return ret; 1526 left);
1824 1527 if (ret <= 0)
1825 crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); 1528 return ret;
1826 if (cpu_to_le32(crc) != con->in_hdr.crc) { 1529 con->in_base_pos += ret;
1827 pr_err("read_partial_message bad hdr " 1530 if (con->in_base_pos == sizeof(con->in_hdr)) {
1828 " crc %u != expected %u\n", 1531 u32 crc = crc32c(0, (void *)&con->in_hdr,
1829 crc, con->in_hdr.crc); 1532 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1830 return -EBADMSG; 1533 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1534 pr_err("read_partial_message bad hdr "
1535 " crc %u != expected %u\n",
1536 crc, con->in_hdr.crc);
1537 return -EBADMSG;
1538 }
1539 }
1831 } 1540 }
1832
1833 front_len = le32_to_cpu(con->in_hdr.front_len); 1541 front_len = le32_to_cpu(con->in_hdr.front_len);
1834 if (front_len > CEPH_MSG_MAX_FRONT_LEN) 1542 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1835 return -EIO; 1543 return -EIO;
@@ -1860,13 +1568,10 @@ static int read_partial_message(struct ceph_connection *con)
1860 1568
1861 /* allocate message? */ 1569 /* allocate message? */
1862 if (!con->in_msg) { 1570 if (!con->in_msg) {
1863 int skip = 0;
1864
1865 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 1571 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1866 con->in_hdr.front_len, con->in_hdr.data_len); 1572 con->in_hdr.front_len, con->in_hdr.data_len);
1867 ret = ceph_con_in_msg_alloc(con, &skip); 1573 skip = 0;
1868 if (ret < 0) 1574 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1869 return ret;
1870 if (skip) { 1575 if (skip) {
1871 /* skip this message */ 1576 /* skip this message */
1872 dout("alloc_msg said skip message\n"); 1577 dout("alloc_msg said skip message\n");
@@ -1877,9 +1582,11 @@ static int read_partial_message(struct ceph_connection *con)
1877 con->in_seq++; 1582 con->in_seq++;
1878 return 0; 1583 return 0;
1879 } 1584 }
1880 1585 if (!con->in_msg) {
1881 BUG_ON(!con->in_msg); 1586 con->error_msg =
1882 BUG_ON(con->in_msg->con != con); 1587 "error allocating memory for incoming message";
1588 return -ENOMEM;
1589 }
1883 m = con->in_msg; 1590 m = con->in_msg;
1884 m->front.iov_len = 0; /* haven't read it yet */ 1591 m->front.iov_len = 0; /* haven't read it yet */
1885 if (m->middle) 1592 if (m->middle)
@@ -1891,11 +1598,6 @@ static int read_partial_message(struct ceph_connection *con)
1891 else 1598 else
1892 con->in_msg_pos.page_pos = 0; 1599 con->in_msg_pos.page_pos = 0;
1893 con->in_msg_pos.data_pos = 0; 1600 con->in_msg_pos.data_pos = 0;
1894
1895#ifdef CONFIG_BLOCK
1896 if (m->bio)
1897 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1898#endif
1899 } 1601 }
1900 1602
1901 /* front */ 1603 /* front */
@@ -1912,20 +1614,24 @@ static int read_partial_message(struct ceph_connection *con)
1912 if (ret <= 0) 1614 if (ret <= 0)
1913 return ret; 1615 return ret;
1914 } 1616 }
1617#ifdef CONFIG_BLOCK
1618 if (m->bio && !m->bio_iter)
1619 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1620#endif
1915 1621
1916 /* (page) data */ 1622 /* (page) data */
1917 while (con->in_msg_pos.data_pos < data_len) { 1623 while (con->in_msg_pos.data_pos < data_len) {
1918 if (m->pages) { 1624 if (m->pages) {
1919 ret = read_partial_message_pages(con, m->pages, 1625 ret = read_partial_message_pages(con, m->pages,
1920 data_len, do_datacrc); 1626 data_len, datacrc);
1921 if (ret <= 0) 1627 if (ret <= 0)
1922 return ret; 1628 return ret;
1923#ifdef CONFIG_BLOCK 1629#ifdef CONFIG_BLOCK
1924 } else if (m->bio) { 1630 } else if (m->bio) {
1925 BUG_ON(!m->bio_iter); 1631
1926 ret = read_partial_message_bio(con, 1632 ret = read_partial_message_bio(con,
1927 &m->bio_iter, &m->bio_seg, 1633 &m->bio_iter, &m->bio_seg,
1928 data_len, do_datacrc); 1634 data_len, datacrc);
1929 if (ret <= 0) 1635 if (ret <= 0)
1930 return ret; 1636 return ret;
1931#endif 1637#endif
@@ -1935,12 +1641,16 @@ static int read_partial_message(struct ceph_connection *con)
1935 } 1641 }
1936 1642
1937 /* footer */ 1643 /* footer */
1938 size = sizeof (m->footer); 1644 to = sizeof(m->hdr) + sizeof(m->footer);
1939 end += size; 1645 while (con->in_base_pos < to) {
1940 ret = read_partial(con, end, size, &m->footer); 1646 left = to - con->in_base_pos;
1941 if (ret <= 0) 1647 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1942 return ret; 1648 (con->in_base_pos - sizeof(m->hdr)),
1943 1649 left);
1650 if (ret <= 0)
1651 return ret;
1652 con->in_base_pos += ret;
1653 }
1944 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", 1654 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1945 m, front_len, m->footer.front_crc, middle_len, 1655 m, front_len, m->footer.front_crc, middle_len,
1946 m->footer.middle_crc, data_len, m->footer.data_crc); 1656 m->footer.middle_crc, data_len, m->footer.data_crc);
@@ -1956,7 +1666,7 @@ static int read_partial_message(struct ceph_connection *con)
1956 m, con->in_middle_crc, m->footer.middle_crc); 1666 m, con->in_middle_crc, m->footer.middle_crc);
1957 return -EBADMSG; 1667 return -EBADMSG;
1958 } 1668 }
1959 if (do_datacrc && 1669 if (datacrc &&
1960 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && 1670 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1961 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { 1671 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1962 pr_err("read_partial_message %p data crc %u != exp. %u\n", m, 1672 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
@@ -1976,11 +1686,8 @@ static void process_message(struct ceph_connection *con)
1976{ 1686{
1977 struct ceph_msg *msg; 1687 struct ceph_msg *msg;
1978 1688
1979 BUG_ON(con->in_msg->con != con);
1980 con->in_msg->con = NULL;
1981 msg = con->in_msg; 1689 msg = con->in_msg;
1982 con->in_msg = NULL; 1690 con->in_msg = NULL;
1983 con->ops->put(con);
1984 1691
1985 /* if first message, set peer_name */ 1692 /* if first message, set peer_name */
1986 if (con->peer_name.type == 0) 1693 if (con->peer_name.type == 0)
@@ -2000,6 +1707,7 @@ static void process_message(struct ceph_connection *con)
2000 con->ops->dispatch(con, msg); 1707 con->ops->dispatch(con, msg);
2001 1708
2002 mutex_lock(&con->mutex); 1709 mutex_lock(&con->mutex);
1710 prepare_read_tag(con);
2003} 1711}
2004 1712
2005 1713
@@ -2009,29 +1717,32 @@ static void process_message(struct ceph_connection *con)
2009 */ 1717 */
2010static int try_write(struct ceph_connection *con) 1718static int try_write(struct ceph_connection *con)
2011{ 1719{
1720 struct ceph_messenger *msgr = con->msgr;
2012 int ret = 1; 1721 int ret = 1;
2013 1722
2014 dout("try_write start %p state %lu\n", con, con->state); 1723 dout("try_write start %p state %lu nref %d\n", con, con->state,
1724 atomic_read(&con->nref));
2015 1725
2016more: 1726more:
2017 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 1727 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
2018 1728
2019 /* open the socket first? */ 1729 /* open the socket first? */
2020 if (con->state == CON_STATE_PREOPEN) { 1730 if (con->sock == NULL) {
2021 BUG_ON(con->sock); 1731 prepare_write_banner(msgr, con);
2022 con->state = CON_STATE_CONNECTING; 1732 prepare_write_connect(msgr, con, 1);
2023
2024 con_out_kvec_reset(con);
2025 prepare_write_banner(con);
2026 prepare_read_banner(con); 1733 prepare_read_banner(con);
1734 set_bit(CONNECTING, &con->state);
1735 clear_bit(NEGOTIATING, &con->state);
2027 1736
2028 BUG_ON(con->in_msg); 1737 BUG_ON(con->in_msg);
2029 con->in_tag = CEPH_MSGR_TAG_READY; 1738 con->in_tag = CEPH_MSGR_TAG_READY;
2030 dout("try_write initiating connect on %p new state %lu\n", 1739 dout("try_write initiating connect on %p new state %lu\n",
2031 con, con->state); 1740 con, con->state);
2032 ret = ceph_tcp_connect(con); 1741 con->sock = ceph_tcp_connect(con);
2033 if (ret < 0) { 1742 if (IS_ERR(con->sock)) {
1743 con->sock = NULL;
2034 con->error_msg = "connect error"; 1744 con->error_msg = "connect error";
1745 ret = -1;
2035 goto out; 1746 goto out;
2036 } 1747 }
2037 } 1748 }
@@ -2070,7 +1781,7 @@ more_kvec:
2070 } 1781 }
2071 1782
2072do_next: 1783do_next:
2073 if (con->state == CON_STATE_OPEN) { 1784 if (!test_bit(CONNECTING, &con->state)) {
2074 /* is anything else pending? */ 1785 /* is anything else pending? */
2075 if (!list_empty(&con->out_queue)) { 1786 if (!list_empty(&con->out_queue)) {
2076 prepare_write_message(con); 1787 prepare_write_message(con);
@@ -2080,15 +1791,14 @@ do_next:
2080 prepare_write_ack(con); 1791 prepare_write_ack(con);
2081 goto more; 1792 goto more;
2082 } 1793 }
2083 if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING, 1794 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
2084 &con->flags)) {
2085 prepare_write_keepalive(con); 1795 prepare_write_keepalive(con);
2086 goto more; 1796 goto more;
2087 } 1797 }
2088 } 1798 }
2089 1799
2090 /* Nothing to do! */ 1800 /* Nothing to do! */
2091 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 1801 clear_bit(WRITE_PENDING, &con->state);
2092 dout("try_write nothing else to write.\n"); 1802 dout("try_write nothing else to write.\n");
2093 ret = 0; 1803 ret = 0;
2094out: 1804out:
@@ -2105,45 +1815,38 @@ static int try_read(struct ceph_connection *con)
2105{ 1815{
2106 int ret = -1; 1816 int ret = -1;
2107 1817
2108more: 1818 if (!con->sock)
2109 dout("try_read start on %p state %lu\n", con, con->state); 1819 return 0;
2110 if (con->state != CON_STATE_CONNECTING && 1820
2111 con->state != CON_STATE_NEGOTIATING && 1821 if (test_bit(STANDBY, &con->state))
2112 con->state != CON_STATE_OPEN)
2113 return 0; 1822 return 0;
2114 1823
2115 BUG_ON(!con->sock); 1824 dout("try_read start on %p\n", con);
2116 1825
1826more:
2117 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 1827 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
2118 con->in_base_pos); 1828 con->in_base_pos);
2119 1829
2120 if (con->state == CON_STATE_CONNECTING) { 1830 /*
2121 dout("try_read connecting\n"); 1831 * process_connect and process_message drop and re-take
2122 ret = read_partial_banner(con); 1832 * con->mutex. make sure we handle a racing close or reopen.
2123 if (ret <= 0) 1833 */
2124 goto out; 1834 if (test_bit(CLOSED, &con->state) ||
2125 ret = process_banner(con); 1835 test_bit(OPENING, &con->state)) {
2126 if (ret < 0) 1836 ret = -EAGAIN;
2127 goto out;
2128
2129 con->state = CON_STATE_NEGOTIATING;
2130
2131 /*
2132 * Received banner is good, exchange connection info.
2133 * Do not reset out_kvec, as sending our banner raced
2134 * with receiving peer banner after connect completed.
2135 */
2136 ret = prepare_write_connect(con);
2137 if (ret < 0)
2138 goto out;
2139 prepare_read_connect(con);
2140
2141 /* Send connection info before awaiting response */
2142 goto out; 1837 goto out;
2143 } 1838 }
2144 1839
2145 if (con->state == CON_STATE_NEGOTIATING) { 1840 if (test_bit(CONNECTING, &con->state)) {
2146 dout("try_read negotiating\n"); 1841 if (!test_bit(NEGOTIATING, &con->state)) {
1842 dout("try_read connecting\n");
1843 ret = read_partial_banner(con);
1844 if (ret <= 0)
1845 goto out;
1846 ret = process_banner(con);
1847 if (ret < 0)
1848 goto out;
1849 }
2147 ret = read_partial_connect(con); 1850 ret = read_partial_connect(con);
2148 if (ret <= 0) 1851 if (ret <= 0)
2149 goto out; 1852 goto out;
@@ -2153,17 +1856,14 @@ more:
2153 goto more; 1856 goto more;
2154 } 1857 }
2155 1858
2156 WARN_ON(con->state != CON_STATE_OPEN);
2157
2158 if (con->in_base_pos < 0) { 1859 if (con->in_base_pos < 0) {
2159 /* 1860 /*
2160 * skipping + discarding content. 1861 * skipping + discarding content.
2161 * 1862 *
2162 * FIXME: there must be a better way to do this! 1863 * FIXME: there must be a better way to do this!
2163 */ 1864 */
2164 static char buf[SKIP_BUF_SIZE]; 1865 static char buf[1024];
2165 int skip = min((int) sizeof (buf), -con->in_base_pos); 1866 int skip = min(1024, -con->in_base_pos);
2166
2167 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); 1867 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
2168 ret = ceph_tcp_recvmsg(con->sock, buf, skip); 1868 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
2169 if (ret <= 0) 1869 if (ret <= 0)
@@ -2188,8 +1888,7 @@ more:
2188 prepare_read_ack(con); 1888 prepare_read_ack(con);
2189 break; 1889 break;
2190 case CEPH_MSGR_TAG_CLOSE: 1890 case CEPH_MSGR_TAG_CLOSE:
2191 con_close_socket(con); 1891 set_bit(CLOSED, &con->state); /* fixme */
2192 con->state = CON_STATE_CLOSED;
2193 goto out; 1892 goto out;
2194 default: 1893 default:
2195 goto bad_tag; 1894 goto bad_tag;
@@ -2212,8 +1911,6 @@ more:
2212 if (con->in_tag == CEPH_MSGR_TAG_READY) 1911 if (con->in_tag == CEPH_MSGR_TAG_READY)
2213 goto more; 1912 goto more;
2214 process_message(con); 1913 process_message(con);
2215 if (con->state == CON_STATE_OPEN)
2216 prepare_read_tag(con);
2217 goto more; 1914 goto more;
2218 } 1915 }
2219 if (con->in_tag == CEPH_MSGR_TAG_ACK) { 1916 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
@@ -2237,62 +1934,28 @@ bad_tag:
2237 1934
2238 1935
2239/* 1936/*
2240 * Atomically queue work on a connection after the specified delay. 1937 * Atomically queue work on a connection. Bump @con reference to
2241 * Bump @con reference to avoid races with connection teardown. 1938 * avoid races with connection teardown.
2242 * Returns 0 if work was queued, or an error code otherwise.
2243 */ 1939 */
2244static int queue_con_delay(struct ceph_connection *con, unsigned long delay) 1940static void queue_con(struct ceph_connection *con)
2245{ 1941{
2246 if (!con->ops->get(con)) { 1942 if (test_bit(DEAD, &con->state)) {
2247 dout("%s %p ref count 0\n", __func__, con); 1943 dout("queue_con %p ignoring: DEAD\n",
2248 1944 con);
2249 return -ENOENT; 1945 return;
2250 } 1946 }
2251 1947
2252 if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { 1948 if (!con->ops->get(con)) {
2253 dout("%s %p - already queued\n", __func__, con); 1949 dout("queue_con %p ref count 0\n", con);
2254 con->ops->put(con); 1950 return;
2255
2256 return -EBUSY;
2257 } 1951 }
2258 1952
2259 dout("%s %p %lu\n", __func__, con, delay); 1953 if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
2260 1954 dout("queue_con %p - already queued\n", con);
2261 return 0; 1955 con->ops->put(con);
2262} 1956 } else {
2263 1957 dout("queue_con %p\n", con);
2264static void queue_con(struct ceph_connection *con)
2265{
2266 (void) queue_con_delay(con, 0);
2267}
2268
2269static bool con_sock_closed(struct ceph_connection *con)
2270{
2271 if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
2272 return false;
2273
2274#define CASE(x) \
2275 case CON_STATE_ ## x: \
2276 con->error_msg = "socket closed (con state " #x ")"; \
2277 break;
2278
2279 switch (con->state) {
2280 CASE(CLOSED);
2281 CASE(PREOPEN);
2282 CASE(CONNECTING);
2283 CASE(NEGOTIATING);
2284 CASE(OPEN);
2285 CASE(STANDBY);
2286 default:
2287 pr_warning("%s con %p unrecognized state %lu\n",
2288 __func__, con, con->state);
2289 con->error_msg = "unrecognized con state";
2290 BUG();
2291 break;
2292 } 1958 }
2293#undef CASE
2294
2295 return true;
2296} 1959}
2297 1960
2298/* 1961/*
@@ -2306,50 +1969,49 @@ static void con_work(struct work_struct *work)
2306 1969
2307 mutex_lock(&con->mutex); 1970 mutex_lock(&con->mutex);
2308restart: 1971restart:
2309 if (con_sock_closed(con)) 1972 if (test_and_clear_bit(BACKOFF, &con->state)) {
2310 goto fault;
2311
2312 if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
2313 dout("con_work %p backing off\n", con); 1973 dout("con_work %p backing off\n", con);
2314 ret = queue_con_delay(con, round_jiffies_relative(con->delay)); 1974 if (queue_delayed_work(ceph_msgr_wq, &con->work,
2315 if (ret) { 1975 round_jiffies_relative(con->delay))) {
1976 dout("con_work %p backoff %lu\n", con, con->delay);
1977 mutex_unlock(&con->mutex);
1978 return;
1979 } else {
1980 con->ops->put(con);
2316 dout("con_work %p FAILED to back off %lu\n", con, 1981 dout("con_work %p FAILED to back off %lu\n", con,
2317 con->delay); 1982 con->delay);
2318 BUG_ON(ret == -ENOENT);
2319 set_bit(CON_FLAG_BACKOFF, &con->flags);
2320 } 1983 }
2321 goto done;
2322 } 1984 }
2323 1985
2324 if (con->state == CON_STATE_STANDBY) { 1986 if (test_bit(STANDBY, &con->state)) {
2325 dout("con_work %p STANDBY\n", con); 1987 dout("con_work %p STANDBY\n", con);
2326 goto done; 1988 goto done;
2327 } 1989 }
2328 if (con->state == CON_STATE_CLOSED) { 1990 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
2329 dout("con_work %p CLOSED\n", con); 1991 dout("con_work CLOSED\n");
2330 BUG_ON(con->sock); 1992 con_close_socket(con);
2331 goto done; 1993 goto done;
2332 } 1994 }
2333 if (con->state == CON_STATE_PREOPEN) { 1995 if (test_and_clear_bit(OPENING, &con->state)) {
1996 /* reopen w/ new peer */
2334 dout("con_work OPENING\n"); 1997 dout("con_work OPENING\n");
2335 BUG_ON(con->sock); 1998 con_close_socket(con);
2336 } 1999 }
2337 2000
2001 if (test_and_clear_bit(SOCK_CLOSED, &con->state))
2002 goto fault;
2003
2338 ret = try_read(con); 2004 ret = try_read(con);
2339 if (ret == -EAGAIN) 2005 if (ret == -EAGAIN)
2340 goto restart; 2006 goto restart;
2341 if (ret < 0) { 2007 if (ret < 0)
2342 con->error_msg = "socket error on read";
2343 goto fault; 2008 goto fault;
2344 }
2345 2009
2346 ret = try_write(con); 2010 ret = try_write(con);
2347 if (ret == -EAGAIN) 2011 if (ret == -EAGAIN)
2348 goto restart; 2012 goto restart;
2349 if (ret < 0) { 2013 if (ret < 0)
2350 con->error_msg = "socket error on write";
2351 goto fault; 2014 goto fault;
2352 }
2353 2015
2354done: 2016done:
2355 mutex_unlock(&con->mutex); 2017 mutex_unlock(&con->mutex);
@@ -2358,6 +2020,7 @@ done_unlocked:
2358 return; 2020 return;
2359 2021
2360fault: 2022fault:
2023 mutex_unlock(&con->mutex);
2361 ceph_fault(con); /* error/fault path */ 2024 ceph_fault(con); /* error/fault path */
2362 goto done_unlocked; 2025 goto done_unlocked;
2363} 2026}
@@ -2368,31 +2031,26 @@ fault:
2368 * exponential backoff 2031 * exponential backoff
2369 */ 2032 */
2370static void ceph_fault(struct ceph_connection *con) 2033static void ceph_fault(struct ceph_connection *con)
2371 __releases(con->mutex)
2372{ 2034{
2373 pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), 2035 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2374 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); 2036 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
2375 dout("fault %p state %lu to peer %s\n", 2037 dout("fault %p state %lu to peer %s\n",
2376 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); 2038 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
2377 2039
2378 WARN_ON(con->state != CON_STATE_CONNECTING && 2040 if (test_bit(LOSSYTX, &con->state)) {
2379 con->state != CON_STATE_NEGOTIATING && 2041 dout("fault on LOSSYTX channel\n");
2380 con->state != CON_STATE_OPEN); 2042 goto out;
2381 2043 }
2382 con_close_socket(con);
2383 2044
2384 if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) { 2045 mutex_lock(&con->mutex);
2385 dout("fault on LOSSYTX channel, marking CLOSED\n"); 2046 if (test_bit(CLOSED, &con->state))
2386 con->state = CON_STATE_CLOSED;
2387 goto out_unlock; 2047 goto out_unlock;
2388 } 2048
2049 con_close_socket(con);
2389 2050
2390 if (con->in_msg) { 2051 if (con->in_msg) {
2391 BUG_ON(con->in_msg->con != con);
2392 con->in_msg->con = NULL;
2393 ceph_msg_put(con->in_msg); 2052 ceph_msg_put(con->in_msg);
2394 con->in_msg = NULL; 2053 con->in_msg = NULL;
2395 con->ops->put(con);
2396 } 2054 }
2397 2055
2398 /* Requeue anything that hasn't been acked */ 2056 /* Requeue anything that hasn't been acked */
@@ -2401,23 +2059,39 @@ static void ceph_fault(struct ceph_connection *con)
2401 /* If there are no messages queued or keepalive pending, place 2059 /* If there are no messages queued or keepalive pending, place
2402 * the connection in a STANDBY state */ 2060 * the connection in a STANDBY state */
2403 if (list_empty(&con->out_queue) && 2061 if (list_empty(&con->out_queue) &&
2404 !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) { 2062 !test_bit(KEEPALIVE_PENDING, &con->state)) {
2405 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); 2063 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
2406 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags); 2064 clear_bit(WRITE_PENDING, &con->state);
2407 con->state = CON_STATE_STANDBY; 2065 set_bit(STANDBY, &con->state);
2408 } else { 2066 } else {
2409 /* retry after a delay. */ 2067 /* retry after a delay. */
2410 con->state = CON_STATE_PREOPEN;
2411 if (con->delay == 0) 2068 if (con->delay == 0)
2412 con->delay = BASE_DELAY_INTERVAL; 2069 con->delay = BASE_DELAY_INTERVAL;
2413 else if (con->delay < MAX_DELAY_INTERVAL) 2070 else if (con->delay < MAX_DELAY_INTERVAL)
2414 con->delay *= 2; 2071 con->delay *= 2;
2415 set_bit(CON_FLAG_BACKOFF, &con->flags); 2072 con->ops->get(con);
2416 queue_con(con); 2073 if (queue_delayed_work(ceph_msgr_wq, &con->work,
2074 round_jiffies_relative(con->delay))) {
2075 dout("fault queued %p delay %lu\n", con, con->delay);
2076 } else {
2077 con->ops->put(con);
2078 dout("fault failed to queue %p delay %lu, backoff\n",
2079 con, con->delay);
2080 /*
2081 * In many cases we see a socket state change
2082 * while con_work is running and end up
2083 * queuing (non-delayed) work, such that we
2084 * can't backoff with a delay. Set a flag so
2085 * that when con_work restarts we schedule the
2086 * delay then.
2087 */
2088 set_bit(BACKOFF, &con->state);
2089 }
2417 } 2090 }
2418 2091
2419out_unlock: 2092out_unlock:
2420 mutex_unlock(&con->mutex); 2093 mutex_unlock(&con->mutex);
2094out:
2421 /* 2095 /*
2422 * in case we faulted due to authentication, invalidate our 2096 * in case we faulted due to authentication, invalidate our
2423 * current tickets so that we can get new ones. 2097 * current tickets so that we can get new ones.
@@ -2434,19 +2108,32 @@ out_unlock:
2434 2108
2435 2109
2436/* 2110/*
2437 * initialize a new messenger instance 2111 * create a new messenger instance
2438 */ 2112 */
2439void ceph_messenger_init(struct ceph_messenger *msgr, 2113struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2440 struct ceph_entity_addr *myaddr, 2114 u32 supported_features,
2441 u32 supported_features, 2115 u32 required_features)
2442 u32 required_features,
2443 bool nocrc)
2444{ 2116{
2117 struct ceph_messenger *msgr;
2118
2119 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
2120 if (msgr == NULL)
2121 return ERR_PTR(-ENOMEM);
2122
2445 msgr->supported_features = supported_features; 2123 msgr->supported_features = supported_features;
2446 msgr->required_features = required_features; 2124 msgr->required_features = required_features;
2447 2125
2448 spin_lock_init(&msgr->global_seq_lock); 2126 spin_lock_init(&msgr->global_seq_lock);
2449 2127
2128 /* the zero page is needed if a request is "canceled" while the message
2129 * is being written over the socket */
2130 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
2131 if (!msgr->zero_page) {
2132 kfree(msgr);
2133 return ERR_PTR(-ENOMEM);
2134 }
2135 kmap(msgr->zero_page);
2136
2450 if (myaddr) 2137 if (myaddr)
2451 msgr->inst.addr = *myaddr; 2138 msgr->inst.addr = *myaddr;
2452 2139
@@ -2454,23 +2141,32 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
2454 msgr->inst.addr.type = 0; 2141 msgr->inst.addr.type = 0;
2455 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); 2142 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
2456 encode_my_addr(msgr); 2143 encode_my_addr(msgr);
2457 msgr->nocrc = nocrc;
2458 2144
2459 atomic_set(&msgr->stopping, 0); 2145 dout("messenger_create %p\n", msgr);
2146 return msgr;
2147}
2148EXPORT_SYMBOL(ceph_messenger_create);
2460 2149
2461 dout("%s %p\n", __func__, msgr); 2150void ceph_messenger_destroy(struct ceph_messenger *msgr)
2151{
2152 dout("destroy %p\n", msgr);
2153 kunmap(msgr->zero_page);
2154 __free_page(msgr->zero_page);
2155 kfree(msgr);
2156 dout("destroyed messenger %p\n", msgr);
2462} 2157}
2463EXPORT_SYMBOL(ceph_messenger_init); 2158EXPORT_SYMBOL(ceph_messenger_destroy);
2464 2159
2465static void clear_standby(struct ceph_connection *con) 2160static void clear_standby(struct ceph_connection *con)
2466{ 2161{
2467 /* come back from STANDBY? */ 2162 /* come back from STANDBY? */
2468 if (con->state == CON_STATE_STANDBY) { 2163 if (test_and_clear_bit(STANDBY, &con->state)) {
2164 mutex_lock(&con->mutex);
2469 dout("clear_standby %p and ++connect_seq\n", con); 2165 dout("clear_standby %p and ++connect_seq\n", con);
2470 con->state = CON_STATE_PREOPEN;
2471 con->connect_seq++; 2166 con->connect_seq++;
2472 WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags)); 2167 WARN_ON(test_bit(WRITE_PENDING, &con->state));
2473 WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)); 2168 WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
2169 mutex_unlock(&con->mutex);
2474 } 2170 }
2475} 2171}
2476 2172
@@ -2479,24 +2175,21 @@ static void clear_standby(struct ceph_connection *con)
2479 */ 2175 */
2480void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) 2176void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2481{ 2177{
2482 /* set src+dst */ 2178 if (test_bit(CLOSED, &con->state)) {
2483 msg->hdr.src = con->msgr->inst.name;
2484 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
2485 msg->needs_out_seq = true;
2486
2487 mutex_lock(&con->mutex);
2488
2489 if (con->state == CON_STATE_CLOSED) {
2490 dout("con_send %p closed, dropping %p\n", con, msg); 2179 dout("con_send %p closed, dropping %p\n", con, msg);
2491 ceph_msg_put(msg); 2180 ceph_msg_put(msg);
2492 mutex_unlock(&con->mutex);
2493 return; 2181 return;
2494 } 2182 }
2495 2183
2496 BUG_ON(msg->con != NULL); 2184 /* set src+dst */
2497 msg->con = con->ops->get(con); 2185 msg->hdr.src = con->msgr->inst.name;
2498 BUG_ON(msg->con == NULL); 2186
2187 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
2499 2188
2189 msg->needs_out_seq = true;
2190
2191 /* queue */
2192 mutex_lock(&con->mutex);
2500 BUG_ON(!list_empty(&msg->list_head)); 2193 BUG_ON(!list_empty(&msg->list_head));
2501 list_add_tail(&msg->list_head, &con->out_queue); 2194 list_add_tail(&msg->list_head, &con->out_queue);
2502 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, 2195 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
@@ -2505,13 +2198,12 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2505 le32_to_cpu(msg->hdr.front_len), 2198 le32_to_cpu(msg->hdr.front_len),
2506 le32_to_cpu(msg->hdr.middle_len), 2199 le32_to_cpu(msg->hdr.middle_len),
2507 le32_to_cpu(msg->hdr.data_len)); 2200 le32_to_cpu(msg->hdr.data_len));
2508
2509 clear_standby(con);
2510 mutex_unlock(&con->mutex); 2201 mutex_unlock(&con->mutex);
2511 2202
2512 /* if there wasn't anything waiting to send before, queue 2203 /* if there wasn't anything waiting to send before, queue
2513 * new work */ 2204 * new work */
2514 if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0) 2205 clear_standby(con);
2206 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2515 queue_con(con); 2207 queue_con(con);
2516} 2208}
2517EXPORT_SYMBOL(ceph_con_send); 2209EXPORT_SYMBOL(ceph_con_send);
@@ -2519,34 +2211,24 @@ EXPORT_SYMBOL(ceph_con_send);
2519/* 2211/*
2520 * Revoke a message that was previously queued for send 2212 * Revoke a message that was previously queued for send
2521 */ 2213 */
2522void ceph_msg_revoke(struct ceph_msg *msg) 2214void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2523{ 2215{
2524 struct ceph_connection *con = msg->con;
2525
2526 if (!con)
2527 return; /* Message not in our possession */
2528
2529 mutex_lock(&con->mutex); 2216 mutex_lock(&con->mutex);
2530 if (!list_empty(&msg->list_head)) { 2217 if (!list_empty(&msg->list_head)) {
2531 dout("%s %p msg %p - was on queue\n", __func__, con, msg); 2218 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2532 list_del_init(&msg->list_head); 2219 list_del_init(&msg->list_head);
2533 BUG_ON(msg->con == NULL);
2534 msg->con->ops->put(msg->con);
2535 msg->con = NULL;
2536 msg->hdr.seq = 0;
2537
2538 ceph_msg_put(msg); 2220 ceph_msg_put(msg);
2221 msg->hdr.seq = 0;
2539 } 2222 }
2540 if (con->out_msg == msg) { 2223 if (con->out_msg == msg) {
2541 dout("%s %p msg %p - was sending\n", __func__, con, msg); 2224 dout("con_revoke %p msg %p - was sending\n", con, msg);
2542 con->out_msg = NULL; 2225 con->out_msg = NULL;
2543 if (con->out_kvec_is_msg) { 2226 if (con->out_kvec_is_msg) {
2544 con->out_skip = con->out_kvec_bytes; 2227 con->out_skip = con->out_kvec_bytes;
2545 con->out_kvec_is_msg = false; 2228 con->out_kvec_is_msg = false;
2546 } 2229 }
2547 msg->hdr.seq = 0;
2548
2549 ceph_msg_put(msg); 2230 ceph_msg_put(msg);
2231 msg->hdr.seq = 0;
2550 } 2232 }
2551 mutex_unlock(&con->mutex); 2233 mutex_unlock(&con->mutex);
2552} 2234}
@@ -2554,27 +2236,17 @@ void ceph_msg_revoke(struct ceph_msg *msg)
2554/* 2236/*
2555 * Revoke a message that we may be reading data into 2237 * Revoke a message that we may be reading data into
2556 */ 2238 */
2557void ceph_msg_revoke_incoming(struct ceph_msg *msg) 2239void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2558{ 2240{
2559 struct ceph_connection *con;
2560
2561 BUG_ON(msg == NULL);
2562 if (!msg->con) {
2563 dout("%s msg %p null con\n", __func__, msg);
2564
2565 return; /* Message not in our possession */
2566 }
2567
2568 con = msg->con;
2569 mutex_lock(&con->mutex); 2241 mutex_lock(&con->mutex);
2570 if (con->in_msg == msg) { 2242 if (con->in_msg && con->in_msg == msg) {
2571 unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); 2243 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2572 unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); 2244 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2573 unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); 2245 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2574 2246
2575 /* skip rest of message */ 2247 /* skip rest of message */
2576 dout("%s %p msg %p revoked\n", __func__, con, msg); 2248 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2577 con->in_base_pos = con->in_base_pos - 2249 con->in_base_pos = con->in_base_pos -
2578 sizeof(struct ceph_msg_header) - 2250 sizeof(struct ceph_msg_header) -
2579 front_len - 2251 front_len -
2580 middle_len - 2252 middle_len -
@@ -2585,8 +2257,8 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
2585 con->in_tag = CEPH_MSGR_TAG_READY; 2257 con->in_tag = CEPH_MSGR_TAG_READY;
2586 con->in_seq++; 2258 con->in_seq++;
2587 } else { 2259 } else {
2588 dout("%s %p in_msg %p msg %p no-op\n", 2260 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2589 __func__, con, con->in_msg, msg); 2261 con, con->in_msg, msg);
2590 } 2262 }
2591 mutex_unlock(&con->mutex); 2263 mutex_unlock(&con->mutex);
2592} 2264}
@@ -2597,11 +2269,9 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
2597void ceph_con_keepalive(struct ceph_connection *con) 2269void ceph_con_keepalive(struct ceph_connection *con)
2598{ 2270{
2599 dout("con_keepalive %p\n", con); 2271 dout("con_keepalive %p\n", con);
2600 mutex_lock(&con->mutex);
2601 clear_standby(con); 2272 clear_standby(con);
2602 mutex_unlock(&con->mutex); 2273 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2603 if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 && 2274 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2604 test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
2605 queue_con(con); 2275 queue_con(con);
2606} 2276}
2607EXPORT_SYMBOL(ceph_con_keepalive); 2277EXPORT_SYMBOL(ceph_con_keepalive);
@@ -2611,8 +2281,7 @@ EXPORT_SYMBOL(ceph_con_keepalive);
2611 * construct a new message with given type, size 2281 * construct a new message with given type, size
2612 * the new msg has a ref count of 1. 2282 * the new msg has a ref count of 1.
2613 */ 2283 */
2614struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 2284struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2615 bool can_fail)
2616{ 2285{
2617 struct ceph_msg *m; 2286 struct ceph_msg *m;
2618 2287
@@ -2620,8 +2289,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2620 if (m == NULL) 2289 if (m == NULL)
2621 goto out; 2290 goto out;
2622 kref_init(&m->kref); 2291 kref_init(&m->kref);
2623
2624 m->con = NULL;
2625 INIT_LIST_HEAD(&m->list_head); 2292 INIT_LIST_HEAD(&m->list_head);
2626 2293
2627 m->hdr.tid = 0; 2294 m->hdr.tid = 0;
@@ -2666,7 +2333,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2666 m->front.iov_base = kmalloc(front_len, flags); 2333 m->front.iov_base = kmalloc(front_len, flags);
2667 } 2334 }
2668 if (m->front.iov_base == NULL) { 2335 if (m->front.iov_base == NULL) {
2669 dout("ceph_msg_new can't allocate %d bytes\n", 2336 pr_err("msg_new can't allocate %d bytes\n",
2670 front_len); 2337 front_len);
2671 goto out2; 2338 goto out2;
2672 } 2339 }
@@ -2681,14 +2348,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2681out2: 2348out2:
2682 ceph_msg_put(m); 2349 ceph_msg_put(m);
2683out: 2350out:
2684 if (!can_fail) { 2351 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2685 pr_err("msg_new can't create type %d front %d\n", type,
2686 front_len);
2687 WARN_ON(1);
2688 } else {
2689 dout("msg_new can't create type %d front %d\n", type,
2690 front_len);
2691 }
2692 return NULL; 2352 return NULL;
2693} 2353}
2694EXPORT_SYMBOL(ceph_msg_new); 2354EXPORT_SYMBOL(ceph_msg_new);
@@ -2717,78 +2377,46 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2717} 2377}
2718 2378
2719/* 2379/*
2720 * Allocate a message for receiving an incoming message on a 2380 * Generic message allocator, for incoming messages.
2721 * connection, and save the result in con->in_msg. Uses the
2722 * connection's private alloc_msg op if available.
2723 *
2724 * Returns 0 on success, or a negative error code.
2725 *
2726 * On success, if we set *skip = 1:
2727 * - the next message should be skipped and ignored.
2728 * - con->in_msg == NULL
2729 * or if we set *skip = 0:
2730 * - con->in_msg is non-null.
2731 * On error (ENOMEM, EAGAIN, ...),
2732 * - con->in_msg == NULL
2733 */ 2381 */
2734static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) 2382static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2383 struct ceph_msg_header *hdr,
2384 int *skip)
2735{ 2385{
2736 struct ceph_msg_header *hdr = &con->in_hdr;
2737 int type = le16_to_cpu(hdr->type); 2386 int type = le16_to_cpu(hdr->type);
2738 int front_len = le32_to_cpu(hdr->front_len); 2387 int front_len = le32_to_cpu(hdr->front_len);
2739 int middle_len = le32_to_cpu(hdr->middle_len); 2388 int middle_len = le32_to_cpu(hdr->middle_len);
2740 int ret = 0; 2389 struct ceph_msg *msg = NULL;
2741 2390 int ret;
2742 BUG_ON(con->in_msg != NULL);
2743 2391
2744 if (con->ops->alloc_msg) { 2392 if (con->ops->alloc_msg) {
2745 struct ceph_msg *msg;
2746
2747 mutex_unlock(&con->mutex); 2393 mutex_unlock(&con->mutex);
2748 msg = con->ops->alloc_msg(con, hdr, skip); 2394 msg = con->ops->alloc_msg(con, hdr, skip);
2749 mutex_lock(&con->mutex); 2395 mutex_lock(&con->mutex);
2750 if (con->state != CON_STATE_OPEN) { 2396 if (!msg || *skip)
2751 if (msg) 2397 return NULL;
2752 ceph_msg_put(msg);
2753 return -EAGAIN;
2754 }
2755 con->in_msg = msg;
2756 if (con->in_msg) {
2757 con->in_msg->con = con->ops->get(con);
2758 BUG_ON(con->in_msg->con == NULL);
2759 }
2760 if (*skip) {
2761 con->in_msg = NULL;
2762 return 0;
2763 }
2764 if (!con->in_msg) {
2765 con->error_msg =
2766 "error allocating memory for incoming message";
2767 return -ENOMEM;
2768 }
2769 } 2398 }
2770 if (!con->in_msg) { 2399 if (!msg) {
2771 con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); 2400 *skip = 0;
2772 if (!con->in_msg) { 2401 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2402 if (!msg) {
2773 pr_err("unable to allocate msg type %d len %d\n", 2403 pr_err("unable to allocate msg type %d len %d\n",
2774 type, front_len); 2404 type, front_len);
2775 return -ENOMEM; 2405 return NULL;
2776 } 2406 }
2777 con->in_msg->con = con->ops->get(con); 2407 msg->page_alignment = le16_to_cpu(hdr->data_off);
2778 BUG_ON(con->in_msg->con == NULL);
2779 con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
2780 } 2408 }
2781 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2409 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2782 2410
2783 if (middle_len && !con->in_msg->middle) { 2411 if (middle_len && !msg->middle) {
2784 ret = ceph_alloc_middle(con, con->in_msg); 2412 ret = ceph_alloc_middle(con, msg);
2785 if (ret < 0) { 2413 if (ret < 0) {
2786 ceph_msg_put(con->in_msg); 2414 ceph_msg_put(msg);
2787 con->in_msg = NULL; 2415 return NULL;
2788 } 2416 }
2789 } 2417 }
2790 2418
2791 return ret; 2419 return msg;
2792} 2420}
2793 2421
2794 2422
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 812eb3b46c1..cbe31fa4550 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -8,8 +8,8 @@
8 8
9#include <linux/ceph/mon_client.h> 9#include <linux/ceph/mon_client.h>
10#include <linux/ceph/libceph.h> 10#include <linux/ceph/libceph.h>
11#include <linux/ceph/debugfs.h>
12#include <linux/ceph/decode.h> 11#include <linux/ceph/decode.h>
12
13#include <linux/ceph/auth.h> 13#include <linux/ceph/auth.h>
14 14
15/* 15/*
@@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
106 monc->pending_auth = 1; 106 monc->pending_auth = 1;
107 monc->m_auth->front.iov_len = len; 107 monc->m_auth->front.iov_len = len;
108 monc->m_auth->hdr.front_len = cpu_to_le32(len); 108 monc->m_auth->hdr.front_len = cpu_to_le32(len);
109 ceph_msg_revoke(monc->m_auth); 109 ceph_con_revoke(monc->con, monc->m_auth);
110 ceph_msg_get(monc->m_auth); /* keep our ref */ 110 ceph_msg_get(monc->m_auth); /* keep our ref */
111 ceph_con_send(&monc->con, monc->m_auth); 111 ceph_con_send(monc->con, monc->m_auth);
112} 112}
113 113
114/* 114/*
@@ -116,15 +116,14 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
116 */ 116 */
117static void __close_session(struct ceph_mon_client *monc) 117static void __close_session(struct ceph_mon_client *monc)
118{ 118{
119 dout("__close_session closing mon%d\n", monc->cur_mon); 119 if (monc->con) {
120 ceph_msg_revoke(monc->m_auth); 120 dout("__close_session closing mon%d\n", monc->cur_mon);
121 ceph_msg_revoke_incoming(monc->m_auth_reply); 121 ceph_con_revoke(monc->con, monc->m_auth);
122 ceph_msg_revoke(monc->m_subscribe); 122 ceph_con_close(monc->con);
123 ceph_msg_revoke_incoming(monc->m_subscribe_ack); 123 monc->cur_mon = -1;
124 ceph_con_close(&monc->con); 124 monc->pending_auth = 0;
125 monc->cur_mon = -1; 125 ceph_auth_reset(monc->auth);
126 monc->pending_auth = 0; 126 }
127 ceph_auth_reset(monc->auth);
128} 127}
129 128
130/* 129/*
@@ -145,8 +144,9 @@ static int __open_session(struct ceph_mon_client *monc)
145 monc->want_next_osdmap = !!monc->want_next_osdmap; 144 monc->want_next_osdmap = !!monc->want_next_osdmap;
146 145
147 dout("open_session mon%d opening\n", monc->cur_mon); 146 dout("open_session mon%d opening\n", monc->cur_mon);
148 ceph_con_open(&monc->con, 147 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
149 CEPH_ENTITY_TYPE_MON, monc->cur_mon, 148 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
149 ceph_con_open(monc->con,
150 &monc->monmap->mon_inst[monc->cur_mon].addr); 150 &monc->monmap->mon_inst[monc->cur_mon].addr);
151 151
152 /* initiatiate authentication handshake */ 152 /* initiatiate authentication handshake */
@@ -170,7 +170,7 @@ static bool __sub_expired(struct ceph_mon_client *monc)
170 */ 170 */
171static void __schedule_delayed(struct ceph_mon_client *monc) 171static void __schedule_delayed(struct ceph_mon_client *monc)
172{ 172{
173 unsigned int delay; 173 unsigned delay;
174 174
175 if (monc->cur_mon < 0 || __sub_expired(monc)) 175 if (monc->cur_mon < 0 || __sub_expired(monc))
176 delay = 10 * HZ; 176 delay = 10 * HZ;
@@ -186,7 +186,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
186static void __send_subscribe(struct ceph_mon_client *monc) 186static void __send_subscribe(struct ceph_mon_client *monc)
187{ 187{
188 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", 188 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
189 (unsigned int)monc->sub_sent, __sub_expired(monc), 189 (unsigned)monc->sub_sent, __sub_expired(monc),
190 monc->want_next_osdmap); 190 monc->want_next_osdmap);
191 if ((__sub_expired(monc) && !monc->sub_sent) || 191 if ((__sub_expired(monc) && !monc->sub_sent) ||
192 monc->want_next_osdmap == 1) { 192 monc->want_next_osdmap == 1) {
@@ -203,7 +203,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
203 203
204 if (monc->want_next_osdmap) { 204 if (monc->want_next_osdmap) {
205 dout("__send_subscribe to 'osdmap' %u\n", 205 dout("__send_subscribe to 'osdmap' %u\n",
206 (unsigned int)monc->have_osdmap); 206 (unsigned)monc->have_osdmap);
207 ceph_encode_string(&p, end, "osdmap", 6); 207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p; 208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap); 209 i->have = cpu_to_le64(monc->have_osdmap);
@@ -213,7 +213,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
213 } 213 }
214 if (monc->want_mdsmap) { 214 if (monc->want_mdsmap) {
215 dout("__send_subscribe to 'mdsmap' %u+\n", 215 dout("__send_subscribe to 'mdsmap' %u+\n",
216 (unsigned int)monc->have_mdsmap); 216 (unsigned)monc->have_mdsmap);
217 ceph_encode_string(&p, end, "mdsmap", 6); 217 ceph_encode_string(&p, end, "mdsmap", 6);
218 i = p; 218 i = p;
219 i->have = cpu_to_le64(monc->have_mdsmap); 219 i->have = cpu_to_le64(monc->have_mdsmap);
@@ -228,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
228 228
229 msg->front.iov_len = p - msg->front.iov_base; 229 msg->front.iov_len = p - msg->front.iov_base;
230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
231 ceph_msg_revoke(msg); 231 ceph_con_revoke(monc->con, msg);
232 ceph_con_send(&monc->con, ceph_msg_get(msg)); 232 ceph_con_send(monc->con, ceph_msg_get(msg));
233 233
234 monc->sub_sent = jiffies | 1; /* never 0 */ 234 monc->sub_sent = jiffies | 1; /* never 0 */
235 } 235 }
@@ -238,7 +238,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
238static void handle_subscribe_ack(struct ceph_mon_client *monc, 238static void handle_subscribe_ack(struct ceph_mon_client *monc,
239 struct ceph_msg *msg) 239 struct ceph_msg *msg)
240{ 240{
241 unsigned int seconds; 241 unsigned seconds;
242 struct ceph_mon_subscribe_ack *h = msg->front.iov_base; 242 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
243 243
244 if (msg->front.iov_len < sizeof(*h)) 244 if (msg->front.iov_len < sizeof(*h))
@@ -249,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
249 if (monc->hunting) { 249 if (monc->hunting) {
250 pr_info("mon%d %s session established\n", 250 pr_info("mon%d %s session established\n",
251 monc->cur_mon, 251 monc->cur_mon,
252 ceph_pr_addr(&monc->con.peer_addr.in_addr)); 252 ceph_pr_addr(&monc->con->peer_addr.in_addr));
253 monc->hunting = false; 253 monc->hunting = false;
254 } 254 }
255 dout("handle_subscribe_ack after %d seconds\n", seconds); 255 dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -302,6 +302,15 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
302 */ 302 */
303int ceph_monc_open_session(struct ceph_mon_client *monc) 303int ceph_monc_open_session(struct ceph_mon_client *monc)
304{ 304{
305 if (!monc->con) {
306 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
307 if (!monc->con)
308 return -ENOMEM;
309 ceph_con_init(monc->client->msgr, monc->con);
310 monc->con->private = monc;
311 monc->con->ops = &mon_con_ops;
312 }
313
305 mutex_lock(&monc->mutex); 314 mutex_lock(&monc->mutex);
306 __open_session(monc); 315 __open_session(monc);
307 __schedule_delayed(monc); 316 __schedule_delayed(monc);
@@ -311,17 +320,6 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
311EXPORT_SYMBOL(ceph_monc_open_session); 320EXPORT_SYMBOL(ceph_monc_open_session);
312 321
313/* 322/*
314 * We require the fsid and global_id in order to initialize our
315 * debugfs dir.
316 */
317static bool have_debugfs_info(struct ceph_mon_client *monc)
318{
319 dout("have_debugfs_info fsid %d globalid %lld\n",
320 (int)monc->client->have_fsid, monc->auth->global_id);
321 return monc->client->have_fsid && monc->auth->global_id > 0;
322}
323
324/*
325 * The monitor responds with mount ack indicate mount success. The 323 * The monitor responds with mount ack indicate mount success. The
326 * included client ticket allows the client to talk to MDSs and OSDs. 324 * included client ticket allows the client to talk to MDSs and OSDs.
327 */ 325 */
@@ -331,12 +329,9 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
331 struct ceph_client *client = monc->client; 329 struct ceph_client *client = monc->client;
332 struct ceph_monmap *monmap = NULL, *old = monc->monmap; 330 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
333 void *p, *end; 331 void *p, *end;
334 int had_debugfs_info, init_debugfs = 0;
335 332
336 mutex_lock(&monc->mutex); 333 mutex_lock(&monc->mutex);
337 334
338 had_debugfs_info = have_debugfs_info(monc);
339
340 dout("handle_monmap\n"); 335 dout("handle_monmap\n");
341 p = msg->front.iov_base; 336 p = msg->front.iov_base;
342 end = p + msg->front.iov_len; 337 end = p + msg->front.iov_len;
@@ -356,29 +351,8 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
356 client->monc.monmap = monmap; 351 client->monc.monmap = monmap;
357 kfree(old); 352 kfree(old);
358 353
359 if (!client->have_fsid) {
360 client->have_fsid = true;
361 if (!had_debugfs_info && have_debugfs_info(monc)) {
362 pr_info("client%lld fsid %pU\n",
363 ceph_client_id(monc->client),
364 &monc->client->fsid);
365 init_debugfs = 1;
366 }
367 mutex_unlock(&monc->mutex);
368
369 if (init_debugfs) {
370 /*
371 * do debugfs initialization without mutex to avoid
372 * creating a locking dependency
373 */
374 ceph_debugfs_client_init(monc->client);
375 }
376
377 goto out_unlocked;
378 }
379out: 354out:
380 mutex_unlock(&monc->mutex); 355 mutex_unlock(&monc->mutex);
381out_unlocked:
382 wake_up_all(&client->auth_wq); 356 wake_up_all(&client->auth_wq);
383} 357}
384 358
@@ -465,7 +439,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
465 m = NULL; 439 m = NULL;
466 } else { 440 } else {
467 dout("get_generic_reply %lld got %p\n", tid, req->reply); 441 dout("get_generic_reply %lld got %p\n", tid, req->reply);
468 *skip = 0;
469 m = ceph_msg_get(req->reply); 442 m = ceph_msg_get(req->reply);
470 /* 443 /*
471 * we don't need to track the connection reading into 444 * we don't need to track the connection reading into
@@ -488,7 +461,7 @@ static int do_generic_request(struct ceph_mon_client *monc,
488 req->request->hdr.tid = cpu_to_le64(req->tid); 461 req->request->hdr.tid = cpu_to_le64(req->tid);
489 __insert_generic_request(monc, req); 462 __insert_generic_request(monc, req);
490 monc->num_generic_requests++; 463 monc->num_generic_requests++;
491 ceph_con_send(&monc->con, ceph_msg_get(req->request)); 464 ceph_con_send(monc->con, ceph_msg_get(req->request));
492 mutex_unlock(&monc->mutex); 465 mutex_unlock(&monc->mutex);
493 466
494 err = wait_for_completion_interruptible(&req->completion); 467 err = wait_for_completion_interruptible(&req->completion);
@@ -555,12 +528,10 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
555 init_completion(&req->completion); 528 init_completion(&req->completion);
556 529
557 err = -ENOMEM; 530 err = -ENOMEM;
558 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, 531 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
559 true);
560 if (!req->request) 532 if (!req->request)
561 goto out; 533 goto out;
562 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, 534 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
563 true);
564 if (!req->reply) 535 if (!req->reply)
565 goto out; 536 goto out;
566 537
@@ -637,7 +608,7 @@ bad:
637/* 608/*
638 * Do a synchronous pool op. 609 * Do a synchronous pool op.
639 */ 610 */
640static int do_poolop(struct ceph_mon_client *monc, u32 op, 611int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
641 u32 pool, u64 snapid, 612 u32 pool, u64 snapid,
642 char *buf, int len) 613 char *buf, int len)
643{ 614{
@@ -655,12 +626,10 @@ static int do_poolop(struct ceph_mon_client *monc, u32 op,
655 init_completion(&req->completion); 626 init_completion(&req->completion);
656 627
657 err = -ENOMEM; 628 err = -ENOMEM;
658 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, 629 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
659 true);
660 if (!req->request) 630 if (!req->request)
661 goto out; 631 goto out;
662 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, 632 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
663 true);
664 if (!req->reply) 633 if (!req->reply)
665 goto out; 634 goto out;
666 635
@@ -687,7 +656,7 @@ out:
687int ceph_monc_create_snapid(struct ceph_mon_client *monc, 656int ceph_monc_create_snapid(struct ceph_mon_client *monc,
688 u32 pool, u64 *snapid) 657 u32 pool, u64 *snapid)
689{ 658{
690 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 659 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
691 pool, 0, (char *)snapid, sizeof(*snapid)); 660 pool, 0, (char *)snapid, sizeof(*snapid));
692 661
693} 662}
@@ -696,7 +665,7 @@ EXPORT_SYMBOL(ceph_monc_create_snapid);
696int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 665int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
697 u32 pool, u64 snapid) 666 u32 pool, u64 snapid)
698{ 667{
699 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 668 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
700 pool, snapid, 0, 0); 669 pool, snapid, 0, 0);
701 670
702} 671}
@@ -711,9 +680,8 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
711 680
712 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { 681 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
713 req = rb_entry(p, struct ceph_mon_generic_request, node); 682 req = rb_entry(p, struct ceph_mon_generic_request, node);
714 ceph_msg_revoke(req->request); 683 ceph_con_revoke(monc->con, req->request);
715 ceph_msg_revoke_incoming(req->reply); 684 ceph_con_send(monc->con, ceph_msg_get(req->request));
716 ceph_con_send(&monc->con, ceph_msg_get(req->request));
717 } 685 }
718} 686}
719 687
@@ -733,7 +701,7 @@ static void delayed_work(struct work_struct *work)
733 __close_session(monc); 701 __close_session(monc);
734 __open_session(monc); /* continue hunting */ 702 __open_session(monc); /* continue hunting */
735 } else { 703 } else {
736 ceph_con_keepalive(&monc->con); 704 ceph_con_keepalive(monc->con);
737 705
738 __validate_auth(monc); 706 __validate_auth(monc);
739 707
@@ -769,6 +737,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
769 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); 737 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
770 } 738 }
771 monc->monmap->num_mon = num_mon; 739 monc->monmap->num_mon = num_mon;
740 monc->have_fsid = false;
772 return 0; 741 return 0;
773} 742}
774 743
@@ -786,14 +755,13 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
786 if (err) 755 if (err)
787 goto out; 756 goto out;
788 757
789 /* connection */ 758 monc->con = NULL;
759
790 /* authentication */ 760 /* authentication */
791 monc->auth = ceph_auth_init(cl->options->name, 761 monc->auth = ceph_auth_init(cl->options->name,
792 cl->options->key); 762 cl->options->key);
793 if (IS_ERR(monc->auth)) { 763 if (IS_ERR(monc->auth))
794 err = PTR_ERR(monc->auth); 764 return PTR_ERR(monc->auth);
795 goto out_monmap;
796 }
797 monc->auth->want_keys = 765 monc->auth->want_keys =
798 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 766 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
799 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 767 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
@@ -802,28 +770,23 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
802 err = -ENOMEM; 770 err = -ENOMEM;
803 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, 771 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
804 sizeof(struct ceph_mon_subscribe_ack), 772 sizeof(struct ceph_mon_subscribe_ack),
805 GFP_NOFS, true); 773 GFP_NOFS);
806 if (!monc->m_subscribe_ack) 774 if (!monc->m_subscribe_ack)
807 goto out_auth; 775 goto out_monmap;
808 776
809 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, 777 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
810 true);
811 if (!monc->m_subscribe) 778 if (!monc->m_subscribe)
812 goto out_subscribe_ack; 779 goto out_subscribe_ack;
813 780
814 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, 781 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
815 true);
816 if (!monc->m_auth_reply) 782 if (!monc->m_auth_reply)
817 goto out_subscribe; 783 goto out_subscribe;
818 784
819 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); 785 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
820 monc->pending_auth = 0; 786 monc->pending_auth = 0;
821 if (!monc->m_auth) 787 if (!monc->m_auth)
822 goto out_auth_reply; 788 goto out_auth_reply;
823 789
824 ceph_con_init(&monc->con, monc, &mon_con_ops,
825 &monc->client->msgr);
826
827 monc->cur_mon = -1; 790 monc->cur_mon = -1;
828 monc->hunting = true; 791 monc->hunting = true;
829 monc->sub_renew_after = jiffies; 792 monc->sub_renew_after = jiffies;
@@ -845,8 +808,6 @@ out_subscribe:
845 ceph_msg_put(monc->m_subscribe); 808 ceph_msg_put(monc->m_subscribe);
846out_subscribe_ack: 809out_subscribe_ack:
847 ceph_msg_put(monc->m_subscribe_ack); 810 ceph_msg_put(monc->m_subscribe_ack);
848out_auth:
849 ceph_auth_destroy(monc->auth);
850out_monmap: 811out_monmap:
851 kfree(monc->monmap); 812 kfree(monc->monmap);
852out: 813out:
@@ -861,17 +822,13 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
861 822
862 mutex_lock(&monc->mutex); 823 mutex_lock(&monc->mutex);
863 __close_session(monc); 824 __close_session(monc);
864 825 if (monc->con) {
826 monc->con->private = NULL;
827 monc->con->ops->put(monc->con);
828 monc->con = NULL;
829 }
865 mutex_unlock(&monc->mutex); 830 mutex_unlock(&monc->mutex);
866 831
867 /*
868 * flush msgr queue before we destroy ourselves to ensure that:
869 * - any work that references our embedded con is finished.
870 * - any osd_client or other work that may reference an authorizer
871 * finishes before we shut down the auth subsystem.
872 */
873 ceph_msgr_flush();
874
875 ceph_auth_destroy(monc->auth); 832 ceph_auth_destroy(monc->auth);
876 833
877 ceph_msg_put(monc->m_auth); 834 ceph_msg_put(monc->m_auth);
@@ -888,10 +845,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
888{ 845{
889 int ret; 846 int ret;
890 int was_auth = 0; 847 int was_auth = 0;
891 int had_debugfs_info, init_debugfs = 0;
892 848
893 mutex_lock(&monc->mutex); 849 mutex_lock(&monc->mutex);
894 had_debugfs_info = have_debugfs_info(monc);
895 if (monc->auth->ops) 850 if (monc->auth->ops)
896 was_auth = monc->auth->ops->is_authenticated(monc->auth); 851 was_auth = monc->auth->ops->is_authenticated(monc->auth);
897 monc->pending_auth = 0; 852 monc->pending_auth = 0;
@@ -907,29 +862,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
907 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { 862 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
908 dout("authenticated, starting session\n"); 863 dout("authenticated, starting session\n");
909 864
910 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 865 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
911 monc->client->msgr.inst.name.num = 866 monc->client->msgr->inst.name.num =
912 cpu_to_le64(monc->auth->global_id); 867 cpu_to_le64(monc->auth->global_id);
913 868
914 __send_subscribe(monc); 869 __send_subscribe(monc);
915 __resend_generic_request(monc); 870 __resend_generic_request(monc);
916 } 871 }
917
918 if (!had_debugfs_info && have_debugfs_info(monc)) {
919 pr_info("client%lld fsid %pU\n",
920 ceph_client_id(monc->client),
921 &monc->client->fsid);
922 init_debugfs = 1;
923 }
924 mutex_unlock(&monc->mutex); 872 mutex_unlock(&monc->mutex);
925
926 if (init_debugfs) {
927 /*
928 * do debugfs initialization without mutex to avoid
929 * creating a locking dependency
930 */
931 ceph_debugfs_client_init(monc->client);
932 }
933} 873}
934 874
935static int __validate_auth(struct ceph_mon_client *monc) 875static int __validate_auth(struct ceph_mon_client *monc)
@@ -1033,9 +973,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
1033 case CEPH_MSG_MON_MAP: 973 case CEPH_MSG_MON_MAP:
1034 case CEPH_MSG_MDS_MAP: 974 case CEPH_MSG_MDS_MAP:
1035 case CEPH_MSG_OSD_MAP: 975 case CEPH_MSG_OSD_MAP:
1036 m = ceph_msg_new(type, front_len, GFP_NOFS, false); 976 m = ceph_msg_new(type, front_len, GFP_NOFS);
1037 if (!m)
1038 return NULL; /* ENOMEM--return skip == 0 */
1039 break; 977 break;
1040 } 978 }
1041 979
@@ -1062,10 +1000,10 @@ static void mon_fault(struct ceph_connection *con)
1062 if (!con->private) 1000 if (!con->private)
1063 goto out; 1001 goto out;
1064 1002
1065 if (!monc->hunting) 1003 if (monc->con && !monc->hunting)
1066 pr_info("mon%d %s session lost, " 1004 pr_info("mon%d %s session lost, "
1067 "hunting for new mon\n", monc->cur_mon, 1005 "hunting for new mon\n", monc->cur_mon,
1068 ceph_pr_addr(&monc->con.peer_addr.in_addr)); 1006 ceph_pr_addr(&monc->con->peer_addr.in_addr));
1069 1007
1070 __close_session(monc); 1008 __close_session(monc);
1071 if (!monc->hunting) { 1009 if (!monc->hunting) {
@@ -1080,23 +1018,9 @@ out:
1080 mutex_unlock(&monc->mutex); 1018 mutex_unlock(&monc->mutex);
1081} 1019}
1082 1020
1083/*
1084 * We can ignore refcounting on the connection struct, as all references
1085 * will come from the messenger workqueue, which is drained prior to
1086 * mon_client destruction.
1087 */
1088static struct ceph_connection *con_get(struct ceph_connection *con)
1089{
1090 return con;
1091}
1092
1093static void con_put(struct ceph_connection *con)
1094{
1095}
1096
1097static const struct ceph_connection_operations mon_con_ops = { 1021static const struct ceph_connection_operations mon_con_ops = {
1098 .get = con_get, 1022 .get = ceph_con_get,
1099 .put = con_put, 1023 .put = ceph_con_put,
1100 .dispatch = dispatch, 1024 .dispatch = dispatch,
1101 .fault = mon_fault, 1025 .fault = mon_fault,
1102 .alloc_msg = mon_alloc_msg, 1026 .alloc_msg = mon_alloc_msg,
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index ddec1c10ac8..1f4cb30a42c 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
12 struct ceph_msgpool *pool = arg; 12 struct ceph_msgpool *pool = arg;
13 struct ceph_msg *msg; 13 struct ceph_msg *msg;
14 14
15 msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); 15 msg = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!msg) { 16 if (!msg) {
17 dout("msgpool_alloc %s failed\n", pool->name); 17 dout("msgpool_alloc %s failed\n", pool->name);
18 } else { 18 } else {
@@ -32,11 +32,10 @@ static void msgpool_free(void *element, void *arg)
32 ceph_msg_put(msg); 32 ceph_msg_put(msg);
33} 33}
34 34
35int ceph_msgpool_init(struct ceph_msgpool *pool, int type, 35int ceph_msgpool_init(struct ceph_msgpool *pool,
36 int front_len, int size, bool blocking, const char *name) 36 int front_len, int size, bool blocking, const char *name)
37{ 37{
38 dout("msgpool %s init\n", name); 38 dout("msgpool %s init\n", name);
39 pool->type = type;
40 pool->front_len = front_len; 39 pool->front_len = front_len;
41 pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); 40 pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
42 if (!pool->pool) 41 if (!pool->pool)
@@ -62,7 +61,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
62 WARN_ON(1); 61 WARN_ON(1);
63 62
64 /* try to alloc a fresh message */ 63 /* try to alloc a fresh message */
65 return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); 64 return ceph_msg_new(0, front_len, GFP_NOFS);
66 } 65 }
67 66
68 msg = mempool_alloc(pool->pool, GFP_NOFS); 67 msg = mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eb9a4447876..88ad8a2501b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -29,8 +29,8 @@ static void __register_request(struct ceph_osd_client *osdc,
29 struct ceph_osd_request *req); 29 struct ceph_osd_request *req);
30static void __unregister_linger_request(struct ceph_osd_client *osdc, 30static void __unregister_linger_request(struct ceph_osd_client *osdc,
31 struct ceph_osd_request *req); 31 struct ceph_osd_request *req);
32static void __send_request(struct ceph_osd_client *osdc, 32static int __send_request(struct ceph_osd_client *osdc,
33 struct ceph_osd_request *req); 33 struct ceph_osd_request *req);
34 34
35static int op_needs_trail(int op) 35static int op_needs_trail(int op)
36{ 36{
@@ -52,7 +52,7 @@ static int op_has_extent(int op)
52 op == CEPH_OSD_OP_WRITE); 52 op == CEPH_OSD_OP_WRITE);
53} 53}
54 54
55int ceph_calc_raw_layout(struct ceph_osd_client *osdc, 55void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
56 struct ceph_file_layout *layout, 56 struct ceph_file_layout *layout,
57 u64 snapid, 57 u64 snapid,
58 u64 off, u64 *plen, u64 *bno, 58 u64 off, u64 *plen, u64 *bno,
@@ -62,15 +62,12 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
62 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 62 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
63 u64 orig_len = *plen; 63 u64 orig_len = *plen;
64 u64 objoff, objlen; /* extent in object */ 64 u64 objoff, objlen; /* extent in object */
65 int r;
66 65
67 reqhead->snapid = cpu_to_le64(snapid); 66 reqhead->snapid = cpu_to_le64(snapid);
68 67
69 /* object extent? */ 68 /* object extent? */
70 r = ceph_calc_file_object_mapping(layout, off, plen, bno, 69 ceph_calc_file_object_mapping(layout, off, plen, bno,
71 &objoff, &objlen); 70 &objoff, &objlen);
72 if (r < 0)
73 return r;
74 if (*plen < orig_len) 71 if (*plen < orig_len)
75 dout(" skipping last %llu, final file extent %llu~%llu\n", 72 dout(" skipping last %llu, final file extent %llu~%llu\n",
76 orig_len - *plen, off, *plen); 73 orig_len - *plen, off, *plen);
@@ -86,7 +83,7 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
86 83
87 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", 84 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
88 *bno, objoff, objlen, req->r_num_pages); 85 *bno, objoff, objlen, req->r_num_pages);
89 return 0; 86
90} 87}
91EXPORT_SYMBOL(ceph_calc_raw_layout); 88EXPORT_SYMBOL(ceph_calc_raw_layout);
92 89
@@ -115,25 +112,20 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
115 * 112 *
116 * fill osd op in request message. 113 * fill osd op in request message.
117 */ 114 */
118static int calc_layout(struct ceph_osd_client *osdc, 115static void calc_layout(struct ceph_osd_client *osdc,
119 struct ceph_vino vino, 116 struct ceph_vino vino,
120 struct ceph_file_layout *layout, 117 struct ceph_file_layout *layout,
121 u64 off, u64 *plen, 118 u64 off, u64 *plen,
122 struct ceph_osd_request *req, 119 struct ceph_osd_request *req,
123 struct ceph_osd_req_op *op) 120 struct ceph_osd_req_op *op)
124{ 121{
125 u64 bno; 122 u64 bno;
126 int r;
127 123
128 r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, 124 ceph_calc_raw_layout(osdc, layout, vino.snap, off,
129 plen, &bno, req, op); 125 plen, &bno, req, op);
130 if (r < 0)
131 return r;
132 126
133 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 127 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
134 req->r_oid_len = strlen(req->r_oid); 128 req->r_oid_len = strlen(req->r_oid);
135
136 return r;
137} 129}
138 130
139/* 131/*
@@ -147,14 +139,15 @@ void ceph_osdc_release_request(struct kref *kref)
147 139
148 if (req->r_request) 140 if (req->r_request)
149 ceph_msg_put(req->r_request); 141 ceph_msg_put(req->r_request);
142 if (req->r_reply)
143 ceph_msg_put(req->r_reply);
150 if (req->r_con_filling_msg) { 144 if (req->r_con_filling_msg) {
151 dout("%s revoking pages %p from con %p\n", __func__, 145 dout("release_request revoking pages %p from con %p\n",
152 req->r_pages, req->r_con_filling_msg); 146 req->r_pages, req->r_con_filling_msg);
153 ceph_msg_revoke_incoming(req->r_reply); 147 ceph_con_revoke_message(req->r_con_filling_msg,
154 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 148 req->r_reply);
149 ceph_con_put(req->r_con_filling_msg);
155 } 150 }
156 if (req->r_reply)
157 ceph_msg_put(req->r_reply);
158 if (req->r_own_pages) 151 if (req->r_own_pages)
159 ceph_release_page_vector(req->r_pages, 152 ceph_release_page_vector(req->r_pages,
160 req->r_num_pages); 153 req->r_num_pages);
@@ -221,13 +214,10 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
221 kref_init(&req->r_kref); 214 kref_init(&req->r_kref);
222 init_completion(&req->r_completion); 215 init_completion(&req->r_completion);
223 init_completion(&req->r_safe_completion); 216 init_completion(&req->r_safe_completion);
224 RB_CLEAR_NODE(&req->r_node);
225 INIT_LIST_HEAD(&req->r_unsafe_item); 217 INIT_LIST_HEAD(&req->r_unsafe_item);
226 INIT_LIST_HEAD(&req->r_linger_item); 218 INIT_LIST_HEAD(&req->r_linger_item);
227 INIT_LIST_HEAD(&req->r_linger_osd); 219 INIT_LIST_HEAD(&req->r_linger_osd);
228 INIT_LIST_HEAD(&req->r_req_lru_item); 220 INIT_LIST_HEAD(&req->r_req_lru_item);
229 INIT_LIST_HEAD(&req->r_osd_item);
230
231 req->r_flags = flags; 221 req->r_flags = flags;
232 222
233 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); 223 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
@@ -237,7 +227,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
237 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 227 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
238 else 228 else
239 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 229 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
240 OSD_OPREPLY_FRONT_LEN, gfp_flags, true); 230 OSD_OPREPLY_FRONT_LEN, gfp_flags);
241 if (!msg) { 231 if (!msg) {
242 ceph_osdc_put_request(req); 232 ceph_osdc_put_request(req);
243 return NULL; 233 return NULL;
@@ -253,20 +243,20 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
253 } 243 }
254 ceph_pagelist_init(req->r_trail); 244 ceph_pagelist_init(req->r_trail);
255 } 245 }
256
257 /* create request message; allow space for oid */ 246 /* create request message; allow space for oid */
258 msg_size += MAX_OBJ_NAME_SIZE; 247 msg_size += 40;
259 if (snapc) 248 if (snapc)
260 msg_size += sizeof(u64) * snapc->num_snaps; 249 msg_size += sizeof(u64) * snapc->num_snaps;
261 if (use_mempool) 250 if (use_mempool)
262 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 251 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
263 else 252 else
264 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); 253 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
265 if (!msg) { 254 if (!msg) {
266 ceph_osdc_put_request(req); 255 ceph_osdc_put_request(req);
267 return NULL; 256 return NULL;
268 } 257 }
269 258
259 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
270 memset(msg->front.iov_base, 0, msg->front.iov_len); 260 memset(msg->front.iov_base, 0, msg->front.iov_len);
271 261
272 req->r_request = msg; 262 req->r_request = msg;
@@ -288,7 +278,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
288{ 278{
289 dst->op = cpu_to_le16(src->op); 279 dst->op = cpu_to_le16(src->op);
290 280
291 switch (src->op) { 281 switch (dst->op) {
292 case CEPH_OSD_OP_READ: 282 case CEPH_OSD_OP_READ:
293 case CEPH_OSD_OP_WRITE: 283 case CEPH_OSD_OP_WRITE:
294 dst->extent.offset = 284 dst->extent.offset =
@@ -464,7 +454,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
464{ 454{
465 struct ceph_osd_req_op ops[3]; 455 struct ceph_osd_req_op ops[3];
466 struct ceph_osd_request *req; 456 struct ceph_osd_request *req;
467 int r;
468 457
469 ops[0].op = opcode; 458 ops[0].op = opcode;
470 ops[0].extent.truncate_seq = truncate_seq; 459 ops[0].extent.truncate_seq = truncate_seq;
@@ -483,12 +472,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
483 use_mempool, 472 use_mempool,
484 GFP_NOFS, NULL, NULL); 473 GFP_NOFS, NULL, NULL);
485 if (!req) 474 if (!req)
486 return ERR_PTR(-ENOMEM); 475 return NULL;
487 476
488 /* calculate max write size */ 477 /* calculate max write size */
489 r = calc_layout(osdc, vino, layout, off, plen, req, ops); 478 calc_layout(osdc, vino, layout, off, plen, req, ops);
490 if (r < 0)
491 return ERR_PTR(r);
492 req->r_file_layout = *layout; /* keep a copy */ 479 req->r_file_layout = *layout; /* keep a copy */
493 480
494 /* in case it differs from natural (file) alignment that 481 /* in case it differs from natural (file) alignment that
@@ -581,7 +568,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
581 568
582 dout("__kick_osd_requests osd%d\n", osd->o_osd); 569 dout("__kick_osd_requests osd%d\n", osd->o_osd);
583 err = __reset_osd(osdc, osd); 570 err = __reset_osd(osdc, osd);
584 if (err) 571 if (err == -EAGAIN)
585 return; 572 return;
586 573
587 list_for_each_entry(req, &osd->o_requests, r_osd_item) { 574 list_for_each_entry(req, &osd->o_requests, r_osd_item) {
@@ -608,6 +595,14 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
608 } 595 }
609} 596}
610 597
598static void kick_osd_requests(struct ceph_osd_client *osdc,
599 struct ceph_osd *kickosd)
600{
601 mutex_lock(&osdc->request_mutex);
602 __kick_osd_requests(osdc, kickosd);
603 mutex_unlock(&osdc->request_mutex);
604}
605
611/* 606/*
612 * If the osd connection drops, we need to resubmit all requests. 607 * If the osd connection drops, we need to resubmit all requests.
613 */ 608 */
@@ -621,9 +616,7 @@ static void osd_reset(struct ceph_connection *con)
621 dout("osd_reset osd%d\n", osd->o_osd); 616 dout("osd_reset osd%d\n", osd->o_osd);
622 osdc = osd->o_osdc; 617 osdc = osd->o_osdc;
623 down_read(&osdc->map_sem); 618 down_read(&osdc->map_sem);
624 mutex_lock(&osdc->request_mutex); 619 kick_osd_requests(osdc, osd);
625 __kick_osd_requests(osdc, osd);
626 mutex_unlock(&osdc->request_mutex);
627 send_queued(osdc); 620 send_queued(osdc);
628 up_read(&osdc->map_sem); 621 up_read(&osdc->map_sem);
629} 622}
@@ -631,7 +624,7 @@ static void osd_reset(struct ceph_connection *con)
631/* 624/*
632 * Track open sessions with osds. 625 * Track open sessions with osds.
633 */ 626 */
634static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) 627static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
635{ 628{
636 struct ceph_osd *osd; 629 struct ceph_osd *osd;
637 630
@@ -641,14 +634,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
641 634
642 atomic_set(&osd->o_ref, 1); 635 atomic_set(&osd->o_ref, 1);
643 osd->o_osdc = osdc; 636 osd->o_osdc = osdc;
644 osd->o_osd = onum;
645 RB_CLEAR_NODE(&osd->o_node);
646 INIT_LIST_HEAD(&osd->o_requests); 637 INIT_LIST_HEAD(&osd->o_requests);
647 INIT_LIST_HEAD(&osd->o_linger_requests); 638 INIT_LIST_HEAD(&osd->o_linger_requests);
648 INIT_LIST_HEAD(&osd->o_osd_lru); 639 INIT_LIST_HEAD(&osd->o_osd_lru);
649 osd->o_incarnation = 1; 640 osd->o_incarnation = 1;
650 641
651 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); 642 ceph_con_init(osdc->client->msgr, &osd->o_con);
643 osd->o_con.private = osd;
644 osd->o_con.ops = &osd_con_ops;
645 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
652 646
653 INIT_LIST_HEAD(&osd->o_keepalive_item); 647 INIT_LIST_HEAD(&osd->o_keepalive_item);
654 return osd; 648 return osd;
@@ -670,11 +664,11 @@ static void put_osd(struct ceph_osd *osd)
670{ 664{
671 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 665 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
672 atomic_read(&osd->o_ref) - 1); 666 atomic_read(&osd->o_ref) - 1);
673 if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { 667 if (atomic_dec_and_test(&osd->o_ref)) {
674 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; 668 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
675 669
676 if (ac->ops && ac->ops->destroy_authorizer) 670 if (osd->o_authorizer)
677 ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); 671 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
678 kfree(osd); 672 kfree(osd);
679 } 673 }
680} 674}
@@ -694,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
694 688
695static void remove_all_osds(struct ceph_osd_client *osdc) 689static void remove_all_osds(struct ceph_osd_client *osdc)
696{ 690{
697 dout("%s %p\n", __func__, osdc); 691 dout("__remove_old_osds %p\n", osdc);
698 mutex_lock(&osdc->request_mutex); 692 mutex_lock(&osdc->request_mutex);
699 while (!RB_EMPTY_ROOT(&osdc->osds)) { 693 while (!RB_EMPTY_ROOT(&osdc->osds)) {
700 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), 694 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
@@ -746,7 +740,6 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
746 if (list_empty(&osd->o_requests) && 740 if (list_empty(&osd->o_requests) &&
747 list_empty(&osd->o_linger_requests)) { 741 list_empty(&osd->o_linger_requests)) {
748 __remove_osd(osdc, osd); 742 __remove_osd(osdc, osd);
749 ret = -ENODEV;
750 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], 743 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
751 &osd->o_con.peer_addr, 744 &osd->o_con.peer_addr,
752 sizeof(osd->o_con.peer_addr)) == 0 && 745 sizeof(osd->o_con.peer_addr)) == 0 &&
@@ -759,8 +752,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
759 ret = -EAGAIN; 752 ret = -EAGAIN;
760 } else { 753 } else {
761 ceph_con_close(&osd->o_con); 754 ceph_con_close(&osd->o_con);
762 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, 755 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
763 &osdc->osdmap->osd_addr[osd->o_osd]);
764 osd->o_incarnation++; 756 osd->o_incarnation++;
765 } 757 }
766 return ret; 758 return ret;
@@ -849,19 +841,13 @@ static void register_request(struct ceph_osd_client *osdc,
849static void __unregister_request(struct ceph_osd_client *osdc, 841static void __unregister_request(struct ceph_osd_client *osdc,
850 struct ceph_osd_request *req) 842 struct ceph_osd_request *req)
851{ 843{
852 if (RB_EMPTY_NODE(&req->r_node)) {
853 dout("__unregister_request %p tid %lld not registered\n",
854 req, req->r_tid);
855 return;
856 }
857
858 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 844 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
859 rb_erase(&req->r_node, &osdc->requests); 845 rb_erase(&req->r_node, &osdc->requests);
860 osdc->num_requests--; 846 osdc->num_requests--;
861 847
862 if (req->r_osd) { 848 if (req->r_osd) {
863 /* make sure the original request isn't in flight. */ 849 /* make sure the original request isn't in flight. */
864 ceph_msg_revoke(req->r_request); 850 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
865 851
866 list_del_init(&req->r_osd_item); 852 list_del_init(&req->r_osd_item);
867 if (list_empty(&req->r_osd->o_requests) && 853 if (list_empty(&req->r_osd->o_requests) &&
@@ -873,9 +859,9 @@ static void __unregister_request(struct ceph_osd_client *osdc,
873 req->r_osd = NULL; 859 req->r_osd = NULL;
874 } 860 }
875 861
876 list_del_init(&req->r_req_lru_item);
877 ceph_osdc_put_request(req); 862 ceph_osdc_put_request(req);
878 863
864 list_del_init(&req->r_req_lru_item);
879 if (osdc->num_requests == 0) { 865 if (osdc->num_requests == 0) {
880 dout(" no requests, canceling timeout\n"); 866 dout(" no requests, canceling timeout\n");
881 __cancel_osd_timeout(osdc); 867 __cancel_osd_timeout(osdc);
@@ -888,7 +874,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
888static void __cancel_request(struct ceph_osd_request *req) 874static void __cancel_request(struct ceph_osd_request *req)
889{ 875{
890 if (req->r_sent && req->r_osd) { 876 if (req->r_sent && req->r_osd) {
891 ceph_msg_revoke(req->r_request); 877 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
892 req->r_sent = 0; 878 req->r_sent = 0;
893 } 879 }
894} 880}
@@ -898,17 +884,15 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
898{ 884{
899 dout("__register_linger_request %p\n", req); 885 dout("__register_linger_request %p\n", req);
900 list_add_tail(&req->r_linger_item, &osdc->req_linger); 886 list_add_tail(&req->r_linger_item, &osdc->req_linger);
901 if (req->r_osd) 887 list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
902 list_add_tail(&req->r_linger_osd,
903 &req->r_osd->o_linger_requests);
904} 888}
905 889
906static void __unregister_linger_request(struct ceph_osd_client *osdc, 890static void __unregister_linger_request(struct ceph_osd_client *osdc,
907 struct ceph_osd_request *req) 891 struct ceph_osd_request *req)
908{ 892{
909 dout("__unregister_linger_request %p\n", req); 893 dout("__unregister_linger_request %p\n", req);
910 list_del_init(&req->r_linger_item);
911 if (req->r_osd) { 894 if (req->r_osd) {
895 list_del_init(&req->r_linger_item);
912 list_del_init(&req->r_linger_osd); 896 list_del_init(&req->r_linger_osd);
913 897
914 if (list_empty(&req->r_osd->o_requests) && 898 if (list_empty(&req->r_osd->o_requests) &&
@@ -959,7 +943,7 @@ EXPORT_SYMBOL(ceph_osdc_set_request_linger);
959 * Caller should hold map_sem for read and request_mutex. 943 * Caller should hold map_sem for read and request_mutex.
960 */ 944 */
961static int __map_request(struct ceph_osd_client *osdc, 945static int __map_request(struct ceph_osd_client *osdc,
962 struct ceph_osd_request *req, int force_resend) 946 struct ceph_osd_request *req)
963{ 947{
964 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 948 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
965 struct ceph_pg pgid; 949 struct ceph_pg pgid;
@@ -983,8 +967,7 @@ static int __map_request(struct ceph_osd_client *osdc,
983 num = err; 967 num = err;
984 } 968 }
985 969
986 if ((!force_resend && 970 if ((req->r_osd && req->r_osd->o_osd == o &&
987 req->r_osd && req->r_osd->o_osd == o &&
988 req->r_sent >= req->r_osd->o_incarnation && 971 req->r_sent >= req->r_osd->o_incarnation &&
989 req->r_num_pg_osds == num && 972 req->r_num_pg_osds == num &&
990 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || 973 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
@@ -1008,18 +991,18 @@ static int __map_request(struct ceph_osd_client *osdc,
1008 req->r_osd = __lookup_osd(osdc, o); 991 req->r_osd = __lookup_osd(osdc, o);
1009 if (!req->r_osd && o >= 0) { 992 if (!req->r_osd && o >= 0) {
1010 err = -ENOMEM; 993 err = -ENOMEM;
1011 req->r_osd = create_osd(osdc, o); 994 req->r_osd = create_osd(osdc);
1012 if (!req->r_osd) { 995 if (!req->r_osd) {
1013 list_move(&req->r_req_lru_item, &osdc->req_notarget); 996 list_move(&req->r_req_lru_item, &osdc->req_notarget);
1014 goto out; 997 goto out;
1015 } 998 }
1016 999
1017 dout("map_request osd %p is osd%d\n", req->r_osd, o); 1000 dout("map_request osd %p is osd%d\n", req->r_osd, o);
1001 req->r_osd->o_osd = o;
1002 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
1018 __insert_osd(osdc, req->r_osd); 1003 __insert_osd(osdc, req->r_osd);
1019 1004
1020 ceph_con_open(&req->r_osd->o_con, 1005 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
1021 CEPH_ENTITY_TYPE_OSD, o,
1022 &osdc->osdmap->osd_addr[o]);
1023 } 1006 }
1024 1007
1025 if (req->r_osd) { 1008 if (req->r_osd) {
@@ -1038,8 +1021,8 @@ out:
1038/* 1021/*
1039 * caller should hold map_sem (for read) and request_mutex 1022 * caller should hold map_sem (for read) and request_mutex
1040 */ 1023 */
1041static void __send_request(struct ceph_osd_client *osdc, 1024static int __send_request(struct ceph_osd_client *osdc,
1042 struct ceph_osd_request *req) 1025 struct ceph_osd_request *req)
1043{ 1026{
1044 struct ceph_osd_request_head *reqhead; 1027 struct ceph_osd_request_head *reqhead;
1045 1028
@@ -1057,6 +1040,7 @@ static void __send_request(struct ceph_osd_client *osdc,
1057 ceph_msg_get(req->r_request); /* send consumes a ref */ 1040 ceph_msg_get(req->r_request); /* send consumes a ref */
1058 ceph_con_send(&req->r_osd->o_con, req->r_request); 1041 ceph_con_send(&req->r_osd->o_con, req->r_request);
1059 req->r_sent = req->r_osd->o_incarnation; 1042 req->r_sent = req->r_osd->o_incarnation;
1043 return 0;
1060} 1044}
1061 1045
1062/* 1046/*
@@ -1087,10 +1071,12 @@ static void handle_timeout(struct work_struct *work)
1087{ 1071{
1088 struct ceph_osd_client *osdc = 1072 struct ceph_osd_client *osdc =
1089 container_of(work, struct ceph_osd_client, timeout_work.work); 1073 container_of(work, struct ceph_osd_client, timeout_work.work);
1090 struct ceph_osd_request *req; 1074 struct ceph_osd_request *req, *last_req = NULL;
1091 struct ceph_osd *osd; 1075 struct ceph_osd *osd;
1076 unsigned long timeout = osdc->client->options->osd_timeout * HZ;
1092 unsigned long keepalive = 1077 unsigned long keepalive =
1093 osdc->client->options->osd_keepalive_timeout * HZ; 1078 osdc->client->options->osd_keepalive_timeout * HZ;
1079 unsigned long last_stamp = 0;
1094 struct list_head slow_osds; 1080 struct list_head slow_osds;
1095 dout("timeout\n"); 1081 dout("timeout\n");
1096 down_read(&osdc->map_sem); 1082 down_read(&osdc->map_sem);
@@ -1100,6 +1086,37 @@ static void handle_timeout(struct work_struct *work)
1100 mutex_lock(&osdc->request_mutex); 1086 mutex_lock(&osdc->request_mutex);
1101 1087
1102 /* 1088 /*
1089 * reset osds that appear to be _really_ unresponsive. this
1090 * is a failsafe measure.. we really shouldn't be getting to
1091 * this point if the system is working properly. the monitors
1092 * should mark the osd as failed and we should find out about
1093 * it from an updated osd map.
1094 */
1095 while (timeout && !list_empty(&osdc->req_lru)) {
1096 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
1097 r_req_lru_item);
1098
1099 /* hasn't been long enough since we sent it? */
1100 if (time_before(jiffies, req->r_stamp + timeout))
1101 break;
1102
1103 /* hasn't been long enough since it was acked? */
1104 if (req->r_request->ack_stamp == 0 ||
1105 time_before(jiffies, req->r_request->ack_stamp + timeout))
1106 break;
1107
1108 BUG_ON(req == last_req && req->r_stamp == last_stamp);
1109 last_req = req;
1110 last_stamp = req->r_stamp;
1111
1112 osd = req->r_osd;
1113 BUG_ON(!osd);
1114 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
1115 req->r_tid, osd->o_osd);
1116 __kick_osd_requests(osdc, osd);
1117 }
1118
1119 /*
1103 * ping osds that are a bit slow. this ensures that if there 1120 * ping osds that are a bit slow. this ensures that if there
1104 * is a break in the TCP connection we will notice, and reopen 1121 * is a break in the TCP connection we will notice, and reopen
1105 * a connection with that osd (from the fault callback). 1122 * a connection with that osd (from the fault callback).
@@ -1193,11 +1210,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1193 if (req->r_con_filling_msg == con && req->r_reply == msg) { 1210 if (req->r_con_filling_msg == con && req->r_reply == msg) {
1194 dout(" dropping con_filling_msg ref %p\n", con); 1211 dout(" dropping con_filling_msg ref %p\n", con);
1195 req->r_con_filling_msg = NULL; 1212 req->r_con_filling_msg = NULL;
1196 con->ops->put(con); 1213 ceph_con_put(con);
1197 } 1214 }
1198 1215
1199 if (!req->r_got_reply) { 1216 if (!req->r_got_reply) {
1200 unsigned int bytes; 1217 unsigned bytes;
1201 1218
1202 req->r_result = le32_to_cpu(rhead->result); 1219 req->r_result = le32_to_cpu(rhead->result);
1203 bytes = le32_to_cpu(msg->hdr.data_len); 1220 bytes = le32_to_cpu(msg->hdr.data_len);
@@ -1270,51 +1287,30 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
1270 * Requeue requests whose mapping to an OSD has changed. If requests map to 1287 * Requeue requests whose mapping to an OSD has changed. If requests map to
1271 * no osd, request a new map. 1288 * no osd, request a new map.
1272 * 1289 *
1273 * Caller should hold map_sem for read. 1290 * Caller should hold map_sem for read and request_mutex.
1274 */ 1291 */
1275static void kick_requests(struct ceph_osd_client *osdc, int force_resend) 1292static void kick_requests(struct ceph_osd_client *osdc)
1276{ 1293{
1277 struct ceph_osd_request *req, *nreq; 1294 struct ceph_osd_request *req, *nreq;
1278 struct rb_node *p; 1295 struct rb_node *p;
1279 int needmap = 0; 1296 int needmap = 0;
1280 int err; 1297 int err;
1281 1298
1282 dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); 1299 dout("kick_requests\n");
1283 mutex_lock(&osdc->request_mutex); 1300 mutex_lock(&osdc->request_mutex);
1284 for (p = rb_first(&osdc->requests); p; ) { 1301 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
1285 req = rb_entry(p, struct ceph_osd_request, r_node); 1302 req = rb_entry(p, struct ceph_osd_request, r_node);
1286 p = rb_next(p); 1303 err = __map_request(osdc, req);
1287
1288 /*
1289 * For linger requests that have not yet been
1290 * registered, move them to the linger list; they'll
1291 * be sent to the osd in the loop below. Unregister
1292 * the request before re-registering it as a linger
1293 * request to ensure the __map_request() below
1294 * will decide it needs to be sent.
1295 */
1296 if (req->r_linger && list_empty(&req->r_linger_item)) {
1297 dout("%p tid %llu restart on osd%d\n",
1298 req, req->r_tid,
1299 req->r_osd ? req->r_osd->o_osd : -1);
1300 __unregister_request(osdc, req);
1301 __register_linger_request(osdc, req);
1302 continue;
1303 }
1304
1305 err = __map_request(osdc, req, force_resend);
1306 if (err < 0) 1304 if (err < 0)
1307 continue; /* error */ 1305 continue; /* error */
1308 if (req->r_osd == NULL) { 1306 if (req->r_osd == NULL) {
1309 dout("%p tid %llu maps to no osd\n", req, req->r_tid); 1307 dout("%p tid %llu maps to no osd\n", req, req->r_tid);
1310 needmap++; /* request a newer map */ 1308 needmap++; /* request a newer map */
1311 } else if (err > 0) { 1309 } else if (err > 0) {
1312 if (!req->r_linger) { 1310 dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
1313 dout("%p tid %llu requeued on osd%d\n", req, 1311 req->r_osd ? req->r_osd->o_osd : -1);
1314 req->r_tid, 1312 if (!req->r_linger)
1315 req->r_osd ? req->r_osd->o_osd : -1);
1316 req->r_flags |= CEPH_OSD_FLAG_RETRY; 1313 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1317 }
1318 } 1314 }
1319 } 1315 }
1320 1316
@@ -1322,8 +1318,7 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1322 r_linger_item) { 1318 r_linger_item) {
1323 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 1319 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
1324 1320
1325 err = __map_request(osdc, req, force_resend); 1321 err = __map_request(osdc, req);
1326 dout("__map_request returned %d\n", err);
1327 if (err == 0) 1322 if (err == 0)
1328 continue; /* no change and no osd was specified */ 1323 continue; /* no change and no osd was specified */
1329 if (err < 0) 1324 if (err < 0)
@@ -1336,8 +1331,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1336 1331
1337 dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, 1332 dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
1338 req->r_osd ? req->r_osd->o_osd : -1); 1333 req->r_osd ? req->r_osd->o_osd : -1);
1339 __register_request(osdc, req);
1340 __unregister_linger_request(osdc, req); 1334 __unregister_linger_request(osdc, req);
1335 __register_request(osdc, req);
1341 } 1336 }
1342 mutex_unlock(&osdc->request_mutex); 1337 mutex_unlock(&osdc->request_mutex);
1343 1338
@@ -1345,7 +1340,6 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1345 dout("%d requests for down osds, need new map\n", needmap); 1340 dout("%d requests for down osds, need new map\n", needmap);
1346 ceph_monc_request_next_osdmap(&osdc->client->monc); 1341 ceph_monc_request_next_osdmap(&osdc->client->monc);
1347 } 1342 }
1348 reset_changed_osds(osdc);
1349} 1343}
1350 1344
1351 1345
@@ -1391,7 +1385,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1391 epoch, maplen); 1385 epoch, maplen);
1392 newmap = osdmap_apply_incremental(&p, next, 1386 newmap = osdmap_apply_incremental(&p, next,
1393 osdc->osdmap, 1387 osdc->osdmap,
1394 &osdc->client->msgr); 1388 osdc->client->msgr);
1395 if (IS_ERR(newmap)) { 1389 if (IS_ERR(newmap)) {
1396 err = PTR_ERR(newmap); 1390 err = PTR_ERR(newmap);
1397 goto bad; 1391 goto bad;
@@ -1401,7 +1395,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1401 ceph_osdmap_destroy(osdc->osdmap); 1395 ceph_osdmap_destroy(osdc->osdmap);
1402 osdc->osdmap = newmap; 1396 osdc->osdmap = newmap;
1403 } 1397 }
1404 kick_requests(osdc, 0); 1398 kick_requests(osdc);
1399 reset_changed_osds(osdc);
1405 } else { 1400 } else {
1406 dout("ignoring incremental map %u len %d\n", 1401 dout("ignoring incremental map %u len %d\n",
1407 epoch, maplen); 1402 epoch, maplen);
@@ -1428,8 +1423,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1428 "older than our %u\n", epoch, maplen, 1423 "older than our %u\n", epoch, maplen,
1429 osdc->osdmap->epoch); 1424 osdc->osdmap->epoch);
1430 } else { 1425 } else {
1431 int skipped_map = 0;
1432
1433 dout("taking full map %u len %d\n", epoch, maplen); 1426 dout("taking full map %u len %d\n", epoch, maplen);
1434 newmap = osdmap_decode(&p, p+maplen); 1427 newmap = osdmap_decode(&p, p+maplen);
1435 if (IS_ERR(newmap)) { 1428 if (IS_ERR(newmap)) {
@@ -1439,12 +1432,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1439 BUG_ON(!newmap); 1432 BUG_ON(!newmap);
1440 oldmap = osdc->osdmap; 1433 oldmap = osdc->osdmap;
1441 osdc->osdmap = newmap; 1434 osdc->osdmap = newmap;
1442 if (oldmap) { 1435 if (oldmap)
1443 if (oldmap->epoch + 1 < newmap->epoch)
1444 skipped_map = 1;
1445 ceph_osdmap_destroy(oldmap); 1436 ceph_osdmap_destroy(oldmap);
1446 } 1437 kick_requests(osdc);
1447 kick_requests(osdc, skipped_map);
1448 } 1438 }
1449 p += maplen; 1439 p += maplen;
1450 nr_maps--; 1440 nr_maps--;
@@ -1571,7 +1561,6 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc,
1571 event->data = data; 1561 event->data = data;
1572 event->osdc = osdc; 1562 event->osdc = osdc;
1573 INIT_LIST_HEAD(&event->osd_node); 1563 INIT_LIST_HEAD(&event->osd_node);
1574 RB_CLEAR_NODE(&event->node);
1575 kref_init(&event->kref); /* one ref for us */ 1564 kref_init(&event->kref); /* one ref for us */
1576 kref_get(&event->kref); /* one ref for the caller */ 1565 kref_get(&event->kref); /* one ref for the caller */
1577 init_completion(&event->completion); 1566 init_completion(&event->completion);
@@ -1718,7 +1707,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1718 * the request still han't been touched yet. 1707 * the request still han't been touched yet.
1719 */ 1708 */
1720 if (req->r_sent == 0) { 1709 if (req->r_sent == 0) {
1721 rc = __map_request(osdc, req, 0); 1710 rc = __map_request(osdc, req);
1722 if (rc < 0) { 1711 if (rc < 0) {
1723 if (nofail) { 1712 if (nofail) {
1724 dout("osdc_start_request failed map, " 1713 dout("osdc_start_request failed map, "
@@ -1731,9 +1720,17 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1731 dout("send_request %p no up osds in pg\n", req); 1720 dout("send_request %p no up osds in pg\n", req);
1732 ceph_monc_request_next_osdmap(&osdc->client->monc); 1721 ceph_monc_request_next_osdmap(&osdc->client->monc);
1733 } else { 1722 } else {
1734 __send_request(osdc, req); 1723 rc = __send_request(osdc, req);
1724 if (rc) {
1725 if (nofail) {
1726 dout("osdc_start_request failed send, "
1727 " will retry %lld\n", req->r_tid);
1728 rc = 0;
1729 } else {
1730 __unregister_request(osdc, req);
1731 }
1732 }
1735 } 1733 }
1736 rc = 0;
1737 } 1734 }
1738 1735
1739out_unlock: 1736out_unlock:
@@ -1839,12 +1836,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1839 if (!osdc->req_mempool) 1836 if (!osdc->req_mempool)
1840 goto out; 1837 goto out;
1841 1838
1842 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, 1839 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1843 OSD_OP_FRONT_LEN, 10, true,
1844 "osd_op"); 1840 "osd_op");
1845 if (err < 0) 1841 if (err < 0)
1846 goto out_mempool; 1842 goto out_mempool;
1847 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, 1843 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1848 OSD_OPREPLY_FRONT_LEN, 10, true, 1844 OSD_OPREPLY_FRONT_LEN, 10, true,
1849 "osd_op_reply"); 1845 "osd_op_reply");
1850 if (err < 0) 1846 if (err < 0)
@@ -1903,8 +1899,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1903 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1899 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1904 NULL, 0, truncate_seq, truncate_size, NULL, 1900 NULL, 0, truncate_seq, truncate_size, NULL,
1905 false, 1, page_align); 1901 false, 1, page_align);
1906 if (IS_ERR(req)) 1902 if (!req)
1907 return PTR_ERR(req); 1903 return -ENOMEM;
1908 1904
1909 /* it may be a short read due to an object boundary */ 1905 /* it may be a short read due to an object boundary */
1910 req->r_pages = pages; 1906 req->r_pages = pages;
@@ -1946,8 +1942,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1946 snapc, do_sync, 1942 snapc, do_sync,
1947 truncate_seq, truncate_size, mtime, 1943 truncate_seq, truncate_size, mtime,
1948 nofail, 1, page_align); 1944 nofail, 1, page_align);
1949 if (IS_ERR(req)) 1945 if (!req)
1950 return PTR_ERR(req); 1946 return -ENOMEM;
1951 1947
1952 /* it may be a short write due to an object boundary */ 1948 /* it may be a short write due to an object boundary */
1953 req->r_pages = pages; 1949 req->r_pages = pages;
@@ -2020,23 +2016,23 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2020 if (!req) { 2016 if (!req) {
2021 *skip = 1; 2017 *skip = 1;
2022 m = NULL; 2018 m = NULL;
2023 dout("get_reply unknown tid %llu from osd%d\n", tid, 2019 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
2024 osd->o_osd); 2020 osd->o_osd);
2025 goto out; 2021 goto out;
2026 } 2022 }
2027 2023
2028 if (req->r_con_filling_msg) { 2024 if (req->r_con_filling_msg) {
2029 dout("%s revoking msg %p from old con %p\n", __func__, 2025 dout("get_reply revoking msg %p from old con %p\n",
2030 req->r_reply, req->r_con_filling_msg); 2026 req->r_reply, req->r_con_filling_msg);
2031 ceph_msg_revoke_incoming(req->r_reply); 2027 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
2032 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 2028 ceph_con_put(req->r_con_filling_msg);
2033 req->r_con_filling_msg = NULL; 2029 req->r_con_filling_msg = NULL;
2034 } 2030 }
2035 2031
2036 if (front > req->r_reply->front.iov_len) { 2032 if (front > req->r_reply->front.iov_len) {
2037 pr_warning("get_reply front %d > preallocated %d\n", 2033 pr_warning("get_reply front %d > preallocated %d\n",
2038 front, (int)req->r_reply->front.iov_len); 2034 front, (int)req->r_reply->front.iov_len);
2039 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); 2035 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
2040 if (!m) 2036 if (!m)
2041 goto out; 2037 goto out;
2042 ceph_msg_put(req->r_reply); 2038 ceph_msg_put(req->r_reply);
@@ -2064,7 +2060,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2064#endif 2060#endif
2065 } 2061 }
2066 *skip = 0; 2062 *skip = 0;
2067 req->r_con_filling_msg = con->ops->get(con); 2063 req->r_con_filling_msg = ceph_con_get(con);
2068 dout("get_reply tid %lld %p\n", tid, m); 2064 dout("get_reply tid %lld %p\n", tid, m);
2069 2065
2070out: 2066out:
@@ -2081,11 +2077,10 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
2081 int type = le16_to_cpu(hdr->type); 2077 int type = le16_to_cpu(hdr->type);
2082 int front = le32_to_cpu(hdr->front_len); 2078 int front = le32_to_cpu(hdr->front_len);
2083 2079
2084 *skip = 0;
2085 switch (type) { 2080 switch (type) {
2086 case CEPH_MSG_OSD_MAP: 2081 case CEPH_MSG_OSD_MAP:
2087 case CEPH_MSG_WATCH_NOTIFY: 2082 case CEPH_MSG_WATCH_NOTIFY:
2088 return ceph_msg_new(type, front, GFP_NOFS, false); 2083 return ceph_msg_new(type, front, GFP_NOFS);
2089 case CEPH_MSG_OSD_OPREPLY: 2084 case CEPH_MSG_OSD_OPREPLY:
2090 return get_reply(con, hdr, skip); 2085 return get_reply(con, hdr, skip);
2091 default: 2086 default:
@@ -2116,32 +2111,37 @@ static void put_osd_con(struct ceph_connection *con)
2116/* 2111/*
2117 * authentication 2112 * authentication
2118 */ 2113 */
2119/* 2114static int get_authorizer(struct ceph_connection *con,
2120 * Note: returned pointer is the address of a structure that's 2115 void **buf, int *len, int *proto,
2121 * managed separately. Caller must *not* attempt to free it. 2116 void **reply_buf, int *reply_len, int force_new)
2122 */
2123static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
2124 int *proto, int force_new)
2125{ 2117{
2126 struct ceph_osd *o = con->private; 2118 struct ceph_osd *o = con->private;
2127 struct ceph_osd_client *osdc = o->o_osdc; 2119 struct ceph_osd_client *osdc = o->o_osdc;
2128 struct ceph_auth_client *ac = osdc->client->monc.auth; 2120 struct ceph_auth_client *ac = osdc->client->monc.auth;
2129 struct ceph_auth_handshake *auth = &o->o_auth; 2121 int ret = 0;
2130 2122
2131 if (force_new && auth->authorizer) { 2123 if (force_new && o->o_authorizer) {
2132 if (ac->ops && ac->ops->destroy_authorizer) 2124 ac->ops->destroy_authorizer(ac, o->o_authorizer);
2133 ac->ops->destroy_authorizer(ac, auth->authorizer); 2125 o->o_authorizer = NULL;
2134 auth->authorizer = NULL; 2126 }
2135 } 2127 if (o->o_authorizer == NULL) {
2136 if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { 2128 ret = ac->ops->create_authorizer(
2137 int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, 2129 ac, CEPH_ENTITY_TYPE_OSD,
2138 auth); 2130 &o->o_authorizer,
2131 &o->o_authorizer_buf,
2132 &o->o_authorizer_buf_len,
2133 &o->o_authorizer_reply_buf,
2134 &o->o_authorizer_reply_buf_len);
2139 if (ret) 2135 if (ret)
2140 return ERR_PTR(ret); 2136 return ret;
2141 } 2137 }
2142 *proto = ac->protocol;
2143 2138
2144 return auth; 2139 *proto = ac->protocol;
2140 *buf = o->o_authorizer_buf;
2141 *len = o->o_authorizer_buf_len;
2142 *reply_buf = o->o_authorizer_reply_buf;
2143 *reply_len = o->o_authorizer_reply_buf_len;
2144 return 0;
2145} 2145}
2146 2146
2147 2147
@@ -2151,11 +2151,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
2151 struct ceph_osd_client *osdc = o->o_osdc; 2151 struct ceph_osd_client *osdc = o->o_osdc;
2152 struct ceph_auth_client *ac = osdc->client->monc.auth; 2152 struct ceph_auth_client *ac = osdc->client->monc.auth;
2153 2153
2154 /* 2154 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
2155 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null,
2156 * XXX which do we do: succeed or fail?
2157 */
2158 return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
2159} 2155}
2160 2156
2161static int invalidate_authorizer(struct ceph_connection *con) 2157static int invalidate_authorizer(struct ceph_connection *con)
@@ -2164,7 +2160,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
2164 struct ceph_osd_client *osdc = o->o_osdc; 2160 struct ceph_osd_client *osdc = o->o_osdc;
2165 struct ceph_auth_client *ac = osdc->client->monc.auth; 2161 struct ceph_auth_client *ac = osdc->client->monc.auth;
2166 2162
2167 if (ac->ops && ac->ops->invalidate_authorizer) 2163 if (ac->ops->invalidate_authorizer)
2168 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); 2164 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
2169 2165
2170 return ceph_monc_validate_auth(&osdc->client->monc); 2166 return ceph_monc_validate_auth(&osdc->client->monc);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214b5d2..fd863fe7693 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -38,7 +38,7 @@ done:
38 38
39/* maps */ 39/* maps */
40 40
41static int calc_bits_of(unsigned int t) 41static int calc_bits_of(unsigned t)
42{ 42{
43 int b = 0; 43 int b = 0;
44 while (t) { 44 while (t) {
@@ -135,21 +135,6 @@ bad:
135 return -EINVAL; 135 return -EINVAL;
136} 136}
137 137
138static int skip_name_map(void **p, void *end)
139{
140 int len;
141 ceph_decode_32_safe(p, end, len ,bad);
142 while (len--) {
143 int strlen;
144 *p += sizeof(u32);
145 ceph_decode_32_safe(p, end, strlen, bad);
146 *p += strlen;
147}
148 return 0;
149bad:
150 return -EINVAL;
151}
152
153static struct crush_map *crush_decode(void *pbyval, void *end) 138static struct crush_map *crush_decode(void *pbyval, void *end)
154{ 139{
155 struct crush_map *c; 140 struct crush_map *c;
@@ -158,7 +143,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
158 void **p = &pbyval; 143 void **p = &pbyval;
159 void *start = pbyval; 144 void *start = pbyval;
160 u32 magic; 145 u32 magic;
161 u32 num_name_maps;
162 146
163 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 147 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
164 148
@@ -166,22 +150,24 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
166 if (c == NULL) 150 if (c == NULL)
167 return ERR_PTR(-ENOMEM); 151 return ERR_PTR(-ENOMEM);
168 152
169 /* set tunables to default values */
170 c->choose_local_tries = 2;
171 c->choose_local_fallback_tries = 5;
172 c->choose_total_tries = 19;
173
174 ceph_decode_need(p, end, 4*sizeof(u32), bad); 153 ceph_decode_need(p, end, 4*sizeof(u32), bad);
175 magic = ceph_decode_32(p); 154 magic = ceph_decode_32(p);
176 if (magic != CRUSH_MAGIC) { 155 if (magic != CRUSH_MAGIC) {
177 pr_err("crush_decode magic %x != current %x\n", 156 pr_err("crush_decode magic %x != current %x\n",
178 (unsigned int)magic, (unsigned int)CRUSH_MAGIC); 157 (unsigned)magic, (unsigned)CRUSH_MAGIC);
179 goto bad; 158 goto bad;
180 } 159 }
181 c->max_buckets = ceph_decode_32(p); 160 c->max_buckets = ceph_decode_32(p);
182 c->max_rules = ceph_decode_32(p); 161 c->max_rules = ceph_decode_32(p);
183 c->max_devices = ceph_decode_32(p); 162 c->max_devices = ceph_decode_32(p);
184 163
164 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
165 if (c->device_parents == NULL)
166 goto badmem;
167 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
168 if (c->bucket_parents == NULL)
169 goto badmem;
170
185 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); 171 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
186 if (c->buckets == NULL) 172 if (c->buckets == NULL)
187 goto badmem; 173 goto badmem;
@@ -297,8 +283,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
297 ceph_decode_32_safe(p, end, yes, bad); 283 ceph_decode_32_safe(p, end, yes, bad);
298#if BITS_PER_LONG == 32 284#if BITS_PER_LONG == 32
299 err = -EINVAL; 285 err = -EINVAL;
300 if (yes > (ULONG_MAX - sizeof(*r)) 286 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
301 / sizeof(struct crush_rule_step))
302 goto bad; 287 goto bad;
303#endif 288#endif
304 r = c->rules[i] = kmalloc(sizeof(*r) + 289 r = c->rules[i] = kmalloc(sizeof(*r) +
@@ -318,25 +303,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
318 } 303 }
319 304
320 /* ignore trailing name maps. */ 305 /* ignore trailing name maps. */
321 for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
322 err = skip_name_map(p, end);
323 if (err < 0)
324 goto done;
325 }
326
327 /* tunables */
328 ceph_decode_need(p, end, 3*sizeof(u32), done);
329 c->choose_local_tries = ceph_decode_32(p);
330 c->choose_local_fallback_tries = ceph_decode_32(p);
331 c->choose_total_tries = ceph_decode_32(p);
332 dout("crush decode tunable choose_local_tries = %d",
333 c->choose_local_tries);
334 dout("crush decode tunable choose_local_fallback_tries = %d",
335 c->choose_local_fallback_tries);
336 dout("crush decode tunable choose_total_tries = %d",
337 c->choose_total_tries);
338 306
339done:
340 dout("crush_decode success\n"); 307 dout("crush_decode success\n");
341 return c; 308 return c;
342 309
@@ -469,22 +436,6 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
469 return NULL; 436 return NULL;
470} 437}
471 438
472const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
473{
474 struct ceph_pg_pool_info *pi;
475
476 if (id == CEPH_NOPOOL)
477 return NULL;
478
479 if (WARN_ON_ONCE(id > (u64) INT_MAX))
480 return NULL;
481
482 pi = __lookup_pg_pool(&map->pg_pools, (int) id);
483
484 return pi ? pi->name : NULL;
485}
486EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
487
488int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) 439int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
489{ 440{
490 struct rb_node *rbp; 441 struct rb_node *rbp;
@@ -508,7 +459,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
508 459
509static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 460static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
510{ 461{
511 unsigned int n, m; 462 unsigned n, m;
512 463
513 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 464 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
514 calc_pg_masks(pi); 465 calc_pg_masks(pi);
@@ -543,16 +494,15 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
543 ceph_decode_32_safe(p, end, pool, bad); 494 ceph_decode_32_safe(p, end, pool, bad);
544 ceph_decode_32_safe(p, end, len, bad); 495 ceph_decode_32_safe(p, end, len, bad);
545 dout(" pool %d len %d\n", pool, len); 496 dout(" pool %d len %d\n", pool, len);
546 ceph_decode_need(p, end, len, bad);
547 pi = __lookup_pg_pool(&map->pg_pools, pool); 497 pi = __lookup_pg_pool(&map->pg_pools, pool);
548 if (pi) { 498 if (pi) {
549 char *name = kstrndup(*p, len, GFP_NOFS);
550
551 if (!name)
552 return -ENOMEM;
553 kfree(pi->name); 499 kfree(pi->name);
554 pi->name = name; 500 pi->name = kmalloc(len + 1, GFP_NOFS);
555 dout(" name is %s\n", pi->name); 501 if (pi->name) {
502 memcpy(pi->name, *p, len);
503 pi->name[len] = '\0';
504 dout(" name is %s\n", pi->name);
505 }
556 } 506 }
557 *p += len; 507 *p += len;
558 } 508 }
@@ -661,12 +611,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
661 ceph_decode_32_safe(p, end, max, bad); 611 ceph_decode_32_safe(p, end, max, bad);
662 while (max--) { 612 while (max--) {
663 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 613 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
664 err = -ENOMEM;
665 pi = kzalloc(sizeof(*pi), GFP_NOFS); 614 pi = kzalloc(sizeof(*pi), GFP_NOFS);
666 if (!pi) 615 if (!pi)
667 goto bad; 616 goto bad;
668 pi->id = ceph_decode_32(p); 617 pi->id = ceph_decode_32(p);
669 err = -EINVAL;
670 ev = ceph_decode_8(p); /* encoding version */ 618 ev = ceph_decode_8(p); /* encoding version */
671 if (ev > CEPH_PG_POOL_VERSION) { 619 if (ev > CEPH_PG_POOL_VERSION) {
672 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 620 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
@@ -682,13 +630,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
682 __insert_pg_pool(&map->pg_pools, pi); 630 __insert_pg_pool(&map->pg_pools, pi);
683 } 631 }
684 632
685 if (version >= 5) { 633 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
686 err = __decode_pool_names(p, end, map); 634 goto bad;
687 if (err < 0) {
688 dout("fail to decode pool names");
689 goto bad;
690 }
691 }
692 635
693 ceph_decode_32_safe(p, end, map->pool_max, bad); 636 ceph_decode_32_safe(p, end, map->pool_max, bad);
694 637
@@ -729,9 +672,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
729 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); 672 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
730 ceph_decode_copy(p, &pgid, sizeof(pgid)); 673 ceph_decode_copy(p, &pgid, sizeof(pgid));
731 n = ceph_decode_32(p); 674 n = ceph_decode_32(p);
732 err = -EINVAL;
733 if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
734 goto bad;
735 ceph_decode_need(p, end, n * sizeof(u32), bad); 675 ceph_decode_need(p, end, n * sizeof(u32), bad);
736 err = -ENOMEM; 676 err = -ENOMEM;
737 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); 677 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
@@ -768,7 +708,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
768 return map; 708 return map;
769 709
770bad: 710bad:
771 dout("osdmap_decode fail err %d\n", err); 711 dout("osdmap_decode fail\n");
772 ceph_osdmap_destroy(map); 712 ceph_osdmap_destroy(map);
773 return ERR_PTR(err); 713 return ERR_PTR(err);
774} 714}
@@ -862,7 +802,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
862 if (ev > CEPH_PG_POOL_VERSION) { 802 if (ev > CEPH_PG_POOL_VERSION) {
863 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 803 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
864 ev, CEPH_PG_POOL_VERSION); 804 ev, CEPH_PG_POOL_VERSION);
865 err = -EINVAL;
866 goto bad; 805 goto bad;
867 } 806 }
868 pi = __lookup_pg_pool(&map->pg_pools, pool); 807 pi = __lookup_pg_pool(&map->pg_pools, pool);
@@ -879,11 +818,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
879 if (err < 0) 818 if (err < 0)
880 goto bad; 819 goto bad;
881 } 820 }
882 if (version >= 5) { 821 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
883 err = __decode_pool_names(p, end, map); 822 goto bad;
884 if (err < 0)
885 goto bad;
886 }
887 823
888 /* old_pool */ 824 /* old_pool */
889 ceph_decode_32_safe(p, end, len, bad); 825 ceph_decode_32_safe(p, end, len, bad);
@@ -953,19 +889,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
953 pglen = ceph_decode_32(p); 889 pglen = ceph_decode_32(p);
954 890
955 if (pglen) { 891 if (pglen) {
956 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
957
958 /* removing existing (if any) */
959 (void) __remove_pg_mapping(&map->pg_temp, pgid);
960
961 /* insert */ 892 /* insert */
962 err = -EINVAL; 893 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
963 if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
964 goto bad;
965 err = -ENOMEM;
966 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); 894 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
967 if (!pg) 895 if (!pg) {
896 err = -ENOMEM;
968 goto bad; 897 goto bad;
898 }
969 pg->pgid = pgid; 899 pg->pgid = pgid;
970 pg->len = pglen; 900 pg->len = pglen;
971 for (j = 0; j < pglen; j++) 901 for (j = 0; j < pglen; j++)
@@ -1009,7 +939,7 @@ bad:
1009 * for now, we write only a single su, until we can 939 * for now, we write only a single su, until we can
1010 * pass a stride back to the caller. 940 * pass a stride back to the caller.
1011 */ 941 */
1012int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 942void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1013 u64 off, u64 *plen, 943 u64 off, u64 *plen,
1014 u64 *ono, 944 u64 *ono,
1015 u64 *oxoff, u64 *oxlen) 945 u64 *oxoff, u64 *oxlen)
@@ -1023,17 +953,11 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1023 953
1024 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, 954 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
1025 osize, su); 955 osize, su);
1026 if (su == 0 || sc == 0)
1027 goto invalid;
1028 su_per_object = osize / su; 956 su_per_object = osize / su;
1029 if (su_per_object == 0)
1030 goto invalid;
1031 dout("osize %u / su %u = su_per_object %u\n", osize, su, 957 dout("osize %u / su %u = su_per_object %u\n", osize, su,
1032 su_per_object); 958 su_per_object);
1033 959
1034 if ((su & ~PAGE_MASK) != 0) 960 BUG_ON((su & ~PAGE_MASK) != 0);
1035 goto invalid;
1036
1037 /* bl = *off / su; */ 961 /* bl = *off / su; */
1038 t = off; 962 t = off;
1039 do_div(t, su); 963 do_div(t, su);
@@ -1045,7 +969,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1045 objsetno = stripeno / su_per_object; 969 objsetno = stripeno / su_per_object;
1046 970
1047 *ono = objsetno * sc + stripepos; 971 *ono = objsetno * sc + stripepos;
1048 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono); 972 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
1049 973
1050 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ 974 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
1051 t = off; 975 t = off;
@@ -1061,14 +985,6 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
1061 *plen = *oxlen; 985 *plen = *oxlen;
1062 986
1063 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 987 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
1064 return 0;
1065
1066invalid:
1067 dout(" invalid layout\n");
1068 *ono = 0;
1069 *oxoff = 0;
1070 *oxlen = 0;
1071 return -EINVAL;
1072} 988}
1073EXPORT_SYMBOL(ceph_calc_file_object_mapping); 989EXPORT_SYMBOL(ceph_calc_file_object_mapping);
1074 990
@@ -1081,11 +997,12 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
1081 struct ceph_file_layout *fl, 997 struct ceph_file_layout *fl,
1082 struct ceph_osdmap *osdmap) 998 struct ceph_osdmap *osdmap)
1083{ 999{
1084 unsigned int num, num_mask; 1000 unsigned num, num_mask;
1085 struct ceph_pg pgid; 1001 struct ceph_pg pgid;
1002 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
1086 int poolid = le32_to_cpu(fl->fl_pg_pool); 1003 int poolid = le32_to_cpu(fl->fl_pg_pool);
1087 struct ceph_pg_pool_info *pool; 1004 struct ceph_pg_pool_info *pool;
1088 unsigned int ps; 1005 unsigned ps;
1089 1006
1090 BUG_ON(!osdmap); 1007 BUG_ON(!osdmap);
1091 1008
@@ -1093,13 +1010,23 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
1093 if (!pool) 1010 if (!pool)
1094 return -EIO; 1011 return -EIO;
1095 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); 1012 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
1096 num = le32_to_cpu(pool->v.pg_num); 1013 if (preferred >= 0) {
1097 num_mask = pool->pg_num_mask; 1014 ps += preferred;
1015 num = le32_to_cpu(pool->v.lpg_num);
1016 num_mask = pool->lpg_num_mask;
1017 } else {
1018 num = le32_to_cpu(pool->v.pg_num);
1019 num_mask = pool->pg_num_mask;
1020 }
1098 1021
1099 pgid.ps = cpu_to_le16(ps); 1022 pgid.ps = cpu_to_le16(ps);
1100 pgid.preferred = cpu_to_le16(-1); 1023 pgid.preferred = cpu_to_le16(preferred);
1101 pgid.pool = fl->fl_pg_pool; 1024 pgid.pool = fl->fl_pg_pool;
1102 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); 1025 if (preferred >= 0)
1026 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1027 (int)preferred);
1028 else
1029 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1103 1030
1104 ol->ol_pgid = pgid; 1031 ol->ol_pgid = pgid;
1105 ol->ol_stripe_unit = fl->fl_object_stripe_unit; 1032 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
@@ -1117,18 +1044,24 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1117 struct ceph_pg_mapping *pg; 1044 struct ceph_pg_mapping *pg;
1118 struct ceph_pg_pool_info *pool; 1045 struct ceph_pg_pool_info *pool;
1119 int ruleno; 1046 int ruleno;
1120 unsigned int poolid, ps, pps, t, r; 1047 unsigned poolid, ps, pps, t;
1048 int preferred;
1121 1049
1122 poolid = le32_to_cpu(pgid.pool); 1050 poolid = le32_to_cpu(pgid.pool);
1123 ps = le16_to_cpu(pgid.ps); 1051 ps = le16_to_cpu(pgid.ps);
1052 preferred = (s16)le16_to_cpu(pgid.preferred);
1124 1053
1125 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 1054 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1126 if (!pool) 1055 if (!pool)
1127 return NULL; 1056 return NULL;
1128 1057
1129 /* pg_temp? */ 1058 /* pg_temp? */
1130 t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), 1059 if (preferred >= 0)
1131 pool->pgp_num_mask); 1060 t = ceph_stable_mod(ps, le32_to_cpu(pool->v.lpg_num),
1061 pool->lpgp_num_mask);
1062 else
1063 t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
1064 pool->pgp_num_mask);
1132 pgid.ps = cpu_to_le16(t); 1065 pgid.ps = cpu_to_le16(t);
1133 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1066 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1134 if (pg) { 1067 if (pg) {
@@ -1146,20 +1079,23 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1146 return NULL; 1079 return NULL;
1147 } 1080 }
1148 1081
1149 pps = ceph_stable_mod(ps, 1082 /* don't forcefeed bad device ids to crush */
1150 le32_to_cpu(pool->v.pgp_num), 1083 if (preferred >= osdmap->max_osd ||
1151 pool->pgp_num_mask); 1084 preferred >= osdmap->crush->max_devices)
1085 preferred = -1;
1086
1087 if (preferred >= 0)
1088 pps = ceph_stable_mod(ps,
1089 le32_to_cpu(pool->v.lpgp_num),
1090 pool->lpgp_num_mask);
1091 else
1092 pps = ceph_stable_mod(ps,
1093 le32_to_cpu(pool->v.pgp_num),
1094 pool->pgp_num_mask);
1152 pps += poolid; 1095 pps += poolid;
1153 r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1096 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1154 min_t(int, pool->v.size, *num), 1097 min_t(int, pool->v.size, *num),
1155 osdmap->osd_weight); 1098 preferred, osdmap->osd_weight);
1156 if (r < 0) {
1157 pr_err("error %d from crush rule: pool %d ruleset %d type %d"
1158 " size %d\n", r, poolid, pool->v.crush_ruleset,
1159 pool->v.type, pool->v.size);
1160 return NULL;
1161 }
1162 *num = r;
1163 return osds; 1099 return osds;
1164} 1100}
1165 1101
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 92866bebb65..13cb409a7bb 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -1,3 +1,4 @@
1
1#include <linux/module.h> 2#include <linux/module.h>
2#include <linux/gfp.h> 3#include <linux/gfp.h>
3#include <linux/pagemap.h> 4#include <linux/pagemap.h>
@@ -71,7 +72,8 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
71} 72}
72EXPORT_SYMBOL(ceph_pagelist_append); 73EXPORT_SYMBOL(ceph_pagelist_append);
73 74
74/* Allocate enough pages for a pagelist to append the given amount 75/**
76 * Allocate enough pages for a pagelist to append the given amount
75 * of data without without allocating. 77 * of data without without allocating.
76 * Returns: 0 on success, -ENOMEM on error. 78 * Returns: 0 on success, -ENOMEM on error.
77 */ 79 */
@@ -93,7 +95,9 @@ int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
93} 95}
94EXPORT_SYMBOL(ceph_pagelist_reserve); 96EXPORT_SYMBOL(ceph_pagelist_reserve);
95 97
96/* Free any pages that have been preallocated. */ 98/**
99 * Free any pages that have been preallocated.
100 */
97int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) 101int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
98{ 102{
99 while (!list_empty(&pl->free_list)) { 103 while (!list_empty(&pl->free_list)) {
@@ -108,7 +112,9 @@ int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
108} 112}
109EXPORT_SYMBOL(ceph_pagelist_free_reserve); 113EXPORT_SYMBOL(ceph_pagelist_free_reserve);
110 114
111/* Create a truncation point. */ 115/**
116 * Create a truncation point.
117 */
112void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, 118void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
113 struct ceph_pagelist_cursor *c) 119 struct ceph_pagelist_cursor *c)
114{ 120{
@@ -118,7 +124,8 @@ void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
118} 124}
119EXPORT_SYMBOL(ceph_pagelist_set_cursor); 125EXPORT_SYMBOL(ceph_pagelist_set_cursor);
120 126
121/* Truncate a pagelist to the given point. Move extra pages to reserve. 127/**
128 * Truncate a pagelist to the given point. Move extra pages to reserve.
122 * This won't sleep. 129 * This won't sleep.
123 * Returns: 0 on success, 130 * Returns: 0 on success,
124 * -EINVAL if the pagelist doesn't match the trunc point pagelist 131 * -EINVAL if the pagelist doesn't match the trunc point pagelist
@@ -133,8 +140,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl,
133 ceph_pagelist_unmap_tail(pl); 140 ceph_pagelist_unmap_tail(pl);
134 while (pl->head.prev != c->page_lru) { 141 while (pl->head.prev != c->page_lru) {
135 page = list_entry(pl->head.prev, struct page, lru); 142 page = list_entry(pl->head.prev, struct page, lru);
136 /* move from pagelist to reserve */ 143 list_del(&page->lru); /* remove from pagelist */
137 list_move_tail(&page->lru, &pl->free_list); 144 list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
138 ++pl->num_pages_free; 145 ++pl->num_pages_free;
139 } 146 }
140 pl->room = c->room; 147 pl->room = c->room;